From 5de7ed3d5667c377b70d5b327ab2405b386611c2 Mon Sep 17 00:00:00 2001 From: Wagner Bruna Date: Tue, 12 Aug 2025 12:25:02 -0300 Subject: [PATCH] WIP: update stable-diffusion.cpp to 5900ef6605c6 (new API) (#1669) * Update stable-diffusion.cpp to 5900ef6605c6 (new API) * Clean up pending LoRA code and simplify LoRA changes to upstream * Move VAE tiling disabling for TAESD to sdtype_adapter.cpp * Move auxiliary ctx functions to sdtype_adapter.cpp * Use ref_images parameter for Kontext images * Drop clip skip workaround (fixed upstream) * Workaround for flash attention with img2img leejet/stable-diffusion.cpp#756 * Workaround for Chroma with flash attention, debug prints * Disable forcing CLIP weights to F32 for reduced memory usage --- otherarch/sdcpp/clip.hpp | 44 +- otherarch/sdcpp/common.hpp | 14 +- otherarch/sdcpp/conditioner.hpp | 73 +- otherarch/sdcpp/control.hpp | 15 +- otherarch/sdcpp/denoiser.hpp | 133 ++- otherarch/sdcpp/diffusion_model.hpp | 26 +- otherarch/sdcpp/esrgan.hpp | 17 +- otherarch/sdcpp/flux.hpp | 55 +- otherarch/sdcpp/ggml_extend.hpp | 147 ++- otherarch/sdcpp/gits_noise.inl | 2 +- otherarch/sdcpp/lora.hpp | 8 +- otherarch/sdcpp/main.cpp | 1159 +++++++++----------- otherarch/sdcpp/mmdit.hpp | 16 +- otherarch/sdcpp/model.cpp | 233 +++- otherarch/sdcpp/model.h | 33 +- otherarch/sdcpp/pmid.hpp | 7 +- otherarch/sdcpp/sdtype_adapter.cpp | 222 ++-- otherarch/sdcpp/stable-diffusion.cpp | 1508 ++++++++++++++------------ otherarch/sdcpp/stable-diffusion.h | 288 +++-- otherarch/sdcpp/t5.hpp | 22 +- otherarch/sdcpp/tae.hpp | 15 +- otherarch/sdcpp/unet.hpp | 21 +- otherarch/sdcpp/upscaler.cpp | 20 +- otherarch/sdcpp/util.cpp | 27 +- otherarch/sdcpp/util.h | 3 + otherarch/sdcpp/vae.hpp | 17 +- 26 files changed, 2255 insertions(+), 1870 deletions(-) diff --git a/otherarch/sdcpp/clip.hpp b/otherarch/sdcpp/clip.hpp index cdc27e727..0696220f9 100644 --- a/otherarch/sdcpp/clip.hpp +++ b/otherarch/sdcpp/clip.hpp @@ -545,9 +545,15 @@ protected: int64_t vocab_size; int64_t num_positions; - void init_params(struct ggml_context* ctx, std::map& tensor_types, const std::string prefix = "") { - enum ggml_type token_wtype = (tensor_types.find(prefix + "token_embedding.weight") != tensor_types.end()) ? tensor_types[prefix + "token_embedding.weight"] : GGML_TYPE_F32; - enum ggml_type position_wtype = GGML_TYPE_F32; //(tensor_types.find(prefix + "position_embedding.weight") != tensor_types.end()) ? tensor_types[prefix + "position_embedding.weight"] : GGML_TYPE_F32; + void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") { + enum ggml_type token_wtype = GGML_TYPE_F32; + #if 1 + // kcpp reduce memory usage (reverts https://github.com/leejet/stable-diffusion.cpp/pull/601) + auto tensor_type = tensor_types.find(prefix + "token_embedding.weight"); + if (tensor_type != tensor_types.end()) + token_wtype = tensor_type->second; + #endif + enum ggml_type position_wtype = GGML_TYPE_F32; params["token_embedding.weight"] = ggml_new_tensor_2d(ctx, token_wtype, embed_dim, vocab_size); params["position_embedding.weight"] = ggml_new_tensor_2d(ctx, position_wtype, embed_dim, num_positions); @@ -594,10 +600,10 @@ protected: int64_t image_size; int64_t num_patches; int64_t num_positions; - void init_params(struct ggml_context* ctx, std::map& tensor_types, const std::string prefix = "") { - enum ggml_type patch_wtype = GGML_TYPE_F16; // tensor_types.find(prefix + "patch_embedding.weight") != tensor_types.end() ? tensor_types[prefix + "patch_embedding.weight"] : GGML_TYPE_F16; - enum ggml_type class_wtype = GGML_TYPE_F32; // tensor_types.find(prefix + "class_embedding") != tensor_types.end() ? tensor_types[prefix + "class_embedding"] : GGML_TYPE_F32; - enum ggml_type position_wtype = GGML_TYPE_F32; // tensor_types.find(prefix + "position_embedding.weight") != tensor_types.end() ? tensor_types[prefix + "position_embedding.weight"] : GGML_TYPE_F32; + void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") { + enum ggml_type patch_wtype = GGML_TYPE_F16; + enum ggml_type class_wtype = GGML_TYPE_F32; + enum ggml_type position_wtype = GGML_TYPE_F32; params["patch_embedding.weight"] = ggml_new_tensor_4d(ctx, patch_wtype, patch_size, patch_size, num_channels, embed_dim); params["class_embedding"] = ggml_new_tensor_1d(ctx, class_wtype, embed_dim); @@ -657,9 +663,9 @@ enum CLIPVersion { class CLIPTextModel : public GGMLBlock { protected: - void init_params(struct ggml_context* ctx, std::map& tensor_types, const std::string prefix = "") { + void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") { if (version == OPEN_CLIP_VIT_BIGG_14) { - enum ggml_type wtype = GGML_TYPE_F32; // tensor_types.find(prefix + "text_projection") != tensor_types.end() ? tensor_types[prefix + "text_projection"] : GGML_TYPE_F32; + enum ggml_type wtype = GGML_TYPE_F32; params["text_projection"] = ggml_new_tensor_2d(ctx, wtype, projection_dim, hidden_size); } } @@ -678,8 +684,8 @@ public: bool with_final_ln = true; CLIPTextModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14, - int clip_skip_value = -1, - bool with_final_ln = true) + bool with_final_ln = true, + int clip_skip_value = -1) : version(version), with_final_ln(with_final_ln) { if (version == OPEN_CLIP_VIT_H_14) { hidden_size = 1024; @@ -701,7 +707,7 @@ public: void set_clip_skip(int skip) { if (skip <= 0) { - return; + skip = -1; } clip_skip = skip; } @@ -805,8 +811,8 @@ protected: int64_t out_features; bool transpose_weight; - void init_params(struct ggml_context* ctx, std::map& tensor_types, const std::string prefix = "") { - enum ggml_type wtype = tensor_types.find(prefix + "weight") != tensor_types.end() ? tensor_types[prefix + "weight"] : GGML_TYPE_F32; + void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") { + enum ggml_type wtype = get_type(prefix + "weight", tensor_types, GGML_TYPE_F32); if (transpose_weight) { params["weight"] = ggml_new_tensor_2d(ctx, wtype, out_features, in_features); } else { @@ -868,12 +874,12 @@ struct CLIPTextModelRunner : public GGMLRunner { CLIPTextModel model; CLIPTextModelRunner(ggml_backend_t backend, - std::map& tensor_types, + const String2GGMLType& tensor_types, const std::string prefix, CLIPVersion version = OPENAI_CLIP_VIT_L_14, - int clip_skip_value = 1, - bool with_final_ln = true) - : GGMLRunner(backend), model(version, clip_skip_value, with_final_ln) { + bool with_final_ln = true, + int clip_skip_value = -1) + : GGMLRunner(backend), model(version, with_final_ln, clip_skip_value) { model.init(params_ctx, tensor_types, prefix); } @@ -949,4 +955,4 @@ struct CLIPTextModelRunner : public GGMLRunner { } }; -#endif // __CLIP_HPP__ \ No newline at end of file +#endif // __CLIP_HPP__ diff --git a/otherarch/sdcpp/common.hpp b/otherarch/sdcpp/common.hpp index 32250d763..3a1307767 100644 --- a/otherarch/sdcpp/common.hpp +++ b/otherarch/sdcpp/common.hpp @@ -56,8 +56,8 @@ public: // x: [N, channels, h, w] auto conv = std::dynamic_pointer_cast(blocks["conv"]); - x = ggml_upscale(ctx, x, 2, ggml_scale_mode::GGML_SCALE_MODE_NEAREST); // [N, channels, h*2, w*2] - x = conv->forward(ctx, x); // [N, out_channels, h*2, w*2] + x = ggml_upscale(ctx, x, 2, GGML_SCALE_MODE_NEAREST); // [N, channels, h*2, w*2] + x = conv->forward(ctx, x); // [N, out_channels, h*2, w*2] return x; } }; @@ -182,9 +182,9 @@ protected: int64_t dim_in; int64_t dim_out; - void init_params(struct ggml_context* ctx, std::map& tensor_types, std::string prefix = "") { - enum ggml_type wtype = (tensor_types.find(prefix + "proj.weight") != tensor_types.end()) ? tensor_types[prefix + "proj.weight"] : GGML_TYPE_F32; - enum ggml_type bias_wtype = GGML_TYPE_F32; //(tensor_types.find(prefix + "proj.bias") != tensor_types.end()) ? tensor_types[prefix + "proj.bias"] : GGML_TYPE_F32; + void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, std::string prefix = "") { + enum ggml_type wtype = get_type(prefix + "proj.weight", tensor_types, GGML_TYPE_F32); + enum ggml_type bias_wtype = GGML_TYPE_F32; params["proj.weight"] = ggml_new_tensor_2d(ctx, wtype, dim_in, dim_out * 2); params["proj.bias"] = ggml_new_tensor_1d(ctx, bias_wtype, dim_out * 2); } @@ -440,9 +440,9 @@ public: class AlphaBlender : public GGMLBlock { protected: - void init_params(struct ggml_context* ctx, std::map& tensor_types, std::string prefix = "") { + void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, std::string prefix = "") { // Get the type of the "mix_factor" tensor from the input tensors map with the specified prefix - enum ggml_type wtype = GGML_TYPE_F32; //(tensor_types.ypes.find(prefix + "mix_factor") != tensor_types.end()) ? tensor_types[prefix + "mix_factor"] : GGML_TYPE_F32; + enum ggml_type wtype = GGML_TYPE_F32; params["mix_factor"] = ggml_new_tensor_1d(ctx, wtype, 1); } diff --git a/otherarch/sdcpp/conditioner.hpp b/otherarch/sdcpp/conditioner.hpp index 4005fadf7..6a51dce81 100644 --- a/otherarch/sdcpp/conditioner.hpp +++ b/otherarch/sdcpp/conditioner.hpp @@ -57,29 +57,30 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { std::vector readed_embeddings; FrozenCLIPEmbedderWithCustomWords(ggml_backend_t backend, - std::map& tensor_types, + const String2GGMLType& tensor_types, const std::string& embd_dir, SDVersion version = VERSION_SD1, PMVersion pv = PM_VERSION_1, int clip_skip = -1) : version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407), embd_dir(embd_dir) { + if (sd_version_is_sd1(version)) { + text_model = std::make_shared(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14); + } else if (sd_version_is_sd2(version)) { + text_model = std::make_shared(backend, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14); + } else if (sd_version_is_sdxl(version)) { + text_model = std::make_shared(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false); + text_model2 = std::make_shared(backend, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false); + } + set_clip_skip(clip_skip); + } + + void set_clip_skip(int clip_skip) { if (clip_skip <= 0) { clip_skip = 1; if (sd_version_is_sd2(version) || sd_version_is_sdxl(version)) { clip_skip = 2; } } - if (sd_version_is_sd1(version)) { - text_model = std::make_shared(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, clip_skip); - } else if (sd_version_is_sd2(version)) { - text_model = std::make_shared(backend, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, clip_skip); - } else if (sd_version_is_sdxl(version)) { - text_model = std::make_shared(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, clip_skip, false); - text_model2 = std::make_shared(backend, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, clip_skip, false); - } - } - - void set_clip_skip(int clip_skip) { text_model->set_clip_skip(clip_skip); if (sd_version_is_sdxl(version)) { text_model2->set_clip_skip(clip_skip); @@ -458,8 +459,8 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { if (sd_version_is_sdxl(version)) { text_model2->compute(n_threads, input_ids2, - 0, - NULL, + num_custom_embeddings, + token_embed_custom.data(), max_token_idx, false, &chunk_hidden_states2, work_ctx); @@ -469,8 +470,8 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { if (chunk_idx == 0) { text_model2->compute(n_threads, input_ids2, - 0, - NULL, + num_custom_embeddings, + token_embed_custom.data(), max_token_idx, true, &pooled, @@ -617,7 +618,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { struct FrozenCLIPVisionEmbedder : public GGMLRunner { CLIPVisionModelProjection vision_model; - FrozenCLIPVisionEmbedder(ggml_backend_t backend, std::map& tensor_types) + FrozenCLIPVisionEmbedder(ggml_backend_t backend, const String2GGMLType& tensor_types = {}) : vision_model(OPEN_CLIP_VIT_H_14, true), GGMLRunner(backend) { vision_model.init(params_ctx, tensor_types, "cond_stage_model.transformer"); } @@ -662,18 +663,19 @@ struct SD3CLIPEmbedder : public Conditioner { std::shared_ptr t5; SD3CLIPEmbedder(ggml_backend_t backend, - std::map& tensor_types, - int clip_skip = -1) + const String2GGMLType& tensor_types = {}, + int clip_skip = -1) : clip_g_tokenizer(0) { - if (clip_skip <= 0) { - clip_skip = 2; - } - clip_l = std::make_shared(backend, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, clip_skip, false); - clip_g = std::make_shared(backend, tensor_types, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, clip_skip, false); + clip_l = std::make_shared(backend, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false); + clip_g = std::make_shared(backend, tensor_types, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false); t5 = std::make_shared(backend, tensor_types, "text_encoders.t5xxl.transformer"); + set_clip_skip(clip_skip); } void set_clip_skip(int clip_skip) { + if (clip_skip <= 0) { + clip_skip = 2; + } clip_l->set_clip_skip(clip_skip); clip_g->set_clip_skip(clip_skip); } @@ -1008,16 +1010,17 @@ struct FluxCLIPEmbedder : public Conditioner { size_t chunk_len = 256; FluxCLIPEmbedder(ggml_backend_t backend, - std::map& tensor_types, - int clip_skip = -1) { - if (clip_skip <= 0) { - clip_skip = 2; - } - clip_l = std::make_shared(backend, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, clip_skip, true); + const String2GGMLType& tensor_types = {}, + int clip_skip = -1) { + clip_l = std::make_shared(backend, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true); t5 = std::make_shared(backend, tensor_types, "text_encoders.t5xxl.transformer"); + set_clip_skip(clip_skip); } void set_clip_skip(int clip_skip) { + if (clip_skip <= 0) { + clip_skip = 2; + } clip_l->set_clip_skip(clip_skip); } @@ -1228,10 +1231,10 @@ struct PixArtCLIPEmbedder : public Conditioner { int mask_pad = 1; PixArtCLIPEmbedder(ggml_backend_t backend, - std::map& tensor_types, - int clip_skip = -1, - bool use_mask = false, - int mask_pad = 1) + const String2GGMLType& tensor_types = {}, + int clip_skip = -1, + bool use_mask = false, + int mask_pad = 1) : use_mask(use_mask), mask_pad(mask_pad) { t5 = std::make_shared(backend, tensor_types, "text_encoders.t5xxl.transformer"); } @@ -1422,4 +1425,4 @@ struct PixArtCLIPEmbedder : public Conditioner { } }; -#endif \ No newline at end of file +#endif diff --git a/otherarch/sdcpp/control.hpp b/otherarch/sdcpp/control.hpp index 23b75feff..63fe70455 100644 --- a/otherarch/sdcpp/control.hpp +++ b/otherarch/sdcpp/control.hpp @@ -317,12 +317,23 @@ struct ControlNet : public GGMLRunner { bool guided_hint_cached = false; ControlNet(ggml_backend_t backend, - std::map& tensor_types, - SDVersion version = VERSION_SD1) + const String2GGMLType& tensor_types = {}, + SDVersion version = VERSION_SD1) : GGMLRunner(backend), control_net(version) { control_net.init(params_ctx, tensor_types, ""); } + void enable_conv2d_direct() { + std::vector blocks; + control_net.get_all_blocks(blocks); + for (auto block : blocks) { + if (block->get_desc() == "Conv2d") { + auto conv_block = (Conv2d*)block; + conv_block->enable_direct(); + } + } + } + ~ControlNet() { free_control_ctx(); } diff --git a/otherarch/sdcpp/denoiser.hpp b/otherarch/sdcpp/denoiser.hpp index 307423f40..d4bcec590 100644 --- a/otherarch/sdcpp/denoiser.hpp +++ b/otherarch/sdcpp/denoiser.hpp @@ -168,24 +168,21 @@ struct AYSSchedule : SigmaSchedule { std::vector inputs; std::vector results(n + 1); - switch (version) { - case VERSION_SD2: /* fallthrough */ - LOG_WARN("AYS not designed for SD2.X models"); - case VERSION_SD1: - LOG_INFO("AYS using SD1.5 noise levels"); - inputs = noise_levels[0]; - break; - case VERSION_SDXL: - LOG_INFO("AYS using SDXL noise levels"); - inputs = noise_levels[1]; - break; - case VERSION_SVD: - LOG_INFO("AYS using SVD noise levels"); - inputs = noise_levels[2]; - break; - default: - LOG_ERROR("Version not compatable with AYS scheduler"); - return results; + if (sd_version_is_sd2((SDVersion)version)) { + LOG_WARN("AYS not designed for SD2.X models"); + } /* fallthrough */ + else if (sd_version_is_sd1((SDVersion)version)) { + LOG_INFO("AYS using SD1.5 noise levels"); + inputs = noise_levels[0]; + } else if (sd_version_is_sdxl((SDVersion)version)) { + LOG_INFO("AYS using SDXL noise levels"); + inputs = noise_levels[1]; + } else if (version == VERSION_SVD) { + LOG_INFO("AYS using SVD noise levels"); + inputs = noise_levels[2]; + } else { + LOG_ERROR("Version not compatible with AYS scheduler"); + return results; } /* Stretches those pre-calculated reference levels out to the desired @@ -346,6 +343,32 @@ struct CompVisVDenoiser : public CompVisDenoiser { } }; +struct EDMVDenoiser : public CompVisVDenoiser { + float min_sigma = 0.002; + float max_sigma = 120.0; + + EDMVDenoiser(float min_sigma = 0.002, float max_sigma = 120.0) + : min_sigma(min_sigma), max_sigma(max_sigma) { + schedule = std::make_shared(); + } + + float t_to_sigma(float t) { + return std::exp(t * 4 / (float)TIMESTEPS); + } + + float sigma_to_t(float s) { + return 0.25 * std::log(s); + } + + float sigma_min() { + return min_sigma; + } + + float sigma_max() { + return max_sigma; + } +}; + float time_snr_shift(float alpha, float t) { if (alpha == 1.0f) { return t; @@ -1019,7 +1042,7 @@ static void sample_k_diffusion(sample_method_t method, // also needed to invert the behavior of CompVisDenoiser // (k-diffusion's LMSDiscreteScheduler) float beta_start = 0.00085f; - float beta_end = 0.0120f; + float beta_end = 0.0120f; std::vector alphas_cumprod; std::vector compvis_sigmas; @@ -1030,8 +1053,9 @@ static void sample_k_diffusion(sample_method_t method, (i == 0 ? 1.0f : alphas_cumprod[i - 1]) * (1.0f - std::pow(sqrtf(beta_start) + - (sqrtf(beta_end) - sqrtf(beta_start)) * - ((float)i / (TIMESTEPS - 1)), 2)); + (sqrtf(beta_end) - sqrtf(beta_start)) * + ((float)i / (TIMESTEPS - 1)), + 2)); compvis_sigmas[i] = std::sqrt((1 - alphas_cumprod[i]) / alphas_cumprod[i]); @@ -1061,7 +1085,8 @@ static void sample_k_diffusion(sample_method_t method, // - pred_prev_sample -> "x_t-1" int timestep = roundf(TIMESTEPS - - i * ((float)TIMESTEPS / steps)) - 1; + i * ((float)TIMESTEPS / steps)) - + 1; // 1. get previous step value (=t-1) int prev_timestep = timestep - TIMESTEPS / steps; // The sigma here is chosen to cause the @@ -1086,10 +1111,9 @@ static void sample_k_diffusion(sample_method_t method, float* vec_x = (float*)x->data; for (int j = 0; j < ggml_nelements(x); j++) { vec_x[j] *= std::sqrt(sigma * sigma + 1) / - sigma; + sigma; } - } - else { + } else { // For the subsequent steps after the first one, // at this point x = latents or x = sample, and // needs to be prescaled with x <- sample / c_in @@ -1127,9 +1151,8 @@ static void sample_k_diffusion(sample_method_t method, float alpha_prod_t = alphas_cumprod[timestep]; // Note final_alpha_cumprod = alphas_cumprod[0] due to // trailing timestep spacing - float alpha_prod_t_prev = prev_timestep >= 0 ? - alphas_cumprod[prev_timestep] : alphas_cumprod[0]; - float beta_prod_t = 1 - alpha_prod_t; + float alpha_prod_t_prev = prev_timestep >= 0 ? alphas_cumprod[prev_timestep] : alphas_cumprod[0]; + float beta_prod_t = 1 - alpha_prod_t; // 3. compute predicted original sample from predicted // noise also called "predicted x_0" of formula (12) // from https://arxiv.org/pdf/2010.02502.pdf @@ -1145,7 +1168,7 @@ static void sample_k_diffusion(sample_method_t method, vec_pred_original_sample[j] = (vec_x[j] / std::sqrt(sigma * sigma + 1) - std::sqrt(beta_prod_t) * - vec_model_output[j]) * + vec_model_output[j]) * (1 / std::sqrt(alpha_prod_t)); } } @@ -1159,8 +1182,8 @@ static void sample_k_diffusion(sample_method_t method, // sigma_t = sqrt((1 - alpha_t-1)/(1 - alpha_t)) * // sqrt(1 - alpha_t/alpha_t-1) float beta_prod_t_prev = 1 - alpha_prod_t_prev; - float variance = (beta_prod_t_prev / beta_prod_t) * - (1 - alpha_prod_t / alpha_prod_t_prev); + float variance = (beta_prod_t_prev / beta_prod_t) * + (1 - alpha_prod_t / alpha_prod_t_prev); float std_dev_t = eta * std::sqrt(variance); // 6. compute "direction pointing to x_t" of formula // (12) from https://arxiv.org/pdf/2010.02502.pdf @@ -1179,8 +1202,8 @@ static void sample_k_diffusion(sample_method_t method, std::pow(std_dev_t, 2)) * vec_model_output[j]; vec_x[j] = std::sqrt(alpha_prod_t_prev) * - vec_pred_original_sample[j] + - pred_sample_direction; + vec_pred_original_sample[j] + + pred_sample_direction; } } if (eta > 0) { @@ -1208,7 +1231,7 @@ static void sample_k_diffusion(sample_method_t method, // by Semi-Linear Consistency Function with Trajectory // Mapping", arXiv:2402.19159 [cs.CV] float beta_start = 0.00085f; - float beta_end = 0.0120f; + float beta_end = 0.0120f; std::vector alphas_cumprod; std::vector compvis_sigmas; @@ -1219,8 +1242,9 @@ static void sample_k_diffusion(sample_method_t method, (i == 0 ? 1.0f : alphas_cumprod[i - 1]) * (1.0f - std::pow(sqrtf(beta_start) + - (sqrtf(beta_end) - sqrtf(beta_start)) * - ((float)i / (TIMESTEPS - 1)), 2)); + (sqrtf(beta_end) - sqrtf(beta_start)) * + ((float)i / (TIMESTEPS - 1)), + 2)); compvis_sigmas[i] = std::sqrt((1 - alphas_cumprod[i]) / alphas_cumprod[i]); @@ -1235,13 +1259,10 @@ static void sample_k_diffusion(sample_method_t method, for (int i = 0; i < steps; i++) { // Analytic form for TCD timesteps int timestep = TIMESTEPS - 1 - - (TIMESTEPS / original_steps) * - (int)floor(i * ((float)original_steps / steps)); + (TIMESTEPS / original_steps) * + (int)floor(i * ((float)original_steps / steps)); // 1. get previous step value - int prev_timestep = i >= steps - 1 ? 0 : - TIMESTEPS - 1 - (TIMESTEPS / original_steps) * - (int)floor((i + 1) * - ((float)original_steps / steps)); + int prev_timestep = i >= steps - 1 ? 0 : TIMESTEPS - 1 - (TIMESTEPS / original_steps) * (int)floor((i + 1) * ((float)original_steps / steps)); // Here timestep_s is tau_n' in Algorithm 4. The _s // notation appears to be that from C. Lu, // "DPM-Solver: A Fast ODE Solver for Diffusion @@ -1258,10 +1279,9 @@ static void sample_k_diffusion(sample_method_t method, float* vec_x = (float*)x->data; for (int j = 0; j < ggml_nelements(x); j++) { vec_x[j] *= std::sqrt(sigma * sigma + 1) / - sigma; + sigma; } - } - else { + } else { float* vec_x = (float*)x->data; for (int j = 0; j < ggml_nelements(x); j++) { vec_x[j] *= std::sqrt(sigma * sigma + 1); @@ -1294,15 +1314,14 @@ static void sample_k_diffusion(sample_method_t method, // DPM-Solver. In fact, we have alpha_{t_n} = // \sqrt{\hat{alpha_n}}, [...]" float alpha_prod_t = alphas_cumprod[timestep]; - float beta_prod_t = 1 - alpha_prod_t; + float beta_prod_t = 1 - alpha_prod_t; // Note final_alpha_cumprod = alphas_cumprod[0] since // TCD is always "trailing" - float alpha_prod_t_prev = prev_timestep >= 0 ? - alphas_cumprod[prev_timestep] : alphas_cumprod[0]; + float alpha_prod_t_prev = prev_timestep >= 0 ? alphas_cumprod[prev_timestep] : alphas_cumprod[0]; // The subscript _s are the only portion in this // section (2) unique to TCD float alpha_prod_s = alphas_cumprod[timestep_s]; - float beta_prod_s = 1 - alpha_prod_s; + float beta_prod_s = 1 - alpha_prod_s; // 3. Compute the predicted noised sample x_s based on // the model parameterization // @@ -1317,7 +1336,7 @@ static void sample_k_diffusion(sample_method_t method, vec_pred_original_sample[j] = (vec_x[j] / std::sqrt(sigma * sigma + 1) - std::sqrt(beta_prod_t) * - vec_model_output[j]) * + vec_model_output[j]) * (1 / std::sqrt(alpha_prod_t)); } } @@ -1339,9 +1358,9 @@ static void sample_k_diffusion(sample_method_t method, // pred_epsilon = model_output vec_x[j] = std::sqrt(alpha_prod_s) * - vec_pred_original_sample[j] + + vec_pred_original_sample[j] + std::sqrt(beta_prod_s) * - vec_model_output[j]; + vec_model_output[j]; } } // 4. Sample and inject noise z ~ N(0, I) for @@ -1357,7 +1376,7 @@ static void sample_k_diffusion(sample_method_t method, // In this case, x is still pred_noised_sample, // continue in-place ggml_tensor_set_f32_randn(noise, rng); - float* vec_x = (float*)x->data; + float* vec_x = (float*)x->data; float* vec_noise = (float*)noise->data; for (int j = 0; j < ggml_nelements(x); j++) { // Corresponding to (35) in Zheng et @@ -1366,10 +1385,10 @@ static void sample_k_diffusion(sample_method_t method, vec_x[j] = std::sqrt(alpha_prod_t_prev / alpha_prod_s) * - vec_x[j] + + vec_x[j] + std::sqrt(1 - alpha_prod_t_prev / - alpha_prod_s) * - vec_noise[j]; + alpha_prod_s) * + vec_noise[j]; } } } @@ -1381,4 +1400,4 @@ static void sample_k_diffusion(sample_method_t method, } } -#endif // __DENOISER_HPP__ \ No newline at end of file +#endif // __DENOISER_HPP__ diff --git a/otherarch/sdcpp/diffusion_model.hpp b/otherarch/sdcpp/diffusion_model.hpp index 65680b8d9..787a4fa79 100644 --- a/otherarch/sdcpp/diffusion_model.hpp +++ b/otherarch/sdcpp/diffusion_model.hpp @@ -13,7 +13,7 @@ struct DiffusionModel { struct ggml_tensor* c_concat, struct ggml_tensor* y, struct ggml_tensor* guidance, - std::vector ref_latents = {}, + std::vector ref_latents = {}, int num_video_frames = -1, std::vector controls = {}, float control_strength = 0.f, @@ -32,9 +32,9 @@ struct UNetModel : public DiffusionModel { UNetModelRunner unet; UNetModel(ggml_backend_t backend, - std::map& tensor_types, - SDVersion version = VERSION_SD1, - bool flash_attn = false) + const String2GGMLType& tensor_types = {}, + SDVersion version = VERSION_SD1, + bool flash_attn = false) : unet(backend, tensor_types, "model.diffusion_model", version, flash_attn) { } @@ -69,7 +69,7 @@ struct UNetModel : public DiffusionModel { struct ggml_tensor* c_concat, struct ggml_tensor* y, struct ggml_tensor* guidance, - std::vector ref_latents = {}, + std::vector ref_latents = {}, int num_video_frames = -1, std::vector controls = {}, float control_strength = 0.f, @@ -85,7 +85,7 @@ struct MMDiTModel : public DiffusionModel { MMDiTRunner mmdit; MMDiTModel(ggml_backend_t backend, - std::map& tensor_types) + const String2GGMLType& tensor_types = {}) : mmdit(backend, tensor_types, "model.diffusion_model") { } @@ -120,7 +120,7 @@ struct MMDiTModel : public DiffusionModel { struct ggml_tensor* c_concat, struct ggml_tensor* y, struct ggml_tensor* guidance, - std::vector ref_latents = {}, + std::vector ref_latents = {}, int num_video_frames = -1, std::vector controls = {}, float control_strength = 0.f, @@ -135,10 +135,10 @@ struct FluxModel : public DiffusionModel { Flux::FluxRunner flux; FluxModel(ggml_backend_t backend, - std::map& tensor_types, - SDVersion version = VERSION_FLUX, - bool flash_attn = false, - bool use_mask = false) + const String2GGMLType& tensor_types = {}, + SDVersion version = VERSION_FLUX, + bool flash_attn = false, + bool use_mask = false) : flux(backend, tensor_types, "model.diffusion_model", version, flash_attn, use_mask) { } @@ -173,7 +173,7 @@ struct FluxModel : public DiffusionModel { struct ggml_tensor* c_concat, struct ggml_tensor* y, struct ggml_tensor* guidance, - std::vector ref_latents = {}, + std::vector ref_latents = {}, int num_video_frames = -1, std::vector controls = {}, float control_strength = 0.f, @@ -184,4 +184,4 @@ struct FluxModel : public DiffusionModel { } }; -#endif \ No newline at end of file +#endif diff --git a/otherarch/sdcpp/esrgan.hpp b/otherarch/sdcpp/esrgan.hpp index 5fbc6c509..3e41a8871 100644 --- a/otherarch/sdcpp/esrgan.hpp +++ b/otherarch/sdcpp/esrgan.hpp @@ -130,8 +130,8 @@ public: body_feat = conv_body->forward(ctx, body_feat); feat = ggml_add(ctx, feat, body_feat); // upsample - feat = lrelu(ctx, conv_up1->forward(ctx, ggml_upscale(ctx, feat, 2, ggml_scale_mode::GGML_SCALE_MODE_NEAREST))); - feat = lrelu(ctx, conv_up2->forward(ctx, ggml_upscale(ctx, feat, 2, ggml_scale_mode::GGML_SCALE_MODE_NEAREST))); + feat = lrelu(ctx, conv_up1->forward(ctx, ggml_upscale(ctx, feat, 2, GGML_SCALE_MODE_NEAREST))); + feat = lrelu(ctx, conv_up2->forward(ctx, ggml_upscale(ctx, feat, 2, GGML_SCALE_MODE_NEAREST))); auto out = conv_last->forward(ctx, lrelu(ctx, conv_hr->forward(ctx, feat))); return out; } @@ -142,11 +142,22 @@ struct ESRGAN : public GGMLRunner { int scale = 4; int tile_size = 128; // avoid cuda OOM for 4gb VRAM - ESRGAN(ggml_backend_t backend, std::map& tensor_types) + ESRGAN(ggml_backend_t backend, const String2GGMLType& tensor_types = {}) : GGMLRunner(backend) { rrdb_net.init(params_ctx, tensor_types, ""); } + void enable_conv2d_direct() { + std::vector blocks; + rrdb_net.get_all_blocks(blocks); + for (auto block : blocks) { + if (block->get_desc() == "Conv2d") { + auto conv_block = (Conv2d*)block; + conv_block->enable_direct(); + } + } + } + std::string get_desc() { return "esrgan"; } diff --git a/otherarch/sdcpp/flux.hpp b/otherarch/sdcpp/flux.hpp index a16125102..40838f2fa 100644 --- a/otherarch/sdcpp/flux.hpp +++ b/otherarch/sdcpp/flux.hpp @@ -35,8 +35,8 @@ namespace Flux { int64_t hidden_size; float eps; - void init_params(struct ggml_context* ctx, std::map& tensor_types, const std::string prefix = "") { - ggml_type wtype = GGML_TYPE_F32; //(tensor_types.find(prefix + "scale") != tensor_types.end()) ? tensor_types[prefix + "scale"] : GGML_TYPE_F32; + void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") { + ggml_type wtype = GGML_TYPE_F32; params["scale"] = ggml_new_tensor_1d(ctx, wtype, hidden_size); } @@ -512,7 +512,8 @@ namespace Flux { LastLayer(int64_t hidden_size, int64_t patch_size, int64_t out_channels, - bool prune_mod = false) : prune_mod(prune_mod) { + bool prune_mod = false) + : prune_mod(prune_mod) { blocks["norm_final"] = std::shared_ptr(new LayerNorm(hidden_size, 1e-06f, false)); blocks["linear"] = std::shared_ptr(new Linear(hidden_size, patch_size * patch_size * out_channels)); if (!prune_mod) { @@ -723,7 +724,7 @@ namespace Flux { auto txt_ids = gen_txt_ids(bs, context_len); auto img_ids = gen_img_ids(h, w, patch_size, bs); - auto ids = concat_ids(txt_ids, img_ids, bs); + auto ids = concat_ids(txt_ids, img_ids, bs); uint64_t curr_h_offset = 0; uint64_t curr_w_offset = 0; for (ggml_tensor* ref : ref_latents) { @@ -736,7 +737,7 @@ namespace Flux { } auto ref_ids = gen_img_ids(ref->ne[1], ref->ne[0], patch_size, bs, 1, h_offset, w_offset); - ids = concat_ids(ids, ref_ids, bs); + ids = concat_ids(ids, ref_ids, bs); curr_h_offset = std::max(curr_h_offset, ref->ne[1] + h_offset); curr_w_offset = std::max(curr_w_offset, ref->ne[0] + w_offset); @@ -744,7 +745,6 @@ namespace Flux { return ids; } - // Generate positional embeddings std::vector gen_pe(int h, int w, int patch_size, int bs, int context_len, std::vector ref_latents, int theta, const std::vector& axes_dim) { std::vector> ids = gen_ids(h, w, patch_size, bs, context_len, ref_latents); @@ -872,8 +872,8 @@ namespace Flux { struct ggml_tensor* y, struct ggml_tensor* guidance, struct ggml_tensor* pe, - struct ggml_tensor* mod_index_arange = NULL, - std::vector skip_layers = {}) { + struct ggml_tensor* mod_index_arange = NULL, + std::vector skip_layers = {}) { auto img_in = std::dynamic_pointer_cast(blocks["img_in"]); auto txt_in = std::dynamic_pointer_cast(blocks["txt_in"]); auto final_layer = std::dynamic_pointer_cast(blocks["final_layer"]); @@ -962,7 +962,6 @@ namespace Flux { struct ggml_tensor* process_img(struct ggml_context* ctx, struct ggml_tensor* x) { - int64_t W = x->ne[0]; int64_t H = x->ne[1]; int64_t patch_size = 2; @@ -983,9 +982,9 @@ namespace Flux { struct ggml_tensor* y, struct ggml_tensor* guidance, struct ggml_tensor* pe, - struct ggml_tensor* mod_index_arange = NULL, + struct ggml_tensor* mod_index_arange = NULL, std::vector ref_latents = {}, - std::vector skip_layers = {}) { + std::vector skip_layers = {}) { // Forward pass of DiT. // x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images) // timestep: (N,) tensor of diffusion timesteps @@ -1005,7 +1004,7 @@ namespace Flux { int pad_h = (patch_size - H % patch_size) % patch_size; int pad_w = (patch_size - W % patch_size) % patch_size; - auto img = process_img(ctx, x); + auto img = process_img(ctx, x); uint64_t img_tokens = img->ne[1]; if (c_concat != NULL) { @@ -1013,7 +1012,7 @@ namespace Flux { ggml_tensor* mask = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], 8 * 8, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * C); masked = process_img(ctx, masked); - mask = process_img(ctx, mask); + mask = process_img(ctx, mask); img = ggml_concat(ctx, img, ggml_concat(ctx, masked, mask, 0), 0); } @@ -1027,9 +1026,9 @@ namespace Flux { auto out = forward_orig(ctx, img, context, timestep, y, guidance, pe, mod_index_arange, skip_layers); // [N, num_tokens, C * patch_size * patch_size] if (out->ne[1] > img_tokens) { - out = ggml_cont(ctx, ggml_permute(ctx, out, 0, 2, 1, 3)); // [num_tokens, N, C * patch_size * patch_size] + out = ggml_cont(ctx, ggml_permute(ctx, out, 0, 2, 1, 3)); // [num_tokens, N, C * patch_size * patch_size] out = ggml_view_3d(ctx, out, out->ne[0], out->ne[1], img_tokens, out->nb[1], out->nb[2], 0); - out = ggml_cont(ctx, ggml_permute(ctx, out, 0, 2, 1, 3)); // [N, h*w, C * patch_size * patch_size] + out = ggml_cont(ctx, ggml_permute(ctx, out, 0, 2, 1, 3)); // [N, h*w, C * patch_size * patch_size] } // rearrange(out, "b (h w) (c ph pw) -> b c (h ph) (w pw)", h=h_len, w=w_len, ph=2, pw=2) @@ -1040,8 +1039,6 @@ namespace Flux { }; struct FluxRunner : public GGMLRunner { - static std::map empty_tensor_types; - public: FluxParams flux_params; Flux flux; @@ -1051,11 +1048,11 @@ namespace Flux { bool use_mask = false; FluxRunner(ggml_backend_t backend, - std::map& tensor_types = empty_tensor_types, - const std::string prefix = "", - SDVersion version = VERSION_FLUX, - bool flash_attn = false, - bool use_mask = false) + const String2GGMLType& tensor_types = {}, + const std::string prefix = "", + SDVersion version = VERSION_FLUX, + bool flash_attn = false, + bool use_mask = false) : GGMLRunner(backend), use_mask(use_mask) { flux_params.flash_attn = flash_attn; flux_params.guidance_embed = false; @@ -1120,7 +1117,7 @@ namespace Flux { struct ggml_tensor* y, struct ggml_tensor* guidance, std::vector ref_latents = {}, - std::vector skip_layers = {}) { + std::vector skip_layers = {}) { GGML_ASSERT(x->ne[3] == 1); struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, FLUX_GRAPH_SIZE, false); @@ -1139,8 +1136,8 @@ namespace Flux { } // ggml_arange is not working on some backends, precompute it - mod_index_arange_vec = arange(0, 344); - mod_index_arange = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_F32, mod_index_arange_vec.size()); + mod_index_arange_vec = arange(0, 344); + mod_index_arange = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_F32, mod_index_arange_vec.size()); set_backend_tensor_data(mod_index_arange, mod_index_arange_vec.data()); } y = to_backend(y); @@ -1187,9 +1184,9 @@ namespace Flux { struct ggml_tensor* y, struct ggml_tensor* guidance, std::vector ref_latents = {}, - struct ggml_tensor** output = NULL, - struct ggml_context* output_ctx = NULL, - std::vector skip_layers = std::vector()) { + struct ggml_tensor** output = NULL, + struct ggml_context* output_ctx = NULL, + std::vector skip_layers = std::vector()) { // x: [N, in_channels, h, w] // timesteps: [N, ] // context: [N, max_position, hidden_size] @@ -1277,4 +1274,4 @@ namespace Flux { } // namespace Flux -#endif // __FLUX_HPP__ \ No newline at end of file +#endif // __FLUX_HPP__ diff --git a/otherarch/sdcpp/ggml_extend.hpp b/otherarch/sdcpp/ggml_extend.hpp index 838c271c2..9a0b1f8d3 100644 --- a/otherarch/sdcpp/ggml_extend.hpp +++ b/otherarch/sdcpp/ggml_extend.hpp @@ -40,6 +40,10 @@ #include "ggml-vulkan.h" #endif +#ifdef SD_USE_OPENCL +#include "ggml-opencl.h" +#endif + #ifdef SD_USE_SYCL #include "ggml-sycl.h" #endif @@ -53,6 +57,8 @@ #define __STATIC_INLINE__ static inline #endif +static_assert(GGML_MAX_NAME >= 128, "GGML_MAX_NAME must be at least 128"); + // n-mode trensor-matrix product // example: 2-mode product // A: [ne03, k, ne01, ne00] @@ -109,13 +115,13 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_merge_lora(ggml_context* ctx, struct // [ne03,ne02,ne01,ne00] x [ne13,ne12,ne11,ne10] => [ne03*ne13,ne02*ne12,ne01*ne11,ne00*ne10] __STATIC_INLINE__ struct ggml_tensor* ggml_kronecker(ggml_context* ctx, struct ggml_tensor* a, struct ggml_tensor* b) { return ggml_mul(ctx, - ggml_upscale_ext(ctx, + ggml_interpolate(ctx, a, a->ne[0] * b->ne[0], a->ne[1] * b->ne[1], a->ne[2] * b->ne[2], a->ne[3] * b->ne[3], - ggml_scale_mode::GGML_SCALE_MODE_NEAREST), + GGML_SCALE_MODE_NEAREST), b); } @@ -811,6 +817,25 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_conv_2d(struct ggml_context* ctx, return x; } +__STATIC_INLINE__ struct ggml_tensor* ggml_nn_conv_2d_direct(struct ggml_context* ctx, + struct ggml_tensor* x, + struct ggml_tensor* w, + struct ggml_tensor* b, + int s0 = 1, + int s1 = 1, + int p0 = 0, + int p1 = 0, + int d0 = 1, + int d1 = 1) { + x = ggml_conv_2d_direct(ctx, w, x, s0, s1, p0, p1, d0, d1); + if (b != NULL) { + b = ggml_reshape_4d(ctx, b, 1, 1, b->ne[0], 1); + // b = ggml_repeat(ctx, b, x); + x = ggml_add(ctx, x, b); + } + return x; +} + // w: [OC,IC, KD, 1 * 1] // x: [N, IC, IH, IW] // b: [OC,] @@ -945,18 +970,33 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_attention_ext(struct ggml_context* float scale = (1.0f / sqrt((float)d_head)); + int kv_pad = 0; // if (flash_attn) { // LOG_DEBUG("attention_ext L_q:%d L_k:%d n_head:%d C:%d d_head:%d N:%d", L_q, L_k, n_head, C, d_head, N); // } - // is there anything oddly shaped?? ping Green-Sky if you can trip this assert + // is there anything oddly shaped?? ping Green-Sky if you can trip this assert GGML_ASSERT(((L_k % 256 == 0) && L_q == L_k) || !(L_k % 256 == 0)); bool can_use_flash_attn = true; + can_use_flash_attn = can_use_flash_attn && (d_head == 64 || + d_head == 80 || + d_head == 96 || + d_head == 112 || + d_head == 128 || + d_head == 256); +// kcpp disable kv_pad (leejet/stable-diffusion.cpp#756) +#if 1 can_use_flash_attn = can_use_flash_attn && L_k % 256 == 0; - can_use_flash_attn = can_use_flash_attn && d_head % 64 == 0; // double check - - // cuda max d_head seems to be 256, cpu does seem to work with 512 - can_use_flash_attn = can_use_flash_attn && d_head <= 256; // double check +#else + if (can_use_flash_attn && L_k % 256 != 0) { + // TODO(Green-Sky): might be worth just padding by default + if (L_k == 77 || L_k == 4208 || L_k == 3952) { + kv_pad = GGML_PAD(L_k, 256) - L_k; + } else { + can_use_flash_attn = false; + } + } +#endif if (mask != nullptr) { // TODO(Green-Sky): figure out if we can bend t5 to work too @@ -969,11 +1009,18 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_attention_ext(struct ggml_context* ggml_tensor* kqv = nullptr; // GGML_ASSERT((flash_attn && can_use_flash_attn) || !flash_attn); if (can_use_flash_attn && flash_attn) { - // LOG_DEBUG("using flash attention"); + // LOG_DEBUG(" uses flash attention"); + if (kv_pad != 0) { + // LOG_DEBUG(" padding k and v dim1 by %d", kv_pad); + k = ggml_pad(ctx, k, 0, kv_pad, 0, 0); + } k = ggml_cast(ctx, k, GGML_TYPE_F16); v = ggml_cont(ctx, ggml_permute(ctx, v, 0, 2, 1, 3)); // [N, n_head, L_k, d_head] v = ggml_reshape_3d(ctx, v, d_head, L_k, n_head * N); // [N * n_head, L_k, d_head] + if (kv_pad != 0) { + v = ggml_pad(ctx, v, 0, kv_pad, 0, 0); + } v = ggml_cast(ctx, v, GGML_TYPE_F16); if (mask != nullptr) { @@ -1181,6 +1228,8 @@ __STATIC_INLINE__ size_t ggml_tensor_num(ggml_context* ctx) { #define MAX_PARAMS_TENSOR_NUM 32768 #define MAX_GRAPH_SIZE 32768 +typedef std::map String2GGMLType; + struct GGMLRunner { protected: typedef std::function get_graph_cb_t; @@ -1365,13 +1414,7 @@ public: ggml_backend_cpu_set_n_threads(backend, n_threads); } -// #ifdef SD_USE_METAL -// if (ggml_backend_is_metal(backend)) { -// ggml_backend_metal_set_n_cb(backend, n_threads); -// } -// #endif ggml_backend_graph_compute(backend, gf); - #ifdef GGML_PERF ggml_graph_print(gf); #endif @@ -1398,17 +1441,25 @@ protected: GGMLBlockMap blocks; ParameterMap params; - void init_blocks(struct ggml_context* ctx, std::map& tensor_types, const std::string prefix = "") { + ggml_type get_type(const std::string& name, const String2GGMLType& tensor_types, ggml_type default_type) { + auto iter = tensor_types.find(name); + if (iter != tensor_types.end()) { + return iter->second; + } + return default_type; + } + + void init_blocks(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") { for (auto& pair : blocks) { auto& block = pair.second; block->init(ctx, tensor_types, prefix + pair.first); } } - virtual void init_params(struct ggml_context* ctx, std::map& tensor_types, const std::string prefix = "") {} + virtual void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {} public: - void init(struct ggml_context* ctx, std::map& tensor_types, std::string prefix = "") { + void init(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, std::string prefix = "") { if (prefix.size() > 0) { prefix = prefix + "."; } @@ -1455,6 +1506,19 @@ public: tensors[prefix + pair.first] = pair.second; } } + + virtual std::string get_desc() { + return "GGMLBlock"; + } + + void get_all_blocks(std::vector& result) { + result.push_back(this); + for (auto& block_iter : blocks) { + if (block_iter.second) { + block_iter.second->get_all_blocks(result); + } + } + } }; class UnaryBlock : public GGMLBlock { @@ -1469,8 +1533,8 @@ protected: bool bias; bool force_f32; - void init_params(struct ggml_context* ctx, std::map& tensor_types, const std::string prefix = "") { - enum ggml_type wtype = (tensor_types.find(prefix + "weight") != tensor_types.end()) ? tensor_types[prefix + "weight"] : GGML_TYPE_F32; + void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") { + enum ggml_type wtype = get_type(prefix + "weight", tensor_types, GGML_TYPE_F32); if (in_features % ggml_blck_size(wtype) != 0 || force_f32) { wtype = GGML_TYPE_F32; } @@ -1505,8 +1569,8 @@ class Embedding : public UnaryBlock { protected: int64_t embedding_dim; int64_t num_embeddings; - void init_params(struct ggml_context* ctx, std::map& tensor_types, const std::string prefix = "") { - enum ggml_type wtype = (tensor_types.find(prefix + "weight") != tensor_types.end()) ? tensor_types[prefix + "weight"] : GGML_TYPE_F32; + void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types, const std::string prefix = "") { + enum ggml_type wtype = get_type(prefix + "weight", tensor_types, GGML_TYPE_F32); params["weight"] = ggml_new_tensor_2d(ctx, wtype, embedding_dim, num_embeddings); } @@ -1544,12 +1608,13 @@ protected: std::pair padding; std::pair dilation; bool bias; + bool direct = false; - void init_params(struct ggml_context* ctx, std::map& tensor_types, const std::string prefix = "") { - enum ggml_type wtype = GGML_TYPE_F16; //(tensor_types.find(prefix + "weight") != tensor_types.end()) ? tensor_types[prefix + "weight"] : GGML_TYPE_F16; + void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types, const std::string prefix = "") { + enum ggml_type wtype = GGML_TYPE_F16; params["weight"] = ggml_new_tensor_4d(ctx, wtype, kernel_size.second, kernel_size.first, in_channels, out_channels); if (bias) { - enum ggml_type wtype = GGML_TYPE_F32; // (tensor_types.find(prefix + "bias") != tensor_types.end()) ? tensor_types[prefix + "bias"] : GGML_TYPE_F32; + enum ggml_type wtype = GGML_TYPE_F32; params["bias"] = ggml_new_tensor_1d(ctx, wtype, out_channels); } } @@ -1570,13 +1635,25 @@ public: dilation(dilation), bias(bias) {} + void enable_direct() { + direct = true; + } + + std::string get_desc() { + return "Conv2d"; + } + struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { struct ggml_tensor* w = params["weight"]; struct ggml_tensor* b = NULL; if (bias) { b = params["bias"]; } - return ggml_nn_conv_2d(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first); + if (direct) { + return ggml_nn_conv_2d_direct(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first); + } else { + return ggml_nn_conv_2d(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first); + } } }; @@ -1590,11 +1667,11 @@ protected: int64_t dilation; bool bias; - void init_params(struct ggml_context* ctx, std::map& tensor_types, const std::string prefix = "") { - enum ggml_type wtype = GGML_TYPE_F16; //(tensor_types.find(prefix + "weight") != tensor_types.end()) ? tensor_types[prefix + "weight"] : GGML_TYPE_F16; + void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types, const std::string prefix = "") { + enum ggml_type wtype = GGML_TYPE_F16; params["weight"] = ggml_new_tensor_4d(ctx, wtype, 1, kernel_size, in_channels, out_channels); // 5d => 4d if (bias) { - enum ggml_type wtype = GGML_TYPE_F32; //(tensor_types.find(prefix + "bias") != tensor_types.end()) ? tensor_types[prefix + "bias"] : GGML_TYPE_F32; + enum ggml_type wtype = GGML_TYPE_F32; params["bias"] = ggml_new_tensor_1d(ctx, wtype, out_channels); } } @@ -1634,12 +1711,12 @@ protected: bool elementwise_affine; bool bias; - void init_params(struct ggml_context* ctx, std::map& tensor_types, const std::string prefix = "") { + void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") { if (elementwise_affine) { - enum ggml_type wtype = GGML_TYPE_F32; //(tensor_types.ypes.find(prefix + "weight") != tensor_types.end()) ? tensor_types[prefix + "weight"] : GGML_TYPE_F32; + enum ggml_type wtype = GGML_TYPE_F32; params["weight"] = ggml_new_tensor_1d(ctx, wtype, normalized_shape); if (bias) { - enum ggml_type wtype = GGML_TYPE_F32; //(tensor_types.ypes.find(prefix + "bias") != tensor_types.end()) ? tensor_types[prefix + "bias"] : GGML_TYPE_F32; + enum ggml_type wtype = GGML_TYPE_F32; params["bias"] = ggml_new_tensor_1d(ctx, wtype, normalized_shape); } } @@ -1676,10 +1753,10 @@ protected: float eps; bool affine; - void init_params(struct ggml_context* ctx, std::map& tensor_types, const std::string prefix = "") { + void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") { if (affine) { - enum ggml_type wtype = GGML_TYPE_F32; //(tensor_types.find(prefix + "weight") != tensor_types.end()) ? tensor_types[prefix + "weight"] : GGML_TYPE_F32; - enum ggml_type bias_wtype = GGML_TYPE_F32; //(tensor_types.find(prefix + "bias") != tensor_types.end()) ? tensor_types[prefix + "bias"] : GGML_TYPE_F32; + enum ggml_type wtype = GGML_TYPE_F32; + enum ggml_type bias_wtype = GGML_TYPE_F32; params["weight"] = ggml_new_tensor_1d(ctx, wtype, num_channels); params["bias"] = ggml_new_tensor_1d(ctx, bias_wtype, num_channels); } @@ -1760,4 +1837,4 @@ public: } }; -#endif // __GGML_EXTEND__HPP__ \ No newline at end of file +#endif // __GGML_EXTEND__HPP__ diff --git a/otherarch/sdcpp/gits_noise.inl b/otherarch/sdcpp/gits_noise.inl index 5c32d8bd5..7a10ff76f 100644 --- a/otherarch/sdcpp/gits_noise.inl +++ b/otherarch/sdcpp/gits_noise.inl @@ -346,4 +346,4 @@ const std::vector>*> GITS_NOISE = { &GITS_NOISE_1_50 }; -#endif // GITS_NOISE_INL \ No newline at end of file +#endif // GITS_NOISE_INL diff --git a/otherarch/sdcpp/lora.hpp b/otherarch/sdcpp/lora.hpp index f7f46ea48..35f5aacd1 100644 --- a/otherarch/sdcpp/lora.hpp +++ b/otherarch/sdcpp/lora.hpp @@ -3,7 +3,7 @@ #include "ggml_extend.hpp" -#define LORA_GRAPH_SIZE 20480 +#define LORA_GRAPH_BASE_SIZE 10240 struct LoraModel : public GGMLRunner { enum lora_t { @@ -238,7 +238,8 @@ struct LoraModel : public GGMLRunner { } struct ggml_cgraph* build_lora_graph(std::map model_tensors, SDVersion version) { - struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, LORA_GRAPH_SIZE, false); + size_t lora_graph_size = LORA_GRAPH_BASE_SIZE + lora_tensors.size() * 10; + struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, lora_graph_size, false); zero_index = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_I32, 1); set_backend_tensor_data(zero_index, zero_index_vec.data()); @@ -291,7 +292,6 @@ struct LoraModel : public GGMLRunner { std::string hada_2_down_name = ""; std::string hada_2_up_name = ""; - hada_1_down_name = fk + ".hada_w1_b"; hada_1_up_name = fk + ".hada_w1_a"; hada_1_mid_name = fk + ".hada_t1"; @@ -843,4 +843,4 @@ struct LoraModel : public GGMLRunner { } }; -#endif // __LORA_HPP__ \ No newline at end of file +#endif // __LORA_HPP__ diff --git a/otherarch/sdcpp/main.cpp b/otherarch/sdcpp/main.cpp index 55e9591d3..ec04dfde3 100644 --- a/otherarch/sdcpp/main.cpp +++ b/otherarch/sdcpp/main.cpp @@ -1,73 +1,49 @@ #include #include #include +#include #include +#include #include +#include #include #include // #include "preprocessing.hpp" -#include "flux.hpp" #include "stable-diffusion.h" #define STB_IMAGE_IMPLEMENTATION +#define STB_IMAGE_STATIC #include "stb_image.h" #define STB_IMAGE_WRITE_IMPLEMENTATION +#define STB_IMAGE_WRITE_STATIC #include "stb_image_write.h" #define STB_IMAGE_RESIZE_IMPLEMENTATION +#define STB_IMAGE_RESIZE_STATIC #include "stb_image_resize.h" -const char* rng_type_to_str[] = { - "std_default", - "cuda", -}; - -// Names of the sampler method, same order as enum sample_method in stable-diffusion.h -const char* sample_method_str[] = { - "euler_a", - "euler", - "heun", - "dpm2", - "dpm++2s_a", - "dpm++2m", - "dpm++2mv2", - "ipndm", - "ipndm_v", - "lcm", - "ddim_trailing", - "tcd", -}; - -// Names of the sigma schedule overrides, same order as sample_schedule in stable-diffusion.h -const char* schedule_str[] = { - "default", - "discrete", - "karras", - "exponential", - "ays", - "gits", -}; +#define SAFE_STR(s) ((s) ? (s) : "") +#define BOOL_STR(b) ((b) ? "true" : "false") const char* modes_str[] = { - "txt2img", - "img2img", - "img2vid", + "img_gen", + "vid_gen", "convert", }; +#define SD_ALL_MODES_STR "img_gen, vid_gen, convert" enum SDMode { - TXT2IMG, - IMG2IMG, - IMG2VID, + IMG_GEN, + VID_GEN, CONVERT, MODE_COUNT }; struct SDParams { int n_threads = -1; - SDMode mode = TXT2IMG; + SDMode mode = IMG_GEN; std::string model_path; std::string clip_l_path; std::string clip_g_path; @@ -76,30 +52,31 @@ struct SDParams { std::string vae_path; std::string taesd_path; std::string esrgan_path; - std::string controlnet_path; - std::string embeddings_path; - std::string stacked_id_embeddings_path; + std::string control_net_path; + std::string embedding_dir; + std::string stacked_id_embed_dir; std::string input_id_images_path; sd_type_t wtype = SD_TYPE_COUNT; + std::string tensor_type_rules; std::string lora_model_dir; std::string output_path = "output.png"; std::string input_path; std::string mask_path; std::string control_image_path; - - std::vector kontext_image_paths; + std::vector ref_image_paths; std::string prompt; std::string negative_prompt; - float min_cfg = 1.0f; - float cfg_scale = 7.0f; - float guidance = 3.5f; - float eta = 0.f; - float style_ratio = 20.f; - int clip_skip = -1; // <= 0 represents unspecified - int width = 512; - int height = 512; - int batch_count = 1; + float min_cfg = 1.0f; + float cfg_scale = 7.0f; + float img_cfg_scale = INFINITY; + float guidance = 3.5f; + float eta = 0.f; + float style_ratio = 20.f; + int clip_skip = -1; // <= 0 represents unspecified + int width = 512; + int height = 512; + int batch_count = 1; int video_frames = 6; int motion_bucket_id = 127; @@ -120,6 +97,8 @@ struct SDParams { bool clip_on_cpu = false; bool vae_on_cpu = false; bool diffusion_flash_attn = false; + bool diffusion_conv_direct = false; + bool vae_conv_direct = false; bool canny_preprocess = false; bool color = false; int upscale_repeats = 1; @@ -129,9 +108,9 @@ struct SDParams { float skip_layer_start = 0.01f; float skip_layer_end = 0.2f; - bool chroma_use_dit_mask = true; - bool chroma_use_t5_mask = false; - int chroma_t5_mask_pad = 1; + bool chroma_use_dit_mask = true; + bool chroma_use_t5_mask = false; + int chroma_t5_mask_pad = 1; }; void print_params(SDParams params) { @@ -147,9 +126,9 @@ void print_params(SDParams params) { printf(" vae_path: %s\n", params.vae_path.c_str()); printf(" taesd_path: %s\n", params.taesd_path.c_str()); printf(" esrgan_path: %s\n", params.esrgan_path.c_str()); - printf(" controlnet_path: %s\n", params.controlnet_path.c_str()); - printf(" embeddings_path: %s\n", params.embeddings_path.c_str()); - printf(" stacked_id_embeddings_path: %s\n", params.stacked_id_embeddings_path.c_str()); + printf(" control_net_path: %s\n", params.control_net_path.c_str()); + printf(" embedding_dir: %s\n", params.embedding_dir.c_str()); + printf(" stacked_id_embed_dir: %s\n", params.stacked_id_embed_dir.c_str()); printf(" input_id_images_path: %s\n", params.input_id_images_path.c_str()); printf(" style ratio: %.2f\n", params.style_ratio); printf(" normalize input image : %s\n", params.normalize_input ? "true" : "false"); @@ -157,26 +136,33 @@ void print_params(SDParams params) { printf(" init_img: %s\n", params.input_path.c_str()); printf(" mask_img: %s\n", params.mask_path.c_str()); printf(" control_image: %s\n", params.control_image_path.c_str()); + printf(" ref_images_paths:\n"); + for (auto& path : params.ref_image_paths) { + printf(" %s\n", path.c_str()); + }; printf(" clip on cpu: %s\n", params.clip_on_cpu ? "true" : "false"); printf(" controlnet cpu: %s\n", params.control_net_cpu ? "true" : "false"); printf(" vae decoder on cpu:%s\n", params.vae_on_cpu ? "true" : "false"); printf(" diffusion flash attention:%s\n", params.diffusion_flash_attn ? "true" : "false"); + printf(" diffusion Conv2d direct:%s\n", params.diffusion_conv_direct ? "true" : "false"); + printf(" vae Conv2d direct:%s\n", params.vae_conv_direct ? "true" : "false"); printf(" strength(control): %.2f\n", params.control_strength); printf(" prompt: %s\n", params.prompt.c_str()); printf(" negative_prompt: %s\n", params.negative_prompt.c_str()); printf(" min_cfg: %.2f\n", params.min_cfg); printf(" cfg_scale: %.2f\n", params.cfg_scale); + printf(" img_cfg_scale: %.2f\n", params.img_cfg_scale); printf(" slg_scale: %.2f\n", params.slg_scale); printf(" guidance: %.2f\n", params.guidance); printf(" eta: %.2f\n", params.eta); printf(" clip_skip: %d\n", params.clip_skip); printf(" width: %d\n", params.width); printf(" height: %d\n", params.height); - printf(" sample_method: %s\n", sample_method_str[params.sample_method]); - printf(" schedule: %s\n", schedule_str[params.schedule]); + printf(" sample_method: %s\n", sd_sample_method_name(params.sample_method)); + printf(" schedule: %s\n", sd_schedule_name(params.schedule)); printf(" sample_steps: %d\n", params.sample_steps); printf(" strength(img2img): %.2f\n", params.strength); - printf(" rng: %s\n", rng_type_to_str[params.rng_type]); + printf(" rng: %s\n", sd_rng_type_name(params.rng_type)); printf(" seed: %ld\n", params.seed); printf(" batch_count: %d\n", params.batch_count); printf(" vae_tiling: %s\n", params.vae_tiling ? "true" : "false"); @@ -191,14 +177,14 @@ void print_usage(int argc, const char* argv[]) { printf("\n"); printf("arguments:\n"); printf(" -h, --help show this help message and exit\n"); - printf(" -M, --mode [MODEL] run mode (txt2img or img2img or convert, default: txt2img)\n"); + printf(" -M, --mode [MODE] run mode, one of: [img_gen, convert], default: img_gen\n"); printf(" -t, --threads N number of threads to use during computation (default: -1)\n"); printf(" If threads <= 0, then threads will be set to the number of CPU physical cores\n"); printf(" -m, --model [MODEL] path to full model\n"); printf(" --diffusion-model path to the standalone diffusion model\n"); printf(" --clip_l path to the clip-l text encoder\n"); printf(" --clip_g path to the clip-g text encoder\n"); - printf(" --t5xxl path to the the t5xxl text encoder\n"); + printf(" --t5xxl path to the t5xxl text encoder\n"); printf(" --vae [VAE] path to vae\n"); printf(" --taesd [TAESD_PATH] path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)\n"); printf(" --control-net [CONTROL_PATH] path to control net model\n"); @@ -210,15 +196,18 @@ void print_usage(int argc, const char* argv[]) { printf(" --upscale-repeats Run the ESRGAN upscaler this many times (default 1)\n"); printf(" --type [TYPE] weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K)\n"); printf(" If not specified, the default is the type of the weight file\n"); + printf(" --tensor-type-rules [EXPRESSION] weight type per tensor pattern (example: \"^vae\\.=f16,model\\.=q8_0\")\n"); printf(" --lora-model-dir [DIR] lora model directory\n"); printf(" -i, --init-img [IMAGE] path to the input image, required by img2img\n"); printf(" --mask [MASK] path to the mask image, required by img2img with mask\n"); printf(" --control-image [IMAGE] path to image condition, control net\n"); + printf(" -r, --ref-image [PATH] reference image for Flux Kontext models (can be used multiple times) \n"); printf(" -o, --output OUTPUT path to write result image to (default: ./output.png)\n"); printf(" -p, --prompt [PROMPT] the prompt to render\n"); printf(" -n, --negative-prompt PROMPT the negative prompt (default: \"\")\n"); printf(" --cfg-scale SCALE unconditional guidance scale: (default: 7.0)\n"); - printf(" --guidance SCALE guidance scale for img2img (default: 3.5)\n"); + printf(" --img-cfg-scale SCALE image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)\n"); + printf(" --guidance SCALE distilled guidance scale for models with guidance input (default: 3.5)\n"); printf(" --slg-scale SCALE skip layer guidance (SLG) scale, only for DiT models: (default: 0)\n"); printf(" 0 means disabled, a value of 2.5 is nice for sd3.5 medium\n"); printf(" --eta SCALE eta in DDIM, only for DDIM and TCD: (default: 0)\n"); @@ -227,7 +216,7 @@ void print_usage(int argc, const char* argv[]) { printf(" --skip-layer-end END SLG disabling point: (default: 0.2)\n"); printf(" SLG will be enabled at step int([STEPS]*[START]) and disabled at int([STEPS]*[END])\n"); printf(" --strength STRENGTH strength for noising/unnoising (default: 0.75)\n"); - printf(" --style-ratio STYLE-RATIO strength for keeping input identity (default: 20%%)\n"); + printf(" --style-ratio STYLE-RATIO strength for keeping input identity (default: 20)\n"); printf(" --control-strength STRENGTH strength to apply Control Net (default: 0.9)\n"); printf(" 1.0 corresponds to full destruction of information in init image\n"); printf(" -H, --height H image height, in pixel space (default: 512)\n"); @@ -247,420 +236,359 @@ void print_usage(int argc, const char* argv[]) { printf(" --diffusion-fa use flash attention in the diffusion model (for low vram)\n"); printf(" Might lower quality, since it implies converting k and v to f16.\n"); printf(" This might crash if it is not supported by the backend.\n"); + printf(" --diffusion-conv-direct use Conv2d direct in the diffusion model"); + printf(" This might crash if it is not supported by the backend.\n"); + printf(" --vae-conv-direct use Conv2d direct in the vae model (should improve the performance)"); + printf(" This might crash if it is not supported by the backend.\n"); printf(" --control-net-cpu keep controlnet in cpu (for low vram)\n"); printf(" --canny apply canny preprocessor (edge detection)\n"); - printf(" --color Colors the logging tags according to level\n"); + printf(" --color colors the logging tags according to level\n"); printf(" --chroma-disable-dit-mask disable dit mask for chroma\n"); printf(" --chroma-enable-t5-mask enable t5 mask for chroma\n"); printf(" --chroma-t5-mask-pad PAD_SIZE t5 mask pad size of chroma\n"); printf(" -v, --verbose print extra info\n"); - printf(" -ki, --kontext_img [PATH] Reference image for Flux Kontext models (can be used multiple times) \n"); } -void parse_args(int argc, const char** argv, SDParams& params) { +struct StringOption { + std::string short_name; + std::string long_name; + std::string desc; + std::string* target; +}; + +struct IntOption { + std::string short_name; + std::string long_name; + std::string desc; + int* target; +}; + +struct FloatOption { + std::string short_name; + std::string long_name; + std::string desc; + float* target; +}; + +struct BoolOption { + std::string short_name; + std::string long_name; + std::string desc; + bool keep_true; + bool* target; +}; + +struct ManualOption { + std::string short_name; + std::string long_name; + std::string desc; + std::function cb; +}; + +struct ArgOptions { + std::vector string_options; + std::vector int_options; + std::vector float_options; + std::vector bool_options; + std::vector manual_options; +}; + +bool parse_options(int argc, const char** argv, ArgOptions& options) { bool invalid_arg = false; std::string arg; for (int i = 1; i < argc; i++) { arg = argv[i]; - if (arg == "-t" || arg == "--threads") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.n_threads = std::stoi(argv[i]); - } else if (arg == "-M" || arg == "--mode") { - if (++i >= argc) { - invalid_arg = true; - break; - } - const char* mode_selected = argv[i]; - int mode_found = -1; - for (int d = 0; d < MODE_COUNT; d++) { - if (!strcmp(mode_selected, modes_str[d])) { - mode_found = d; - } - } - if (mode_found == -1) { - fprintf(stderr, - "error: invalid mode %s, must be one of [txt2img, img2img, img2vid, convert]\n", - mode_selected); - exit(1); - } - params.mode = (SDMode)mode_found; - } else if (arg == "-m" || arg == "--model") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.model_path = argv[i]; - } else if (arg == "--clip_l") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.clip_l_path = argv[i]; - } else if (arg == "--clip_g") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.clip_g_path = argv[i]; - } else if (arg == "--t5xxl") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.t5xxl_path = argv[i]; - } else if (arg == "--diffusion-model") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.diffusion_model_path = argv[i]; - } else if (arg == "--vae") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.vae_path = argv[i]; - } else if (arg == "--taesd") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.taesd_path = argv[i]; - } else if (arg == "--control-net") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.controlnet_path = argv[i]; - } else if (arg == "--upscale-model") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.esrgan_path = argv[i]; - } else if (arg == "--embd-dir") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.embeddings_path = argv[i]; - } else if (arg == "--stacked-id-embd-dir") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.stacked_id_embeddings_path = argv[i]; - } else if (arg == "--input-id-images-dir") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.input_id_images_path = argv[i]; - } else if (arg == "--type") { - if (++i >= argc) { - invalid_arg = true; - break; - } - std::string type = argv[i]; - bool found = false; - std::string valid_types = ""; - for (size_t i = 0; i < SD_TYPE_COUNT; i++) { - auto trait = ggml_get_type_traits((ggml_type)i); - std::string name(trait->type_name); - if (name == "f32" || trait->to_float && trait->type_size) { - if (i) - valid_types += ", "; - valid_types += name; - if (type == name) { - if (ggml_quantize_requires_imatrix((ggml_type)i)) { - printf("\033[35;1m[WARNING]\033[0m: type %s requires imatrix to work properly. A dummy imatrix will be used, expect poor quality.\n", trait->type_name); - } - params.wtype = (enum sd_type_t)i; - found = true; - break; - } - } - } - if (!found) { - fprintf(stderr, "error: invalid weight format %s, must be one of [%s]\n", - type.c_str(), - valid_types.c_str()); - exit(1); - } - } else if (arg == "--lora-model-dir") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.lora_model_dir = argv[i]; - } else if (arg == "-i" || arg == "--init-img") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.input_path = argv[i]; - } else if (arg == "--mask") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.mask_path = argv[i]; - } else if (arg == "--control-image") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.control_image_path = argv[i]; - } else if (arg == "-o" || arg == "--output") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.output_path = argv[i]; - } else if (arg == "-p" || arg == "--prompt") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.prompt = argv[i]; - } else if (arg == "--upscale-repeats") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.upscale_repeats = std::stoi(argv[i]); - if (params.upscale_repeats < 1) { - fprintf(stderr, "error: upscale multiplier must be at least 1\n"); - exit(1); - } - } else if (arg == "-n" || arg == "--negative-prompt") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.negative_prompt = argv[i]; - } else if (arg == "--cfg-scale") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.cfg_scale = std::stof(argv[i]); - } else if (arg == "--guidance") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.guidance = std::stof(argv[i]); - } else if (arg == "--eta") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.eta = std::stof(argv[i]); - } else if (arg == "--strength") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.strength = std::stof(argv[i]); - } else if (arg == "--style-ratio") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.style_ratio = std::stof(argv[i]); - } else if (arg == "--control-strength") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.control_strength = std::stof(argv[i]); - } else if (arg == "-H" || arg == "--height") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.height = std::stoi(argv[i]); - } else if (arg == "-W" || arg == "--width") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.width = std::stoi(argv[i]); - } else if (arg == "--steps") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.sample_steps = std::stoi(argv[i]); - } else if (arg == "--clip-skip") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.clip_skip = std::stoi(argv[i]); - } else if (arg == "--vae-tiling") { - params.vae_tiling = true; - } else if (arg == "--control-net-cpu") { - params.control_net_cpu = true; - } else if (arg == "--normalize-input") { - params.normalize_input = true; - } else if (arg == "--clip-on-cpu") { - params.clip_on_cpu = true; // will slow down get_learned_condiotion but necessary for low MEM GPUs - } else if (arg == "--vae-on-cpu") { - params.vae_on_cpu = true; // will slow down latent decoding but necessary for low MEM GPUs - } else if (arg == "--diffusion-fa") { - params.diffusion_flash_attn = true; // can reduce MEM significantly - } else if (arg == "--canny") { - params.canny_preprocess = true; - } else if (arg == "-b" || arg == "--batch-count") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.batch_count = std::stoi(argv[i]); - } else if (arg == "--rng") { - if (++i >= argc) { - invalid_arg = true; - break; - } - std::string rng_type_str = argv[i]; - if (rng_type_str == "std_default") { - params.rng_type = STD_DEFAULT_RNG; - } else if (rng_type_str == "cuda") { - params.rng_type = CUDA_RNG; - } else { - invalid_arg = true; - break; - } - } else if (arg == "--schedule") { - if (++i >= argc) { - invalid_arg = true; - break; - } - const char* schedule_selected = argv[i]; - int schedule_found = -1; - for (int d = 0; d < N_SCHEDULES; d++) { - if (!strcmp(schedule_selected, schedule_str[d])) { - schedule_found = d; - } - } - if (schedule_found == -1) { - invalid_arg = true; - break; - } - params.schedule = (schedule_t)schedule_found; - } else if (arg == "-s" || arg == "--seed") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.seed = std::stoll(argv[i]); - } else if (arg == "--sampling-method") { - if (++i >= argc) { - invalid_arg = true; - break; - } - const char* sample_method_selected = argv[i]; - int sample_method_found = -1; - for (int m = 0; m < N_SAMPLE_METHODS; m++) { - if (!strcmp(sample_method_selected, sample_method_str[m])) { - sample_method_found = m; - } - } - if (sample_method_found == -1) { - invalid_arg = true; - break; - } - params.sample_method = (sample_method_t)sample_method_found; - } else if (arg == "-h" || arg == "--help") { - print_usage(argc, argv); - exit(0); - } else if (arg == "-v" || arg == "--verbose") { - params.verbose = true; - } else if (arg == "--color") { - params.color = true; - } else if (arg == "--slg-scale") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.slg_scale = std::stof(argv[i]); - } else if (arg == "--skip-layers") { - if (++i >= argc) { - invalid_arg = true; - break; - } - if (argv[i][0] != '[') { - invalid_arg = true; - break; - } - std::string layers_str = argv[i]; - while (layers_str.back() != ']') { + for (auto& option : options.string_options) { + if ((option.short_name.size() > 0 && arg == option.short_name) || (option.long_name.size() > 0 && arg == option.long_name)) { if (++i >= argc) { invalid_arg = true; break; } - layers_str += " " + std::string(argv[i]); + *option.target = std::string(argv[i]); } - layers_str = layers_str.substr(1, layers_str.size() - 2); + } + if (invalid_arg) { + break; + } - std::regex regex("[, ]+"); - std::sregex_token_iterator iter(layers_str.begin(), layers_str.end(), regex, -1); - std::sregex_token_iterator end; - std::vector tokens(iter, end); - std::vector layers; - for (const auto& token : tokens) { - try { - layers.push_back(std::stoi(token)); - } catch (const std::invalid_argument& e) { + for (auto& option : options.int_options) { + if ((option.short_name.size() > 0 && arg == option.short_name) || (option.long_name.size() > 0 && arg == option.long_name)) { + if (++i >= argc) { invalid_arg = true; break; } + *option.target = std::stoi(argv[i]); } - params.skip_layers = layers; + } + if (invalid_arg) { + break; + } - if (invalid_arg) { - break; + for (auto& option : options.float_options) { + if ((option.short_name.size() > 0 && arg == option.short_name) || (option.long_name.size() > 0 && arg == option.long_name)) { + if (++i >= argc) { + invalid_arg = true; + break; + } + *option.target = std::stof(argv[i]); } - } else if (arg == "--skip-layer-start") { - if (++i >= argc) { - invalid_arg = true; - break; + } + if (invalid_arg) { + break; + } + + for (auto& option : options.bool_options) { + if ((option.short_name.size() > 0 && arg == option.short_name) || (option.long_name.size() > 0 && arg == option.long_name)) { + if (option.keep_true) { + *option.target = true; + } else { + *option.target = false; + } } - params.skip_layer_start = std::stof(argv[i]); - } else if (arg == "--skip-layer-end") { - if (++i >= argc) { - invalid_arg = true; - break; + } + if (invalid_arg) { + break; + } + + for (auto& option : options.manual_options) { + if ((option.short_name.size() > 0 && arg == option.short_name) || (option.long_name.size() > 0 && arg == option.long_name)) { + int ret = option.cb(argc, argv, i); + if (ret < 0) { + invalid_arg = true; + break; + } + i += ret; } - params.skip_layer_end = std::stof(argv[i]); - } else if (arg == "-ki" || arg == "--kontext-img") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.kontext_image_paths.push_back(argv[i]); - } else { - fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); - print_usage(argc, argv); - exit(1); + } + if (invalid_arg) { + break; } } if (invalid_arg) { fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); + return false; + } + return true; +} + +void parse_args(int argc, const char** argv, SDParams& params) { + ArgOptions options; + options.string_options = { + {"-m", "--model", "", ¶ms.model_path}, + {"", "--clip_l", "", ¶ms.clip_l_path}, + {"", "--clip_g", "", ¶ms.clip_g_path}, + {"", "--t5xxl", "", ¶ms.t5xxl_path}, + {"", "--diffusion-model", "", ¶ms.diffusion_model_path}, + {"", "--vae", "", ¶ms.vae_path}, + {"", "--taesd", "", ¶ms.taesd_path}, + {"", "--control-net", "", ¶ms.control_net_path}, + {"", "--embd-dir", "", ¶ms.embedding_dir}, + {"", "--stacked-id-embd-dir", "", ¶ms.stacked_id_embed_dir}, + {"", "--lora-model-dir", "", ¶ms.lora_model_dir}, + {"-i", "--init-img", "", ¶ms.input_path}, + {"", "--tensor-type-rules", "", ¶ms.tensor_type_rules}, + {"", "--input-id-images-dir", "", ¶ms.input_id_images_path}, + {"", "--mask", "", ¶ms.mask_path}, + {"", "--control-image", "", ¶ms.control_image_path}, + {"-o", "--output", "", ¶ms.output_path}, + {"-p", "--prompt", "", ¶ms.prompt}, + {"-n", "--negative-prompt", "", ¶ms.negative_prompt}, + + {"", "--upscale-model", "", ¶ms.esrgan_path}, + }; + + options.int_options = { + {"-t", "--threads", "", ¶ms.n_threads}, + {"", "--upscale-repeats", "", ¶ms.upscale_repeats}, + {"-H", "--height", "", ¶ms.height}, + {"-W", "--width", "", ¶ms.width}, + {"", "--steps", "", ¶ms.sample_steps}, + {"", "--clip-skip", "", ¶ms.clip_skip}, + {"-b", "--batch-count", "", ¶ms.batch_count}, + {"", "--chroma-t5-mask-pad", "", ¶ms.chroma_t5_mask_pad}, + }; + + options.float_options = { + {"", "--cfg-scale", "", ¶ms.cfg_scale}, + {"", "--img-cfg-scale", "", ¶ms.img_cfg_scale}, + {"", "--guidance", "", ¶ms.guidance}, + {"", "--eta", "", ¶ms.eta}, + {"", "--strength", "", ¶ms.strength}, + {"", "--style-ratio", "", ¶ms.style_ratio}, + {"", "--control-strength", "", ¶ms.control_strength}, + {"", "--slg-scale", "", ¶ms.slg_scale}, + {"", "--skip-layer-start", "", ¶ms.skip_layer_start}, + {"", "--skip-layer-end", "", ¶ms.skip_layer_end}, + + }; + + options.bool_options = { + {"", "--vae-tiling", "", true, ¶ms.vae_tiling}, + {"", "--control-net-cpu", "", true, ¶ms.control_net_cpu}, + {"", "--normalize-input", "", true, ¶ms.normalize_input}, + {"", "--clip-on-cpu", "", true, ¶ms.clip_on_cpu}, + {"", "--vae-on-cpu", "", true, ¶ms.vae_on_cpu}, + {"", "--diffusion-fa", "", true, ¶ms.diffusion_flash_attn}, + {"", "--diffusion-conv-direct", "", true, ¶ms.diffusion_conv_direct}, + {"", "--vae-conv-direct", "", true, ¶ms.vae_conv_direct}, + {"", "--canny", "", true, ¶ms.canny_preprocess}, + {"-v", "--verbos", "", true, ¶ms.verbose}, + {"", "--color", "", true, ¶ms.color}, + {"", "--chroma-disable-dit-mask", "", false, ¶ms.chroma_use_dit_mask}, + {"", "--chroma-enable-t5-mask", "", true, ¶ms.chroma_use_t5_mask}, + }; + + auto on_mode_arg = [&](int argc, const char** argv, int index) { + if (++index >= argc) { + return -1; + } + const char* mode = argv[index]; + if (mode != NULL) { + int mode_found = -1; + for (int i = 0; i < MODE_COUNT; i++) { + if (!strcmp(mode, modes_str[i])) { + mode_found = i; + } + } + if (mode_found == -1) { + fprintf(stderr, + "error: invalid mode %s, must be one of [%s]\n", + mode, SD_ALL_MODES_STR); + exit(1); + } + params.mode = (SDMode)mode_found; + } + return 1; + }; + + auto on_type_arg = [&](int argc, const char** argv, int index) { + if (++index >= argc) { + return -1; + } + const char* arg = argv[index]; + params.wtype = str_to_sd_type(arg); + if (params.wtype == SD_TYPE_COUNT) { + fprintf(stderr, "error: invalid weight format %s\n", + arg); + return -1; + } + return 1; + }; + + auto on_rng_arg = [&](int argc, const char** argv, int index) { + if (++index >= argc) { + return -1; + } + const char* arg = argv[index]; + params.rng_type = str_to_rng_type(arg); + if (params.rng_type == RNG_TYPE_COUNT) { + fprintf(stderr, "error: invalid rng type %s\n", + arg); + return -1; + } + return 1; + }; + + auto on_schedule_arg = [&](int argc, const char** argv, int index) { + if (++index >= argc) { + return -1; + } + const char* arg = argv[index]; + params.schedule = str_to_schedule(arg); + if (params.schedule == SCHEDULE_COUNT) { + fprintf(stderr, "error: invalid schedule %s\n", + arg); + return -1; + } + return 1; + }; + + auto on_sample_method_arg = [&](int argc, const char** argv, int index) { + if (++index >= argc) { + return -1; + } + const char* arg = argv[index]; + params.sample_method = str_to_sample_method(arg); + if (params.sample_method == SAMPLE_METHOD_COUNT) { + fprintf(stderr, "error: invalid sample method %s\n", + arg); + return -1; + } + return 1; + }; + + auto on_seed_arg = [&](int argc, const char** argv, int index) { + if (++index >= argc) { + return -1; + } + params.seed = std::stoll(argv[index]); + return 1; + }; + + auto on_help_arg = [&](int argc, const char** argv, int index) { + print_usage(argc, argv); + exit(0); + return 0; + }; + + auto on_skip_layers_arg = [&](int argc, const char** argv, int index) { + if (++index >= argc) { + return -1; + } + std::string layers_str = argv[index]; + if (layers_str[0] != '[' || layers_str[layers_str.size() - 1] != ']') { + return -1; + } + + layers_str = layers_str.substr(1, layers_str.size() - 2); + + std::regex regex("[, ]+"); + std::sregex_token_iterator iter(layers_str.begin(), layers_str.end(), regex, -1); + std::sregex_token_iterator end; + std::vector tokens(iter, end); + std::vector layers; + for (const auto& token : tokens) { + try { + layers.push_back(std::stoi(token)); + } catch (const std::invalid_argument& e) { + return -1; + } + } + params.skip_layers = layers; + return 1; + }; + + auto on_ref_image_arg = [&](int argc, const char** argv, int index) { + if (++index >= argc) { + return -1; + } + params.ref_image_paths.push_back(argv[index]); + return 1; + }; + + options.manual_options = { + {"-M", "--mode", "", on_mode_arg}, + {"", "--type", "", on_type_arg}, + {"", "--rng", "", on_rng_arg}, + {"-s", "--seed", "", on_seed_arg}, + {"", "--sampling-method", "", on_sample_method_arg}, + {"", "--schedule", "", on_schedule_arg}, + {"", "--skip-layers", "", on_skip_layers_arg}, + {"-r", "--ref-image", "", on_ref_image_arg}, + {"-h", "--help", "", on_help_arg}, + }; + + if (!parse_options(argc, argv, options)) { print_usage(argc, argv); exit(1); } + if (params.n_threads <= 0) { - params.n_threads = sd_get_num_physical_cores(); + params.n_threads = get_num_physical_cores(); } - if (params.mode != CONVERT && params.mode != IMG2VID && params.prompt.length() == 0) { + if (params.mode != CONVERT && params.mode != VID_GEN && params.prompt.length() == 0) { fprintf(stderr, "error: the following arguments are required: prompt\n"); print_usage(argc, argv); exit(1); @@ -672,25 +600,19 @@ void parse_args(int argc, const char** argv, SDParams& params) { exit(1); } - if ((params.mode == IMG2IMG || params.mode == IMG2VID) && params.input_path.length() == 0) { - fprintf(stderr, "error: when using the img2img mode, the following arguments are required: init-img\n"); - print_usage(argc, argv); - exit(1); - } - if (params.output_path.length() == 0) { fprintf(stderr, "error: the following arguments are required: output_path\n"); print_usage(argc, argv); exit(1); } - if (params.width <= 0 || params.width % 64 != 0) { - fprintf(stderr, "error: the width must be a multiple of 64\n"); + if (params.width <= 0) { + fprintf(stderr, "error: the width must be greater than 0\n"); exit(1); } - if (params.height <= 0 || params.height % 64 != 0) { - fprintf(stderr, "error: the height must be a multiple of 64\n"); + if (params.height <= 0) { + fprintf(stderr, "error: the height must be greater than 0\n"); exit(1); } @@ -704,6 +626,15 @@ void parse_args(int argc, const char** argv, SDParams& params) { exit(1); } + if (params.mode != CONVERT && params.tensor_type_rules.size() > 0) { + fprintf(stderr, "warning: --tensor-type-rules is currently supported only for conversion\n"); + } + + if (params.upscale_repeats < 1) { + fprintf(stderr, "error: upscale multiplier must be at least 1\n"); + exit(1); + } + if (params.seed < 0) { srand((int)time(NULL)); params.seed = rand(); @@ -714,6 +645,10 @@ void parse_args(int argc, const char** argv, SDParams& params) { params.output_path = "output.gguf"; } } + + if (!isfinite(params.img_cfg_scale)) { + params.img_cfg_scale = params.cfg_scale; + } } static std::string sd_basename(const std::string& path) { @@ -750,12 +685,26 @@ std::string get_image_params(SDParams params, int64_t seed) { parameter_string += "Seed: " + std::to_string(seed) + ", "; parameter_string += "Size: " + std::to_string(params.width) + "x" + std::to_string(params.height) + ", "; parameter_string += "Model: " + sd_basename(params.model_path) + ", "; - parameter_string += "RNG: " + std::string(rng_type_to_str[params.rng_type]) + ", "; - parameter_string += "Sampler: " + std::string(sample_method_str[params.sample_method]); - if (params.schedule == KARRAS) { - parameter_string += " karras"; + parameter_string += "RNG: " + std::string(sd_rng_type_name(params.rng_type)) + ", "; + parameter_string += "Sampler: " + std::string(sd_sample_method_name(params.sample_method)); + if (params.schedule != DEFAULT) { + parameter_string += " " + std::string(sd_schedule_name(params.schedule)); } parameter_string += ", "; + for (const auto& te : {params.clip_l_path, params.clip_g_path, params.t5xxl_path}) { + if (!te.empty()) { + parameter_string += "TE: " + sd_basename(te) + ", "; + } + } + if (!params.diffusion_model_path.empty()) { + parameter_string += "Unet: " + sd_basename(params.diffusion_model_path) + ", "; + } + if (!params.vae_path.empty()) { + parameter_string += "VAE: " + sd_basename(params.vae_path) + ", "; + } + if (params.clip_skip != -1) { + parameter_string += "Clip skip: " + std::to_string(params.clip_skip) + ", "; + } parameter_string += "Version: stable-diffusion.cpp"; return parameter_string; } @@ -808,6 +757,18 @@ int main(int argc, const char* argv[]) { parse_args(argc, argv, params); + sd_guidance_params_t guidance_params = {params.cfg_scale, + params.img_cfg_scale, + params.min_cfg, + params.guidance, + { + params.skip_layers.data(), + params.skip_layers.size(), + params.skip_layer_start, + params.skip_layer_end, + params.slg_scale, + }}; + sd_set_log_callback(sd_log_cb, (void*)¶ms); if (params.verbose) { @@ -816,7 +777,7 @@ int main(int argc, const char* argv[]) { } if (params.mode == CONVERT) { - bool success = convert(params.model_path.c_str(), params.vae_path.c_str(), params.output_path.c_str(), params.wtype); + bool success = convert(params.model_path.c_str(), params.vae_path.c_str(), params.output_path.c_str(), params.wtype, params.tensor_type_rules.c_str()); if (!success) { fprintf(stderr, "convert '%s'/'%s' to '%s' failed\n", @@ -833,49 +794,18 @@ int main(int argc, const char* argv[]) { } } - if (params.mode == IMG2VID) { + if (params.mode == VID_GEN) { fprintf(stderr, "SVD support is broken, do not use it!!!\n"); return 1; } - bool vae_decode_only = true; - - std::vector kontext_imgs; - for (auto& path : params.kontext_image_paths) { - vae_decode_only = false; - int c = 0; - int width = 0; - int height = 0; - uint8_t* image_buffer = stbi_load(path.c_str(), &width, &height, &c, 3); - if (image_buffer == NULL) { - fprintf(stderr, "load image from '%s' failed\n", path.c_str()); - return 1; - } - if (c < 3) { - fprintf(stderr, "the number of channels for the input image must be >= 3, but got %d channels\n", c); - free(image_buffer); - return 1; - } - if (width <= 0) { - fprintf(stderr, "error: the width of image must be greater than 0\n"); - free(image_buffer); - return 1; - } - if (height <= 0) { - fprintf(stderr, "error: the height of image must be greater than 0\n"); - free(image_buffer); - return 1; - } - kontext_imgs.push_back({(uint32_t)width, - (uint32_t)height, - 3, - image_buffer}); - } + bool vae_decode_only = true; uint8_t* input_image_buffer = NULL; uint8_t* control_image_buffer = NULL; uint8_t* mask_image_buffer = NULL; + std::vector ref_images; - if (params.mode == IMG2IMG || params.mode == IMG2VID) { + if (params.input_path.size() > 0) { vae_decode_only = false; int c = 0; @@ -925,41 +855,83 @@ int main(int argc, const char* argv[]) { free(input_image_buffer); input_image_buffer = resized_image_buffer; } + } else if (params.ref_image_paths.size() > 0) { + vae_decode_only = false; + for (auto& path : params.ref_image_paths) { + int c = 0; + int width = 0; + int height = 0; + uint8_t* image_buffer = stbi_load(path.c_str(), &width, &height, &c, 3); + if (image_buffer == NULL) { + fprintf(stderr, "load image from '%s' failed\n", path.c_str()); + return 1; + } + if (c < 3) { + fprintf(stderr, "the number of channels for the input image must be >= 3, but got %d channels\n", c); + free(image_buffer); + return 1; + } + if (width <= 0) { + fprintf(stderr, "error: the width of image must be greater than 0\n"); + free(image_buffer); + return 1; + } + if (height <= 0) { + fprintf(stderr, "error: the height of image must be greater than 0\n"); + free(image_buffer); + return 1; + } + ref_images.push_back({(uint32_t)width, + (uint32_t)height, + 3, + image_buffer}); + } } - sd_ctx_t* sd_ctx = new_sd_ctx(params.model_path.c_str(), - params.clip_l_path.c_str(), - params.clip_g_path.c_str(), - params.t5xxl_path.c_str(), - params.diffusion_model_path.c_str(), - params.vae_path.c_str(), - params.taesd_path.c_str(), - params.controlnet_path.c_str(), - params.lora_model_dir.c_str(), - params.embeddings_path.c_str(), - params.stacked_id_embeddings_path.c_str(), - vae_decode_only, - params.vae_tiling, - true, - params.n_threads, - params.wtype, - params.rng_type, - params.schedule, - params.clip_on_cpu, - params.control_net_cpu, - params.vae_on_cpu, - params.diffusion_flash_attn, - params.chroma_use_dit_mask, - params.chroma_use_t5_mask, - params.chroma_t5_mask_pad); + sd_ctx_params_t sd_ctx_params = { + params.model_path.c_str(), + params.clip_l_path.c_str(), + params.clip_g_path.c_str(), + params.t5xxl_path.c_str(), + params.diffusion_model_path.c_str(), + params.vae_path.c_str(), + params.taesd_path.c_str(), + params.control_net_path.c_str(), + params.lora_model_dir.c_str(), + params.embedding_dir.c_str(), + params.stacked_id_embed_dir.c_str(), + vae_decode_only, + params.vae_tiling, + true, + params.n_threads, + params.wtype, + params.rng_type, + params.schedule, + params.clip_on_cpu, + params.control_net_cpu, + params.vae_on_cpu, + params.diffusion_flash_attn, + params.diffusion_conv_direct, + params.vae_conv_direct, + params.chroma_use_dit_mask, + params.chroma_use_t5_mask, + params.chroma_t5_mask_pad, + }; + + sd_ctx_t* sd_ctx = new_sd_ctx(&sd_ctx_params); if (sd_ctx == NULL) { printf("new_sd_ctx_t failed\n"); return 1; } + sd_image_t input_image = {(uint32_t)params.width, + (uint32_t)params.height, + 3, + input_image_buffer}; + sd_image_t* control_image = NULL; - if (params.controlnet_path.size() > 0 && params.control_image_path.size() > 0) { + if (params.control_net_path.size() > 0 && params.control_image_path.size() > 0) { int c = 0; control_image_buffer = stbi_load(params.control_image_path.c_str(), ¶ms.width, ¶ms.height, &c, 3); if (control_image_buffer == NULL) { @@ -995,104 +967,52 @@ int main(int argc, const char* argv[]) { mask_image_buffer}; sd_image_t* results; - if (params.mode == TXT2IMG) { - results = txt2img(sd_ctx, - params.prompt.c_str(), - params.negative_prompt.c_str(), - params.clip_skip, - params.cfg_scale, - params.guidance, - params.eta, - params.width, - params.height, - params.sample_method, - params.sample_steps, - params.seed, - params.batch_count, - control_image, - params.control_strength, - params.style_ratio, - params.normalize_input, - params.input_id_images_path.c_str(), - kontext_imgs.data(), kontext_imgs.size(), - params.skip_layers.data(), - params.skip_layers.size(), - params.slg_scale, - params.skip_layer_start, - params.skip_layer_end, - std::vector()); - } else { - sd_image_t input_image = {(uint32_t)params.width, - (uint32_t)params.height, - 3, - input_image_buffer}; + int expected_num_results = 1; + if (params.mode == IMG_GEN) { + sd_img_gen_params_t img_gen_params = { + params.prompt.c_str(), + params.negative_prompt.c_str(), + params.clip_skip, + guidance_params, + input_image, + ref_images.data(), + (int)ref_images.size(), + mask_image, + params.width, + params.height, + params.sample_method, + params.sample_steps, + params.eta, + params.strength, + params.seed, + params.batch_count, + control_image, + params.control_strength, + params.style_ratio, + params.normalize_input, + params.input_id_images_path.c_str(), + }; - if (params.mode == IMG2VID) { - results = img2vid(sd_ctx, - input_image, - params.width, - params.height, - params.video_frames, - params.motion_bucket_id, - params.fps, - params.augmentation_level, - params.min_cfg, - params.cfg_scale, - params.sample_method, - params.sample_steps, - params.strength, - params.seed); - if (results == NULL) { - printf("generate failed\n"); - free_sd_ctx(sd_ctx); - return 1; - } - size_t last = params.output_path.find_last_of("."); - std::string dummy_name = last != std::string::npos ? params.output_path.substr(0, last) : params.output_path; - for (int i = 0; i < params.video_frames; i++) { - if (results[i].data == NULL) { - continue; - } - std::string final_image_path = i > 0 ? dummy_name + "_" + std::to_string(i + 1) + ".png" : dummy_name + ".png"; - stbi_write_png(final_image_path.c_str(), results[i].width, results[i].height, results[i].channel, - results[i].data, 0, get_image_params(params, params.seed + i).c_str()); - printf("save result image to '%s'\n", final_image_path.c_str()); - free(results[i].data); - results[i].data = NULL; - } - free(results); - free_sd_ctx(sd_ctx); - return 0; - } else { - results = img2img(sd_ctx, - input_image, - mask_image, - params.prompt.c_str(), - params.negative_prompt.c_str(), - params.clip_skip, - params.cfg_scale, - params.guidance, - params.eta, - params.width, - params.height, - params.sample_method, - params.sample_steps, - params.strength, - params.seed, - params.batch_count, - control_image, - params.control_strength, - params.style_ratio, - params.normalize_input, - params.input_id_images_path.c_str(), - kontext_imgs.data(), kontext_imgs.size(), - params.skip_layers.data(), - params.skip_layers.size(), - params.slg_scale, - params.skip_layer_start, - params.skip_layer_end, - std::vector()); - } + results = generate_image(sd_ctx, &img_gen_params); + expected_num_results = params.batch_count; + } else if (params.mode == VID_GEN) { + sd_vid_gen_params_t vid_gen_params = { + input_image, + params.width, + params.height, + guidance_params, + params.sample_method, + params.sample_steps, + params.strength, + params.seed, + params.video_frames, + params.motion_bucket_id, + params.fps, + params.augmentation_level, + }; + + results = generate_video(sd_ctx, &vid_gen_params); + expected_num_results = params.video_frames; } if (results == NULL) { @@ -1104,7 +1024,8 @@ int main(int argc, const char* argv[]) { int upscale_factor = 4; // unused for RealESRGAN_x4plus_anime_6B.pth if (params.esrgan_path.size() > 0 && params.upscale_repeats > 0) { upscaler_ctx_t* upscaler_ctx = new_upscaler_ctx(params.esrgan_path.c_str(), - params.n_threads); + params.n_threads, + params.diffusion_conv_direct); if (upscaler_ctx == NULL) { printf("new_upscaler_ctx failed\n"); @@ -1149,14 +1070,14 @@ int main(int argc, const char* argv[]) { dummy_name += ext; ext = ".png"; } - for (int i = 0; i < params.batch_count; i++) { + for (int i = 0; i < expected_num_results; i++) { if (results[i].data == NULL) { continue; } std::string final_image_path = i > 0 ? dummy_name + "_" + std::to_string(i + 1) + ext : dummy_name + ext; if (is_jpg) { stbi_write_jpg(final_image_path.c_str(), results[i].width, results[i].height, results[i].channel, - results[i].data, 90); + results[i].data, 90, get_image_params(params, params.seed + i).c_str()); printf("save result JPEG image to '%s'\n", final_image_path.c_str()); } else { stbi_write_png(final_image_path.c_str(), results[i].width, results[i].height, results[i].channel, @@ -1172,4 +1093,4 @@ int main(int argc, const char* argv[]) { free(input_image_buffer); return 0; -} \ No newline at end of file +} diff --git a/otherarch/sdcpp/mmdit.hpp b/otherarch/sdcpp/mmdit.hpp index dee7b1c49..a93a35dfd 100644 --- a/otherarch/sdcpp/mmdit.hpp +++ b/otherarch/sdcpp/mmdit.hpp @@ -147,8 +147,8 @@ protected: int64_t hidden_size; float eps; - void init_params(struct ggml_context* ctx, std::map& tensor_types, std::string prefix = "") { - enum ggml_type wtype = GGML_TYPE_F32; //(tensor_types.find(prefix + "weight") != tensor_types.end()) ? tensor_types[prefix + "weight"] : GGML_TYPE_F32; + void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, std::string prefix = "") { + enum ggml_type wtype = GGML_TYPE_F32; params["weight"] = ggml_new_tensor_1d(ctx, wtype, hidden_size); } @@ -652,13 +652,13 @@ protected: int64_t hidden_size; std::string qk_norm; - void init_params(struct ggml_context* ctx, std::map& tensor_types, std::string prefix = "") { - enum ggml_type wtype = GGML_TYPE_F32; //(tensor_types.find(prefix + "pos_embed") != tensor_types.end()) ? tensor_types[prefix + "pos_embed"] : GGML_TYPE_F32; + void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, std::string prefix = "") { + enum ggml_type wtype = GGML_TYPE_F32; params["pos_embed"] = ggml_new_tensor_3d(ctx, wtype, hidden_size, num_patchs, 1); } public: - MMDiT(std::map& tensor_types) { + MMDiT(const String2GGMLType& tensor_types = {}) { // input_size is always None // learn_sigma is always False // register_length is alwalys 0 @@ -869,11 +869,9 @@ public: struct MMDiTRunner : public GGMLRunner { MMDiT mmdit; - static std::map empty_tensor_types; - MMDiTRunner(ggml_backend_t backend, - std::map& tensor_types = empty_tensor_types, - const std::string prefix = "") + const String2GGMLType& tensor_types = {}, + const std::string prefix = "") : GGMLRunner(backend), mmdit(tensor_types) { mmdit.init(params_ctx, tensor_types, prefix); } diff --git a/otherarch/sdcpp/model.cpp b/otherarch/sdcpp/model.cpp index 80af64f8d..d18fd914c 100644 --- a/otherarch/sdcpp/model.cpp +++ b/otherarch/sdcpp/model.cpp @@ -16,7 +16,6 @@ #include "ggml-backend.h" #include "ggml-cpu.h" #include "ggml.h" -#include "gguf.h" #include "stable-diffusion.h" @@ -28,6 +27,10 @@ #include "ggml-vulkan.h" #endif +#ifdef SD_USE_OPENCL +#include "ggml-opencl.h" +#endif + #define ST_HEADER_SIZE_LEN 8 static std::string format(const char* fmt, ...) { @@ -111,6 +114,7 @@ const char* unused_tensors[] = { "model_ema.diffusion_model", "embedding_manager", "denoiser.sigmas", + "text_encoders.t5xxl.transformer.encoder.embed_tokens.weight", // only used during training }; bool is_unused_tensor(std::string name) { @@ -192,7 +196,7 @@ std::unordered_map pmid_v2_name_map = { std::string convert_open_clip_to_hf_clip(const std::string& name) { std::string new_name = name; std::string prefix; - if (contains(new_name, ".enc.")) { + if (contains(new_name, ".enc.")) { // llama.cpp naming convention for T5 size_t pos = new_name.find(".enc."); if (pos != std::string::npos) { @@ -348,6 +352,10 @@ std::unordered_map> su {"to_v", "v"}, {"to_out_0", "proj_out"}, {"group_norm", "norm"}, + {"key", "k"}, + {"query", "q"}, + {"value", "v"}, + {"proj_attn", "proj_out"}, }, }, { @@ -372,6 +380,10 @@ std::unordered_map> su {"to_v", "v"}, {"to_out.0", "proj_out"}, {"group_norm", "norm"}, + {"key", "k"}, + {"query", "q"}, + {"value", "v"}, + {"proj_attn", "proj_out"}, }, }, { @@ -443,6 +455,10 @@ std::string convert_diffusers_name_to_compvis(std::string key, char seq) { return format("model%cdiffusion_model%ctime_embed%c", seq, seq, seq) + std::to_string(std::stoi(m[0]) * 2 - 2) + m[1]; } + if (match(m, std::regex(format("unet%cadd_embedding%clinear_(\\d+)(.*)", seq, seq)), key)) { + return format("model%cdiffusion_model%clabel_emb%c0%c", seq, seq, seq, seq) + std::to_string(std::stoi(m[0]) * 2 - 2) + m[1]; + } + if (match(m, std::regex(format("unet%cdown_blocks%c(\\d+)%c(attentions|resnets)%c(\\d+)%c(.+)", seq, seq, seq, seq, seq)), key)) { std::string suffix = get_converted_suffix(m[1], m[3]); // LOG_DEBUG("%s %s %s %s", m[0].c_str(), m[1].c_str(), m[2].c_str(), m[3].c_str()); @@ -480,6 +496,19 @@ std::string convert_diffusers_name_to_compvis(std::string key, char seq) { return format("cond_stage_model%ctransformer%ctext_model", seq, seq) + m[0]; } + // clip-g + if (match(m, std::regex(format("te%c1%ctext_model%cencoder%clayers%c(\\d+)%c(.+)", seq, seq, seq, seq, seq, seq)), key)) { + return format("cond_stage_model%c1%ctransformer%ctext_model%cencoder%clayers%c", seq, seq, seq, seq, seq, seq) + m[0] + seq + m[1]; + } + + if (match(m, std::regex(format("te%c1%ctext_model(.*)", seq, seq)), key)) { + return format("cond_stage_model%c1%ctransformer%ctext_model", seq, seq, seq) + m[0]; + } + + if (match(m, std::regex(format("te%c1%ctext_projection", seq, seq)), key)) { + return format("cond_stage_model%c1%ctransformer%ctext_model%ctext_projection", seq, seq, seq, seq); + } + // vae if (match(m, std::regex(format("vae%c(.*)%cconv_norm_out(.*)", seq, seq)), key)) { return format("first_stage_model%c%s%cnorm_out%s", seq, m[0].c_str(), seq, m[1].c_str()); @@ -616,6 +645,8 @@ std::string convert_tensor_name(std::string name) { std::string new_key = convert_diffusers_name_to_compvis(name_without_network_parts, '.'); if (new_key.empty()) { new_name = name; + } else if (new_key == "cond_stage_model.1.transformer.text_model.text_projection") { + new_name = new_key; } else { new_name = new_key + "." + network_part; } @@ -631,7 +662,7 @@ std::string convert_tensor_name(std::string name) { return new_name; } -void add_preprocess_tensor_storage_types(std::map& tensor_storages_types, std::string name, enum ggml_type type) { +void add_preprocess_tensor_storage_types(String2GGMLType& tensor_storages_types, std::string name, enum ggml_type type) { std::string new_name = convert_tensor_name(name); if (new_name.find("cond_stage_model") != std::string::npos && ends_with(new_name, "attn.in_proj_weight")) { @@ -798,6 +829,7 @@ void f8_e4m3_to_f16_vec(uint8_t* src, uint16_t* dst, int64_t n) { dst[i] = f8_e4m3_to_f16(src[i]); } } + void f8_e5m2_to_f16_vec(uint8_t* src, uint16_t* dst, int64_t n) { // support inplace op for (int64_t i = n - 1; i >= 0; i--) { @@ -805,6 +837,20 @@ void f8_e5m2_to_f16_vec(uint8_t* src, uint16_t* dst, int64_t n) { } } +void f64_to_f32_vec(double* src, float* dst, int64_t n) { + // support inplace op + for (int64_t i = 0; i < n; i++) { + dst[i] = (float)src[i]; + } +} + +void i64_to_i32_vec(int64_t* src, int32_t* dst, int64_t n) { + // support inplace op + for (int64_t i = 0; i < n; i++) { + dst[i] = (int32_t)src[i]; + } +} + void convert_tensor(void* src, ggml_type src_type, void* dst, @@ -1050,10 +1096,14 @@ ggml_type str_to_ggml_type(const std::string& dtype) { ttype = GGML_TYPE_F32; } else if (dtype == "F32") { ttype = GGML_TYPE_F32; + } else if (dtype == "F64") { + ttype = GGML_TYPE_F32; } else if (dtype == "F8_E4M3") { ttype = GGML_TYPE_F16; } else if (dtype == "F8_E5M2") { ttype = GGML_TYPE_F16; + } else if (dtype == "I64") { + ttype = GGML_TYPE_I32; } return ttype; } @@ -1071,6 +1121,7 @@ bool ModelLoader::init_from_safetensors_file(const std::string& file_path, const std::ifstream file(fpath, std::ios::binary); if (!file.is_open()) { LOG_ERROR("failed to open '%s'", file_path.c_str()); + file_paths_.pop_back(); return false; } @@ -1082,6 +1133,7 @@ bool ModelLoader::init_from_safetensors_file(const std::string& file_path, const // read header size if (file_size_ <= ST_HEADER_SIZE_LEN) { LOG_ERROR("invalid safetensor file '%s'", file_path.c_str()); + file_paths_.pop_back(); return false; } @@ -1095,6 +1147,7 @@ bool ModelLoader::init_from_safetensors_file(const std::string& file_path, const size_t header_size_ = read_u64(header_size_buf); if (header_size_ >= file_size_) { LOG_ERROR("invalid safetensor file '%s'", file_path.c_str()); + file_paths_.pop_back(); return false; } @@ -1105,6 +1158,7 @@ bool ModelLoader::init_from_safetensors_file(const std::string& file_path, const file.read(header_buf.data(), header_size_); if (!file) { LOG_ERROR("read safetensors header failed: '%s'", file_path.c_str()); + file_paths_.pop_back(); return false; } @@ -1176,6 +1230,14 @@ bool ModelLoader::init_from_safetensors_file(const std::string& file_path, const tensor_storage.is_f8_e5m2 = true; // f8 -> f16 GGML_ASSERT(tensor_storage.nbytes() == tensor_data_size * 2); + } else if (dtype == "F64") { + tensor_storage.is_f64 = true; + // f64 -> f32 + GGML_ASSERT(tensor_storage.nbytes() * 2 == tensor_data_size); + } else if (dtype == "I64") { + tensor_storage.is_i64 = true; + // i64 -> i32 + GGML_ASSERT(tensor_storage.nbytes() * 2 == tensor_data_size); } else { GGML_ASSERT(tensor_storage.nbytes() == tensor_data_size); } @@ -1192,18 +1254,45 @@ bool ModelLoader::init_from_safetensors_file(const std::string& file_path, const /*================================================= DiffusersModelLoader ==================================================*/ bool ModelLoader::init_from_diffusers_file(const std::string& file_path, const std::string& prefix) { - std::string unet_path = path_join(file_path, "unet/diffusion_pytorch_model.safetensors"); - std::string vae_path = path_join(file_path, "vae/diffusion_pytorch_model.safetensors"); - std::string clip_path = path_join(file_path, "text_encoder/model.safetensors"); + std::string unet_path = path_join(file_path, "unet/diffusion_pytorch_model.safetensors"); + std::string vae_path = path_join(file_path, "vae/diffusion_pytorch_model.safetensors"); + std::string clip_path = path_join(file_path, "text_encoder/model.safetensors"); + std::string clip_g_path = path_join(file_path, "text_encoder_2/model.safetensors"); if (!init_from_safetensors_file(unet_path, "unet.")) { return false; } + for (auto ts : tensor_storages) { + if (ts.name.find("add_embedding") != std::string::npos || ts.name.find("label_emb") != std::string::npos) { + // probably SDXL + LOG_DEBUG("Fixing name for SDXL output blocks.2.2"); + for (auto& tensor_storage : tensor_storages) { + int len = 34; + auto pos = tensor_storage.name.find("unet.up_blocks.0.upsamplers.0.conv"); + if (pos == std::string::npos) { + len = 44; + pos = tensor_storage.name.find("model.diffusion_model.output_blocks.2.1.conv"); + } + if (pos != std::string::npos) { + tensor_storage.name = "model.diffusion_model.output_blocks.2.2.conv" + tensor_storage.name.substr(len); + LOG_DEBUG("NEW NAME: %s", tensor_storage.name.c_str()); + add_preprocess_tensor_storage_types(tensor_storages_types, tensor_storage.name, tensor_storage.type); + } + } + break; + } + } + if (!init_from_safetensors_file(vae_path, "vae.")) { - return false; + LOG_WARN("Couldn't find working VAE in %s", file_path.c_str()); + // return false; } if (!init_from_safetensors_file(clip_path, "te.")) { - return false; + LOG_WARN("Couldn't find working text encoder in %s", file_path.c_str()); + // return false; + } + if (!init_from_safetensors_file(clip_g_path, "te.1.")) { + LOG_DEBUG("Couldn't find working second text encoder in %s", file_path.c_str()); } return true; } @@ -1566,6 +1655,15 @@ bool ModelLoader::init_from_ckpt_file(const std::string& file_path, const std::s return true; } +bool ModelLoader::model_is_unet() { + for (auto& tensor_storage : tensor_storages) { + if (tensor_storage.name.find("model.diffusion_model.input_blocks.") != std::string::npos) { + return true; + } + } + return false; +} + bool ModelLoader::has_diffusion_model_tensors() { for (auto& tensor_storage : tensor_storages) { @@ -1598,7 +1696,7 @@ SDVersion ModelLoader::get_sd_version() { if (tensor_storage.name.find("model.diffusion_model.joint_blocks.") != std::string::npos) { return VERSION_SD3; } - if (tensor_storage.name.find("model.diffusion_model.input_blocks.") != std::string::npos) { + if (tensor_storage.name.find("model.diffusion_model.input_blocks.") != std::string::npos || tensor_storage.name.find("unet.down_blocks.") != std::string::npos) { is_unet = true; if (has_multiple_encoders) { is_xl = true; @@ -1607,7 +1705,7 @@ SDVersion ModelLoader::get_sd_version() { } } } - if (tensor_storage.name.find("conditioner.embedders.1") != std::string::npos || tensor_storage.name.find("cond_stage_model.1") != std::string::npos) { + if (tensor_storage.name.find("conditioner.embedders.1") != std::string::npos || tensor_storage.name.find("cond_stage_model.1") != std::string::npos || tensor_storage.name.find("te.1") != std::string::npos) { has_multiple_encoders = true; if (is_unet) { is_xl = true; @@ -1629,7 +1727,7 @@ SDVersion ModelLoader::get_sd_version() { token_embedding_weight = tensor_storage; // break; } - if (tensor_storage.name == "model.diffusion_model.input_blocks.0.0.weight" || tensor_storage.name == "model.diffusion_model.img_in.weight") { + if (tensor_storage.name == "model.diffusion_model.input_blocks.0.0.weight" || tensor_storage.name == "model.diffusion_model.img_in.weight" || tensor_storage.name == "unet.conv_in.weight") { input_block_weight = tensor_storage; input_block_checked = true; if (found_family) { @@ -1638,10 +1736,14 @@ SDVersion ModelLoader::get_sd_version() { } } bool is_inpaint = input_block_weight.ne[2] == 9; + bool is_ip2p = input_block_weight.ne[2] == 8; if (is_xl) { if (is_inpaint) { return VERSION_SDXL_INPAINT; } + if (is_ip2p) { + return VERSION_SDXL_PIX2PIX; + } return VERSION_SDXL; } @@ -1657,6 +1759,9 @@ SDVersion ModelLoader::get_sd_version() { if (is_inpaint) { return VERSION_SD1_INPAINT; } + if (is_ip2p) { + return VERSION_SD1_PIX2PIX; + } return VERSION_SD1; } else if (token_embedding_weight.ne[0] == 1024) { if (is_inpaint) { @@ -1714,7 +1819,7 @@ ggml_type ModelLoader::get_diffusion_model_wtype() { continue; } - if (tensor_storage.name.find("model.diffusion_model.") == std::string::npos) { + if (tensor_storage.name.find("model.diffusion_model.") == std::string::npos && tensor_storage.name.find("unet.") == std::string::npos) { continue; } @@ -1883,6 +1988,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend }; int tensor_count = 0; int64_t t1 = ggml_time_ms(); + bool partial = false; for (auto& tensor_storage : processed_tensor_storages) { if (tensor_storage.file_index != file_index) { ++tensor_count; @@ -1907,7 +2013,12 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend // for the CPU and Metal backend, we can copy directly into the tensor if (tensor_storage.type == dst_tensor->type) { GGML_ASSERT(ggml_nbytes(dst_tensor) == tensor_storage.nbytes()); - read_data(tensor_storage, (char*)dst_tensor->data, nbytes_to_read); + if (tensor_storage.is_f64 || tensor_storage.is_i64) { + read_buffer.resize(tensor_storage.nbytes_to_read()); + read_data(tensor_storage, (char*)read_buffer.data(), nbytes_to_read); + } else { + read_data(tensor_storage, (char*)dst_tensor->data, nbytes_to_read); + } if (tensor_storage.is_bf16) { // inplace op @@ -1918,9 +2029,13 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend } else if (tensor_storage.is_f8_e5m2) { // inplace op f8_e5m2_to_f16_vec((uint8_t*)dst_tensor->data, (uint16_t*)dst_tensor->data, tensor_storage.nelements()); + } else if (tensor_storage.is_f64) { + f64_to_f32_vec((double*)read_buffer.data(), (float*)dst_tensor->data, tensor_storage.nelements()); + } else if (tensor_storage.is_i64) { + i64_to_i32_vec((int64_t*)read_buffer.data(), (int32_t*)dst_tensor->data, tensor_storage.nelements()); } } else { - read_buffer.resize(tensor_storage.nbytes()); + read_buffer.resize(std::max(tensor_storage.nbytes(), tensor_storage.nbytes_to_read())); read_data(tensor_storage, (char*)read_buffer.data(), nbytes_to_read); if (tensor_storage.is_bf16) { @@ -1932,13 +2047,19 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend } else if (tensor_storage.is_f8_e5m2) { // inplace op f8_e5m2_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements()); + } else if (tensor_storage.is_f64) { + // inplace op + f64_to_f32_vec((double*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements()); + } else if (tensor_storage.is_i64) { + // inplace op + i64_to_i32_vec((int64_t*)read_buffer.data(), (int32_t*)read_buffer.data(), tensor_storage.nelements()); } convert_tensor((void*)read_buffer.data(), tensor_storage.type, dst_tensor->data, dst_tensor->type, (int)tensor_storage.nelements() / (int)tensor_storage.ne[0], (int)tensor_storage.ne[0]); } } else { - read_buffer.resize(tensor_storage.nbytes()); + read_buffer.resize(std::max(tensor_storage.nbytes(), tensor_storage.nbytes_to_read())); read_data(tensor_storage, (char*)read_buffer.data(), nbytes_to_read); if (tensor_storage.is_bf16) { @@ -1950,6 +2071,12 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend } else if (tensor_storage.is_f8_e5m2) { // inplace op f8_e5m2_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements()); + } else if (tensor_storage.is_f64) { + // inplace op + f64_to_f32_vec((double*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements()); + } else if (tensor_storage.is_i64) { + // inplace op + i64_to_i32_vec((int64_t*)read_buffer.data(), (int32_t*)read_buffer.data(), tensor_storage.nelements()); } if (tensor_storage.type == dst_tensor->type) { @@ -1964,20 +2091,26 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend ggml_backend_tensor_set(dst_tensor, convert_buffer.data(), 0, ggml_nbytes(dst_tensor)); } } - int64_t t2 = ggml_time_ms(); + size_t tensor_max = processed_tensor_storages.size(); + int64_t t2 = ggml_time_ms(); + // kcpp throttle progress printing ++tensor_count; - if(tensor_count<2 || tensor_count%5==0 || (tensor_count+10) > processed_tensor_storages.size()) + if(tensor_count<2 || tensor_count%5==0 || (tensor_count+10) > tensor_max) { - //throttle progress printing - pretty_progress(tensor_count, processed_tensor_storages.size(), (t2 - t1) / 1000.0f); + pretty_progress(tensor_count, tensor_max, (t2 - t1) / 1000.0f); } - t1 = t2; + t1 = t2; + partial = tensor_count != tensor_max; } if (zip != NULL) { zip_close(zip); } + if (partial) { + printf("\n"); + } + if (!success) { break; } @@ -2055,6 +2188,41 @@ bool ModelLoader::load_tensors(std::map& tenso return true; } +std::vector> parse_tensor_type_rules(const std::string& tensor_type_rules) { + std::vector> result; + for (const auto& item : splitString(tensor_type_rules, ',')) { + if (item.size() == 0) + continue; + std::string::size_type pos = item.find('='); + if (pos == std::string::npos) { + LOG_WARN("ignoring invalid quant override \"%s\"", item.c_str()); + continue; + } + std::string tensor_pattern = item.substr(0, pos); + std::string type_name = item.substr(pos + 1); + + ggml_type tensor_type = GGML_TYPE_COUNT; + + if (type_name == "f32") { + tensor_type = GGML_TYPE_F32; + } else { + for (size_t i = 0; i < SD_TYPE_COUNT; i++) { + auto trait = ggml_get_type_traits((ggml_type)i); + if (trait->to_float && trait->type_size && type_name == trait->type_name) { + tensor_type = (ggml_type)i; + } + } + } + + if (tensor_type != GGML_TYPE_COUNT) { + result.emplace_back(tensor_pattern, tensor_type); + } else { + LOG_WARN("ignoring invalid quant override \"%s\"", item.c_str()); + } + } + return result; +} + bool ModelLoader::tensor_should_be_converted(const TensorStorage& tensor_storage, ggml_type type) { const std::string& name = tensor_storage.name; if (type != GGML_TYPE_COUNT) { @@ -2086,7 +2254,7 @@ bool ModelLoader::tensor_should_be_converted(const TensorStorage& tensor_storage return false; } -bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type) { +bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type, const std::string& tensor_type_rules_str) { auto backend = ggml_backend_cpu_init(); size_t mem_size = 1 * 1024 * 1024; // for padding mem_size += tensor_storages.size() * ggml_tensor_overhead(); @@ -2096,12 +2264,23 @@ bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type gguf_context* gguf_ctx = gguf_init_empty(); + auto tensor_type_rules = parse_tensor_type_rules(tensor_type_rules_str); + auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool { const std::string& name = tensor_storage.name; + ggml_type tensor_type = tensor_storage.type; + ggml_type dst_type = type; - ggml_type tensor_type = tensor_storage.type; - if (tensor_should_be_converted(tensor_storage, type)) { - tensor_type = type; + for (const auto& tensor_type_rule : tensor_type_rules) { + std::regex pattern(tensor_type_rule.first); + if (std::regex_search(name, pattern)) { + dst_type = tensor_type_rule.second; + break; + } + } + + if (tensor_should_be_converted(tensor_storage, dst_type)) { + tensor_type = dst_type; } ggml_tensor* tensor = ggml_new_tensor(ggml_ctx, tensor_type, tensor_storage.n_dims, tensor_storage.ne); @@ -2160,7 +2339,7 @@ int64_t ModelLoader::get_params_mem_size(ggml_backend_t backend, ggml_type type) return mem_size; } -bool convert(const char* input_path, const char* vae_path, const char* output_path, sd_type_t output_type) { +bool convert(const char* input_path, const char* vae_path, const char* output_path, sd_type_t output_type, const char* tensor_type_rules) { ModelLoader model_loader; if (!model_loader.init_from_file(input_path)) { @@ -2174,6 +2353,6 @@ bool convert(const char* input_path, const char* vae_path, const char* output_pa return false; } } - bool success = model_loader.save_to_gguf_file(output_path, (ggml_type)output_type); + bool success = model_loader.save_to_gguf_file(output_path, (ggml_type)output_type, tensor_type_rules); return success; -} \ No newline at end of file +} diff --git a/otherarch/sdcpp/model.h b/otherarch/sdcpp/model.h index 9da9eb416..1c1468fc1 100644 --- a/otherarch/sdcpp/model.h +++ b/otherarch/sdcpp/model.h @@ -12,19 +12,21 @@ #include "ggml-backend.h" #include "ggml.h" +#include "gguf.h" #include #include "zip.h" -#include "gguf.h" #define SD_MAX_DIMS 5 enum SDVersion { VERSION_SD1, VERSION_SD1_INPAINT, + VERSION_SD1_PIX2PIX, VERSION_SD2, VERSION_SD2_INPAINT, VERSION_SDXL, VERSION_SDXL_INPAINT, + VERSION_SDXL_PIX2PIX, VERSION_SVD, VERSION_SD3, VERSION_FLUX, @@ -47,7 +49,7 @@ static inline bool sd_version_is_sd3(SDVersion version) { } static inline bool sd_version_is_sd1(SDVersion version) { - if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT) { + if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT || version == VERSION_SD1_PIX2PIX) { return true; } return false; @@ -61,7 +63,7 @@ static inline bool sd_version_is_sd2(SDVersion version) { } static inline bool sd_version_is_sdxl(SDVersion version) { - if (version == VERSION_SDXL || version == VERSION_SDXL_INPAINT) { + if (version == VERSION_SDXL || version == VERSION_SDXL_INPAINT || version == VERSION_SDXL_PIX2PIX) { return true; } return false; @@ -81,6 +83,14 @@ static inline bool sd_version_is_dit(SDVersion version) { return false; } +static inline bool sd_version_is_unet_edit(SDVersion version) { + return version == VERSION_SD1_PIX2PIX || version == VERSION_SDXL_PIX2PIX; +} + +static bool sd_version_is_inpaint_or_unet_edit(SDVersion version) { + return sd_version_is_unet_edit(version) || sd_version_is_inpaint(version); +} + enum PMVersion { PM_VERSION_1, PM_VERSION_2, @@ -92,6 +102,8 @@ struct TensorStorage { bool is_bf16 = false; bool is_f8_e4m3 = false; bool is_f8_e5m2 = false; + bool is_f64 = false; + bool is_i64 = false; int64_t ne[SD_MAX_DIMS] = {1, 1, 1, 1, 1}; int n_dims = 0; @@ -123,6 +135,8 @@ struct TensorStorage { int64_t nbytes_to_read() const { if (is_bf16 || is_f8_e4m3 || is_f8_e5m2) { return nbytes() / 2; + } else if (is_f64 || is_i64) { + return nbytes() * 2; } else { return nbytes(); } @@ -173,6 +187,10 @@ struct TensorStorage { type_name = "f8_e4m3"; } else if (is_f8_e5m2) { type_name = "f8_e5m2"; + } else if (is_f64) { + type_name = "f64"; + } else if (is_i64) { + type_name = "i64"; } ss << name << " | " << type_name << " | "; ss << n_dims << " ["; @@ -189,6 +207,8 @@ struct TensorStorage { typedef std::function on_new_tensor_cb_t; +typedef std::map String2GGMLType; + class ModelLoader { protected: std::vector file_paths_; @@ -207,10 +227,11 @@ protected: bool init_from_diffusers_file(const std::string& file_path, const std::string& prefix = ""); public: - std::map tensor_storages_types; + String2GGMLType tensor_storages_types; bool init_from_file(const std::string& file_path, const std::string& prefix = ""); bool has_diffusion_model_tensors(); + bool model_is_unet(); SDVersion get_sd_version(); ggml_type get_sd_wtype(); ggml_type get_conditioner_wtype(); @@ -222,7 +243,7 @@ public: ggml_backend_t backend, std::set ignore_tensors = {}); - bool save_to_gguf_file(const std::string& file_path, ggml_type type); + bool save_to_gguf_file(const std::string& file_path, ggml_type type, const std::string& tensor_type_rules); bool tensor_should_be_converted(const TensorStorage& tensor_storage, ggml_type type); int64_t get_params_mem_size(ggml_backend_t backend, ggml_type type = GGML_TYPE_COUNT); ~ModelLoader() = default; @@ -231,4 +252,4 @@ public: static std::string load_t5_tokenizer_json(); }; -#endif // __MODEL_H__ \ No newline at end of file +#endif // __MODEL_H__ diff --git a/otherarch/sdcpp/pmid.hpp b/otherarch/sdcpp/pmid.hpp index ea9f02eb6..e2a0f6282 100644 --- a/otherarch/sdcpp/pmid.hpp +++ b/otherarch/sdcpp/pmid.hpp @@ -623,7 +623,12 @@ public: std::vector zeros_right; public: - PhotoMakerIDEncoder(ggml_backend_t backend, std::map& tensor_types, const std::string prefix, SDVersion version = VERSION_SDXL, PMVersion pm_v = PM_VERSION_1, float sty = 20.f) + PhotoMakerIDEncoder(ggml_backend_t backend, + const String2GGMLType& tensor_types, + const std::string prefix, + SDVersion version = VERSION_SDXL, + PMVersion pm_v = PM_VERSION_1, + float sty = 20.f) : GGMLRunner(backend), version(version), pm_version(pm_v), diff --git a/otherarch/sdcpp/sdtype_adapter.cpp b/otherarch/sdcpp/sdtype_adapter.cpp index 7557814ea..d967bbf0e 100644 --- a/otherarch/sdcpp/sdtype_adapter.cpp +++ b/otherarch/sdcpp/sdtype_adapter.cpp @@ -134,6 +134,27 @@ static bool sd_is_quiet = false; static std::string sdmodelfilename = ""; static bool photomaker_enabled = false; +static void set_sd_vae_tiling(sd_ctx_t* ctx, bool tiling) +{ + ctx->sd->vae_tiling = tiling; +} + +static int get_loaded_sd_version(sd_ctx_t* ctx) +{ + return ctx->sd->version; +} + +static bool loaded_model_is_chroma(sd_ctx_t* ctx) +{ + if (ctx != nullptr && ctx->sd != nullptr) { + auto maybe_flux = std::dynamic_pointer_cast(ctx->sd->diffusion_model); + if (maybe_flux != nullptr) { + return maybe_flux->flux.flux_params.is_chroma; + } + } + return false; +} + bool sdtype_load_model(const sd_load_model_inputs inputs) { sd_is_quiet = inputs.quiet; set_sd_quiet(sd_is_quiet); @@ -160,6 +181,10 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) { { taesdpath = executable_path + "taesd.embd"; printf("With TAE SD VAE: %s\n",taesdpath.c_str()); + if (cfg_tiled_vae_threshold < 8192) { + printf(" disabling VAE tiling for TAESD\n"); + cfg_tiled_vae_threshold = 8192; + } } else if(vaefilename!="") { @@ -267,31 +292,35 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) { sd_params->control_net_cpu); } - sd_ctx = new_sd_ctx(sd_params->model_path.c_str(), - sd_params->clip_l_path.c_str(), - sd_params->clip_g_path.c_str(), - sd_params->t5xxl_path.c_str(), - sd_params->diffusion_model_path.c_str(), - sd_params->vae_path.c_str(), - sd_params->taesd_path.c_str(), - sd_params->controlnet_path.c_str(), - sd_params->lora_model_dir.c_str(), - sd_params->embeddings_path.c_str(), - sd_params->stacked_id_embeddings_path.c_str(), - vae_decode_only, - sd_params->vae_tiling, - free_param, - sd_params->n_threads, - sd_params->wtype, - sd_params->rng_type, - sd_params->schedule, - sd_params->clip_on_cpu, - sd_params->control_net_cpu, - sd_params->vae_on_cpu, - sd_params->diffusion_flash_attn, - sd_params->chroma_use_dit_mask, - sd_params->chroma_use_t5_mask, - sd_params->chroma_t5_mask_pad); + sd_ctx_params_t params; + sd_ctx_params_init(¶ms); + params.model_path = sd_params->model_path.c_str(); + params.clip_l_path = sd_params->clip_l_path.c_str(); + params.clip_g_path = sd_params->clip_g_path.c_str(); + params.t5xxl_path = sd_params->t5xxl_path.c_str(); + params.diffusion_model_path = sd_params->diffusion_model_path.c_str(); + params.vae_path = sd_params->vae_path.c_str(); + params.taesd_path = sd_params->taesd_path.c_str(); + params.control_net_path = sd_params->controlnet_path.c_str(); + params.lora_model_dir = sd_params->lora_model_dir.c_str(); + params.embedding_dir = sd_params->embeddings_path.c_str(); + params.stacked_id_embed_dir = sd_params->stacked_id_embeddings_path.c_str(); + params.vae_decode_only = vae_decode_only; + params.vae_tiling = sd_params->vae_tiling; + params.free_params_immediately = free_param; + params.n_threads = sd_params->n_threads; + params.wtype = sd_params->wtype; + params.rng_type = sd_params->rng_type; + params.schedule = sd_params->schedule; + params.keep_clip_on_cpu = sd_params->clip_on_cpu; + params.keep_control_net_on_cpu = sd_params->control_net_cpu; + params.keep_vae_on_cpu = sd_params->vae_on_cpu; + params.diffusion_flash_attn = sd_params->diffusion_flash_attn; + params.chroma_use_dit_mask = sd_params->chroma_use_dit_mask; + params.chroma_use_t5_mask = sd_params->chroma_use_t5_mask; + params.chroma_t5_mask_pad = sd_params->chroma_t5_mask_pad; + + sd_ctx = new_sd_ctx(¶ms); if (sd_ctx == NULL) { printf("\nError: KCPP SD Failed to create context!\nIf using Flux/SD3.5, make sure you have ALL files required (e.g. VAE, T5, Clip...) or baked in!\n"); @@ -305,7 +334,6 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) { if(lorafilename!="" && inputs.lora_multiplier>0) { printf("\nApply LoRA...\n"); - // sd_ctx->sd->set_pending_lora(lorafilename,inputs.lora_multiplier); sd_ctx->sd->apply_lora_from_file(lorafilename,inputs.lora_multiplier); } @@ -482,11 +510,29 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs) auto loadedsdver = get_loaded_sd_version(sd_ctx); if (loadedsdver == SDVersion::VERSION_FLUX) { - if (!sd_loaded_chroma()) { - sd_params->cfg_scale = 1; //non chroma clamp cfg scale + if (loaded_model_is_chroma(sd_ctx)) { + if (sd_params->diffusion_flash_attn && sd_params->chroma_use_dit_mask) { + if (!sd_is_quiet && sddebugmode) { + printf("Chroma: flash attention is on, disabling DiT mask\n"); + } + sd_params->chroma_use_dit_mask = false; + } + } + else { + if (sd_params->cfg_scale != 1.0f) { + //non chroma clamp cfg scale + if (!sd_is_quiet && sddebugmode) { + printf("Flux: clamping CFG Scale to 1\n"); + } + sd_params->cfg_scale = 1.0f; + } } if (sampler == "euler a" || sampler == "k_euler_a" || sampler == "euler_a") { - sampler = "euler"; //euler a broken on flux + //euler a broken on flux + if (!sd_is_quiet && sddebugmode) { + printf("Flux: switching Euler A to Euler\n"); + } + sampler = "euler"; } } @@ -521,17 +567,6 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs) bool dotile = (sd_params->width*sd_params->height > cfg_tiled_vae_threshold*cfg_tiled_vae_threshold); set_sd_vae_tiling(sd_ctx,dotile); //changes vae tiling, prevents memory related crash/oom - if (sd_params->clip_skip <= 0) { - // workaround for clip_skip being "stuck" at the previous requested value - // 2 is the default for all recent base models (SD2, SDXL, Flux, SD3) - if (sd_version_is_sd1((SDVersion)loadedsdver)) { - sd_params->clip_skip = 1; - } - else { - sd_params->clip_skip = 2; - } - } - //for img2img sd_image_t input_image = {0,0,0,nullptr}; std::vector extraimage_references; @@ -663,25 +698,25 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs) } } - std::vector kontext_imgs; - if(extra_image_data.size()>0 && loadedsdver==SDVersion::VERSION_FLUX && !sd_loaded_chroma()) + std::vector reference_imgs; + if(extra_image_data.size()>0 && loadedsdver==SDVersion::VERSION_FLUX && !loaded_model_is_chroma(sd_ctx)) { for(int i=0;i photomaker_imgs; + std::vector photomaker_imgs; if(photomaker_enabled && extra_image_data.size()>0) { for(int i=0;iprompt.c_str(); + params.negative_prompt = sd_params->negative_prompt.c_str(); + params.clip_skip = sd_params->clip_skip; + params.guidance.txt_cfg = sd_params->cfg_scale; + params.guidance.img_cfg = sd_params->cfg_scale; + params.guidance.distilled_guidance = sd_params->guidance; + params.eta = sd_params->eta; + params.width = sd_params->width; + params.height = sd_params->height; + params.sample_method = sd_params->sample_method; + params.sample_steps = sd_params->sample_steps; + params.seed = sd_params->seed; + params.batch_count = sd_params->batch_count; + params.control_cond = control_image; + params.control_strength = sd_params->control_strength; + params.style_strength = sd_params->style_ratio; + params.normalize_input = sd_params->normalize_input; + params.input_id_images_path = sd_params->input_id_images_path.c_str(); + + params.guidance.slg.layers = sd_params->skip_layers.data(); + params.guidance.slg.layer_count = sd_params->skip_layers.size(); + params.guidance.slg.layer_start = sd_params->skip_layer_start; + params.guidance.slg.layer_end = sd_params->skip_layer_end; + params.guidance.slg.scale = sd_params->slg_scale; + + params.ref_images = reference_imgs.data(); + params.ref_images_count = reference_imgs.size(); + + kcpp_img_gen_params_t extra_params = {}; + extra_params.photomaker_references = photomaker_imgs.data(); + extra_params.photomaker_reference_count = photomaker_imgs.size(); + if (sd_params->mode == TXT2IMG) { if(!sd_is_quiet && sddebugmode==1) @@ -708,32 +778,8 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs) sd_params->control_strength); } + results = generate_image(sd_ctx, ¶ms, &extra_params); - results = txt2img(sd_ctx, - sd_params->prompt.c_str(), - sd_params->negative_prompt.c_str(), - sd_params->clip_skip, - sd_params->cfg_scale, - sd_params->guidance, - sd_params->eta, - sd_params->width, - sd_params->height, - sd_params->sample_method, - sd_params->sample_steps, - sd_params->seed, - sd_params->batch_count, - control_image, - sd_params->control_strength, - sd_params->style_ratio, - sd_params->normalize_input, - sd_params->input_id_images_path.c_str(), - kontext_imgs.data(), kontext_imgs.size(), - sd_params->skip_layers.data(), - sd_params->skip_layers.size(), - sd_params->slg_scale, - sd_params->skip_layer_start, - sd_params->skip_layer_end, - photomaker_imgs); } else { if (sd_params->width <= 0 || sd_params->width % 64 != 0 || sd_params->height <= 0 || sd_params->height % 64 != 0) { @@ -839,34 +885,12 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs) sd_params->strength); } - results = img2img(sd_ctx, - input_image, - mask_image, - sd_params->prompt.c_str(), - sd_params->negative_prompt.c_str(), - sd_params->clip_skip, - sd_params->cfg_scale, - sd_params->guidance, - sd_params->eta, - sd_params->width, - sd_params->height, - sd_params->sample_method, - sd_params->sample_steps, - sd_params->strength, - sd_params->seed, - sd_params->batch_count, - control_image, - sd_params->control_strength, - sd_params->style_ratio, - sd_params->normalize_input, - sd_params->input_id_images_path.c_str(), - kontext_imgs.data(), kontext_imgs.size(), - sd_params->skip_layers.data(), - sd_params->skip_layers.size(), - sd_params->slg_scale, - sd_params->skip_layer_start, - sd_params->skip_layer_end, - photomaker_imgs); + params.strength = sd_params->strength; + params.init_image = input_image; + params.mask_image = mask_image; + + results = generate_image(sd_ctx, ¶ms, &extra_params); + } if (results == NULL) { diff --git a/otherarch/sdcpp/stable-diffusion.cpp b/otherarch/sdcpp/stable-diffusion.cpp index 322e888bd..ec9194d22 100644 --- a/otherarch/sdcpp/stable-diffusion.cpp +++ b/otherarch/sdcpp/stable-diffusion.cpp @@ -16,21 +16,26 @@ #include "tae.hpp" #include "vae.hpp" +// #define STB_IMAGE_IMPLEMENTATION +// #define STB_IMAGE_STATIC #include "stb_image.h" #include #include -static std::string pending_apply_lora_fname = ""; -static float pending_apply_lora_power = 1.0f; -static bool is_loaded_chroma = false; + +// #define STB_IMAGE_WRITE_IMPLEMENTATION +// #define STB_IMAGE_WRITE_STATIC +// #include "stb_image_write.h" const char* model_version_to_str[] = { "SD 1.x", "SD 1.x Inpaint", + "Instruct-Pix2Pix", "SD 2.x", "SD 2.x Inpaint", "SDXL", "SDXL Inpaint", + "SDXL Instruct-Pix2Pix", "SVD", "SD3.x", "Flux", @@ -48,8 +53,7 @@ const char* sampling_methods_str[] = { "iPNDM_v", "LCM", "DDIM \"trailing\"", - "TCD" -}; + "TCD"}; /*================================================== Helper Functions ================================================*/ @@ -104,6 +108,9 @@ public: bool vae_tiling = false; bool stacked_id = false; + bool is_using_v_parameterization = false; + bool is_using_edm_v_parameterization = false; + std::map tensors; std::string lora_model_dir; @@ -114,22 +121,6 @@ public: StableDiffusionGGML() = default; - StableDiffusionGGML(int n_threads, - bool vae_decode_only, - bool free_params_immediately, - std::string lora_model_dir, - rng_type_t rng_type) - : n_threads(n_threads), - vae_decode_only(vae_decode_only), - free_params_immediately(free_params_immediately), - lora_model_dir(lora_model_dir) { - if (rng_type == STD_DEFAULT_RNG) { - rng = std::make_shared(); - } else if (rng_type == CUDA_RNG) { - rng = std::make_shared(); - } - } - ~StableDiffusionGGML() { if (clip_backend != backend) { ggml_backend_free(clip_backend); @@ -143,36 +134,14 @@ public: ggml_backend_free(backend); } - bool load_from_file(const std::string& model_path, - const std::string& clip_l_path, - const std::string& clip_g_path, - const std::string& t5xxl_path, - const std::string& diffusion_model_path, - const std::string& vae_path, - const std::string control_net_path, - const std::string embeddings_path, - const std::string id_embeddings_path_original, - const std::string& taesd_path, - bool vae_tiling_, - ggml_type wtype, - schedule_t schedule, - bool clip_on_cpu, - bool control_net_cpu, - bool vae_on_cpu, - bool diffusion_flash_attn, - bool chroma_use_dit_mask, - bool chroma_use_t5_mask, - int chroma_t5_mask_pad) { - use_tiny_autoencoder = taesd_path.size() > 0; - std::string taesd_path_fixed = taesd_path; - is_loaded_chroma = false; - std::string id_embeddings_path = id_embeddings_path_original; + void init_backend() { #ifdef SD_USE_CUDA LOG_DEBUG("Using CUDA backend"); backend = ggml_backend_cuda_init(0); #endif #ifdef SD_USE_METAL LOG_DEBUG("Using Metal backend"); + ggml_log_set(ggml_log_callback_default, nullptr); backend = ggml_backend_metal_init(); #endif #ifdef SD_USE_VULKAN @@ -184,6 +153,14 @@ public: LOG_WARN("Failed to initialize Vulkan backend"); } #endif +#ifdef SD_USE_OPENCL + LOG_DEBUG("Using OpenCL backend"); + // ggml_log_set(ggml_log_callback_default, nullptr); // Optional ggml logs + backend = ggml_backend_opencl_init(); + if (!backend) { + LOG_WARN("Failed to initialize OpenCL backend"); + } +#endif #ifdef SD_USE_SYCL LOG_DEBUG("Using SYCL backend"); backend = ggml_backend_sycl_init(0); @@ -193,80 +170,101 @@ public: LOG_DEBUG("Using CPU backend"); backend = ggml_backend_cpu_init(); } + } + + bool init(const sd_ctx_params_t* sd_ctx_params) { + n_threads = sd_ctx_params->n_threads; + vae_decode_only = sd_ctx_params->vae_decode_only; + free_params_immediately = sd_ctx_params->free_params_immediately; + lora_model_dir = SAFE_STR(sd_ctx_params->lora_model_dir); + taesd_path = SAFE_STR(sd_ctx_params->taesd_path); + use_tiny_autoencoder = taesd_path.size() > 0; + vae_tiling = sd_ctx_params->vae_tiling; + + if (sd_ctx_params->rng_type == STD_DEFAULT_RNG) { + rng = std::make_shared(); + } else if (sd_ctx_params->rng_type == CUDA_RNG) { + rng = std::make_shared(); + } + + init_backend(); + + std::string taesd_path_fixed = taesd_path; ModelLoader model_loader; - vae_tiling = vae_tiling_; - - if (model_path.size() > 0) { - LOG_INFO("loading model from '%s'", model_path.c_str()); - if (!model_loader.init_from_file(model_path)) { - LOG_ERROR("init model loader from file failed: '%s'", model_path.c_str()); + if (strlen(SAFE_STR(sd_ctx_params->model_path)) > 0) { + LOG_INFO("loading model from '%s'", sd_ctx_params->model_path); + if (!model_loader.init_from_file(sd_ctx_params->model_path)) { + LOG_ERROR("init model loader from file failed: '%s'", sd_ctx_params->model_path); } } - if (clip_l_path.size() > 0) { - LOG_INFO("loading clip_l from '%s'", clip_l_path.c_str()); - if (!model_loader.init_from_file(clip_l_path, "text_encoders.clip_l.transformer.")) { - LOG_WARN("loading clip_l from '%s' failed", clip_l_path.c_str()); + if (strlen(SAFE_STR(sd_ctx_params->diffusion_model_path)) > 0) { + LOG_INFO("loading diffusion model from '%s'", sd_ctx_params->diffusion_model_path); + if (!model_loader.init_from_file(sd_ctx_params->diffusion_model_path, "model.diffusion_model.")) { + LOG_WARN("loading diffusion model from '%s' failed", sd_ctx_params->diffusion_model_path); } } - if (clip_g_path.size() > 0) { - LOG_INFO("loading clip_g from '%s'", clip_g_path.c_str()); - if (!model_loader.init_from_file(clip_g_path, "text_encoders.clip_g.transformer.")) { - LOG_WARN("loading clip_g from '%s' failed", clip_g_path.c_str()); + bool is_unet = model_loader.model_is_unet(); + + if (strlen(SAFE_STR(sd_ctx_params->clip_l_path)) > 0) { + LOG_INFO("loading clip_l from '%s'", sd_ctx_params->clip_l_path); + std::string prefix = is_unet ? "cond_stage_model.transformer." : "text_encoders.clip_l.transformer."; + if (!model_loader.init_from_file(sd_ctx_params->clip_l_path, prefix)) { + LOG_WARN("loading clip_l from '%s' failed", sd_ctx_params->clip_l_path); } } - if (t5xxl_path.size() > 0) { - LOG_INFO("loading t5xxl from '%s'", t5xxl_path.c_str()); - if (!model_loader.init_from_file(t5xxl_path, "text_encoders.t5xxl.transformer.")) { - LOG_WARN("loading t5xxl from '%s' failed", t5xxl_path.c_str()); + if (strlen(SAFE_STR(sd_ctx_params->clip_g_path)) > 0) { + LOG_INFO("loading clip_g from '%s'", sd_ctx_params->clip_g_path); + std::string prefix = is_unet ? "cond_stage_model.1.transformer." : "text_encoders.clip_g.transformer."; + if (!model_loader.init_from_file(sd_ctx_params->clip_g_path, prefix)) { + LOG_WARN("loading clip_g from '%s' failed", sd_ctx_params->clip_g_path); } } - if (diffusion_model_path.size() > 0) { - LOG_INFO("loading diffusion model from '%s'", diffusion_model_path.c_str()); - if (!model_loader.init_from_file(diffusion_model_path, "model.diffusion_model.")) { - LOG_WARN("loading diffusion model from '%s' failed", diffusion_model_path.c_str()); + if (strlen(SAFE_STR(sd_ctx_params->t5xxl_path)) > 0) { + LOG_INFO("loading t5xxl from '%s'", sd_ctx_params->t5xxl_path); + if (!model_loader.init_from_file(sd_ctx_params->t5xxl_path, "text_encoders.t5xxl.transformer.")) { + LOG_WARN("loading t5xxl from '%s' failed", sd_ctx_params->t5xxl_path); } } - if (vae_path.size() > 0) { - LOG_INFO("loading vae from '%s'", vae_path.c_str()); - if (!model_loader.init_from_file(vae_path, "vae.")) { - LOG_WARN("loading vae from '%s' failed", vae_path.c_str()); + if (strlen(SAFE_STR(sd_ctx_params->vae_path)) > 0) { + LOG_INFO("loading vae from '%s'", sd_ctx_params->vae_path); + if (!model_loader.init_from_file(sd_ctx_params->vae_path, "vae.")) { + LOG_WARN("loading vae from '%s' failed", sd_ctx_params->vae_path); } } version = model_loader.get_sd_version(); - if (version == VERSION_COUNT && model_path.size() > 0 && diffusion_model_path.size() == 0 && t5xxl_path.size() > 0) { - bool endswithsafetensors = (model_path.rfind(".safetensors") == model_path.size() - 12); + // kcpp fallback to separate diffusion model passed as model + if (version == VERSION_COUNT && + strlen(SAFE_STR(sd_ctx_params->model_path)) > 0 && + strlen(SAFE_STR(sd_ctx_params->diffusion_model_path)) == 0 && + strlen(SAFE_STR(sd_ctx_params->t5xxl_path)) > 0 ) + { + bool endswithsafetensors = ends_with(sd_ctx_params->model_path, ".safetensors"); if(endswithsafetensors && !model_loader.has_diffusion_model_tensors()) { LOG_INFO("SD Diffusion Model tensors missing! Fallback trying alternative tensor names...\n"); - if (!model_loader.init_from_file(model_path, "model.diffusion_model.")) { - LOG_WARN("loading diffusion model from '%s' failed", model_path.c_str()); + if (!model_loader.init_from_file(sd_ctx_params->model_path, "model.diffusion_model.")) { + LOG_WARN("loading diffusion model from '%s' failed", sd_ctx_params->model_path); } version = model_loader.get_sd_version(); } } if (version == VERSION_COUNT) { - LOG_ERROR("Error: get SD version from file failed: '%s'", model_path.c_str()); + LOG_ERROR("get sd version from file failed: '%s'", SAFE_STR(sd_ctx_params->model_path)); return false; } LOG_INFO("Version: %s ", model_version_to_str[version]); - if(id_embeddings_path!="" && version!=VERSION_SDXL) - { - printf("\n!!!!\nWARNING: PhotoMaker is only compatible with SDXL models. PhotoMaker will be disabled!\n!!!!\n"); - id_embeddings_path = ""; - } - if(use_tiny_autoencoder) { std::string to_search = "taesd.embd"; @@ -293,6 +291,7 @@ public: } } + ggml_type wtype = (ggml_type)sd_ctx_params->wtype; if (wtype == GGML_TYPE_COUNT) { model_wtype = model_loader.get_sd_wtype(); if (model_wtype == GGML_TYPE_COUNT) { @@ -325,16 +324,16 @@ public: model_loader.set_wtype_override(GGML_TYPE_F32, "vae."); } - LOG_INFO("Weight type: %s", model_wtype != SD_TYPE_COUNT ? ggml_type_name(model_wtype) : "??"); - LOG_INFO("Conditioner weight type: %s", conditioner_wtype != SD_TYPE_COUNT ? ggml_type_name(conditioner_wtype) : "??"); - LOG_INFO("Diffusion model weight type: %s", diffusion_model_wtype != SD_TYPE_COUNT ? ggml_type_name(diffusion_model_wtype) : "??"); - LOG_INFO("VAE weight type: %s", vae_wtype != SD_TYPE_COUNT ? ggml_type_name(vae_wtype) : "??"); + LOG_INFO("Weight type: %s", model_wtype != GGML_TYPE_COUNT ? ggml_type_name(model_wtype) : "??"); + LOG_INFO("Conditioner weight type: %s", conditioner_wtype != GGML_TYPE_COUNT ? ggml_type_name(conditioner_wtype) : "??"); + LOG_INFO("Diffusion model weight type: %s", diffusion_model_wtype != GGML_TYPE_COUNT ? ggml_type_name(diffusion_model_wtype) : "??"); + LOG_INFO("VAE weight type: %s", vae_wtype != GGML_TYPE_COUNT ? ggml_type_name(vae_wtype) : "??"); LOG_DEBUG("ggml tensor size = %d bytes", (int)sizeof(ggml_tensor)); if (sd_version_is_sdxl(version)) { scale_factor = 0.13025f; - if (vae_path.size() == 0 && taesd_path_fixed.size() == 0) { + if (strlen(SAFE_STR(sd_ctx_params->vae_path)) == 0 && taesd_path_fixed.size() == 0) { LOG_WARN( "!!!It looks like you are using SDXL model. " "If you find that the generated images are completely black, " @@ -348,6 +347,8 @@ public: // TODO: shift_factor } + bool clip_on_cpu = sd_ctx_params->keep_clip_on_cpu; + if (version == VERSION_SVD) { clip_vision = std::make_shared(backend, model_loader.tensor_storages_types); clip_vision->alloc_params_buffer(); @@ -375,11 +376,11 @@ public: LOG_INFO("CLIP: Using CPU backend"); clip_backend = ggml_backend_cpu_init(); } - if (diffusion_flash_attn) { + if (sd_ctx_params->diffusion_flash_attn) { LOG_INFO("Using flash attention in the diffusion model"); } if (sd_version_is_sd3(version)) { - if (diffusion_flash_attn) { + if (sd_ctx_params->diffusion_flash_attn) { LOG_WARN("flash attention in this diffusion model is currently unsupported!"); } cond_stage_model = std::make_shared(clip_backend, model_loader.tensor_storages_types); @@ -389,23 +390,44 @@ public: for (auto pair : model_loader.tensor_storages_types) { if (pair.first.find("distilled_guidance_layer.in_proj.weight") != std::string::npos) { is_chroma = true; - is_loaded_chroma = true; break; } } if (is_chroma) { - cond_stage_model = std::make_shared(clip_backend, model_loader.tensor_storages_types, -1, chroma_use_t5_mask, chroma_t5_mask_pad); + cond_stage_model = std::make_shared(clip_backend, + model_loader.tensor_storages_types, + -1, + sd_ctx_params->chroma_use_t5_mask, + sd_ctx_params->chroma_t5_mask_pad); } else { cond_stage_model = std::make_shared(clip_backend, model_loader.tensor_storages_types); } - diffusion_model = std::make_shared(backend, model_loader.tensor_storages_types, version, diffusion_flash_attn, chroma_use_dit_mask); + diffusion_model = std::make_shared(backend, + model_loader.tensor_storages_types, + version, + sd_ctx_params->diffusion_flash_attn, + sd_ctx_params->chroma_use_dit_mask); } else { - if (id_embeddings_path.find("v2") != std::string::npos) { - cond_stage_model = std::make_shared(clip_backend, model_loader.tensor_storages_types, embeddings_path, version, PM_VERSION_2); + if (strstr(SAFE_STR(sd_ctx_params->stacked_id_embed_dir), "v2")) { + cond_stage_model = std::make_shared(clip_backend, + model_loader.tensor_storages_types, + SAFE_STR(sd_ctx_params->embedding_dir), + version, + PM_VERSION_2); } else { - cond_stage_model = std::make_shared(clip_backend, model_loader.tensor_storages_types, embeddings_path, version); + cond_stage_model = std::make_shared(clip_backend, + model_loader.tensor_storages_types, + SAFE_STR(sd_ctx_params->embedding_dir), + version); + } + diffusion_model = std::make_shared(backend, + model_loader.tensor_storages_types, + version, + sd_ctx_params->diffusion_flash_attn); + if (sd_ctx_params->diffusion_conv_direct) { + LOG_INFO("Using Conv2d direct in the diffusion model"); + std::dynamic_pointer_cast(diffusion_model)->unet.enable_conv2d_direct(); } - diffusion_model = std::make_shared(backend, model_loader.tensor_storages_types, version, diffusion_flash_attn); } cond_stage_model->alloc_params_buffer(); @@ -415,49 +437,74 @@ public: diffusion_model->get_param_tensors(tensors); if (!use_tiny_autoencoder) { - if (vae_on_cpu && !ggml_backend_is_cpu(backend)) { + if (sd_ctx_params->keep_vae_on_cpu && !ggml_backend_is_cpu(backend)) { LOG_INFO("VAE Autoencoder: Using CPU backend"); vae_backend = ggml_backend_cpu_init(); } else { vae_backend = backend; } - first_stage_model = std::make_shared(vae_backend, model_loader.tensor_storages_types, "first_stage_model", vae_decode_only, false, version); + first_stage_model = std::make_shared(vae_backend, + model_loader.tensor_storages_types, + "first_stage_model", + vae_decode_only, + false, + version); + if (sd_ctx_params->vae_conv_direct) { + LOG_INFO("Using Conv2d direct in the vae model"); + first_stage_model->enable_conv2d_direct(); + } first_stage_model->alloc_params_buffer(); first_stage_model->get_param_tensors(tensors, "first_stage_model"); } else { - tae_first_stage = std::make_shared(backend, model_loader.tensor_storages_types, "decoder.layers", vae_decode_only, version); + tae_first_stage = std::make_shared(backend, + model_loader.tensor_storages_types, + "decoder.layers", + vae_decode_only, + version); + if (sd_ctx_params->vae_conv_direct) { + LOG_INFO("Using Conv2d direct in the tae model"); + tae_first_stage->enable_conv2d_direct(); + } } // first_stage_model->get_param_tensors(tensors, "first_stage_model."); - if (control_net_path.size() > 0) { + if (strlen(SAFE_STR(sd_ctx_params->control_net_path)) > 0) { ggml_backend_t controlnet_backend = NULL; - if (control_net_cpu && !ggml_backend_is_cpu(backend)) { + if (sd_ctx_params->keep_control_net_on_cpu && !ggml_backend_is_cpu(backend)) { LOG_DEBUG("ControlNet: Using CPU backend"); controlnet_backend = ggml_backend_cpu_init(); } else { controlnet_backend = backend; } control_net = std::make_shared(controlnet_backend, model_loader.tensor_storages_types, version); + if (sd_ctx_params->diffusion_conv_direct) { + LOG_INFO("Using Conv2d direct in the control net"); + control_net->enable_conv2d_direct(); + } } - if (id_embeddings_path.find("v2") != std::string::npos) { + if (strstr(SAFE_STR(sd_ctx_params->stacked_id_embed_dir), "v2")) { pmid_model = std::make_shared(backend, model_loader.tensor_storages_types, "pmid", version, PM_VERSION_2); LOG_INFO("using PhotoMaker Version 2"); } else { pmid_model = std::make_shared(backend, model_loader.tensor_storages_types, "pmid", version); } - if (id_embeddings_path.size() > 0) { - pmid_lora = std::make_shared(backend, id_embeddings_path, ""); + if (strlen(SAFE_STR(sd_ctx_params->stacked_id_embed_dir)) > 0) { + if (version != VERSION_SDXL) { + printf("\n!!!!\nWARNING: PhotoMaker is only compatible with SDXL models. PhotoMaker will be disabled!\n!!!!\n"); + } else { + pmid_lora = std::make_shared(backend, sd_ctx_params->stacked_id_embed_dir, ""); if (!pmid_lora->load_from_file(true)) { - LOG_WARN("load photomaker lora tensors from %s failed", id_embeddings_path.c_str()); + LOG_WARN("load photomaker lora tensors from %s failed", sd_ctx_params->stacked_id_embed_dir); return false; } - LOG_INFO("loading stacked ID embedding (PHOTOMAKER) model file from '%s'", id_embeddings_path.c_str()); - if (!model_loader.init_from_file(id_embeddings_path, "pmid.")) { - LOG_WARN("loading stacked ID embedding from '%s' failed", id_embeddings_path.c_str()); + LOG_INFO("loading stacked ID embedding (PHOTOMAKER) model file from '%s'", sd_ctx_params->stacked_id_embed_dir); + if (!model_loader.init_from_file(sd_ctx_params->stacked_id_embed_dir, "pmid.")) { + LOG_WARN("loading stacked ID embedding from '%s' failed", sd_ctx_params->stacked_id_embed_dir); } else { stacked_id = true; } + } } if (stacked_id) { if (!pmid_model->alloc_params_buffer()) { @@ -526,7 +573,7 @@ public: } size_t control_net_params_mem_size = 0; if (control_net) { - if (!control_net->load_from_file(control_net_path)) { + if (!control_net->load_from_file(SAFE_STR(sd_ctx_params->control_net_path))) { return false; } control_net_params_mem_size = control_net->get_params_buffer_size(); @@ -582,15 +629,20 @@ public: } int64_t t1 = ggml_time_ms(); - LOG_INFO("loading model from '%s' completed, taking %.2fs", model_path.c_str(), (t1 - t0) * 1.0f / 1000); + LOG_INFO("loading model from '%s' completed, taking %.2fs", SAFE_STR(sd_ctx_params->model_path), (t1 - t0) * 1.0f / 1000); // check is_using_v_parameterization_for_sd2 - bool is_using_v_parameterization = false; + if (sd_version_is_sd2(version)) { if (is_using_v_parameterization_for_sd2(ctx, sd_version_is_inpaint(version))) { is_using_v_parameterization = true; } } else if (sd_version_is_sdxl(version)) { + if (model_loader.tensor_storages_types.find("edm_vpred.sigma_max") != model_loader.tensor_storages_types.end()) { + // CosXL models + // TODO: get sigma_min and sigma_max values from file + is_using_edm_v_parameterization = true; + } if (model_loader.tensor_storages_types.find("v_pred") != model_loader.tensor_storages_types.end()) { is_using_v_parameterization = true; } @@ -615,12 +667,15 @@ public: } else if (is_using_v_parameterization) { LOG_INFO("running in v-prediction mode"); denoiser = std::make_shared(); + } else if (is_using_edm_v_parameterization) { + LOG_INFO("running in v-prediction EDM mode"); + denoiser = std::make_shared(); } else { LOG_INFO("running in eps-prediction mode"); } - if (schedule != DEFAULT) { - switch (schedule) { + if (sd_ctx_params->schedule != DEFAULT) { + switch (sd_ctx_params->schedule) { case DISCRETE: LOG_INFO("running with discrete schedule"); denoiser->schedule = std::make_shared(); @@ -647,7 +702,7 @@ public: // Don't touch anything. break; default: - LOG_ERROR("Unknown schedule %i", schedule); + LOG_ERROR("Unknown schedule %i", sd_ctx_params->schedule); abort(); } } @@ -701,11 +756,6 @@ public: return result < -1; } - void set_pending_lora(const std::string& lora_path, float multiplier) { - pending_apply_lora_fname = lora_path; - pending_apply_lora_power = multiplier; - } - void apply_lora_from_file(const std::string& lora_path, float multiplier) { int64_t t0 = ggml_time_ms(); std::string st_file_path = lora_path; @@ -876,7 +926,7 @@ public: set_timestep_embedding(timesteps, y, out_dim); } int64_t t1 = ggml_time_ms(); - LOG_DEBUG("computing svd condition graph completed, taking %d ms", (int)(t1 - t0)); + LOG_DEBUG("computing svd condition graph completed, taking %" PRId64 " ms", t1 - t0); return {c_crossattn, y, c_concat}; } @@ -885,22 +935,30 @@ public: ggml_tensor* noise, SDCondition cond, SDCondition uncond, + SDCondition img_cond, ggml_tensor* control_hint, float control_strength, - float min_cfg, - float cfg_scale, - float guidance, + sd_guidance_params_t guidance, float eta, sample_method_t method, const std::vector& sigmas, int start_merge_step, SDCondition id_cond, std::vector ref_latents = {}, - std::vector skip_layers = {}, - float slg_scale = 0, - float skip_layer_start = 0.01, - float skip_layer_end = 0.2, - ggml_tensor* noise_mask = nullptr) { + ggml_tensor* denoise_mask = nullptr) { + std::vector skip_layers(guidance.slg.layers, guidance.slg.layers + guidance.slg.layer_count); + + float cfg_scale = guidance.txt_cfg; + float img_cfg_scale = guidance.img_cfg; + float slg_scale = guidance.slg.scale; + + float min_cfg = guidance.min_cfg; + + if (img_cfg_scale != cfg_scale && !sd_version_is_inpaint_or_unet_edit(version)) { + LOG_WARN("2-conditioning CFG is not supported with this model, disabling it for better performance..."); + img_cfg_scale = cfg_scale; + } + LOG_DEBUG("Sample"); struct ggml_init_params params; size_t data_size = ggml_row_size(init_latent->type, init_latent->ne[0]); @@ -922,13 +980,15 @@ public: struct ggml_tensor* noised_input = ggml_dup_tensor(work_ctx, noise); - bool has_unconditioned = cfg_scale != 1.0 && uncond.c_crossattn != NULL; + bool has_unconditioned = img_cfg_scale != 1.0 && uncond.c_crossattn != NULL; + bool has_img_cond = cfg_scale != img_cfg_scale && img_cond.c_crossattn != NULL; bool has_skiplayer = slg_scale != 0.0 && skip_layers.size() > 0; // denoise wrapper - struct ggml_tensor* out_cond = ggml_dup_tensor(work_ctx, x); - struct ggml_tensor* out_uncond = NULL; - struct ggml_tensor* out_skip = NULL; + struct ggml_tensor* out_cond = ggml_dup_tensor(work_ctx, x); + struct ggml_tensor* out_uncond = NULL; + struct ggml_tensor* out_skip = NULL; + struct ggml_tensor* out_img_cond = NULL; if (has_unconditioned) { out_uncond = ggml_dup_tensor(work_ctx, x); @@ -941,6 +1001,9 @@ public: LOG_WARN("SLG is incompatible with %s models", model_version_to_str[version]); } } + if (has_img_cond) { + out_img_cond = ggml_dup_tensor(work_ctx, x); + } struct ggml_tensor* denoised = ggml_dup_tensor(work_ctx, x); auto denoise = [&](ggml_tensor* input, float sigma, int step) -> ggml_tensor* { @@ -958,7 +1021,7 @@ public: float t = denoiser->sigma_to_t(sigma); std::vector timesteps_vec(x->ne[3], t); // [N, ] auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec); - std::vector guidance_vec(x->ne[3], guidance); + std::vector guidance_vec(x->ne[3], guidance.distilled_guidance); auto guidance_tensor = vector_to_ggml_tensor(work_ctx, guidance_vec); copy_ggml_tensor(noised_input, input); @@ -1025,8 +1088,25 @@ public: negative_data = (float*)out_uncond->data; } + float* img_cond_data = NULL; + if (has_img_cond) { + diffusion_model->compute(n_threads, + noised_input, + timesteps, + img_cond.c_crossattn, + img_cond.c_concat, + img_cond.c_vector, + guidance_tensor, + ref_latents, + -1, + controls, + control_strength, + &out_img_cond); + img_cond_data = (float*)out_img_cond->data; + } + int step_count = sigmas.size(); - bool is_skiplayer_step = has_skiplayer && step > (int)(skip_layer_start * step_count) && step < (int)(skip_layer_end * step_count); + bool is_skiplayer_step = has_skiplayer && step > (int)(guidance.slg.layer_start * step_count) && step < (int)(guidance.slg.layer_end * step_count); float* skip_layer_data = NULL; if (is_skiplayer_step) { LOG_DEBUG("Skipping layers at step %d\n", step); @@ -1060,8 +1140,17 @@ public: int64_t i3 = i / out_cond->ne[0] * out_cond->ne[1] * out_cond->ne[2]; float scale = min_cfg + (cfg_scale - min_cfg) * (i3 * 1.0f / ne3); } else { - latent_result = negative_data[i] + cfg_scale * (positive_data[i] - negative_data[i]); + if (has_img_cond) { + // out_uncond + text_cfg_scale * (out_cond - out_img_cond) + image_cfg_scale * (out_img_cond - out_uncond) + latent_result = negative_data[i] + img_cfg_scale * (img_cond_data[i] - negative_data[i]) + cfg_scale * (positive_data[i] - img_cond_data[i]); + } else { + // img_cfg_scale == cfg_scale + latent_result = negative_data[i] + cfg_scale * (positive_data[i] - negative_data[i]); + } } + } else if (has_img_cond) { + // img_cfg_scale == 1 + latent_result = img_cond_data[i] + cfg_scale * (positive_data[i] - img_cond_data[i]); } if (is_skiplayer_step) { latent_result = latent_result + (positive_data[i] - skip_layer_data[i]) * slg_scale; @@ -1075,10 +1164,10 @@ public: pretty_progress(step, (int)steps, (t1 - t0) / 1000000.f); // LOG_INFO("step %d sampling completed taking %.2fs", step, (t1 - t0) * 1.0f / 1000000); } - if (noise_mask != nullptr) { + if (denoise_mask != nullptr) { for (int64_t x = 0; x < denoised->ne[0]; x++) { for (int64_t y = 0; y < denoised->ne[1]; y++) { - float mask = ggml_tensor_get_f32(noise_mask, x, y); + float mask = ggml_tensor_get_f32(denoise_mask, x, y); for (int64_t k = 0; k < denoised->ne[2]; k++) { float init = ggml_tensor_get_f32(init_latent, x, y, k); float den = ggml_tensor_get_f32(denoised, x, y, k); @@ -1267,8 +1356,7 @@ public: ggml_tensor_scale_output(result); } } else { - //koboldcpp never use tiling with taesd - if (false && vae_tiling) { // TODO: support tiling vae encode + if (vae_tiling) { // split latent in 64x64 tiles and compute in several steps auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { tae_first_stage->compute(n_threads, in, decode, &out); @@ -1299,96 +1387,301 @@ public: /*================================================= SD API ==================================================*/ +#define NONE_STR "NONE" + +const char* sd_type_name(enum sd_type_t type) { + return ggml_type_name((ggml_type)type); +} + +enum sd_type_t str_to_sd_type(const char* str) { + for (int i = 0; i < SD_TYPE_COUNT; i++) { + auto trait = ggml_get_type_traits((ggml_type)i); + if (!strcmp(str, trait->type_name)) { + return (enum sd_type_t)i; + } + } + return SD_TYPE_COUNT; +} + +const char* rng_type_to_str[] = { + "std_default", + "cuda", +}; + +const char* sd_rng_type_name(enum rng_type_t rng_type) { + if (rng_type < RNG_TYPE_COUNT) { + return rng_type_to_str[rng_type]; + } + return NONE_STR; +} + +enum rng_type_t str_to_rng_type(const char* str) { + for (int i = 0; i < RNG_TYPE_COUNT; i++) { + if (!strcmp(str, rng_type_to_str[i])) { + return (enum rng_type_t)i; + } + } + return RNG_TYPE_COUNT; +} + +const char* sample_method_to_str[] = { + "euler_a", + "euler", + "heun", + "dpm2", + "dpm++2s_a", + "dpm++2m", + "dpm++2mv2", + "ipndm", + "ipndm_v", + "lcm", + "ddim_trailing", + "tcd", +}; + +const char* sd_sample_method_name(enum sample_method_t sample_method) { + if (sample_method < SAMPLE_METHOD_COUNT) { + return sample_method_to_str[sample_method]; + } + return NONE_STR; +} + +enum sample_method_t str_to_sample_method(const char* str) { + for (int i = 0; i < SAMPLE_METHOD_COUNT; i++) { + if (!strcmp(str, sample_method_to_str[i])) { + return (enum sample_method_t)i; + } + } + return SAMPLE_METHOD_COUNT; +} + +const char* schedule_to_str[] = { + "default", + "discrete", + "karras", + "exponential", + "ays", + "gits", +}; + +const char* sd_schedule_name(enum schedule_t schedule) { + if (schedule < SCHEDULE_COUNT) { + return schedule_to_str[schedule]; + } + return NONE_STR; +} + +enum schedule_t str_to_schedule(const char* str) { + for (int i = 0; i < SCHEDULE_COUNT; i++) { + if (!strcmp(str, schedule_to_str[i])) { + return (enum schedule_t)i; + } + } + return SCHEDULE_COUNT; +} + +void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) { + memset((void*)sd_ctx_params, 0, sizeof(sd_ctx_params_t)); + sd_ctx_params->vae_decode_only = true; + sd_ctx_params->vae_tiling = false; + sd_ctx_params->free_params_immediately = true; + sd_ctx_params->n_threads = sd_get_num_physical_cores(); + sd_ctx_params->wtype = SD_TYPE_COUNT; + sd_ctx_params->rng_type = CUDA_RNG; + sd_ctx_params->schedule = DEFAULT; + sd_ctx_params->keep_clip_on_cpu = false; + sd_ctx_params->keep_control_net_on_cpu = false; + sd_ctx_params->keep_vae_on_cpu = false; + sd_ctx_params->diffusion_flash_attn = false; + sd_ctx_params->chroma_use_dit_mask = true; + sd_ctx_params->chroma_use_t5_mask = false; + sd_ctx_params->chroma_t5_mask_pad = 1; +} + +char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { + char* buf = (char*)malloc(4096); + if (!buf) + return NULL; + buf[0] = '\0'; + + snprintf(buf + strlen(buf), 4096 - strlen(buf), + "model_path: %s\n" + "clip_l_path: %s\n" + "clip_g_path: %s\n" + "t5xxl_path: %s\n" + "diffusion_model_path: %s\n" + "vae_path: %s\n" + "taesd_path: %s\n" + "control_net_path: %s\n" + "lora_model_dir: %s\n" + "embedding_dir: %s\n" + "stacked_id_embed_dir: %s\n" + "vae_decode_only: %s\n" + "vae_tiling: %s\n" + "free_params_immediately: %s\n" + "n_threads: %d\n" + "wtype: %s\n" + "rng_type: %s\n" + "schedule: %s\n" + "keep_clip_on_cpu: %s\n" + "keep_control_net_on_cpu: %s\n" + "keep_vae_on_cpu: %s\n" + "diffusion_flash_attn: %s\n" + "chroma_use_dit_mask: %s\n" + "chroma_use_t5_mask: %s\n" + "chroma_t5_mask_pad: %d\n", + SAFE_STR(sd_ctx_params->model_path), + SAFE_STR(sd_ctx_params->clip_l_path), + SAFE_STR(sd_ctx_params->clip_g_path), + SAFE_STR(sd_ctx_params->t5xxl_path), + SAFE_STR(sd_ctx_params->diffusion_model_path), + SAFE_STR(sd_ctx_params->vae_path), + SAFE_STR(sd_ctx_params->taesd_path), + SAFE_STR(sd_ctx_params->control_net_path), + SAFE_STR(sd_ctx_params->lora_model_dir), + SAFE_STR(sd_ctx_params->embedding_dir), + SAFE_STR(sd_ctx_params->stacked_id_embed_dir), + BOOL_STR(sd_ctx_params->vae_decode_only), + BOOL_STR(sd_ctx_params->vae_tiling), + BOOL_STR(sd_ctx_params->free_params_immediately), + sd_ctx_params->n_threads, + sd_type_name(sd_ctx_params->wtype), + sd_rng_type_name(sd_ctx_params->rng_type), + sd_schedule_name(sd_ctx_params->schedule), + BOOL_STR(sd_ctx_params->keep_clip_on_cpu), + BOOL_STR(sd_ctx_params->keep_control_net_on_cpu), + BOOL_STR(sd_ctx_params->keep_vae_on_cpu), + BOOL_STR(sd_ctx_params->diffusion_flash_attn), + BOOL_STR(sd_ctx_params->chroma_use_dit_mask), + BOOL_STR(sd_ctx_params->chroma_use_t5_mask), + sd_ctx_params->chroma_t5_mask_pad); + + return buf; +} + +void sd_img_gen_params_init(sd_img_gen_params_t* sd_img_gen_params) { + memset((void*)sd_img_gen_params, 0, sizeof(sd_img_gen_params_t)); + sd_img_gen_params->clip_skip = -1; + sd_img_gen_params->guidance.txt_cfg = 7.0f; + sd_img_gen_params->guidance.min_cfg = 1.0f; + sd_img_gen_params->guidance.img_cfg = INFINITY; + sd_img_gen_params->guidance.distilled_guidance = 3.5f; + sd_img_gen_params->guidance.slg.layer_count = 0; + sd_img_gen_params->guidance.slg.layer_start = 0.01f; + sd_img_gen_params->guidance.slg.layer_end = 0.2f; + sd_img_gen_params->guidance.slg.scale = 0.f; + sd_img_gen_params->ref_images_count = 0; + sd_img_gen_params->width = 512; + sd_img_gen_params->height = 512; + sd_img_gen_params->sample_method = EULER_A; + sd_img_gen_params->sample_steps = 20; + sd_img_gen_params->eta = 0.f; + sd_img_gen_params->strength = 0.75f; + sd_img_gen_params->seed = -1; + sd_img_gen_params->batch_count = 1; + sd_img_gen_params->control_strength = 0.9f; + sd_img_gen_params->style_strength = 20.f; + sd_img_gen_params->normalize_input = false; +} + +char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params) { + char* buf = (char*)malloc(4096); + if (!buf) + return NULL; + buf[0] = '\0'; + + snprintf(buf + strlen(buf), 4096 - strlen(buf), + "prompt: %s\n" + "negative_prompt: %s\n" + "clip_skip: %d\n" + "txt_cfg: %.2f\n" + "img_cfg: %.2f\n" + "min_cfg: %.2f\n" + "distilled_guidance: %.2f\n" + "slg.layer_count: %zu\n" + "slg.layer_start: %.2f\n" + "slg.layer_end: %.2f\n" + "slg.scale: %.2f\n" + "width: %d\n" + "height: %d\n" + "sample_method: %s\n" + "sample_steps: %d\n" + "eta: %.2f\n" + "strength: %.2f\n" + "seed: %" PRId64 + "\n" + "batch_count: %d\n" + "ref_images_count: %d\n" + "control_strength: %.2f\n" + "style_strength: %.2f\n" + "normalize_input: %s\n" + "input_id_images_path: %s\n", + SAFE_STR(sd_img_gen_params->prompt), + SAFE_STR(sd_img_gen_params->negative_prompt), + sd_img_gen_params->clip_skip, + sd_img_gen_params->guidance.txt_cfg, + sd_img_gen_params->guidance.img_cfg, + sd_img_gen_params->guidance.min_cfg, + sd_img_gen_params->guidance.distilled_guidance, + sd_img_gen_params->guidance.slg.layer_count, + sd_img_gen_params->guidance.slg.layer_start, + sd_img_gen_params->guidance.slg.layer_end, + sd_img_gen_params->guidance.slg.scale, + sd_img_gen_params->width, + sd_img_gen_params->height, + sd_sample_method_name(sd_img_gen_params->sample_method), + sd_img_gen_params->sample_steps, + sd_img_gen_params->eta, + sd_img_gen_params->strength, + sd_img_gen_params->seed, + sd_img_gen_params->batch_count, + sd_img_gen_params->ref_images_count, + sd_img_gen_params->control_strength, + sd_img_gen_params->style_strength, + BOOL_STR(sd_img_gen_params->normalize_input), + SAFE_STR(sd_img_gen_params->input_id_images_path)); + + return buf; +} + +void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params) { + memset((void*)sd_vid_gen_params, 0, sizeof(sd_vid_gen_params_t)); + sd_vid_gen_params->guidance.txt_cfg = 7.0f; + sd_vid_gen_params->guidance.min_cfg = 1.0f; + sd_vid_gen_params->guidance.img_cfg = INFINITY; + sd_vid_gen_params->guidance.distilled_guidance = 3.5f; + sd_vid_gen_params->guidance.slg.layer_count = 0; + sd_vid_gen_params->guidance.slg.layer_start = 0.01f; + sd_vid_gen_params->guidance.slg.layer_end = 0.2f; + sd_vid_gen_params->guidance.slg.scale = 0.f; + sd_vid_gen_params->width = 512; + sd_vid_gen_params->height = 512; + sd_vid_gen_params->sample_method = EULER_A; + sd_vid_gen_params->sample_steps = 20; + sd_vid_gen_params->strength = 0.75f; + sd_vid_gen_params->seed = -1; + sd_vid_gen_params->video_frames = 6; + sd_vid_gen_params->motion_bucket_id = 127; + sd_vid_gen_params->fps = 6; + sd_vid_gen_params->augmentation_level = 0.f; +} + struct sd_ctx_t { StableDiffusionGGML* sd = NULL; }; -void set_sd_vae_tiling(sd_ctx_t* ctx, bool tiling) -{ - ctx->sd->vae_tiling = tiling; -} - -int get_loaded_sd_version(sd_ctx_t* ctx) -{ - return ctx->sd->version; -} - -//kcpp hack to check if chroma -bool sd_loaded_chroma() -{ - return is_loaded_chroma; -} - -sd_ctx_t* new_sd_ctx(const char* model_path_c_str, - const char* clip_l_path_c_str, - const char* clip_g_path_c_str, - const char* t5xxl_path_c_str, - const char* diffusion_model_path_c_str, - const char* vae_path_c_str, - const char* taesd_path_c_str, - const char* control_net_path_c_str, - const char* lora_model_dir_c_str, - const char* embed_dir_c_str, - const char* id_embed_dir_c_str, - bool vae_decode_only, - bool vae_tiling, - bool free_params_immediately, - int n_threads, - enum sd_type_t wtype, - enum rng_type_t rng_type, - enum schedule_t s, - bool keep_clip_on_cpu, - bool keep_control_net_cpu, - bool keep_vae_on_cpu, - bool diffusion_flash_attn, - bool chroma_use_dit_mask, - bool chroma_use_t5_mask, - int chroma_t5_mask_pad) { +sd_ctx_t* new_sd_ctx(const sd_ctx_params_t* sd_ctx_params) { sd_ctx_t* sd_ctx = (sd_ctx_t*)malloc(sizeof(sd_ctx_t)); if (sd_ctx == NULL) { return NULL; } - std::string model_path(model_path_c_str); - std::string clip_l_path(clip_l_path_c_str); - std::string clip_g_path(clip_g_path_c_str); - std::string t5xxl_path(t5xxl_path_c_str); - std::string diffusion_model_path(diffusion_model_path_c_str); - std::string vae_path(vae_path_c_str); - std::string taesd_path(taesd_path_c_str); - std::string control_net_path(control_net_path_c_str); - std::string embd_path(embed_dir_c_str); - std::string id_embd_path(id_embed_dir_c_str); - std::string lora_model_dir(lora_model_dir_c_str); - sd_ctx->sd = new StableDiffusionGGML(n_threads, - vae_decode_only, - free_params_immediately, - lora_model_dir, - rng_type); + sd_ctx->sd = new StableDiffusionGGML(); if (sd_ctx->sd == NULL) { return NULL; } - if (!sd_ctx->sd->load_from_file(model_path, - clip_l_path, - clip_g_path, - t5xxl_path_c_str, - diffusion_model_path, - vae_path, - control_net_path, - embd_path, - id_embd_path, - taesd_path, - vae_tiling, - (ggml_type)wtype, - s, - keep_clip_on_cpu, - keep_control_net_cpu, - keep_vae_on_cpu, - diffusion_flash_attn, - chroma_use_dit_mask, - chroma_use_t5_mask, - chroma_t5_mask_pad)) { + if (!sd_ctx->sd->init(sd_ctx_params)) { delete sd_ctx->sd; sd_ctx->sd = NULL; free(sd_ctx); @@ -1405,33 +1698,29 @@ void free_sd_ctx(sd_ctx_t* sd_ctx) { free(sd_ctx); } -sd_image_t* generate_image(sd_ctx_t* sd_ctx, - struct ggml_context* work_ctx, - ggml_tensor* init_latent, - std::string prompt, - std::string negative_prompt, - int clip_skip, - float cfg_scale, - float guidance, - float eta, - int width, - int height, - enum sample_method_t sample_method, - const std::vector& sigmas, - int64_t seed, - int batch_count, - const sd_image_t* control_cond, - float control_strength, - float style_ratio, - bool normalize_input, - std::string input_id_images_path, - std::vector ref_latents, - std::vector skip_layers = {}, - float slg_scale = 0, - float skip_layer_start = 0.01, - float skip_layer_end = 0.2, - ggml_tensor* masked_image = NULL, - const std::vector photomaker_references = std::vector()) { +sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx, + struct ggml_context* work_ctx, + ggml_tensor* init_latent, + std::string prompt, + std::string negative_prompt, + int clip_skip, + sd_guidance_params_t guidance, + float eta, + int width, + int height, + enum sample_method_t sample_method, + const std::vector& sigmas, + int64_t seed, + int batch_count, + const sd_image_t* control_cond, + float control_strength, + float style_ratio, + bool normalize_input, + std::string input_id_images_path, + std::vector ref_latents, + ggml_tensor* concat_latent = NULL, + ggml_tensor* denoise_mask = NULL, + const kcpp_img_gen_params_t* kcpp_img_gen_params = NULL) { if (seed < 0) { // Generally, when using the provided command line, the seed is always >0. // However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library @@ -1458,14 +1747,14 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, prompt = result_pair.second; LOG_DEBUG("prompt after extract and remove lora: \"%s\"", prompt.c_str()); - int64_t t0 = ggml_time_ms(); - // sd_ctx->sd->apply_loras(lora_f2m); //only use hardcoded lora for kcpp - if(pending_apply_lora_fname!="" && pending_apply_lora_power>0) - { - printf("\nApplying LoRA now...\n"); - sd_ctx->sd->apply_lora_from_file(pending_apply_lora_fname,pending_apply_lora_power); - pending_apply_lora_fname = ""; + //only use hardcoded lora for kcpp + if (!lora_f2m.empty()) { + lora_f2m.clear(); + printf("\nWarning: not applying LoRAs requested by prompt!\n"); } + + int64_t t0 = ggml_time_ms(); + sd_ctx->sd->apply_loras(lora_f2m); int64_t t1 = ggml_time_ms(); LOG_INFO("apply_loras completed, taking %.2fs", (t1 - t0) * 1.0f / 1000); @@ -1474,7 +1763,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, ggml_tensor* init_img = NULL; SDCondition id_cond; std::vector class_tokens_mask; - if (sd_ctx->sd->pmid_model && photomaker_references.size()>0) + if (sd_ctx->sd->pmid_lora && kcpp_img_gen_params && kcpp_img_gen_params->photomaker_reference_count>0) { sd_ctx->sd->stacked_id = true; //turn on photomaker if needed } @@ -1522,16 +1811,16 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, } // handle multiple photomaker image passed in by kcpp - if (sd_ctx->sd->pmid_model && photomaker_references.size()>0) + if (sd_ctx->sd->pmid_lora && kcpp_img_gen_params) { - for(int i=0;iphotomaker_reference_count;++i) { int c = 0; int width, height; - width = photomaker_references[i]->width; - height = photomaker_references[i]->height; - c = photomaker_references[i]->channel; - uint8_t* input_image_buffer = photomaker_references[i]->data; + width = kcpp_img_gen_params->photomaker_references[i].width; + height = kcpp_img_gen_params->photomaker_references[i].height; + c = kcpp_img_gen_params->photomaker_references[i].channel; + uint8_t* input_image_buffer = kcpp_img_gen_params->photomaker_references[i].data; sd_image_t* input_image = NULL; input_image = new sd_image_t{(uint32_t)width, (uint32_t)height, @@ -1616,9 +1905,10 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, sd_ctx->sd->diffusion_model->get_adm_in_channels()); SDCondition uncond; - if (cfg_scale != 1.0) { + if (guidance.txt_cfg != 1.0 || + (sd_version_is_inpaint_or_unet_edit(sd_ctx->sd->version) && guidance.txt_cfg != guidance.img_cfg)) { bool force_zero_embeddings = false; - if (sd_version_is_sdxl(sd_ctx->sd->version) && negative_prompt.size() == 0) { + if (sd_version_is_sdxl(sd_ctx->sd->version) && negative_prompt.size() == 0 && !sd_ctx->sd->is_using_edm_v_parameterization) { force_zero_embeddings = true; } uncond = sd_ctx->sd->cond_stage_model->get_learned_condition(work_ctx, @@ -1631,7 +1921,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, force_zero_embeddings); } t1 = ggml_time_ms(); - LOG_INFO("get_learned_condition completed, taking %d ms", t1 - t0); + LOG_INFO("get_learned_condition completed, taking %" PRId64 " ms", t1 - t0); if (sd_ctx->sd->free_params_immediately) { sd_ctx->sd->cond_stage_model->free_params_buffer(); @@ -1655,38 +1945,50 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, int W = width / 8; int H = height / 8; LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]); - ggml_tensor* noise_mask = nullptr; if (sd_version_is_inpaint(sd_ctx->sd->version)) { - if (masked_image == NULL) { - int64_t mask_channels = 1; - if (sd_ctx->sd->version == VERSION_FLUX_FILL) { - mask_channels = 8 * 8; // flatten the whole mask - } - // no mask, set the whole image as masked - masked_image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], mask_channels + init_latent->ne[2], 1); - for (int64_t x = 0; x < masked_image->ne[0]; x++) { - for (int64_t y = 0; y < masked_image->ne[1]; y++) { - if (sd_ctx->sd->version == VERSION_FLUX_FILL) { - // TODO: this might be wrong - for (int64_t c = 0; c < init_latent->ne[2]; c++) { - ggml_tensor_set_f32(masked_image, 0, x, y, c); - } - for (int64_t c = init_latent->ne[2]; c < masked_image->ne[2]; c++) { - ggml_tensor_set_f32(masked_image, 1, x, y, c); - } - } else { - ggml_tensor_set_f32(masked_image, 1, x, y, 0); - for (int64_t c = 1; c < masked_image->ne[2]; c++) { - ggml_tensor_set_f32(masked_image, 0, x, y, c); - } + int64_t mask_channels = 1; + if (sd_ctx->sd->version == VERSION_FLUX_FILL) { + mask_channels = 8 * 8; // flatten the whole mask + } + auto empty_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], mask_channels + init_latent->ne[2], 1); + // no mask, set the whole image as masked + for (int64_t x = 0; x < empty_latent->ne[0]; x++) { + for (int64_t y = 0; y < empty_latent->ne[1]; y++) { + if (sd_ctx->sd->version == VERSION_FLUX_FILL) { + // TODO: this might be wrong + for (int64_t c = 0; c < init_latent->ne[2]; c++) { + ggml_tensor_set_f32(empty_latent, 0, x, y, c); + } + for (int64_t c = init_latent->ne[2]; c < empty_latent->ne[2]; c++) { + ggml_tensor_set_f32(empty_latent, 1, x, y, c); + } + } else { + ggml_tensor_set_f32(empty_latent, 1, x, y, 0); + for (int64_t c = 1; c < empty_latent->ne[2]; c++) { + ggml_tensor_set_f32(empty_latent, 0, x, y, c); } } } } - cond.c_concat = masked_image; - uncond.c_concat = masked_image; - } else { - noise_mask = masked_image; + if (concat_latent == NULL) { + concat_latent = empty_latent; + } + cond.c_concat = concat_latent; + uncond.c_concat = empty_latent; + denoise_mask = NULL; + } else if (sd_version_is_unet_edit(sd_ctx->sd->version)) { + auto empty_latent = ggml_dup_tensor(work_ctx, init_latent); + ggml_set_f32(empty_latent, 0); + uncond.c_concat = empty_latent; + if (concat_latent == NULL) { + concat_latent = empty_latent; + } + cond.c_concat = ref_latents[0]; + } + SDCondition img_cond; + if (uncond.c_crossattn != NULL && + (sd_version_is_inpaint_or_unet_edit(sd_ctx->sd->version) && guidance.txt_cfg != guidance.img_cfg)) { + img_cond = SDCondition(uncond.c_crossattn, uncond.c_vector, cond.c_concat); } for (int b = 0; b < batch_count; b++) { int64_t sampling_start = ggml_time_ms(); @@ -1706,15 +2008,17 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, LOG_INFO("PHOTOMAKER: start_merge_step: %d", start_merge_step); } + // Disable min_cfg + guidance.min_cfg = guidance.txt_cfg; + struct ggml_tensor* x_0 = sd_ctx->sd->sample(work_ctx, x_t, noise, cond, uncond, + img_cond, image_hint, control_strength, - cfg_scale, - cfg_scale, guidance, eta, sample_method, @@ -1722,11 +2026,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, start_merge_step, id_cond, ref_latents, - skip_layers, - slg_scale, - skip_layer_start, - skip_layer_end, - noise_mask); + denoise_mask); // struct ggml_tensor* x_0 = load_tensor_from_file(ctx, "samples_ddim.bin"); // print_ggml_tensor(x_0); @@ -1739,7 +2039,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, sd_ctx->sd->diffusion_model->free_params_buffer(); } int64_t t3 = ggml_time_ms(); - LOG_INFO("generating %d latent images completed, taking %.2fs", final_latents.size(), (t3 - t1) * 1.0f / 1000); + LOG_INFO("generating %" PRId64 " latent images completed, taking %.2fs", final_latents.size(), (t3 - t1) * 1.0f / 1000); // Decode to image LOG_INFO("decoding %zu latents", final_latents.size()); @@ -1752,7 +2052,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, decoded_images.push_back(img); } int64_t t2 = ggml_time_ms(); - LOG_INFO("latent %d decoded, taking %.2fs", i + 1, (t2 - t1) * 1.0f / 1000); + LOG_INFO("latent %" PRId64 " decoded, taking %.2fs", i + 1, (t2 - t1) * 1.0f / 1000); } int64_t t4 = ggml_time_ms(); @@ -1777,65 +2077,10 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, return result_images; } -sd_image_t* txt2img(sd_ctx_t* sd_ctx, - const char* prompt_c_str, - const char* negative_prompt_c_str, - int clip_skip, - float cfg_scale, - float guidance, - float eta, - int width, - int height, - enum sample_method_t sample_method, - int sample_steps, - int64_t seed, - int batch_count, - const sd_image_t* control_cond, - float control_strength, - float style_ratio, - bool normalize_input, - const char* input_id_images_path_c_str, - sd_image_t* kontext_imgs, - int kontext_img_count, - int* skip_layers = NULL, - size_t skip_layers_count = 0, - float slg_scale = 0, - float skip_layer_start = 0.01, - float skip_layer_end = 0.2, - const std::vector photomaker_references = std::vector()) { - std::vector skip_layers_vec(skip_layers, skip_layers + skip_layers_count); - LOG_DEBUG("txt2img %dx%d", width, height); - if (sd_ctx == NULL) { - return NULL; - } - - struct ggml_init_params params; - params.mem_size = static_cast(20 * 1024 * 1024); // 20 MB increased by kcpp - if (sd_version_is_sd3(sd_ctx->sd->version)) { - params.mem_size *= 2; //readjust by kcpp as above changed - } - if (sd_version_is_flux(sd_ctx->sd->version)) { - params.mem_size *= 3; //readjust by kcpp as above changed - } - if (sd_ctx->sd->stacked_id) { - params.mem_size += static_cast(15 * 1024 * 1024); // 10 MB - } - params.mem_size += width * height * 3 * sizeof(float); - params.mem_size *= batch_count; - params.mem_buffer = NULL; - params.no_alloc = false; - // LOG_DEBUG("mem_size %u ", params.mem_size); - - struct ggml_context* work_ctx = ggml_init(params); - if (!work_ctx) { - LOG_ERROR("ggml_init() failed"); - return NULL; - } - - size_t t0 = ggml_time_ms(); - - std::vector sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps); - +ggml_tensor* generate_init_latent(sd_ctx_t* sd_ctx, + ggml_context* work_ctx, + int width, + int height) { int C = 4; if (sd_version_is_sd3(sd_ctx->sd->version)) { C = 16; @@ -1852,110 +2097,40 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx, } else { ggml_set_f32(init_latent, 0.f); } - - if (sd_version_is_inpaint(sd_ctx->sd->version)) { - LOG_WARN("This is an inpainting model, this should only be used in img2img mode with a mask"); - } - std::vector kontext_latents = std::vector(); - if (kontext_imgs) { - for (int i = 0; i < kontext_img_count; i++) { - ggml_tensor* img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, kontext_imgs[i].width, kontext_imgs[i].height, 3, 1); - sd_image_to_tensor(kontext_imgs[i].data, img); - - ggml_tensor* latent = NULL; - if (!sd_ctx->sd->use_tiny_autoencoder) { - ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, img); - latent = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments); - } else { - latent = sd_ctx->sd->encode_first_stage(work_ctx, img); - } - kontext_latents.push_back(latent); - } - } - - sd_image_t* result_images = generate_image(sd_ctx, - work_ctx, - init_latent, - prompt_c_str, - negative_prompt_c_str, - clip_skip, - cfg_scale, - guidance, - eta, - width, - height, - sample_method, - sigmas, - seed, - batch_count, - control_cond, - control_strength, - style_ratio, - normalize_input, - input_id_images_path_c_str, - kontext_latents, - skip_layers_vec, - slg_scale, - skip_layer_start, - skip_layer_end, - nullptr, - photomaker_references); - - size_t t1 = ggml_time_ms(); - - LOG_INFO("txt2img completed in %.2fs", (t1 - t0) * 1.0f / 1000); - - return result_images; + return init_latent; } -sd_image_t* img2img(sd_ctx_t* sd_ctx, - sd_image_t init_image, - sd_image_t mask, - const char* prompt_c_str, - const char* negative_prompt_c_str, - int clip_skip, - float cfg_scale, - float guidance, - float eta, - int width, - int height, - sample_method_t sample_method, - int sample_steps, - float strength, - int64_t seed, - int batch_count, - const sd_image_t* control_cond, - float control_strength, - float style_ratio, - bool normalize_input, - const char* input_id_images_path_c_str, - sd_image_t* kontext_imgs, - int kontext_img_count, - int* skip_layers = NULL, - size_t skip_layers_count = 0, - float slg_scale = 0, - float skip_layer_start = 0.01, - float skip_layer_end = 0.2, - const std::vector photomaker_references = std::vector()) { - std::vector skip_layers_vec(skip_layers, skip_layers + skip_layers_count); - LOG_DEBUG("img2img %dx%d", width, height); - if (sd_ctx == NULL) { +sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params, const kcpp_img_gen_params_t* kcpp_img_gen_params) { + int width = sd_img_gen_params->width; + int height = sd_img_gen_params->height; + if (sd_version_is_dit(sd_ctx->sd->version)) { + if (width % 16 || height % 16) { + LOG_ERROR("Image dimensions must be must be a multiple of 16 on each axis for %s models. (Got %dx%d)", model_version_to_str[sd_ctx->sd->version], width, height); + return NULL; + } + } else if (width % 64 || height % 64) { + LOG_ERROR("Image dimensions must be must be a multiple of 64 on each axis for %s models. (Got %dx%d)", model_version_to_str[sd_ctx->sd->version], width, height); + return NULL; + } + LOG_DEBUG("generate_image %dx%d", width, height); + if (sd_ctx == NULL || sd_img_gen_params == NULL) { return NULL; } struct ggml_init_params params; - params.mem_size = static_cast(10 * 1024 * 1024); // 10 MB + params.mem_size = static_cast(20 * 1024 * 1024); // 20 MB increased by kcpp if (sd_version_is_sd3(sd_ctx->sd->version)) { - params.mem_size *= 2; + params.mem_size *= 2; //readjust by kcpp as above changed } if (sd_version_is_flux(sd_ctx->sd->version)) { - params.mem_size *= 3; + params.mem_size *= 3; //readjust by kcpp as above changed } if (sd_ctx->sd->stacked_id) { - params.mem_size += static_cast(10 * 1024 * 1024); // 10 MB + params.mem_size += static_cast(15 * 1024 * 1024); // 15 MB increased by kcpp } params.mem_size += width * height * 3 * sizeof(float) * 3; - params.mem_size *= batch_count; + params.mem_size += width * height * 3 * sizeof(float) * 3 * sd_img_gen_params->ref_images_count; + params.mem_size *= sd_img_gen_params->batch_count; params.mem_buffer = NULL; params.no_alloc = false; // LOG_DEBUG("mem_size %u ", params.mem_size); @@ -1966,175 +2141,198 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, return NULL; } - size_t t0 = ggml_time_ms(); - + int64_t seed = sd_img_gen_params->seed; if (seed < 0) { srand((int)time(NULL)); seed = rand(); } sd_ctx->sd->rng->manual_seed(seed); - ggml_tensor* init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1); - ggml_tensor* mask_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 1, 1); + size_t t0 = ggml_time_ms(); - sd_mask_to_tensor(mask.data, mask_img); + ggml_tensor* init_latent = NULL; + ggml_tensor* concat_latent = NULL; + ggml_tensor* denoise_mask = NULL; + std::vector sigmas = sd_ctx->sd->denoiser->get_sigmas(sd_img_gen_params->sample_steps); - sd_image_to_tensor(init_image.data, init_img); + if (sd_img_gen_params->init_image.data) { + LOG_INFO("IMG2IMG"); - ggml_tensor* masked_image; + size_t t_enc = static_cast(sd_img_gen_params->sample_steps * sd_img_gen_params->strength); + if (t_enc == sd_img_gen_params->sample_steps) + t_enc--; + LOG_INFO("target t_enc is %zu steps", t_enc); + std::vector sigma_sched; + sigma_sched.assign(sigmas.begin() + sd_img_gen_params->sample_steps - t_enc - 1, sigmas.end()); + sigmas = sigma_sched; - if (sd_version_is_inpaint(sd_ctx->sd->version)) { - int64_t mask_channels = 1; - if (sd_ctx->sd->version == VERSION_FLUX_FILL) { - mask_channels = 8 * 8; // flatten the whole mask - } - ggml_tensor* masked_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1); - sd_apply_mask(init_img, mask_img, masked_img); - ggml_tensor* masked_image_0 = NULL; - if (!sd_ctx->sd->use_tiny_autoencoder) { - ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, masked_img); - masked_image_0 = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments); - } else { - masked_image_0 = sd_ctx->sd->encode_first_stage(work_ctx, masked_img); - } - masked_image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, masked_image_0->ne[0], masked_image_0->ne[1], mask_channels + masked_image_0->ne[2], 1); - for (int ix = 0; ix < masked_image_0->ne[0]; ix++) { - for (int iy = 0; iy < masked_image_0->ne[1]; iy++) { - int mx = ix * 8; - int my = iy * 8; - if (sd_ctx->sd->version == VERSION_FLUX_FILL) { - for (int k = 0; k < masked_image_0->ne[2]; k++) { - float v = ggml_tensor_get_f32(masked_image_0, ix, iy, k); - ggml_tensor_set_f32(masked_image, v, ix, iy, k); - } - // "Encode" 8x8 mask chunks into a flattened 1x64 vector, and concatenate to masked image - for (int x = 0; x < 8; x++) { - for (int y = 0; y < 8; y++) { - float m = ggml_tensor_get_f32(mask_img, mx + x, my + y); - // TODO: check if the way the mask is flattened is correct (is it supposed to be x*8+y or x+8*y?) - // python code was using "b (h 8) (w 8) -> b (8 8) h w" - ggml_tensor_set_f32(masked_image, m, ix, iy, masked_image_0->ne[2] + x * 8 + y); + ggml_tensor* init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1); + ggml_tensor* mask_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 1, 1); + + sd_mask_to_tensor(sd_img_gen_params->mask_image.data, mask_img); + sd_image_to_tensor(sd_img_gen_params->init_image.data, init_img); + + if (sd_version_is_inpaint(sd_ctx->sd->version)) { + int64_t mask_channels = 1; + if (sd_ctx->sd->version == VERSION_FLUX_FILL) { + mask_channels = 8 * 8; // flatten the whole mask + } + ggml_tensor* masked_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1); + sd_apply_mask(init_img, mask_img, masked_img); + ggml_tensor* masked_latent = NULL; + if (!sd_ctx->sd->use_tiny_autoencoder) { + ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, masked_img); + masked_latent = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments); + } else { + masked_latent = sd_ctx->sd->encode_first_stage(work_ctx, masked_img); + } + concat_latent = ggml_new_tensor_4d(work_ctx, + GGML_TYPE_F32, + masked_latent->ne[0], + masked_latent->ne[1], + mask_channels + masked_latent->ne[2], + 1); + for (int ix = 0; ix < masked_latent->ne[0]; ix++) { + for (int iy = 0; iy < masked_latent->ne[1]; iy++) { + int mx = ix * 8; + int my = iy * 8; + if (sd_ctx->sd->version == VERSION_FLUX_FILL) { + for (int k = 0; k < masked_latent->ne[2]; k++) { + float v = ggml_tensor_get_f32(masked_latent, ix, iy, k); + ggml_tensor_set_f32(concat_latent, v, ix, iy, k); + } + // "Encode" 8x8 mask chunks into a flattened 1x64 vector, and concatenate to masked image + for (int x = 0; x < 8; x++) { + for (int y = 0; y < 8; y++) { + float m = ggml_tensor_get_f32(mask_img, mx + x, my + y); + // TODO: check if the way the mask is flattened is correct (is it supposed to be x*8+y or x+8*y?) + // python code was using "b (h 8) (w 8) -> b (8 8) h w" + ggml_tensor_set_f32(concat_latent, m, ix, iy, masked_latent->ne[2] + x * 8 + y); + } + } + } else { + float m = ggml_tensor_get_f32(mask_img, mx, my); + ggml_tensor_set_f32(concat_latent, m, ix, iy, 0); + for (int k = 0; k < masked_latent->ne[2]; k++) { + float v = ggml_tensor_get_f32(masked_latent, ix, iy, k); + ggml_tensor_set_f32(concat_latent, v, ix, iy, k + mask_channels); } - } - } else { - float m = ggml_tensor_get_f32(mask_img, mx, my); - ggml_tensor_set_f32(masked_image, m, ix, iy, 0); - for (int k = 0; k < masked_image_0->ne[2]; k++) { - float v = ggml_tensor_get_f32(masked_image_0, ix, iy, k); - ggml_tensor_set_f32(masked_image, v, ix, iy, k + mask_channels); } } } } - } else { - // LOG_WARN("Inpainting with a base model is not great"); - masked_image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width / 8, height / 8, 1, 1); - for (int ix = 0; ix < masked_image->ne[0]; ix++) { - for (int iy = 0; iy < masked_image->ne[1]; iy++) { - int mx = ix * 8; - int my = iy * 8; - float m = ggml_tensor_get_f32(mask_img, mx, my); - ggml_tensor_set_f32(masked_image, m, ix, iy); + + { + // LOG_WARN("Inpainting with a base model is not great"); + denoise_mask = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width / 8, height / 8, 1, 1); + for (int ix = 0; ix < denoise_mask->ne[0]; ix++) { + for (int iy = 0; iy < denoise_mask->ne[1]; iy++) { + int mx = ix * 8; + int my = iy * 8; + float m = ggml_tensor_get_f32(mask_img, mx, my); + ggml_tensor_set_f32(denoise_mask, m, ix, iy); + } } } - } - ggml_tensor* init_latent = NULL; - if (!sd_ctx->sd->use_tiny_autoencoder) { - ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, init_img); - init_latent = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments); - } else { - init_latent = sd_ctx->sd->encode_first_stage(work_ctx, init_img); - } - std::vector kontext_latents = std::vector(); - if (kontext_imgs) { - for (int i = 0; i < kontext_img_count; i++) { - ggml_tensor* img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1); - sd_image_to_tensor(kontext_imgs[i].data, img); - - ggml_tensor* latent = NULL; - if (!sd_ctx->sd->use_tiny_autoencoder) { - ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, img); - latent = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments); - } else { - latent = sd_ctx->sd->encode_first_stage(work_ctx, img); - } - kontext_latents.push_back(latent); + if (!sd_ctx->sd->use_tiny_autoencoder) { + ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, init_img); + init_latent = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments); + } else { + init_latent = sd_ctx->sd->encode_first_stage(work_ctx, init_img); } + } else { + LOG_INFO("TXT2IMG"); + if (sd_version_is_inpaint(sd_ctx->sd->version)) { + LOG_WARN("This is an inpainting model, this should only be used in img2img mode with a mask"); + } + init_latent = generate_init_latent(sd_ctx, work_ctx, width, height); } - // print_ggml_tensor(init_latent, true); - size_t t1 = ggml_time_ms(); - LOG_INFO("encode_first_stage completed, taking %.2fs", (t1 - t0) * 1.0f / 1000); + if (sd_img_gen_params->ref_images_count > 0) { + LOG_INFO("EDIT mode"); + } - std::vector sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps); - size_t t_enc = static_cast(sample_steps * strength); - if (t_enc == sample_steps) - t_enc--; - LOG_INFO("target t_enc is %zu steps", t_enc); - std::vector sigma_sched; - sigma_sched.assign(sigmas.begin() + sample_steps - t_enc - 1, sigmas.end()); + std::vector ref_latents; + for (int i = 0; i < sd_img_gen_params->ref_images_count; i++) { + ggml_tensor* img = ggml_new_tensor_4d(work_ctx, + GGML_TYPE_F32, + sd_img_gen_params->ref_images[i].width, + sd_img_gen_params->ref_images[i].height, + 3, + 1); + sd_image_to_tensor(sd_img_gen_params->ref_images[i].data, img); - sd_image_t* result_images = generate_image(sd_ctx, - work_ctx, - init_latent, - prompt_c_str, - negative_prompt_c_str, - clip_skip, - cfg_scale, - guidance, - eta, - width, - height, - sample_method, - sigma_sched, - seed, - batch_count, - control_cond, - control_strength, - style_ratio, - normalize_input, - input_id_images_path_c_str, - kontext_latents, - skip_layers_vec, - slg_scale, - skip_layer_start, - skip_layer_end, - masked_image, - photomaker_references); + ggml_tensor* latent = NULL; + if (sd_ctx->sd->use_tiny_autoencoder) { + latent = sd_ctx->sd->encode_first_stage(work_ctx, img); + } else if (sd_ctx->sd->version == VERSION_SD1_PIX2PIX) { + latent = sd_ctx->sd->encode_first_stage(work_ctx, img); + latent = ggml_view_3d(work_ctx, + latent, + latent->ne[0], + latent->ne[1], + latent->ne[2] / 2, + latent->nb[1], + latent->nb[2], + 0); + } else { + ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, img); + latent = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments); + } + ref_latents.push_back(latent); + } + + if (sd_img_gen_params->init_image.data != NULL || sd_img_gen_params->ref_images_count > 0) { + size_t t1 = ggml_time_ms(); + LOG_INFO("encode_first_stage completed, taking %.2fs", (t1 - t0) * 1.0f / 1000); + } + + sd_image_t* result_images = generate_image_internal(sd_ctx, + work_ctx, + init_latent, + SAFE_STR(sd_img_gen_params->prompt), + SAFE_STR(sd_img_gen_params->negative_prompt), + sd_img_gen_params->clip_skip, + sd_img_gen_params->guidance, + sd_img_gen_params->eta, + width, + height, + sd_img_gen_params->sample_method, + sigmas, + seed, + sd_img_gen_params->batch_count, + sd_img_gen_params->control_cond, + sd_img_gen_params->control_strength, + sd_img_gen_params->style_strength, + sd_img_gen_params->normalize_input, + sd_img_gen_params->input_id_images_path, + ref_latents, + concat_latent, + denoise_mask, + kcpp_img_gen_params); size_t t2 = ggml_time_ms(); - LOG_INFO("img2img completed in %.2fs", (t2 - t0) * 1.0f / 1000); + LOG_INFO("generate_image completed in %.2fs", (t2 - t0) * 1.0f / 1000); return result_images; } -SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx, - sd_image_t init_image, - int width, - int height, - int video_frames, - int motion_bucket_id, - int fps, - float augmentation_level, - float min_cfg, - float cfg_scale, - enum sample_method_t sample_method, - int sample_steps, - float strength, - int64_t seed) { - if (sd_ctx == NULL) { +SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* sd_vid_gen_params) { + if (sd_ctx == NULL || sd_vid_gen_params == NULL) { return NULL; } + int width = sd_vid_gen_params->width; + int height = sd_vid_gen_params->height; LOG_INFO("img2vid %dx%d", width, height); - std::vector sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps); + std::vector sigmas = sd_ctx->sd->denoiser->get_sigmas(sd_vid_gen_params->sample_steps); struct ggml_init_params params; params.mem_size = static_cast(10 * 1024) * 1024; // 10 MB - params.mem_size += width * height * 3 * sizeof(float) * video_frames; + params.mem_size += width * height * 3 * sizeof(float) * sd_vid_gen_params->video_frames; params.mem_buffer = NULL; params.no_alloc = false; // LOG_DEBUG("mem_size %u ", params.mem_size); @@ -2146,6 +2344,7 @@ SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx, return NULL; } + int64_t seed = sd_vid_gen_params->seed; if (seed < 0) { seed = (int)time(NULL); } @@ -2155,12 +2354,12 @@ SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx, int64_t t0 = ggml_time_ms(); SDCondition cond = sd_ctx->sd->get_svd_condition(work_ctx, - init_image, + sd_vid_gen_params->init_image, width, height, - fps, - motion_bucket_id, - augmentation_level); + sd_vid_gen_params->fps, + sd_vid_gen_params->motion_bucket_id, + sd_vid_gen_params->augmentation_level); auto uc_crossattn = ggml_dup_tensor(work_ctx, cond.c_crossattn); ggml_set_f32(uc_crossattn, 0.f); @@ -2173,7 +2372,7 @@ SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx, SDCondition uncond = SDCondition(uc_crossattn, uc_vector, uc_concat); int64_t t1 = ggml_time_ms(); - LOG_INFO("get_learned_condition completed, taking %d ms", t1 - t0); + LOG_INFO("get_learned_condition completed, taking %" PRId64 " ms", t1 - t0); if (sd_ctx->sd->free_params_immediately) { sd_ctx->sd->clip_vision->free_params_buffer(); } @@ -2182,25 +2381,24 @@ SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx, int C = 4; int W = width / 8; int H = height / 8; - struct ggml_tensor* x_t = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, video_frames); + struct ggml_tensor* x_t = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, sd_vid_gen_params->video_frames); ggml_set_f32(x_t, 0.f); - struct ggml_tensor* noise = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, video_frames); + struct ggml_tensor* noise = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, sd_vid_gen_params->video_frames); ggml_tensor_set_f32_randn(noise, sd_ctx->sd->rng); - LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]); + LOG_INFO("sampling using %s method", sampling_methods_str[sd_vid_gen_params->sample_method]); struct ggml_tensor* x_0 = sd_ctx->sd->sample(work_ctx, x_t, noise, cond, uncond, {}, + {}, 0.f, - min_cfg, - cfg_scale, + sd_vid_gen_params->guidance, 0.f, - 0.f, - sample_method, + sd_vid_gen_params->sample_method, sigmas, -1, SDCondition(NULL, NULL, NULL)); @@ -2220,13 +2418,13 @@ SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx, return NULL; } - sd_image_t* result_images = (sd_image_t*)calloc(video_frames, sizeof(sd_image_t)); + sd_image_t* result_images = (sd_image_t*)calloc(sd_vid_gen_params->video_frames, sizeof(sd_image_t)); if (result_images == NULL) { ggml_free(work_ctx); return NULL; } - for (size_t i = 0; i < video_frames; i++) { + for (size_t i = 0; i < sd_vid_gen_params->video_frames; i++) { auto img_i = ggml_view_3d(work_ctx, img, img->ne[0], img->ne[1], img->ne[2], img->nb[1], img->nb[2], img->nb[3] * i); result_images[i].width = width; @@ -2242,131 +2440,3 @@ SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx, return result_images; } - -sd_image_t* edit(sd_ctx_t* sd_ctx, - sd_image_t* ref_images, - int ref_images_count, - const char* prompt_c_str, - const char* negative_prompt_c_str, - int clip_skip, - float cfg_scale, - float guidance, - float eta, - int width, - int height, - sample_method_t sample_method, - int sample_steps, - float strength, - int64_t seed, - int batch_count, - const sd_image_t* control_cond, - float control_strength, - float style_ratio, - bool normalize_input, - int* skip_layers = NULL, - size_t skip_layers_count = 0, - float slg_scale = 0, - float skip_layer_start = 0.01, - float skip_layer_end = 0.2) { - std::vector skip_layers_vec(skip_layers, skip_layers + skip_layers_count); - LOG_DEBUG("edit %dx%d", width, height); - if (sd_ctx == NULL) { - return NULL; - } - if (ref_images_count <= 0) { - LOG_ERROR("ref images count should > 0"); - return NULL; - } - - struct ggml_init_params params; - params.mem_size = static_cast(30 * 1024 * 1024); // 10 MB - params.mem_size += width * height * 3 * sizeof(float) * 3 * ref_images_count; - params.mem_size *= batch_count; - params.mem_buffer = NULL; - params.no_alloc = false; - // LOG_DEBUG("mem_size %u ", params.mem_size); - - struct ggml_context* work_ctx = ggml_init(params); - if (!work_ctx) { - LOG_ERROR("ggml_init() failed"); - return NULL; - } - - if (seed < 0) { - srand((int)time(NULL)); - seed = rand(); - } - sd_ctx->sd->rng->manual_seed(seed); - - int C = 4; - if (sd_version_is_sd3(sd_ctx->sd->version)) { - C = 16; - } else if (sd_version_is_flux(sd_ctx->sd->version)) { - C = 16; - } - int W = width / 8; - int H = height / 8; - ggml_tensor* init_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1); - if (sd_version_is_sd3(sd_ctx->sd->version)) { - ggml_set_f32(init_latent, 0.0609f); - } else if (sd_version_is_flux(sd_ctx->sd->version)) { - ggml_set_f32(init_latent, 0.1159f); - } else { - ggml_set_f32(init_latent, 0.f); - } - - size_t t0 = ggml_time_ms(); - - std::vector ref_latents; - for (int i = 0; i < ref_images_count; i++) { - ggml_tensor* img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, ref_images[i].width, ref_images[i].height, 3, 1); - sd_image_to_tensor(ref_images[i].data, img); - - ggml_tensor* latent = NULL; - if (!sd_ctx->sd->use_tiny_autoencoder) { - ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, img); - latent = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments); - } else { - latent = sd_ctx->sd->encode_first_stage(work_ctx, img); - } - ref_latents.push_back(latent); - } - - size_t t1 = ggml_time_ms(); - LOG_INFO("encode_first_stage completed, taking %.2fs", (t1 - t0) * 1.0f / 1000); - - std::vector sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps); - - sd_image_t* result_images = generate_image(sd_ctx, - work_ctx, - init_latent, - prompt_c_str, - negative_prompt_c_str, - clip_skip, - cfg_scale, - guidance, - eta, - width, - height, - sample_method, - sigmas, - seed, - batch_count, - control_cond, - control_strength, - style_ratio, - normalize_input, - "", - ref_latents, - skip_layers_vec, - slg_scale, - skip_layer_start, - skip_layer_end, - NULL); - - size_t t2 = ggml_time_ms(); - - LOG_INFO("edit completed in %.2fs", (t2 - t0) * 1.0f / 1000); - - return result_images; -} \ No newline at end of file diff --git a/otherarch/sdcpp/stable-diffusion.h b/otherarch/sdcpp/stable-diffusion.h index 9d4fcda6e..57603441a 100644 --- a/otherarch/sdcpp/stable-diffusion.h +++ b/otherarch/sdcpp/stable-diffusion.h @@ -30,7 +30,8 @@ extern "C" { enum rng_type_t { STD_DEFAULT_RNG, - CUDA_RNG + CUDA_RNG, + RNG_TYPE_COUNT }; enum sample_method_t { @@ -46,7 +47,7 @@ enum sample_method_t { LCM, DDIM_TRAILING, TCD, - N_SAMPLE_METHODS + SAMPLE_METHOD_COUNT }; enum schedule_t { @@ -56,15 +57,15 @@ enum schedule_t { EXPONENTIAL, AYS, GITS, - N_SCHEDULES + SCHEDULE_COUNT }; // same as enum ggml_type enum sd_type_t { - SD_TYPE_F32 = 0, - SD_TYPE_F16 = 1, - SD_TYPE_Q4_0 = 2, - SD_TYPE_Q4_1 = 3, + SD_TYPE_F32 = 0, + SD_TYPE_F16 = 1, + SD_TYPE_Q4_0 = 2, + SD_TYPE_Q4_1 = 3, // SD_TYPE_Q4_2 = 4, support has been removed // SD_TYPE_Q4_3 = 5, support has been removed SD_TYPE_Q5_0 = 6, @@ -92,19 +93,17 @@ enum sd_type_t { SD_TYPE_F64 = 28, SD_TYPE_IQ1_M = 29, SD_TYPE_BF16 = 30, - SD_TYPE_Q4_0_4_4 = 31, - SD_TYPE_Q4_0_4_8 = 32, - SD_TYPE_Q4_0_8_8 = 33, - SD_TYPE_TQ1_0 = 34, - SD_TYPE_TQ2_0 = 35, - SD_TYPE_IQ4_NL_4_4 = 36, + // SD_TYPE_Q4_0_4_4 = 31, support has been removed from gguf files + // SD_TYPE_Q4_0_4_8 = 32, + // SD_TYPE_Q4_0_8_8 = 33, + SD_TYPE_TQ1_0 = 34, + SD_TYPE_TQ2_0 = 35, + // SD_TYPE_IQ4_NL_4_4 = 36, // SD_TYPE_IQ4_NL_4_8 = 37, // SD_TYPE_IQ4_NL_8_8 = 38, SD_TYPE_COUNT = 40, }; -SD_API const char* sd_type_name(enum sd_type_t type); - enum sd_log_level_t { SD_LOG_DEBUG, SD_LOG_INFO, @@ -112,6 +111,105 @@ enum sd_log_level_t { SD_LOG_ERROR }; +typedef struct { + const char* model_path; + const char* clip_l_path; + const char* clip_g_path; + const char* t5xxl_path; + const char* diffusion_model_path; + const char* vae_path; + const char* taesd_path; + const char* control_net_path; + const char* lora_model_dir; + const char* embedding_dir; + const char* stacked_id_embed_dir; + bool vae_decode_only; + bool vae_tiling; + bool free_params_immediately; + int n_threads; + enum sd_type_t wtype; + enum rng_type_t rng_type; + enum schedule_t schedule; + bool keep_clip_on_cpu; + bool keep_control_net_on_cpu; + bool keep_vae_on_cpu; + bool diffusion_flash_attn; + bool diffusion_conv_direct; + bool vae_conv_direct; + bool chroma_use_dit_mask; + bool chroma_use_t5_mask; + int chroma_t5_mask_pad; +} sd_ctx_params_t; + +typedef struct { + uint32_t width; + uint32_t height; + uint32_t channel; + uint8_t* data; +} sd_image_t; + +typedef struct { + int* layers; + size_t layer_count; + float layer_start; + float layer_end; + float scale; +} sd_slg_params_t; + +typedef struct { + float txt_cfg; + float img_cfg; + float min_cfg; + float distilled_guidance; + sd_slg_params_t slg; +} sd_guidance_params_t; + +typedef struct { + const char* prompt; + const char* negative_prompt; + int clip_skip; + sd_guidance_params_t guidance; + sd_image_t init_image; + sd_image_t* ref_images; + int ref_images_count; + sd_image_t mask_image; + int width; + int height; + enum sample_method_t sample_method; + int sample_steps; + float eta; + float strength; + int64_t seed; + int batch_count; + const sd_image_t* control_cond; + float control_strength; + float style_strength; + bool normalize_input; + const char* input_id_images_path; +} sd_img_gen_params_t; + +typedef struct { + sd_image_t* photomaker_references; + int photomaker_reference_count; +} kcpp_img_gen_params_t; + +typedef struct { + sd_image_t init_image; + int width; + int height; + sd_guidance_params_t guidance; + enum sample_method_t sample_method; + int sample_steps; + float strength; + int64_t seed; + int video_frames; + int motion_bucket_id; + int fps; + float augmentation_level; +} sd_vid_gen_params_t; + +typedef struct sd_ctx_t sd_ctx_t; + typedef void (*sd_log_cb_t)(enum sd_log_level_t level, const char* text, void* data); typedef void (*sd_progress_cb_t)(int step, int steps, float time, void* data); @@ -120,154 +218,42 @@ SD_API void sd_set_progress_callback(sd_progress_cb_t cb, void* data); SD_API int32_t sd_get_num_physical_cores(); SD_API const char* sd_get_system_info(); -typedef struct { - uint32_t width; - uint32_t height; - uint32_t channel; - uint8_t* data; -} sd_image_t; +SD_API const char* sd_type_name(enum sd_type_t type); +SD_API enum sd_type_t str_to_sd_type(const char* str); +SD_API const char* sd_rng_type_name(enum rng_type_t rng_type); +SD_API enum rng_type_t str_to_rng_type(const char* str); +SD_API const char* sd_sample_method_name(enum sample_method_t sample_method); +SD_API enum sample_method_t str_to_sample_method(const char* str); +SD_API const char* sd_schedule_name(enum schedule_t schedule); +SD_API enum schedule_t str_to_schedule(const char* str); -typedef struct sd_ctx_t sd_ctx_t; - -SD_API void set_sd_vae_tiling(sd_ctx_t* ctx, bool tiling); -SD_API int get_loaded_sd_version(sd_ctx_t* ctx); -SD_API bool sd_loaded_chroma(); - -SD_API sd_ctx_t* new_sd_ctx(const char* model_path, - const char* clip_l_path, - const char* clip_g_path, - const char* t5xxl_path, - const char* diffusion_model_path, - const char* vae_path, - const char* taesd_path, - const char* control_net_path_c_str, - const char* lora_model_dir, - const char* embed_dir_c_str, - const char* stacked_id_embed_dir_c_str, - bool vae_decode_only, - bool vae_tiling, - bool free_params_immediately, - int n_threads, - enum sd_type_t wtype, - enum rng_type_t rng_type, - enum schedule_t s, - bool keep_clip_on_cpu, - bool keep_control_net_cpu, - bool keep_vae_on_cpu, - bool diffusion_flash_attn, - bool chroma_use_dit_mask, - bool chroma_use_t5_mask, - int chroma_t5_mask_pad); +SD_API void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params); +SD_API char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params); +SD_API sd_ctx_t* new_sd_ctx(const sd_ctx_params_t* sd_ctx_params); SD_API void free_sd_ctx(sd_ctx_t* sd_ctx); -SD_API sd_image_t* txt2img(sd_ctx_t* sd_ctx, - const char* prompt, - const char* negative_prompt, - int clip_skip, - float cfg_scale, - float guidance, - float eta, - int width, - int height, - enum sample_method_t sample_method, - int sample_steps, - int64_t seed, - int batch_count, - const sd_image_t* control_cond, - float control_strength, - float style_strength, - bool normalize_input, - const char* input_id_images_path, - sd_image_t* kontext_imgs, - int kontext_img_count, - int* skip_layers, - size_t skip_layers_count, - float slg_scale, - float skip_layer_start, - float skip_layer_end, - const std::vector photomaker_references); +SD_API void sd_img_gen_params_init(sd_img_gen_params_t* sd_img_gen_params); +SD_API char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params); +SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params, const kcpp_img_gen_params_t* kcpp_img_gen_params); -SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx, - sd_image_t init_image, - sd_image_t mask_image, - const char* prompt, - const char* negative_prompt, - int clip_skip, - float cfg_scale, - float guidance, - float eta, - int width, - int height, - enum sample_method_t sample_method, - int sample_steps, - float strength, - int64_t seed, - int batch_count, - const sd_image_t* control_cond, - float control_strength, - float style_strength, - bool normalize_input, - const char* input_id_images_path, - sd_image_t* kontext_imgs, - int kontext_img_count, - int* skip_layers, - size_t skip_layers_count, - float slg_scale, - float skip_layer_start, - float skip_layer_end, - const std::vector photomaker_references); - -SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx, - sd_image_t init_image, - int width, - int height, - int video_frames, - int motion_bucket_id, - int fps, - float augmentation_level, - float min_cfg, - float cfg_scale, - enum sample_method_t sample_method, - int sample_steps, - float strength, - int64_t seed); - -SD_API sd_image_t* edit(sd_ctx_t* sd_ctx, - sd_image_t* ref_images, - int ref_images_count, - const char* prompt, - const char* negative_prompt, - int clip_skip, - float cfg_scale, - float guidance, - float eta, - int width, - int height, - enum sample_method_t sample_method, - int sample_steps, - float strength, - int64_t seed, - int batch_count, - const sd_image_t* control_cond, - float control_strength, - float style_strength, - bool normalize_input, - int* skip_layers, - size_t skip_layers_count, - float slg_scale, - float skip_layer_start, - float skip_layer_end); +SD_API void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params); +SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* sd_vid_gen_params); // broken typedef struct upscaler_ctx_t upscaler_ctx_t; SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path, - int n_threads); + int n_threads, + bool direct); SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx); SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx, sd_image_t input_image, uint32_t upscale_factor); -SD_API bool convert(const char* input_path, const char* vae_path, const char* output_path, enum sd_type_t output_type); +SD_API bool convert(const char* input_path, + const char* vae_path, + const char* output_path, + enum sd_type_t output_type, + const char* tensor_type_rules); SD_API uint8_t* preprocess_canny(uint8_t* img, int width, @@ -282,4 +268,4 @@ SD_API uint8_t* preprocess_canny(uint8_t* img, } #endif -#endif // __STABLE_DIFFUSION_H__ \ No newline at end of file +#endif // __STABLE_DIFFUSION_H__ diff --git a/otherarch/sdcpp/t5.hpp b/otherarch/sdcpp/t5.hpp index 1861ad478..253b3fbcd 100644 --- a/otherarch/sdcpp/t5.hpp +++ b/otherarch/sdcpp/t5.hpp @@ -457,8 +457,8 @@ protected: int64_t hidden_size; float eps; - void init_params(struct ggml_context* ctx, std::map& tensor_types, const std::string prefix = "") { - enum ggml_type wtype = GGML_TYPE_F32; //(tensor_types.find(prefix + "weight") != tensor_types.end()) ? tensor_types[prefix + "weight"] : GGML_TYPE_F32; + void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") { + enum ggml_type wtype = GGML_TYPE_F32; params["weight"] = ggml_new_tensor_1d(ctx, wtype, hidden_size); } @@ -735,7 +735,7 @@ struct T5Runner : public GGMLRunner { std::vector relative_position_bucket_vec; T5Runner(ggml_backend_t backend, - std::map& tensor_types, + const String2GGMLType& tensor_types, const std::string prefix, int64_t num_layers = 24, int64_t model_dim = 4096, @@ -876,16 +876,14 @@ struct T5Embedder { T5UniGramTokenizer tokenizer; T5Runner model; - static std::map empty_tensor_types; - T5Embedder(ggml_backend_t backend, - std::map& tensor_types = empty_tensor_types, - const std::string prefix = "", - int64_t num_layers = 24, - int64_t model_dim = 4096, - int64_t ff_dim = 10240, - int64_t num_heads = 64, - int64_t vocab_size = 32128) + const String2GGMLType& tensor_types = {}, + const std::string prefix = "", + int64_t num_layers = 24, + int64_t model_dim = 4096, + int64_t ff_dim = 10240, + int64_t num_heads = 64, + int64_t vocab_size = 32128) : model(backend, tensor_types, prefix, num_layers, model_dim, ff_dim, num_heads, vocab_size) { } diff --git a/otherarch/sdcpp/tae.hpp b/otherarch/sdcpp/tae.hpp index 4c822eaf9..4959bbd08 100644 --- a/otherarch/sdcpp/tae.hpp +++ b/otherarch/sdcpp/tae.hpp @@ -149,7 +149,7 @@ public: if (i == 1) { h = ggml_relu_inplace(ctx, h); } else { - h = ggml_upscale(ctx, h, 2, ggml_scale_mode::GGML_SCALE_MODE_NEAREST); + h = ggml_upscale(ctx, h, 2, GGML_SCALE_MODE_NEAREST); } continue; } @@ -196,7 +196,7 @@ struct TinyAutoEncoder : public GGMLRunner { bool decode_only = false; TinyAutoEncoder(ggml_backend_t backend, - std::map& tensor_types, + const String2GGMLType& tensor_types, const std::string prefix, bool decoder_only = true, SDVersion version = VERSION_SD1) @@ -206,6 +206,17 @@ struct TinyAutoEncoder : public GGMLRunner { taesd.init(params_ctx, tensor_types, prefix); } + void enable_conv2d_direct() { + std::vector blocks; + taesd.get_all_blocks(blocks); + for (auto block : blocks) { + if (block->get_desc() == "Conv2d") { + auto conv_block = (Conv2d*)block; + conv_block->enable_direct(); + } + } + } + std::string get_desc() { return "taesd"; } diff --git a/otherarch/sdcpp/unet.hpp b/otherarch/sdcpp/unet.hpp index d52d71285..696bc6dfa 100644 --- a/otherarch/sdcpp/unet.hpp +++ b/otherarch/sdcpp/unet.hpp @@ -166,7 +166,6 @@ public: // ldm.modules.diffusionmodules.openaimodel.UNetModel class UnetModelBlock : public GGMLBlock { protected: - static std::map empty_tensor_types; SDVersion version = VERSION_SD1; // network hparams int in_channels = 4; @@ -184,7 +183,7 @@ public: int model_channels = 320; int adm_in_channels = 2816; // only for VERSION_SDXL/SVD - UnetModelBlock(SDVersion version = VERSION_SD1, std::map& tensor_types = empty_tensor_types, bool flash_attn = false) + UnetModelBlock(SDVersion version = VERSION_SD1, const String2GGMLType& tensor_types = {}, bool flash_attn = false) : version(version) { if (sd_version_is_sd2(version)) { context_dim = 1024; @@ -207,6 +206,8 @@ public: } if (sd_version_is_inpaint(version)) { in_channels = 9; + } else if (sd_version_is_unet_edit(version)) { + in_channels = 8; } // dims is always 2 @@ -537,7 +538,7 @@ struct UNetModelRunner : public GGMLRunner { UnetModelBlock unet; UNetModelRunner(ggml_backend_t backend, - std::map& tensor_types, + const String2GGMLType& tensor_types, const std::string prefix, SDVersion version = VERSION_SD1, bool flash_attn = false) @@ -545,6 +546,18 @@ struct UNetModelRunner : public GGMLRunner { unet.init(params_ctx, tensor_types, prefix); } + void enable_conv2d_direct() { + std::vector blocks; + unet.get_all_blocks(blocks); + for (auto block : blocks) { + if (block->get_desc() == "Conv2d") { + LOG_DEBUG("block %s", block->get_desc().c_str()); + auto conv_block = (Conv2d*)block; + conv_block->enable_direct(); + } + } + } + std::string get_desc() { return "unet"; } @@ -657,4 +670,4 @@ struct UNetModelRunner : public GGMLRunner { } }; -#endif // __UNET_HPP__ \ No newline at end of file +#endif // __UNET_HPP__ diff --git a/otherarch/sdcpp/upscaler.cpp b/otherarch/sdcpp/upscaler.cpp index 8907e69c3..599f263f9 100644 --- a/otherarch/sdcpp/upscaler.cpp +++ b/otherarch/sdcpp/upscaler.cpp @@ -9,9 +9,12 @@ struct UpscalerGGML { std::shared_ptr esrgan_upscaler; std::string esrgan_path; int n_threads; + bool direct = false; - UpscalerGGML(int n_threads) - : n_threads(n_threads) { + UpscalerGGML(int n_threads, + bool direct = false) + : n_threads(n_threads), + direct(direct) { } bool load_from_file(const std::string& esrgan_path) { @@ -21,12 +24,17 @@ struct UpscalerGGML { #endif #ifdef SD_USE_METAL LOG_DEBUG("Using Metal backend"); + ggml_log_set(ggml_log_callback_default, nullptr); backend = ggml_backend_metal_init(); #endif #ifdef SD_USE_VULKAN LOG_DEBUG("Using Vulkan backend"); backend = ggml_backend_vk_init(0); #endif +#ifdef SD_USE_OPENCL + LOG_DEBUG("Using OpenCL backend"); + backend = ggml_backend_opencl_init(); +#endif #ifdef SD_USE_SYCL LOG_DEBUG("Using SYCL backend"); backend = ggml_backend_sycl_init(0); @@ -42,6 +50,9 @@ struct UpscalerGGML { } LOG_INFO("Upscaler weight type: %s", ggml_type_name(model_data_type)); esrgan_upscaler = std::make_shared(backend, model_loader.tensor_storages_types); + if (direct) { + esrgan_upscaler->enable_conv2d_direct(); + } if (!esrgan_upscaler->load_from_file(esrgan_path)) { return false; } @@ -99,14 +110,15 @@ struct upscaler_ctx_t { }; upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str, - int n_threads) { + int n_threads, + bool direct = false) { upscaler_ctx_t* upscaler_ctx = (upscaler_ctx_t*)malloc(sizeof(upscaler_ctx_t)); if (upscaler_ctx == NULL) { return NULL; } std::string esrgan_path(esrgan_path_c_str); - upscaler_ctx->upscaler = new UpscalerGGML(n_threads); + upscaler_ctx->upscaler = new UpscalerGGML(n_threads, direct); if (upscaler_ctx->upscaler == NULL) { return NULL; } diff --git a/otherarch/sdcpp/util.cpp b/otherarch/sdcpp/util.cpp index d5cd8e229..829099487 100644 --- a/otherarch/sdcpp/util.cpp +++ b/otherarch/sdcpp/util.cpp @@ -102,19 +102,32 @@ std::vector get_files_from_dir(const std::string& dir) { sprintf(directoryPath, "%s\\%s\\*", currentDirectory, dir.c_str()); // Find the first file in the directory - hFind = FindFirstFile(directoryPath, &findFileData); - + hFind = FindFirstFile(directoryPath, &findFileData); + bool isAbsolutePath = false; // Check if the directory was found if (hFind == INVALID_HANDLE_VALUE) { - printf("Unable to find directory.\n"); - return files; + printf("Unable to find directory. Try with original path \n"); + + char directoryPathAbsolute[MAX_PATH]; + sprintf(directoryPathAbsolute, "%s*", dir.c_str()); + + hFind = FindFirstFile(directoryPathAbsolute, &findFileData); + isAbsolutePath = true; + if (hFind == INVALID_HANDLE_VALUE) { + printf("Absolute path was also wrong.\n"); + return files; + } } // Loop through all files in the directory do { // Check if the found file is a regular file (not a directory) if (!(findFileData.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY)) { - files.push_back(std::string(currentDirectory) + "\\" + dir + "\\" + std::string(findFileData.cFileName)); + if (isAbsolutePath) { + files.push_back(dir + "\\" + std::string(findFileData.cFileName)); + } else { + files.push_back(std::string(currentDirectory) + "\\" + dir + "\\" + std::string(findFileData.cFileName)); + } } } while (FindNextFile(hFind, &findFileData) != 0); @@ -447,10 +460,6 @@ const char* sd_get_system_info() { return buffer; } -const char* sd_type_name(enum sd_type_t type) { - return ggml_type_name((ggml_type)type); -} - sd_image_f32_t sd_image_t_to_sd_image_f32_t(sd_image_t image) { sd_image_f32_t converted_image; converted_image.width = image.width; diff --git a/otherarch/sdcpp/util.h b/otherarch/sdcpp/util.h index 607ea1bb1..7f0cb337d 100644 --- a/otherarch/sdcpp/util.h +++ b/otherarch/sdcpp/util.h @@ -7,6 +7,9 @@ #include "stable-diffusion.h" +#define SAFE_STR(s) ((s) ? (s) : "") +#define BOOL_STR(b) ((b) ? "true" : "false") + bool ends_with(const std::string& str, const std::string& ending); bool starts_with(const std::string& str, const std::string& start); bool contains(const std::string& str, const std::string& substr); diff --git a/otherarch/sdcpp/vae.hpp b/otherarch/sdcpp/vae.hpp index 4add881f6..bdf160bb8 100644 --- a/otherarch/sdcpp/vae.hpp +++ b/otherarch/sdcpp/vae.hpp @@ -163,8 +163,8 @@ public: class VideoResnetBlock : public ResnetBlock { protected: - void init_params(struct ggml_context* ctx, std::map& tensor_types, const std::string prefix = "") { - enum ggml_type wtype = (tensor_types.find(prefix + "mix_factor") != tensor_types.end()) ? tensor_types[prefix + "mix_factor"] : GGML_TYPE_F32; + void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") { + enum ggml_type wtype = get_type(prefix + "mix_factor", tensor_types, GGML_TYPE_F32); params["mix_factor"] = ggml_new_tensor_1d(ctx, wtype, 1); } @@ -525,7 +525,7 @@ struct AutoEncoderKL : public GGMLRunner { AutoencodingEngine ae; AutoEncoderKL(ggml_backend_t backend, - std::map& tensor_types, + const String2GGMLType& tensor_types, const std::string prefix, bool decode_only = false, bool use_video_decoder = false, @@ -534,6 +534,17 @@ struct AutoEncoderKL : public GGMLRunner { ae.init(params_ctx, tensor_types, prefix); } + void enable_conv2d_direct() { + std::vector blocks; + ae.get_all_blocks(blocks); + for (auto block : blocks) { + if (block->get_desc() == "Conv2d") { + auto conv_block = (Conv2d*)block; + conv_block->enable_direct(); + } + } + } + std::string get_desc() { return "vae"; }