From 5de7ed3d5667c377b70d5b327ab2405b386611c2 Mon Sep 17 00:00:00 2001
From: Wagner Bruna <wbruna@users.noreply.github.com>
Date: Tue, 12 Aug 2025 12:25:02 -0300
Subject: [PATCH] WIP: update stable-diffusion.cpp to 5900ef6605c6 (new API)
 (#1669)

* Update stable-diffusion.cpp to 5900ef6605c6 (new API)

* Clean up pending LoRA code and simplify LoRA changes to upstream

* Move VAE tiling disabling for TAESD to sdtype_adapter.cpp

* Move auxiliary ctx functions to sdtype_adapter.cpp

* Use ref_images parameter for Kontext images

* Drop clip skip workaround (fixed upstream)

* Workaround for flash attention with img2img

leejet/stable-diffusion.cpp#756

* Workaround for Chroma with flash attention, debug prints

* Disable forcing CLIP weights to F32 for reduced memory usage
---
 otherarch/sdcpp/clip.hpp             |   44 +-
 otherarch/sdcpp/common.hpp           |   14 +-
 otherarch/sdcpp/conditioner.hpp      |   73 +-
 otherarch/sdcpp/control.hpp          |   15 +-
 otherarch/sdcpp/denoiser.hpp         |  133 ++-
 otherarch/sdcpp/diffusion_model.hpp  |   26 +-
 otherarch/sdcpp/esrgan.hpp           |   17 +-
 otherarch/sdcpp/flux.hpp             |   55 +-
 otherarch/sdcpp/ggml_extend.hpp      |  147 ++-
 otherarch/sdcpp/gits_noise.inl       |    2 +-
 otherarch/sdcpp/lora.hpp             |    8 +-
 otherarch/sdcpp/main.cpp             | 1159 +++++++++-----------
 otherarch/sdcpp/mmdit.hpp            |   16 +-
 otherarch/sdcpp/model.cpp            |  233 +++-
 otherarch/sdcpp/model.h              |   33 +-
 otherarch/sdcpp/pmid.hpp             |    7 +-
 otherarch/sdcpp/sdtype_adapter.cpp   |  222 ++--
 otherarch/sdcpp/stable-diffusion.cpp | 1508 ++++++++++++++------------
 otherarch/sdcpp/stable-diffusion.h   |  288 +++--
 otherarch/sdcpp/t5.hpp               |   22 +-
 otherarch/sdcpp/tae.hpp              |   15 +-
 otherarch/sdcpp/unet.hpp             |   21 +-
 otherarch/sdcpp/upscaler.cpp         |   20 +-
 otherarch/sdcpp/util.cpp             |   27 +-
 otherarch/sdcpp/util.h               |    3 +
 otherarch/sdcpp/vae.hpp              |   17 +-
 26 files changed, 2255 insertions(+), 1870 deletions(-)

diff --git a/otherarch/sdcpp/clip.hpp b/otherarch/sdcpp/clip.hpp
index cdc27e727..0696220f9 100644
--- a/otherarch/sdcpp/clip.hpp
+++ b/otherarch/sdcpp/clip.hpp
@@ -545,9 +545,15 @@ protected:
     int64_t vocab_size;
     int64_t num_positions;
 
-    void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") {
-        enum ggml_type token_wtype    = (tensor_types.find(prefix + "token_embedding.weight") != tensor_types.end()) ? tensor_types[prefix + "token_embedding.weight"] : GGML_TYPE_F32;
-        enum ggml_type position_wtype = GGML_TYPE_F32;  //(tensor_types.find(prefix + "position_embedding.weight") != tensor_types.end()) ? tensor_types[prefix + "position_embedding.weight"] : GGML_TYPE_F32;
+    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
+        enum ggml_type token_wtype    = GGML_TYPE_F32;
+        #if 1
+        // kcpp reduce memory usage (reverts https://github.com/leejet/stable-diffusion.cpp/pull/601)
+        auto tensor_type = tensor_types.find(prefix + "token_embedding.weight");
+        if (tensor_type != tensor_types.end())
+            token_wtype = tensor_type->second;
+        #endif
+        enum ggml_type position_wtype = GGML_TYPE_F32;
 
         params["token_embedding.weight"]    = ggml_new_tensor_2d(ctx, token_wtype, embed_dim, vocab_size);
         params["position_embedding.weight"] = ggml_new_tensor_2d(ctx, position_wtype, embed_dim, num_positions);
@@ -594,10 +600,10 @@ protected:
     int64_t image_size;
     int64_t num_patches;
     int64_t num_positions;
-    void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") {
-        enum ggml_type patch_wtype    = GGML_TYPE_F16;  // tensor_types.find(prefix + "patch_embedding.weight") != tensor_types.end() ? tensor_types[prefix + "patch_embedding.weight"] : GGML_TYPE_F16;
-        enum ggml_type class_wtype    = GGML_TYPE_F32;  // tensor_types.find(prefix + "class_embedding") != tensor_types.end() ? tensor_types[prefix + "class_embedding"] : GGML_TYPE_F32;
-        enum ggml_type position_wtype = GGML_TYPE_F32;  // tensor_types.find(prefix + "position_embedding.weight") != tensor_types.end() ? tensor_types[prefix + "position_embedding.weight"] : GGML_TYPE_F32;
+    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
+        enum ggml_type patch_wtype    = GGML_TYPE_F16;
+        enum ggml_type class_wtype    = GGML_TYPE_F32;
+        enum ggml_type position_wtype = GGML_TYPE_F32;
 
         params["patch_embedding.weight"]    = ggml_new_tensor_4d(ctx, patch_wtype, patch_size, patch_size, num_channels, embed_dim);
         params["class_embedding"]           = ggml_new_tensor_1d(ctx, class_wtype, embed_dim);
@@ -657,9 +663,9 @@ enum CLIPVersion {
 
 class CLIPTextModel : public GGMLBlock {
 protected:
-    void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") {
+    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
         if (version == OPEN_CLIP_VIT_BIGG_14) {
-            enum ggml_type wtype      = GGML_TYPE_F32;  // tensor_types.find(prefix + "text_projection") != tensor_types.end() ? tensor_types[prefix + "text_projection"] : GGML_TYPE_F32;
+            enum ggml_type wtype      = GGML_TYPE_F32;
             params["text_projection"] = ggml_new_tensor_2d(ctx, wtype, projection_dim, hidden_size);
         }
     }
@@ -678,8 +684,8 @@ public:
     bool with_final_ln        = true;
 
     CLIPTextModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14,
-                  int clip_skip_value = -1,
-                  bool with_final_ln  = true)
+                  bool with_final_ln  = true,
+                  int clip_skip_value = -1)
         : version(version), with_final_ln(with_final_ln) {
         if (version == OPEN_CLIP_VIT_H_14) {
             hidden_size       = 1024;
@@ -701,7 +707,7 @@ public:
 
     void set_clip_skip(int skip) {
         if (skip <= 0) {
-            return;
+            skip = -1;
         }
         clip_skip = skip;
     }
@@ -805,8 +811,8 @@ protected:
     int64_t out_features;
     bool transpose_weight;
 
-    void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") {
-        enum ggml_type wtype = tensor_types.find(prefix + "weight") != tensor_types.end() ? tensor_types[prefix + "weight"] : GGML_TYPE_F32;
+    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
+        enum ggml_type wtype = get_type(prefix + "weight", tensor_types, GGML_TYPE_F32);
         if (transpose_weight) {
             params["weight"] = ggml_new_tensor_2d(ctx, wtype, out_features, in_features);
         } else {
@@ -868,12 +874,12 @@ struct CLIPTextModelRunner : public GGMLRunner {
     CLIPTextModel model;
 
     CLIPTextModelRunner(ggml_backend_t backend,
-                        std::map<std::string, enum ggml_type>& tensor_types,
+                        const String2GGMLType& tensor_types,
                         const std::string prefix,
                         CLIPVersion version = OPENAI_CLIP_VIT_L_14,
-                        int clip_skip_value = 1,
-                        bool with_final_ln  = true)
-        : GGMLRunner(backend), model(version, clip_skip_value, with_final_ln) {
+                        bool with_final_ln  = true,
+                        int clip_skip_value = -1)
+        : GGMLRunner(backend), model(version, with_final_ln, clip_skip_value) {
         model.init(params_ctx, tensor_types, prefix);
     }
 
@@ -949,4 +955,4 @@ struct CLIPTextModelRunner : public GGMLRunner {
     }
 };
 
-#endif  // __CLIP_HPP__
\ No newline at end of file
+#endif  // __CLIP_HPP__
diff --git a/otherarch/sdcpp/common.hpp b/otherarch/sdcpp/common.hpp
index 32250d763..3a1307767 100644
--- a/otherarch/sdcpp/common.hpp
+++ b/otherarch/sdcpp/common.hpp
@@ -56,8 +56,8 @@ public:
         // x: [N, channels, h, w]
         auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]);
 
-        x = ggml_upscale(ctx, x, 2, ggml_scale_mode::GGML_SCALE_MODE_NEAREST);  // [N, channels, h*2, w*2]
-        x = conv->forward(ctx, x);    // [N, out_channels, h*2, w*2]
+        x = ggml_upscale(ctx, x, 2, GGML_SCALE_MODE_NEAREST);  // [N, channels, h*2, w*2]
+        x = conv->forward(ctx, x);                             // [N, out_channels, h*2, w*2]
         return x;
     }
 };
@@ -182,9 +182,9 @@ protected:
     int64_t dim_in;
     int64_t dim_out;
 
-    void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, std::string prefix = "") {
-        enum ggml_type wtype      = (tensor_types.find(prefix + "proj.weight") != tensor_types.end()) ? tensor_types[prefix + "proj.weight"] : GGML_TYPE_F32;
-        enum ggml_type bias_wtype = GGML_TYPE_F32;  //(tensor_types.find(prefix + "proj.bias") != tensor_types.end()) ? tensor_types[prefix + "proj.bias"] : GGML_TYPE_F32;
+    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, std::string prefix = "") {
+        enum ggml_type wtype      = get_type(prefix + "proj.weight", tensor_types, GGML_TYPE_F32);
+        enum ggml_type bias_wtype = GGML_TYPE_F32;
         params["proj.weight"]     = ggml_new_tensor_2d(ctx, wtype, dim_in, dim_out * 2);
         params["proj.bias"]       = ggml_new_tensor_1d(ctx, bias_wtype, dim_out * 2);
     }
@@ -440,9 +440,9 @@ public:
 
 class AlphaBlender : public GGMLBlock {
 protected:
-    void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, std::string prefix = "") {
+    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, std::string prefix = "") {
         // Get the type of the "mix_factor" tensor from the input tensors map with the specified prefix
-        enum ggml_type wtype = GGML_TYPE_F32;  //(tensor_types.ypes.find(prefix + "mix_factor") != tensor_types.end()) ? tensor_types[prefix + "mix_factor"] : GGML_TYPE_F32;
+        enum ggml_type wtype = GGML_TYPE_F32;
         params["mix_factor"] = ggml_new_tensor_1d(ctx, wtype, 1);
     }
 
diff --git a/otherarch/sdcpp/conditioner.hpp b/otherarch/sdcpp/conditioner.hpp
index 4005fadf7..6a51dce81 100644
--- a/otherarch/sdcpp/conditioner.hpp
+++ b/otherarch/sdcpp/conditioner.hpp
@@ -57,29 +57,30 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
     std::vector<std::string> readed_embeddings;
 
     FrozenCLIPEmbedderWithCustomWords(ggml_backend_t backend,
-                                      std::map<std::string, enum ggml_type>& tensor_types,
+                                      const String2GGMLType& tensor_types,
                                       const std::string& embd_dir,
                                       SDVersion version = VERSION_SD1,
                                       PMVersion pv      = PM_VERSION_1,
                                       int clip_skip     = -1)
         : version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407), embd_dir(embd_dir) {
+        if (sd_version_is_sd1(version)) {
+            text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14);
+        } else if (sd_version_is_sd2(version)) {
+            text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14);
+        } else if (sd_version_is_sdxl(version)) {
+            text_model  = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
+            text_model2 = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
+        }
+        set_clip_skip(clip_skip);
+    }
+
+    void set_clip_skip(int clip_skip) {
         if (clip_skip <= 0) {
             clip_skip = 1;
             if (sd_version_is_sd2(version) || sd_version_is_sdxl(version)) {
                 clip_skip = 2;
             }
         }
-        if (sd_version_is_sd1(version)) {
-            text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, clip_skip);
-        } else if (sd_version_is_sd2(version)) {
-            text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, clip_skip);
-        } else if (sd_version_is_sdxl(version)) {
-            text_model  = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, clip_skip, false);
-            text_model2 = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, clip_skip, false);
-        }
-    }
-
-    void set_clip_skip(int clip_skip) {
         text_model->set_clip_skip(clip_skip);
         if (sd_version_is_sdxl(version)) {
             text_model2->set_clip_skip(clip_skip);
@@ -458,8 +459,8 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                 if (sd_version_is_sdxl(version)) {
                     text_model2->compute(n_threads,
                                          input_ids2,
-                                         0,
-                                         NULL,
+                                         num_custom_embeddings,
+                                         token_embed_custom.data(),
                                          max_token_idx,
                                          false,
                                          &chunk_hidden_states2, work_ctx);
@@ -469,8 +470,8 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                     if (chunk_idx == 0) {
                         text_model2->compute(n_threads,
                                              input_ids2,
-                                             0,
-                                             NULL,
+                                             num_custom_embeddings,
+                                             token_embed_custom.data(),
                                              max_token_idx,
                                              true,
                                              &pooled,
@@ -617,7 +618,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
 struct FrozenCLIPVisionEmbedder : public GGMLRunner {
     CLIPVisionModelProjection vision_model;
 
-    FrozenCLIPVisionEmbedder(ggml_backend_t backend, std::map<std::string, enum ggml_type>& tensor_types)
+    FrozenCLIPVisionEmbedder(ggml_backend_t backend, const String2GGMLType& tensor_types = {})
         : vision_model(OPEN_CLIP_VIT_H_14, true), GGMLRunner(backend) {
         vision_model.init(params_ctx, tensor_types, "cond_stage_model.transformer");
     }
@@ -662,18 +663,19 @@ struct SD3CLIPEmbedder : public Conditioner {
     std::shared_ptr<T5Runner> t5;
 
     SD3CLIPEmbedder(ggml_backend_t backend,
-                    std::map<std::string, enum ggml_type>& tensor_types,
-                    int clip_skip = -1)
+                    const String2GGMLType& tensor_types = {},
+                    int clip_skip                       = -1)
         : clip_g_tokenizer(0) {
-        if (clip_skip <= 0) {
-            clip_skip = 2;
-        }
-        clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, clip_skip, false);
-        clip_g = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, clip_skip, false);
+        clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
+        clip_g = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
         t5     = std::make_shared<T5Runner>(backend, tensor_types, "text_encoders.t5xxl.transformer");
+        set_clip_skip(clip_skip);
     }
 
     void set_clip_skip(int clip_skip) {
+        if (clip_skip <= 0) {
+            clip_skip = 2;
+        }
         clip_l->set_clip_skip(clip_skip);
         clip_g->set_clip_skip(clip_skip);
     }
@@ -1008,16 +1010,17 @@ struct FluxCLIPEmbedder : public Conditioner {
     size_t chunk_len = 256;
 
     FluxCLIPEmbedder(ggml_backend_t backend,
-                     std::map<std::string, enum ggml_type>& tensor_types,
-                     int clip_skip = -1) {
-        if (clip_skip <= 0) {
-            clip_skip = 2;
-        }
-        clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, clip_skip, true);
+                     const String2GGMLType& tensor_types = {},
+                     int clip_skip                       = -1) {
+        clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true);
         t5     = std::make_shared<T5Runner>(backend, tensor_types, "text_encoders.t5xxl.transformer");
+        set_clip_skip(clip_skip);
     }
 
     void set_clip_skip(int clip_skip) {
+        if (clip_skip <= 0) {
+            clip_skip = 2;
+        }
         clip_l->set_clip_skip(clip_skip);
     }
 
@@ -1228,10 +1231,10 @@ struct PixArtCLIPEmbedder : public Conditioner {
     int mask_pad     = 1;
 
     PixArtCLIPEmbedder(ggml_backend_t backend,
-                       std::map<std::string, enum ggml_type>& tensor_types,
-                       int clip_skip = -1,
-                       bool use_mask = false,
-                       int mask_pad  = 1)
+                       const String2GGMLType& tensor_types = {},
+                       int clip_skip                       = -1,
+                       bool use_mask                       = false,
+                       int mask_pad                        = 1)
         : use_mask(use_mask), mask_pad(mask_pad) {
         t5 = std::make_shared<T5Runner>(backend, tensor_types, "text_encoders.t5xxl.transformer");
     }
@@ -1422,4 +1425,4 @@ struct PixArtCLIPEmbedder : public Conditioner {
     }
 };
 
-#endif
\ No newline at end of file
+#endif
diff --git a/otherarch/sdcpp/control.hpp b/otherarch/sdcpp/control.hpp
index 23b75feff..63fe70455 100644
--- a/otherarch/sdcpp/control.hpp
+++ b/otherarch/sdcpp/control.hpp
@@ -317,12 +317,23 @@ struct ControlNet : public GGMLRunner {
     bool guided_hint_cached         = false;
 
     ControlNet(ggml_backend_t backend,
-               std::map<std::string, enum ggml_type>& tensor_types,
-               SDVersion version = VERSION_SD1)
+               const String2GGMLType& tensor_types = {},
+               SDVersion version                   = VERSION_SD1)
         : GGMLRunner(backend), control_net(version) {
         control_net.init(params_ctx, tensor_types, "");
     }
 
+    void enable_conv2d_direct() {
+        std::vector<GGMLBlock*> blocks;
+        control_net.get_all_blocks(blocks);
+        for (auto block : blocks) {
+            if (block->get_desc() == "Conv2d") {
+                auto conv_block = (Conv2d*)block;
+                conv_block->enable_direct();
+            }
+        }
+    }
+
     ~ControlNet() {
         free_control_ctx();
     }
diff --git a/otherarch/sdcpp/denoiser.hpp b/otherarch/sdcpp/denoiser.hpp
index 307423f40..d4bcec590 100644
--- a/otherarch/sdcpp/denoiser.hpp
+++ b/otherarch/sdcpp/denoiser.hpp
@@ -168,24 +168,21 @@ struct AYSSchedule : SigmaSchedule {
         std::vector<float> inputs;
         std::vector<float> results(n + 1);
 
-        switch (version) {
-            case VERSION_SD2: /* fallthrough */
-                LOG_WARN("AYS not designed for SD2.X models");
-            case VERSION_SD1:
-                LOG_INFO("AYS using SD1.5 noise levels");
-                inputs = noise_levels[0];
-                break;
-            case VERSION_SDXL:
-                LOG_INFO("AYS using SDXL noise levels");
-                inputs = noise_levels[1];
-                break;
-            case VERSION_SVD:
-                LOG_INFO("AYS using SVD noise levels");
-                inputs = noise_levels[2];
-                break;
-            default:
-                LOG_ERROR("Version not compatable with AYS scheduler");
-                return results;
+        if (sd_version_is_sd2((SDVersion)version)) {
+            LOG_WARN("AYS not designed for SD2.X models");
+        } /* fallthrough */
+        else if (sd_version_is_sd1((SDVersion)version)) {
+            LOG_INFO("AYS using SD1.5 noise levels");
+            inputs = noise_levels[0];
+        } else if (sd_version_is_sdxl((SDVersion)version)) {
+            LOG_INFO("AYS using SDXL noise levels");
+            inputs = noise_levels[1];
+        } else if (version == VERSION_SVD) {
+            LOG_INFO("AYS using SVD noise levels");
+            inputs = noise_levels[2];
+        } else {
+            LOG_ERROR("Version not compatible with AYS scheduler");
+            return results;
         }
 
         /* Stretches those pre-calculated reference levels out to the desired
@@ -346,6 +343,32 @@ struct CompVisVDenoiser : public CompVisDenoiser {
     }
 };
 
+struct EDMVDenoiser : public CompVisVDenoiser {
+    float min_sigma = 0.002;
+    float max_sigma = 120.0;
+
+    EDMVDenoiser(float min_sigma = 0.002, float max_sigma = 120.0)
+        : min_sigma(min_sigma), max_sigma(max_sigma) {
+        schedule = std::make_shared<ExponentialSchedule>();
+    }
+
+    float t_to_sigma(float t) {
+        return std::exp(t * 4 / (float)TIMESTEPS);
+    }
+
+    float sigma_to_t(float s) {
+        return 0.25 * std::log(s);
+    }
+
+    float sigma_min() {
+        return min_sigma;
+    }
+
+    float sigma_max() {
+        return max_sigma;
+    }
+};
+
 float time_snr_shift(float alpha, float t) {
     if (alpha == 1.0f) {
         return t;
@@ -1019,7 +1042,7 @@ static void sample_k_diffusion(sample_method_t method,
             // also needed to invert the behavior of CompVisDenoiser
             // (k-diffusion's LMSDiscreteScheduler)
             float beta_start = 0.00085f;
-            float beta_end = 0.0120f;
+            float beta_end   = 0.0120f;
             std::vector<double> alphas_cumprod;
             std::vector<double> compvis_sigmas;
 
@@ -1030,8 +1053,9 @@ static void sample_k_diffusion(sample_method_t method,
                     (i == 0 ? 1.0f : alphas_cumprod[i - 1]) *
                     (1.0f -
                      std::pow(sqrtf(beta_start) +
-                              (sqrtf(beta_end) - sqrtf(beta_start)) *
-                              ((float)i / (TIMESTEPS - 1)), 2));
+                                  (sqrtf(beta_end) - sqrtf(beta_start)) *
+                                      ((float)i / (TIMESTEPS - 1)),
+                              2));
                 compvis_sigmas[i] =
                     std::sqrt((1 - alphas_cumprod[i]) /
                               alphas_cumprod[i]);
@@ -1061,7 +1085,8 @@ static void sample_k_diffusion(sample_method_t method,
                 // - pred_prev_sample -> "x_t-1"
                 int timestep =
                     roundf(TIMESTEPS -
-                           i * ((float)TIMESTEPS / steps)) - 1;
+                           i * ((float)TIMESTEPS / steps)) -
+                    1;
                 // 1. get previous step value (=t-1)
                 int prev_timestep = timestep - TIMESTEPS / steps;
                 // The sigma here is chosen to cause the
@@ -1086,10 +1111,9 @@ static void sample_k_diffusion(sample_method_t method,
                     float* vec_x = (float*)x->data;
                     for (int j = 0; j < ggml_nelements(x); j++) {
                         vec_x[j] *= std::sqrt(sigma * sigma + 1) /
-                            sigma;
+                                    sigma;
                     }
-                }
-                else {
+                } else {
                     // For the subsequent steps after the first one,
                     // at this point x = latents or x = sample, and
                     // needs to be prescaled with x <- sample / c_in
@@ -1127,9 +1151,8 @@ static void sample_k_diffusion(sample_method_t method,
                 float alpha_prod_t = alphas_cumprod[timestep];
                 // Note final_alpha_cumprod = alphas_cumprod[0] due to
                 // trailing timestep spacing
-                float alpha_prod_t_prev = prev_timestep >= 0 ?
-                    alphas_cumprod[prev_timestep] : alphas_cumprod[0];
-                float beta_prod_t = 1 - alpha_prod_t;
+                float alpha_prod_t_prev = prev_timestep >= 0 ? alphas_cumprod[prev_timestep] : alphas_cumprod[0];
+                float beta_prod_t       = 1 - alpha_prod_t;
                 // 3. compute predicted original sample from predicted
                 // noise also called "predicted x_0" of formula (12)
                 // from https://arxiv.org/pdf/2010.02502.pdf
@@ -1145,7 +1168,7 @@ static void sample_k_diffusion(sample_method_t method,
                         vec_pred_original_sample[j] =
                             (vec_x[j] / std::sqrt(sigma * sigma + 1) -
                              std::sqrt(beta_prod_t) *
-                             vec_model_output[j]) *
+                                 vec_model_output[j]) *
                             (1 / std::sqrt(alpha_prod_t));
                     }
                 }
@@ -1159,8 +1182,8 @@ static void sample_k_diffusion(sample_method_t method,
                 // sigma_t = sqrt((1 - alpha_t-1)/(1 - alpha_t)) *
                 // sqrt(1 - alpha_t/alpha_t-1)
                 float beta_prod_t_prev = 1 - alpha_prod_t_prev;
-                float variance = (beta_prod_t_prev / beta_prod_t) *
-                    (1 - alpha_prod_t / alpha_prod_t_prev);
+                float variance         = (beta_prod_t_prev / beta_prod_t) *
+                                 (1 - alpha_prod_t / alpha_prod_t_prev);
                 float std_dev_t = eta * std::sqrt(variance);
                 // 6. compute "direction pointing to x_t" of formula
                 // (12) from https://arxiv.org/pdf/2010.02502.pdf
@@ -1179,8 +1202,8 @@ static void sample_k_diffusion(sample_method_t method,
                                       std::pow(std_dev_t, 2)) *
                             vec_model_output[j];
                         vec_x[j] = std::sqrt(alpha_prod_t_prev) *
-                            vec_pred_original_sample[j] +
-                            pred_sample_direction;
+                                       vec_pred_original_sample[j] +
+                                   pred_sample_direction;
                     }
                 }
                 if (eta > 0) {
@@ -1208,7 +1231,7 @@ static void sample_k_diffusion(sample_method_t method,
             // by Semi-Linear Consistency Function with Trajectory
             // Mapping", arXiv:2402.19159 [cs.CV]
             float beta_start = 0.00085f;
-            float beta_end = 0.0120f;
+            float beta_end   = 0.0120f;
             std::vector<double> alphas_cumprod;
             std::vector<double> compvis_sigmas;
 
@@ -1219,8 +1242,9 @@ static void sample_k_diffusion(sample_method_t method,
                     (i == 0 ? 1.0f : alphas_cumprod[i - 1]) *
                     (1.0f -
                      std::pow(sqrtf(beta_start) +
-                              (sqrtf(beta_end) - sqrtf(beta_start)) *
-                              ((float)i / (TIMESTEPS - 1)), 2));
+                                  (sqrtf(beta_end) - sqrtf(beta_start)) *
+                                      ((float)i / (TIMESTEPS - 1)),
+                              2));
                 compvis_sigmas[i] =
                     std::sqrt((1 - alphas_cumprod[i]) /
                               alphas_cumprod[i]);
@@ -1235,13 +1259,10 @@ static void sample_k_diffusion(sample_method_t method,
             for (int i = 0; i < steps; i++) {
                 // Analytic form for TCD timesteps
                 int timestep = TIMESTEPS - 1 -
-                    (TIMESTEPS / original_steps) *
-                    (int)floor(i * ((float)original_steps / steps));
+                               (TIMESTEPS / original_steps) *
+                                   (int)floor(i * ((float)original_steps / steps));
                 // 1. get previous step value
-                int prev_timestep = i >= steps - 1 ? 0 :
-                    TIMESTEPS - 1 - (TIMESTEPS / original_steps) *
-                    (int)floor((i + 1) *
-                               ((float)original_steps / steps));
+                int prev_timestep = i >= steps - 1 ? 0 : TIMESTEPS - 1 - (TIMESTEPS / original_steps) * (int)floor((i + 1) * ((float)original_steps / steps));
                 // Here timestep_s is tau_n' in Algorithm 4. The _s
                 // notation appears to be that from C. Lu,
                 // "DPM-Solver: A Fast ODE Solver for Diffusion
@@ -1258,10 +1279,9 @@ static void sample_k_diffusion(sample_method_t method,
                     float* vec_x = (float*)x->data;
                     for (int j = 0; j < ggml_nelements(x); j++) {
                         vec_x[j] *= std::sqrt(sigma * sigma + 1) /
-                            sigma;
+                                    sigma;
                     }
-                }
-                else {
+                } else {
                     float* vec_x = (float*)x->data;
                     for (int j = 0; j < ggml_nelements(x); j++) {
                         vec_x[j] *= std::sqrt(sigma * sigma + 1);
@@ -1294,15 +1314,14 @@ static void sample_k_diffusion(sample_method_t method,
                 // DPM-Solver. In fact, we have alpha_{t_n} =
                 // \sqrt{\hat{alpha_n}}, [...]"
                 float alpha_prod_t = alphas_cumprod[timestep];
-                float beta_prod_t = 1 - alpha_prod_t;
+                float beta_prod_t  = 1 - alpha_prod_t;
                 // Note final_alpha_cumprod = alphas_cumprod[0] since
                 // TCD is always "trailing"
-                float alpha_prod_t_prev = prev_timestep >= 0 ?
-                    alphas_cumprod[prev_timestep] : alphas_cumprod[0];
+                float alpha_prod_t_prev = prev_timestep >= 0 ? alphas_cumprod[prev_timestep] : alphas_cumprod[0];
                 // The subscript _s are the only portion in this
                 // section (2) unique to TCD
                 float alpha_prod_s = alphas_cumprod[timestep_s];
-                float beta_prod_s = 1 - alpha_prod_s;
+                float beta_prod_s  = 1 - alpha_prod_s;
                 // 3. Compute the predicted noised sample x_s based on
                 // the model parameterization
                 //
@@ -1317,7 +1336,7 @@ static void sample_k_diffusion(sample_method_t method,
                         vec_pred_original_sample[j] =
                             (vec_x[j] / std::sqrt(sigma * sigma + 1) -
                              std::sqrt(beta_prod_t) *
-                             vec_model_output[j]) *
+                                 vec_model_output[j]) *
                             (1 / std::sqrt(alpha_prod_t));
                     }
                 }
@@ -1339,9 +1358,9 @@ static void sample_k_diffusion(sample_method_t method,
                         // pred_epsilon = model_output
                         vec_x[j] =
                             std::sqrt(alpha_prod_s) *
-                            vec_pred_original_sample[j] +
+                                vec_pred_original_sample[j] +
                             std::sqrt(beta_prod_s) *
-                            vec_model_output[j];
+                                vec_model_output[j];
                     }
                 }
                 // 4. Sample and inject noise z ~ N(0, I) for
@@ -1357,7 +1376,7 @@ static void sample_k_diffusion(sample_method_t method,
                     // In this case, x is still pred_noised_sample,
                     // continue in-place
                     ggml_tensor_set_f32_randn(noise, rng);
-                    float* vec_x = (float*)x->data;
+                    float* vec_x     = (float*)x->data;
                     float* vec_noise = (float*)noise->data;
                     for (int j = 0; j < ggml_nelements(x); j++) {
                         // Corresponding to (35) in Zheng et
@@ -1366,10 +1385,10 @@ static void sample_k_diffusion(sample_method_t method,
                         vec_x[j] =
                             std::sqrt(alpha_prod_t_prev /
                                       alpha_prod_s) *
-                            vec_x[j] +
+                                vec_x[j] +
                             std::sqrt(1 - alpha_prod_t_prev /
-                                      alpha_prod_s) *
-                            vec_noise[j];
+                                              alpha_prod_s) *
+                                vec_noise[j];
                     }
                 }
             }
@@ -1381,4 +1400,4 @@ static void sample_k_diffusion(sample_method_t method,
     }
 }
 
-#endif  // __DENOISER_HPP__
\ No newline at end of file
+#endif  // __DENOISER_HPP__
diff --git a/otherarch/sdcpp/diffusion_model.hpp b/otherarch/sdcpp/diffusion_model.hpp
index 65680b8d9..787a4fa79 100644
--- a/otherarch/sdcpp/diffusion_model.hpp
+++ b/otherarch/sdcpp/diffusion_model.hpp
@@ -13,7 +13,7 @@ struct DiffusionModel {
                          struct ggml_tensor* c_concat,
                          struct ggml_tensor* y,
                          struct ggml_tensor* guidance,
-                         std::vector<ggml_tensor*> ref_latents = {},
+                         std::vector<ggml_tensor*> ref_latents     = {},
                          int num_video_frames                      = -1,
                          std::vector<struct ggml_tensor*> controls = {},
                          float control_strength                    = 0.f,
@@ -32,9 +32,9 @@ struct UNetModel : public DiffusionModel {
     UNetModelRunner unet;
 
     UNetModel(ggml_backend_t backend,
-              std::map<std::string, enum ggml_type>& tensor_types,
-              SDVersion version = VERSION_SD1,
-              bool flash_attn   = false)
+              const String2GGMLType& tensor_types = {},
+              SDVersion version                   = VERSION_SD1,
+              bool flash_attn                     = false)
         : unet(backend, tensor_types, "model.diffusion_model", version, flash_attn) {
     }
 
@@ -69,7 +69,7 @@ struct UNetModel : public DiffusionModel {
                  struct ggml_tensor* c_concat,
                  struct ggml_tensor* y,
                  struct ggml_tensor* guidance,
-                 std::vector<ggml_tensor*> ref_latents = {},
+                 std::vector<ggml_tensor*> ref_latents     = {},
                  int num_video_frames                      = -1,
                  std::vector<struct ggml_tensor*> controls = {},
                  float control_strength                    = 0.f,
@@ -85,7 +85,7 @@ struct MMDiTModel : public DiffusionModel {
     MMDiTRunner mmdit;
 
     MMDiTModel(ggml_backend_t backend,
-               std::map<std::string, enum ggml_type>& tensor_types)
+               const String2GGMLType& tensor_types = {})
         : mmdit(backend, tensor_types, "model.diffusion_model") {
     }
 
@@ -120,7 +120,7 @@ struct MMDiTModel : public DiffusionModel {
                  struct ggml_tensor* c_concat,
                  struct ggml_tensor* y,
                  struct ggml_tensor* guidance,
-                 std::vector<ggml_tensor*> ref_latents = {},
+                 std::vector<ggml_tensor*> ref_latents     = {},
                  int num_video_frames                      = -1,
                  std::vector<struct ggml_tensor*> controls = {},
                  float control_strength                    = 0.f,
@@ -135,10 +135,10 @@ struct FluxModel : public DiffusionModel {
     Flux::FluxRunner flux;
 
     FluxModel(ggml_backend_t backend,
-              std::map<std::string, enum ggml_type>& tensor_types,
-              SDVersion version = VERSION_FLUX,
-              bool flash_attn   = false,
-              bool use_mask     = false)
+              const String2GGMLType& tensor_types = {},
+              SDVersion version                   = VERSION_FLUX,
+              bool flash_attn                     = false,
+              bool use_mask                       = false)
         : flux(backend, tensor_types, "model.diffusion_model", version, flash_attn, use_mask) {
     }
 
@@ -173,7 +173,7 @@ struct FluxModel : public DiffusionModel {
                  struct ggml_tensor* c_concat,
                  struct ggml_tensor* y,
                  struct ggml_tensor* guidance,
-                 std::vector<ggml_tensor*> ref_latents = {},
+                 std::vector<ggml_tensor*> ref_latents     = {},
                  int num_video_frames                      = -1,
                  std::vector<struct ggml_tensor*> controls = {},
                  float control_strength                    = 0.f,
@@ -184,4 +184,4 @@ struct FluxModel : public DiffusionModel {
     }
 };
 
-#endif
\ No newline at end of file
+#endif
diff --git a/otherarch/sdcpp/esrgan.hpp b/otherarch/sdcpp/esrgan.hpp
index 5fbc6c509..3e41a8871 100644
--- a/otherarch/sdcpp/esrgan.hpp
+++ b/otherarch/sdcpp/esrgan.hpp
@@ -130,8 +130,8 @@ public:
         body_feat = conv_body->forward(ctx, body_feat);
         feat      = ggml_add(ctx, feat, body_feat);
         // upsample
-        feat     = lrelu(ctx, conv_up1->forward(ctx, ggml_upscale(ctx, feat, 2, ggml_scale_mode::GGML_SCALE_MODE_NEAREST)));
-        feat     = lrelu(ctx, conv_up2->forward(ctx, ggml_upscale(ctx, feat, 2, ggml_scale_mode::GGML_SCALE_MODE_NEAREST)));
+        feat     = lrelu(ctx, conv_up1->forward(ctx, ggml_upscale(ctx, feat, 2, GGML_SCALE_MODE_NEAREST)));
+        feat     = lrelu(ctx, conv_up2->forward(ctx, ggml_upscale(ctx, feat, 2, GGML_SCALE_MODE_NEAREST)));
         auto out = conv_last->forward(ctx, lrelu(ctx, conv_hr->forward(ctx, feat)));
         return out;
     }
@@ -142,11 +142,22 @@ struct ESRGAN : public GGMLRunner {
     int scale     = 4;
     int tile_size = 128;  // avoid cuda OOM for 4gb VRAM
 
-    ESRGAN(ggml_backend_t backend, std::map<std::string, enum ggml_type>& tensor_types)
+    ESRGAN(ggml_backend_t backend, const String2GGMLType& tensor_types = {})
         : GGMLRunner(backend) {
         rrdb_net.init(params_ctx, tensor_types, "");
     }
 
+    void enable_conv2d_direct() {
+        std::vector<GGMLBlock*> blocks;
+        rrdb_net.get_all_blocks(blocks);
+        for (auto block : blocks) {
+            if (block->get_desc() == "Conv2d") {
+                auto conv_block = (Conv2d*)block;
+                conv_block->enable_direct();
+            }
+        }
+    }
+
     std::string get_desc() {
         return "esrgan";
     }
diff --git a/otherarch/sdcpp/flux.hpp b/otherarch/sdcpp/flux.hpp
index a16125102..40838f2fa 100644
--- a/otherarch/sdcpp/flux.hpp
+++ b/otherarch/sdcpp/flux.hpp
@@ -35,8 +35,8 @@ namespace Flux {
         int64_t hidden_size;
         float eps;
 
-        void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") {
-            ggml_type wtype = GGML_TYPE_F32;  //(tensor_types.find(prefix + "scale") != tensor_types.end()) ? tensor_types[prefix + "scale"] : GGML_TYPE_F32;
+        void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
+            ggml_type wtype = GGML_TYPE_F32;
             params["scale"] = ggml_new_tensor_1d(ctx, wtype, hidden_size);
         }
 
@@ -512,7 +512,8 @@ namespace Flux {
         LastLayer(int64_t hidden_size,
                   int64_t patch_size,
                   int64_t out_channels,
-                  bool prune_mod = false) : prune_mod(prune_mod) {
+                  bool prune_mod = false)
+            : prune_mod(prune_mod) {
             blocks["norm_final"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-06f, false));
             blocks["linear"]     = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, patch_size * patch_size * out_channels));
             if (!prune_mod) {
@@ -723,7 +724,7 @@ namespace Flux {
             auto txt_ids = gen_txt_ids(bs, context_len);
             auto img_ids = gen_img_ids(h, w, patch_size, bs);
 
-            auto ids = concat_ids(txt_ids, img_ids, bs);
+            auto ids               = concat_ids(txt_ids, img_ids, bs);
             uint64_t curr_h_offset = 0;
             uint64_t curr_w_offset = 0;
             for (ggml_tensor* ref : ref_latents) {
@@ -736,7 +737,7 @@ namespace Flux {
                 }
 
                 auto ref_ids = gen_img_ids(ref->ne[1], ref->ne[0], patch_size, bs, 1, h_offset, w_offset);
-                ids = concat_ids(ids, ref_ids, bs);
+                ids          = concat_ids(ids, ref_ids, bs);
 
                 curr_h_offset = std::max(curr_h_offset, ref->ne[1] + h_offset);
                 curr_w_offset = std::max(curr_w_offset, ref->ne[0] + w_offset);
@@ -744,7 +745,6 @@ namespace Flux {
             return ids;
         }
 
-
         // Generate positional embeddings
         std::vector<float> gen_pe(int h, int w, int patch_size, int bs, int context_len, std::vector<ggml_tensor*> ref_latents, int theta, const std::vector<int>& axes_dim) {
             std::vector<std::vector<float>> ids       = gen_ids(h, w, patch_size, bs, context_len, ref_latents);
@@ -872,8 +872,8 @@ namespace Flux {
                                          struct ggml_tensor* y,
                                          struct ggml_tensor* guidance,
                                          struct ggml_tensor* pe,
-                                         struct ggml_tensor* mod_index_arange   = NULL,
-                                         std::vector<int> skip_layers = {}) {
+                                         struct ggml_tensor* mod_index_arange = NULL,
+                                         std::vector<int> skip_layers         = {}) {
             auto img_in      = std::dynamic_pointer_cast<Linear>(blocks["img_in"]);
             auto txt_in      = std::dynamic_pointer_cast<Linear>(blocks["txt_in"]);
             auto final_layer = std::dynamic_pointer_cast<LastLayer>(blocks["final_layer"]);
@@ -962,7 +962,6 @@ namespace Flux {
 
         struct ggml_tensor* process_img(struct ggml_context* ctx,
                                         struct ggml_tensor* x) {
-
             int64_t W          = x->ne[0];
             int64_t H          = x->ne[1];
             int64_t patch_size = 2;
@@ -983,9 +982,9 @@ namespace Flux {
                                     struct ggml_tensor* y,
                                     struct ggml_tensor* guidance,
                                     struct ggml_tensor* pe,
-                                    struct ggml_tensor* mod_index_arange   = NULL,
+                                    struct ggml_tensor* mod_index_arange  = NULL,
                                     std::vector<ggml_tensor*> ref_latents = {},
-                                    std::vector<int> skip_layers = {}) {
+                                    std::vector<int> skip_layers          = {}) {
             // Forward pass of DiT.
             // x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
             // timestep: (N,) tensor of diffusion timesteps
@@ -1005,7 +1004,7 @@ namespace Flux {
             int pad_h          = (patch_size - H % patch_size) % patch_size;
             int pad_w          = (patch_size - W % patch_size) % patch_size;
 
-            auto img = process_img(ctx, x);
+            auto img            = process_img(ctx, x);
             uint64_t img_tokens = img->ne[1];
 
             if (c_concat != NULL) {
@@ -1013,7 +1012,7 @@ namespace Flux {
                 ggml_tensor* mask   = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], 8 * 8, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * C);
 
                 masked = process_img(ctx, masked);
-                mask = process_img(ctx, mask);
+                mask   = process_img(ctx, mask);
 
                 img = ggml_concat(ctx, img, ggml_concat(ctx, masked, mask, 0), 0);
             }
@@ -1027,9 +1026,9 @@ namespace Flux {
 
             auto out = forward_orig(ctx, img, context, timestep, y, guidance, pe, mod_index_arange, skip_layers);  // [N, num_tokens, C * patch_size * patch_size]
             if (out->ne[1] > img_tokens) {
-                out = ggml_cont(ctx, ggml_permute(ctx, out, 0, 2, 1, 3)); // [num_tokens, N, C * patch_size * patch_size]
+                out = ggml_cont(ctx, ggml_permute(ctx, out, 0, 2, 1, 3));  // [num_tokens, N, C * patch_size * patch_size]
                 out = ggml_view_3d(ctx, out, out->ne[0], out->ne[1], img_tokens, out->nb[1], out->nb[2], 0);
-                out = ggml_cont(ctx, ggml_permute(ctx, out, 0, 2, 1, 3)); // [N, h*w, C * patch_size * patch_size]
+                out = ggml_cont(ctx, ggml_permute(ctx, out, 0, 2, 1, 3));  // [N, h*w, C * patch_size * patch_size]
             }
 
             // rearrange(out, "b (h w) (c ph pw) -> b c (h ph) (w pw)", h=h_len, w=w_len, ph=2, pw=2)
@@ -1040,8 +1039,6 @@ namespace Flux {
     };
 
     struct FluxRunner : public GGMLRunner {
-        static std::map<std::string, enum ggml_type> empty_tensor_types;
-
     public:
         FluxParams flux_params;
         Flux flux;
@@ -1051,11 +1048,11 @@ namespace Flux {
         bool use_mask = false;
 
         FluxRunner(ggml_backend_t backend,
-                   std::map<std::string, enum ggml_type>& tensor_types = empty_tensor_types,
-                   const std::string prefix                            = "",
-                   SDVersion version                                   = VERSION_FLUX,
-                   bool flash_attn                                     = false,
-                   bool use_mask                                       = false)
+                   const String2GGMLType& tensor_types = {},
+                   const std::string prefix            = "",
+                   SDVersion version                   = VERSION_FLUX,
+                   bool flash_attn                     = false,
+                   bool use_mask                       = false)
             : GGMLRunner(backend), use_mask(use_mask) {
             flux_params.flash_attn          = flash_attn;
             flux_params.guidance_embed      = false;
@@ -1120,7 +1117,7 @@ namespace Flux {
                                         struct ggml_tensor* y,
                                         struct ggml_tensor* guidance,
                                         std::vector<ggml_tensor*> ref_latents = {},
-                                        std::vector<int> skip_layers = {}) {
+                                        std::vector<int> skip_layers          = {}) {
             GGML_ASSERT(x->ne[3] == 1);
             struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, FLUX_GRAPH_SIZE, false);
 
@@ -1139,8 +1136,8 @@ namespace Flux {
                 }
 
                 // ggml_arange is not working on some backends, precompute it
-                mod_index_arange_vec  = arange(0, 344);
-                mod_index_arange = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_F32, mod_index_arange_vec.size());
+                mod_index_arange_vec = arange(0, 344);
+                mod_index_arange     = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_F32, mod_index_arange_vec.size());
                 set_backend_tensor_data(mod_index_arange, mod_index_arange_vec.data());
             }
             y = to_backend(y);
@@ -1187,9 +1184,9 @@ namespace Flux {
                      struct ggml_tensor* y,
                      struct ggml_tensor* guidance,
                      std::vector<ggml_tensor*> ref_latents = {},
-                     struct ggml_tensor** output     = NULL,
-                     struct ggml_context* output_ctx = NULL,
-                     std::vector<int> skip_layers    = std::vector<int>()) {
+                     struct ggml_tensor** output           = NULL,
+                     struct ggml_context* output_ctx       = NULL,
+                     std::vector<int> skip_layers          = std::vector<int>()) {
             // x: [N, in_channels, h, w]
             // timesteps: [N, ]
             // context: [N, max_position, hidden_size]
@@ -1277,4 +1274,4 @@ namespace Flux {
 
 }  // namespace Flux
 
-#endif  // __FLUX_HPP__
\ No newline at end of file
+#endif  // __FLUX_HPP__
diff --git a/otherarch/sdcpp/ggml_extend.hpp b/otherarch/sdcpp/ggml_extend.hpp
index 838c271c2..9a0b1f8d3 100644
--- a/otherarch/sdcpp/ggml_extend.hpp
+++ b/otherarch/sdcpp/ggml_extend.hpp
@@ -40,6 +40,10 @@
 #include "ggml-vulkan.h"
 #endif
 
+#ifdef SD_USE_OPENCL
+#include "ggml-opencl.h"
+#endif
+
 #ifdef SD_USE_SYCL
 #include "ggml-sycl.h"
 #endif
@@ -53,6 +57,8 @@
 #define __STATIC_INLINE__ static inline
 #endif
 
+static_assert(GGML_MAX_NAME >= 128, "GGML_MAX_NAME must be at least 128");
+
 // n-mode trensor-matrix product
 // example: 2-mode product
 // A: [ne03, k, ne01, ne00]
@@ -109,13 +115,13 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_merge_lora(ggml_context* ctx, struct
 // [ne03,ne02,ne01,ne00] x [ne13,ne12,ne11,ne10] => [ne03*ne13,ne02*ne12,ne01*ne11,ne00*ne10]
 __STATIC_INLINE__ struct ggml_tensor* ggml_kronecker(ggml_context* ctx, struct ggml_tensor* a, struct ggml_tensor* b) {
     return ggml_mul(ctx,
-                    ggml_upscale_ext(ctx,
+                    ggml_interpolate(ctx,
                                      a,
                                      a->ne[0] * b->ne[0],
                                      a->ne[1] * b->ne[1],
                                      a->ne[2] * b->ne[2],
                                      a->ne[3] * b->ne[3],
-                                     ggml_scale_mode::GGML_SCALE_MODE_NEAREST),
+                                     GGML_SCALE_MODE_NEAREST),
                     b);
 }
 
@@ -811,6 +817,25 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_conv_2d(struct ggml_context* ctx,
     return x;
 }
 
+__STATIC_INLINE__ struct ggml_tensor* ggml_nn_conv_2d_direct(struct ggml_context* ctx,
+                                                             struct ggml_tensor* x,
+                                                             struct ggml_tensor* w,
+                                                             struct ggml_tensor* b,
+                                                             int s0 = 1,
+                                                             int s1 = 1,
+                                                             int p0 = 0,
+                                                             int p1 = 0,
+                                                             int d0 = 1,
+                                                             int d1 = 1) {
+    x = ggml_conv_2d_direct(ctx, w, x, s0, s1, p0, p1, d0, d1);
+    if (b != NULL) {
+        b = ggml_reshape_4d(ctx, b, 1, 1, b->ne[0], 1);
+        // b = ggml_repeat(ctx, b, x);
+        x = ggml_add(ctx, x, b);
+    }
+    return x;
+}
+
 // w: [OC，IC, KD, 1 * 1]
 // x: [N, IC, IH, IW]
 // b: [OC,]
@@ -945,18 +970,33 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_attention_ext(struct ggml_context*
 
     float scale = (1.0f / sqrt((float)d_head));
 
+    int kv_pad = 0;
     // if (flash_attn) {
     //     LOG_DEBUG("attention_ext L_q:%d L_k:%d n_head:%d C:%d d_head:%d N:%d", L_q, L_k, n_head, C, d_head, N);
     // }
-    //  is there anything oddly shaped?? ping Green-Sky if you can trip this assert
+    //   is there anything oddly shaped?? ping Green-Sky if you can trip this assert
     GGML_ASSERT(((L_k % 256 == 0) && L_q == L_k) || !(L_k % 256 == 0));
 
     bool can_use_flash_attn = true;
+    can_use_flash_attn      = can_use_flash_attn && (d_head == 64 ||
+                                                d_head == 80 ||
+                                                d_head == 96 ||
+                                                d_head == 112 ||
+                                                d_head == 128 ||
+                                                d_head == 256);
+// kcpp disable kv_pad (leejet/stable-diffusion.cpp#756)
+#if 1
     can_use_flash_attn      = can_use_flash_attn && L_k % 256 == 0;
-    can_use_flash_attn      = can_use_flash_attn && d_head % 64 == 0;  // double check
-
-    // cuda max d_head seems to be 256, cpu does seem to work with 512
-    can_use_flash_attn = can_use_flash_attn && d_head <= 256;  // double check
+#else
+    if (can_use_flash_attn && L_k % 256 != 0) {
+        // TODO(Green-Sky): might be worth just padding by default
+        if (L_k == 77 || L_k == 4208 || L_k == 3952) {
+            kv_pad = GGML_PAD(L_k, 256) - L_k;
+        } else {
+            can_use_flash_attn = false;
+        }
+    }
+#endif
 
     if (mask != nullptr) {
         // TODO(Green-Sky): figure out if we can bend t5 to work too
@@ -969,11 +1009,18 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_attention_ext(struct ggml_context*
     ggml_tensor* kqv = nullptr;
     // GGML_ASSERT((flash_attn && can_use_flash_attn) || !flash_attn);
     if (can_use_flash_attn && flash_attn) {
-        // LOG_DEBUG("using flash attention");
+        // LOG_DEBUG(" uses flash attention");
+        if (kv_pad != 0) {
+            // LOG_DEBUG(" padding k and v dim1 by %d", kv_pad);
+            k = ggml_pad(ctx, k, 0, kv_pad, 0, 0);
+        }
         k = ggml_cast(ctx, k, GGML_TYPE_F16);
 
         v = ggml_cont(ctx, ggml_permute(ctx, v, 0, 2, 1, 3));  // [N, n_head, L_k, d_head]
         v = ggml_reshape_3d(ctx, v, d_head, L_k, n_head * N);  // [N * n_head, L_k, d_head]
+        if (kv_pad != 0) {
+            v = ggml_pad(ctx, v, 0, kv_pad, 0, 0);
+        }
         v = ggml_cast(ctx, v, GGML_TYPE_F16);
 
         if (mask != nullptr) {
@@ -1181,6 +1228,8 @@ __STATIC_INLINE__ size_t ggml_tensor_num(ggml_context* ctx) {
 #define MAX_PARAMS_TENSOR_NUM 32768
 #define MAX_GRAPH_SIZE 32768
 
+typedef std::map<std::string, enum ggml_type> String2GGMLType;
+
 struct GGMLRunner {
 protected:
     typedef std::function<struct ggml_cgraph*()> get_graph_cb_t;
@@ -1365,13 +1414,7 @@ public:
             ggml_backend_cpu_set_n_threads(backend, n_threads);
         }
 
-// #ifdef SD_USE_METAL
-//         if (ggml_backend_is_metal(backend)) {
-//             ggml_backend_metal_set_n_cb(backend, n_threads);
-//         }
-// #endif
         ggml_backend_graph_compute(backend, gf);
-
 #ifdef GGML_PERF
         ggml_graph_print(gf);
 #endif
@@ -1398,17 +1441,25 @@ protected:
     GGMLBlockMap blocks;
     ParameterMap params;
 
-    void init_blocks(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") {
+    ggml_type get_type(const std::string& name, const String2GGMLType& tensor_types, ggml_type default_type) {
+        auto iter = tensor_types.find(name);
+        if (iter != tensor_types.end()) {
+            return iter->second;
+        }
+        return default_type;
+    }
+
+    void init_blocks(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
         for (auto& pair : blocks) {
             auto& block = pair.second;
             block->init(ctx, tensor_types, prefix + pair.first);
         }
     }
 
-    virtual void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") {}
+    virtual void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {}
 
 public:
-    void init(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, std::string prefix = "") {
+    void init(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, std::string prefix = "") {
         if (prefix.size() > 0) {
             prefix = prefix + ".";
         }
@@ -1455,6 +1506,19 @@ public:
             tensors[prefix + pair.first] = pair.second;
         }
     }
+
+    virtual std::string get_desc() {
+        return "GGMLBlock";
+    }
+
+    void get_all_blocks(std::vector<GGMLBlock*>& result) {
+        result.push_back(this);
+        for (auto& block_iter : blocks) {
+            if (block_iter.second) {
+                block_iter.second->get_all_blocks(result);
+            }
+        }
+    }
 };
 
 class UnaryBlock : public GGMLBlock {
@@ -1469,8 +1533,8 @@ protected:
     bool bias;
     bool force_f32;
 
-    void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") {
-        enum ggml_type wtype = (tensor_types.find(prefix + "weight") != tensor_types.end()) ? tensor_types[prefix + "weight"] : GGML_TYPE_F32;
+    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
+        enum ggml_type wtype = get_type(prefix + "weight", tensor_types, GGML_TYPE_F32);
         if (in_features % ggml_blck_size(wtype) != 0 || force_f32) {
             wtype = GGML_TYPE_F32;
         }
@@ -1505,8 +1569,8 @@ class Embedding : public UnaryBlock {
 protected:
     int64_t embedding_dim;
     int64_t num_embeddings;
-    void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") {
-        enum ggml_type wtype = (tensor_types.find(prefix + "weight") != tensor_types.end()) ? tensor_types[prefix + "weight"] : GGML_TYPE_F32;
+    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types, const std::string prefix = "") {
+        enum ggml_type wtype = get_type(prefix + "weight", tensor_types, GGML_TYPE_F32);
         params["weight"]     = ggml_new_tensor_2d(ctx, wtype, embedding_dim, num_embeddings);
     }
 
@@ -1544,12 +1608,13 @@ protected:
     std::pair<int, int> padding;
     std::pair<int, int> dilation;
     bool bias;
+    bool direct = false;
 
-    void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") {
-        enum ggml_type wtype = GGML_TYPE_F16;  //(tensor_types.find(prefix + "weight") != tensor_types.end()) ? tensor_types[prefix + "weight"] : GGML_TYPE_F16;
+    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types, const std::string prefix = "") {
+        enum ggml_type wtype = GGML_TYPE_F16;
         params["weight"]     = ggml_new_tensor_4d(ctx, wtype, kernel_size.second, kernel_size.first, in_channels, out_channels);
         if (bias) {
-            enum ggml_type wtype = GGML_TYPE_F32;  // (tensor_types.find(prefix + "bias") != tensor_types.end()) ? tensor_types[prefix + "bias"] : GGML_TYPE_F32;
+            enum ggml_type wtype = GGML_TYPE_F32;
             params["bias"]       = ggml_new_tensor_1d(ctx, wtype, out_channels);
         }
     }
@@ -1570,13 +1635,25 @@ public:
           dilation(dilation),
           bias(bias) {}
 
+    void enable_direct() {
+        direct = true;
+    }
+
+    std::string get_desc() {
+        return "Conv2d";
+    }
+
     struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
         struct ggml_tensor* w = params["weight"];
         struct ggml_tensor* b = NULL;
         if (bias) {
             b = params["bias"];
         }
-        return ggml_nn_conv_2d(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first);
+        if (direct) {
+            return ggml_nn_conv_2d_direct(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first);
+        } else {
+            return ggml_nn_conv_2d(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first);
+        }
     }
 };
 
@@ -1590,11 +1667,11 @@ protected:
     int64_t dilation;
     bool bias;
 
-    void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") {
-        enum ggml_type wtype = GGML_TYPE_F16;                                                              //(tensor_types.find(prefix + "weight") != tensor_types.end()) ? tensor_types[prefix + "weight"] : GGML_TYPE_F16;
+    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types, const std::string prefix = "") {
+        enum ggml_type wtype = GGML_TYPE_F16;
         params["weight"]     = ggml_new_tensor_4d(ctx, wtype, 1, kernel_size, in_channels, out_channels);  // 5d => 4d
         if (bias) {
-            enum ggml_type wtype = GGML_TYPE_F32;  //(tensor_types.find(prefix + "bias") != tensor_types.end()) ? tensor_types[prefix + "bias"] : GGML_TYPE_F32;
+            enum ggml_type wtype = GGML_TYPE_F32;
             params["bias"]       = ggml_new_tensor_1d(ctx, wtype, out_channels);
         }
     }
@@ -1634,12 +1711,12 @@ protected:
     bool elementwise_affine;
     bool bias;
 
-    void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") {
+    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
         if (elementwise_affine) {
-            enum ggml_type wtype = GGML_TYPE_F32;  //(tensor_types.ypes.find(prefix + "weight") != tensor_types.end()) ? tensor_types[prefix + "weight"] : GGML_TYPE_F32;
+            enum ggml_type wtype = GGML_TYPE_F32;
             params["weight"]     = ggml_new_tensor_1d(ctx, wtype, normalized_shape);
             if (bias) {
-                enum ggml_type wtype = GGML_TYPE_F32;  //(tensor_types.ypes.find(prefix + "bias") != tensor_types.end()) ? tensor_types[prefix + "bias"] : GGML_TYPE_F32;
+                enum ggml_type wtype = GGML_TYPE_F32;
                 params["bias"]       = ggml_new_tensor_1d(ctx, wtype, normalized_shape);
             }
         }
@@ -1676,10 +1753,10 @@ protected:
     float eps;
     bool affine;
 
-    void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") {
+    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
         if (affine) {
-            enum ggml_type wtype      = GGML_TYPE_F32;  //(tensor_types.find(prefix + "weight") != tensor_types.end()) ? tensor_types[prefix + "weight"] : GGML_TYPE_F32;
-            enum ggml_type bias_wtype = GGML_TYPE_F32;  //(tensor_types.find(prefix + "bias") != tensor_types.end()) ? tensor_types[prefix + "bias"] : GGML_TYPE_F32;
+            enum ggml_type wtype      = GGML_TYPE_F32;
+            enum ggml_type bias_wtype = GGML_TYPE_F32;
             params["weight"]          = ggml_new_tensor_1d(ctx, wtype, num_channels);
             params["bias"]            = ggml_new_tensor_1d(ctx, bias_wtype, num_channels);
         }
@@ -1760,4 +1837,4 @@ public:
     }
 };
 
-#endif  // __GGML_EXTEND__HPP__
\ No newline at end of file
+#endif  // __GGML_EXTEND__HPP__
diff --git a/otherarch/sdcpp/gits_noise.inl b/otherarch/sdcpp/gits_noise.inl
index 5c32d8bd5..7a10ff76f 100644
--- a/otherarch/sdcpp/gits_noise.inl
+++ b/otherarch/sdcpp/gits_noise.inl
@@ -346,4 +346,4 @@ const std::vector<const std::vector<std::vector<float>>*> GITS_NOISE = {
     &GITS_NOISE_1_50
 };
 
-#endif // GITS_NOISE_INL
\ No newline at end of file
+#endif // GITS_NOISE_INL
diff --git a/otherarch/sdcpp/lora.hpp b/otherarch/sdcpp/lora.hpp
index f7f46ea48..35f5aacd1 100644
--- a/otherarch/sdcpp/lora.hpp
+++ b/otherarch/sdcpp/lora.hpp
@@ -3,7 +3,7 @@
 
 #include "ggml_extend.hpp"
 
-#define LORA_GRAPH_SIZE 20480
+#define LORA_GRAPH_BASE_SIZE 10240
 
 struct LoraModel : public GGMLRunner {
     enum lora_t {
@@ -238,7 +238,8 @@ struct LoraModel : public GGMLRunner {
     }
 
     struct ggml_cgraph* build_lora_graph(std::map<std::string, struct ggml_tensor*> model_tensors, SDVersion version) {
-        struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, LORA_GRAPH_SIZE, false);
+        size_t lora_graph_size = LORA_GRAPH_BASE_SIZE + lora_tensors.size() * 10;
+        struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, lora_graph_size, false);
 
         zero_index = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_I32, 1);
         set_backend_tensor_data(zero_index, zero_index_vec.data());
@@ -291,7 +292,6 @@ struct LoraModel : public GGMLRunner {
                     std::string hada_2_down_name = "";
                     std::string hada_2_up_name   = "";
 
-
                     hada_1_down_name = fk + ".hada_w1_b";
                     hada_1_up_name   = fk + ".hada_w1_a";
                     hada_1_mid_name  = fk + ".hada_t1";
@@ -843,4 +843,4 @@ struct LoraModel : public GGMLRunner {
     }
 };
 
-#endif  // __LORA_HPP__
\ No newline at end of file
+#endif  // __LORA_HPP__
diff --git a/otherarch/sdcpp/main.cpp b/otherarch/sdcpp/main.cpp
index 55e9591d3..ec04dfde3 100644
--- a/otherarch/sdcpp/main.cpp
+++ b/otherarch/sdcpp/main.cpp
@@ -1,73 +1,49 @@
 #include <stdio.h>
 #include <string.h>
 #include <time.h>
+#include <functional>
 #include <iostream>
+#include <map>
 #include <random>
+#include <regex>
 #include <string>
 #include <vector>
 
 // #include "preprocessing.hpp"
-#include "flux.hpp"
 #include "stable-diffusion.h"
 
 #define STB_IMAGE_IMPLEMENTATION
+#define STB_IMAGE_STATIC
 #include "stb_image.h"
 
 #define STB_IMAGE_WRITE_IMPLEMENTATION
+#define STB_IMAGE_WRITE_STATIC
 #include "stb_image_write.h"
 
 #define STB_IMAGE_RESIZE_IMPLEMENTATION
+#define STB_IMAGE_RESIZE_STATIC
 #include "stb_image_resize.h"
 
-const char* rng_type_to_str[] = {
-    "std_default",
-    "cuda",
-};
-
-// Names of the sampler method, same order as enum sample_method in stable-diffusion.h
-const char* sample_method_str[] = {
-    "euler_a",
-    "euler",
-    "heun",
-    "dpm2",
-    "dpm++2s_a",
-    "dpm++2m",
-    "dpm++2mv2",
-    "ipndm",
-    "ipndm_v",
-    "lcm",
-    "ddim_trailing",
-    "tcd",
-};
-
-// Names of the sigma schedule overrides, same order as sample_schedule in stable-diffusion.h
-const char* schedule_str[] = {
-    "default",
-    "discrete",
-    "karras",
-    "exponential",
-    "ays",
-    "gits",
-};
+#define SAFE_STR(s) ((s) ? (s) : "")
+#define BOOL_STR(b) ((b) ? "true" : "false")
 
 const char* modes_str[] = {
-    "txt2img",
-    "img2img",
-    "img2vid",
+    "img_gen",
+    "vid_gen",
     "convert",
 };
+#define SD_ALL_MODES_STR "img_gen, vid_gen, convert"
 
 enum SDMode {
-    TXT2IMG,
-    IMG2IMG,
-    IMG2VID,
+    IMG_GEN,
+    VID_GEN,
     CONVERT,
     MODE_COUNT
 };
 
 struct SDParams {
     int n_threads = -1;
-    SDMode mode   = TXT2IMG;
+    SDMode mode   = IMG_GEN;
     std::string model_path;
     std::string clip_l_path;
     std::string clip_g_path;
@@ -76,30 +52,31 @@ struct SDParams {
     std::string vae_path;
     std::string taesd_path;
     std::string esrgan_path;
-    std::string controlnet_path;
-    std::string embeddings_path;
-    std::string stacked_id_embeddings_path;
+    std::string control_net_path;
+    std::string embedding_dir;
+    std::string stacked_id_embed_dir;
     std::string input_id_images_path;
     sd_type_t wtype = SD_TYPE_COUNT;
+    std::string tensor_type_rules;
     std::string lora_model_dir;
     std::string output_path = "output.png";
     std::string input_path;
     std::string mask_path;
     std::string control_image_path;
-
-    std::vector<std::string> kontext_image_paths;
+    std::vector<std::string> ref_image_paths;
 
     std::string prompt;
     std::string negative_prompt;
-    float min_cfg     = 1.0f;
-    float cfg_scale   = 7.0f;
-    float guidance    = 3.5f;
-    float eta         = 0.f;
-    float style_ratio = 20.f;
-    int clip_skip     = -1;  // <= 0 represents unspecified
-    int width         = 512;
-    int height        = 512;
-    int batch_count   = 1;
+    float min_cfg       = 1.0f;
+    float cfg_scale     = 7.0f;
+    float img_cfg_scale = INFINITY;
+    float guidance      = 3.5f;
+    float eta           = 0.f;
+    float style_ratio   = 20.f;
+    int clip_skip       = -1;  // <= 0 represents unspecified
+    int width           = 512;
+    int height          = 512;
+    int batch_count     = 1;
 
     int video_frames         = 6;
     int motion_bucket_id     = 127;
@@ -120,6 +97,8 @@ struct SDParams {
     bool clip_on_cpu              = false;
     bool vae_on_cpu               = false;
     bool diffusion_flash_attn     = false;
+    bool diffusion_conv_direct    = false;
+    bool vae_conv_direct          = false;
     bool canny_preprocess         = false;
     bool color                    = false;
     int upscale_repeats           = 1;
@@ -129,9 +108,9 @@ struct SDParams {
     float skip_layer_start       = 0.01f;
     float skip_layer_end         = 0.2f;
 
-    bool chroma_use_dit_mask     = true;
-    bool chroma_use_t5_mask      = false;
-    int  chroma_t5_mask_pad      = 1;
+    bool chroma_use_dit_mask = true;
+    bool chroma_use_t5_mask  = false;
+    int chroma_t5_mask_pad   = 1;
 };
 
 void print_params(SDParams params) {
@@ -147,9 +126,9 @@ void print_params(SDParams params) {
     printf("    vae_path:          %s\n", params.vae_path.c_str());
     printf("    taesd_path:        %s\n", params.taesd_path.c_str());
     printf("    esrgan_path:       %s\n", params.esrgan_path.c_str());
-    printf("    controlnet_path:   %s\n", params.controlnet_path.c_str());
-    printf("    embeddings_path:   %s\n", params.embeddings_path.c_str());
-    printf("    stacked_id_embeddings_path:   %s\n", params.stacked_id_embeddings_path.c_str());
+    printf("    control_net_path:   %s\n", params.control_net_path.c_str());
+    printf("    embedding_dir:   %s\n", params.embedding_dir.c_str());
+    printf("    stacked_id_embed_dir:   %s\n", params.stacked_id_embed_dir.c_str());
     printf("    input_id_images_path:   %s\n", params.input_id_images_path.c_str());
     printf("    style ratio:       %.2f\n", params.style_ratio);
     printf("    normalize input image :  %s\n", params.normalize_input ? "true" : "false");
@@ -157,26 +136,33 @@ void print_params(SDParams params) {
     printf("    init_img:          %s\n", params.input_path.c_str());
     printf("    mask_img:          %s\n", params.mask_path.c_str());
     printf("    control_image:     %s\n", params.control_image_path.c_str());
+    printf("    ref_images_paths:\n");
+    for (auto& path : params.ref_image_paths) {
+        printf("        %s\n", path.c_str());
+    };
     printf("    clip on cpu:       %s\n", params.clip_on_cpu ? "true" : "false");
     printf("    controlnet cpu:    %s\n", params.control_net_cpu ? "true" : "false");
     printf("    vae decoder on cpu:%s\n", params.vae_on_cpu ? "true" : "false");
     printf("    diffusion flash attention:%s\n", params.diffusion_flash_attn ? "true" : "false");
+    printf("    diffusion Conv2d direct:%s\n", params.diffusion_conv_direct ? "true" : "false");
+    printf("    vae Conv2d direct:%s\n", params.vae_conv_direct ? "true" : "false");
     printf("    strength(control): %.2f\n", params.control_strength);
     printf("    prompt:            %s\n", params.prompt.c_str());
     printf("    negative_prompt:   %s\n", params.negative_prompt.c_str());
     printf("    min_cfg:           %.2f\n", params.min_cfg);
     printf("    cfg_scale:         %.2f\n", params.cfg_scale);
+    printf("    img_cfg_scale:     %.2f\n", params.img_cfg_scale);
     printf("    slg_scale:         %.2f\n", params.slg_scale);
     printf("    guidance:          %.2f\n", params.guidance);
     printf("    eta:               %.2f\n", params.eta);
     printf("    clip_skip:         %d\n", params.clip_skip);
     printf("    width:             %d\n", params.width);
     printf("    height:            %d\n", params.height);
-    printf("    sample_method:     %s\n", sample_method_str[params.sample_method]);
-    printf("    schedule:          %s\n", schedule_str[params.schedule]);
+    printf("    sample_method:     %s\n", sd_sample_method_name(params.sample_method));
+    printf("    schedule:          %s\n", sd_schedule_name(params.schedule));
     printf("    sample_steps:      %d\n", params.sample_steps);
     printf("    strength(img2img): %.2f\n", params.strength);
-    printf("    rng:               %s\n", rng_type_to_str[params.rng_type]);
+    printf("    rng:               %s\n", sd_rng_type_name(params.rng_type));
     printf("    seed:              %ld\n", params.seed);
     printf("    batch_count:       %d\n", params.batch_count);
     printf("    vae_tiling:        %s\n", params.vae_tiling ? "true" : "false");
@@ -191,14 +177,14 @@ void print_usage(int argc, const char* argv[]) {
     printf("\n");
     printf("arguments:\n");
     printf("  -h, --help                         show this help message and exit\n");
-    printf("  -M, --mode [MODEL]                 run mode (txt2img or img2img or convert, default: txt2img)\n");
+    printf("  -M, --mode [MODE]                  run mode, one of: [img_gen, convert], default: img_gen\n");
     printf("  -t, --threads N                    number of threads to use during computation (default: -1)\n");
     printf("                                     If threads <= 0, then threads will be set to the number of CPU physical cores\n");
     printf("  -m, --model [MODEL]                path to full model\n");
     printf("  --diffusion-model                  path to the standalone diffusion model\n");
     printf("  --clip_l                           path to the clip-l text encoder\n");
     printf("  --clip_g                           path to the clip-g text encoder\n");
-    printf("  --t5xxl                            path to the the t5xxl text encoder\n");
+    printf("  --t5xxl                            path to the t5xxl text encoder\n");
     printf("  --vae [VAE]                        path to vae\n");
     printf("  --taesd [TAESD_PATH]               path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)\n");
     printf("  --control-net [CONTROL_PATH]       path to control net model\n");
@@ -210,15 +196,18 @@ void print_usage(int argc, const char* argv[]) {
     printf("  --upscale-repeats                  Run the ESRGAN upscaler this many times (default 1)\n");
     printf("  --type [TYPE]                      weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K)\n");
     printf("                                     If not specified, the default is the type of the weight file\n");
+    printf("  --tensor-type-rules [EXPRESSION]   weight type per tensor pattern (example: \"^vae\\.=f16,model\\.=q8_0\")\n");
     printf("  --lora-model-dir [DIR]             lora model directory\n");
     printf("  -i, --init-img [IMAGE]             path to the input image, required by img2img\n");
     printf("  --mask [MASK]                      path to the mask image, required by img2img with mask\n");
     printf("  --control-image [IMAGE]            path to image condition, control net\n");
+    printf("  -r, --ref-image [PATH]             reference image for Flux Kontext models (can be used multiple times) \n");
     printf("  -o, --output OUTPUT                path to write result image to (default: ./output.png)\n");
     printf("  -p, --prompt [PROMPT]              the prompt to render\n");
     printf("  -n, --negative-prompt PROMPT       the negative prompt (default: \"\")\n");
     printf("  --cfg-scale SCALE                  unconditional guidance scale: (default: 7.0)\n");
-    printf("  --guidance SCALE                   guidance scale for img2img (default: 3.5)\n");
+    printf("  --img-cfg-scale SCALE              image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)\n");
+    printf("  --guidance SCALE                   distilled guidance scale for models with guidance input (default: 3.5)\n");
     printf("  --slg-scale SCALE                  skip layer guidance (SLG) scale, only for DiT models: (default: 0)\n");
     printf("                                     0 means disabled, a value of 2.5 is nice for sd3.5 medium\n");
     printf("  --eta SCALE                        eta in DDIM, only for DDIM and TCD: (default: 0)\n");
@@ -227,7 +216,7 @@ void print_usage(int argc, const char* argv[]) {
     printf("  --skip-layer-end END               SLG disabling point: (default: 0.2)\n");
     printf("                                     SLG will be enabled at step int([STEPS]*[START]) and disabled at int([STEPS]*[END])\n");
     printf("  --strength STRENGTH                strength for noising/unnoising (default: 0.75)\n");
-    printf("  --style-ratio STYLE-RATIO          strength for keeping input identity (default: 20%%)\n");
+    printf("  --style-ratio STYLE-RATIO          strength for keeping input identity (default: 20)\n");
     printf("  --control-strength STRENGTH        strength to apply Control Net (default: 0.9)\n");
     printf("                                     1.0 corresponds to full destruction of information in init image\n");
     printf("  -H, --height H                     image height, in pixel space (default: 512)\n");
@@ -247,420 +236,359 @@ void print_usage(int argc, const char* argv[]) {
     printf("  --diffusion-fa                     use flash attention in the diffusion model (for low vram)\n");
     printf("                                     Might lower quality, since it implies converting k and v to f16.\n");
     printf("                                     This might crash if it is not supported by the backend.\n");
+    printf("  --diffusion-conv-direct            use Conv2d direct in the diffusion model");
+    printf("                                     This might crash if it is not supported by the backend.\n");
+    printf("  --vae-conv-direct                  use Conv2d direct in the vae model (should improve the performance)");
+    printf("                                     This might crash if it is not supported by the backend.\n");
     printf("  --control-net-cpu                  keep controlnet in cpu (for low vram)\n");
     printf("  --canny                            apply canny preprocessor (edge detection)\n");
-    printf("  --color                            Colors the logging tags according to level\n");
+    printf("  --color                            colors the logging tags according to level\n");
     printf("  --chroma-disable-dit-mask          disable dit mask for chroma\n");
     printf("  --chroma-enable-t5-mask            enable t5 mask for chroma\n");
     printf("  --chroma-t5-mask-pad  PAD_SIZE     t5 mask pad size of chroma\n");
     printf("  -v, --verbose                      print extra info\n");
-    printf("  -ki, --kontext_img [PATH]        Reference image for Flux Kontext models (can be used multiple times) \n");
 }
 
-void parse_args(int argc, const char** argv, SDParams& params) {
+struct StringOption {
+    std::string short_name;
+    std::string long_name;
+    std::string desc;
+    std::string* target;
+};
+
+struct IntOption {
+    std::string short_name;
+    std::string long_name;
+    std::string desc;
+    int* target;
+};
+
+struct FloatOption {
+    std::string short_name;
+    std::string long_name;
+    std::string desc;
+    float* target;
+};
+
+struct BoolOption {
+    std::string short_name;
+    std::string long_name;
+    std::string desc;
+    bool keep_true;
+    bool* target;
+};
+
+struct ManualOption {
+    std::string short_name;
+    std::string long_name;
+    std::string desc;
+    std::function<int(int argc, const char** argv, int index)> cb;
+};
+
+struct ArgOptions {
+    std::vector<StringOption> string_options;
+    std::vector<IntOption> int_options;
+    std::vector<FloatOption> float_options;
+    std::vector<BoolOption> bool_options;
+    std::vector<ManualOption> manual_options;
+};
+
+bool parse_options(int argc, const char** argv, ArgOptions& options) {
     bool invalid_arg = false;
     std::string arg;
     for (int i = 1; i < argc; i++) {
         arg = argv[i];
 
-        if (arg == "-t" || arg == "--threads") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.n_threads = std::stoi(argv[i]);
-        } else if (arg == "-M" || arg == "--mode") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            const char* mode_selected = argv[i];
-            int mode_found            = -1;
-            for (int d = 0; d < MODE_COUNT; d++) {
-                if (!strcmp(mode_selected, modes_str[d])) {
-                    mode_found = d;
-                }
-            }
-            if (mode_found == -1) {
-                fprintf(stderr,
-                        "error: invalid mode %s, must be one of [txt2img, img2img, img2vid, convert]\n",
-                        mode_selected);
-                exit(1);
-            }
-            params.mode = (SDMode)mode_found;
-        } else if (arg == "-m" || arg == "--model") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.model_path = argv[i];
-        } else if (arg == "--clip_l") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.clip_l_path = argv[i];
-        } else if (arg == "--clip_g") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.clip_g_path = argv[i];
-        } else if (arg == "--t5xxl") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.t5xxl_path = argv[i];
-        } else if (arg == "--diffusion-model") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.diffusion_model_path = argv[i];
-        } else if (arg == "--vae") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.vae_path = argv[i];
-        } else if (arg == "--taesd") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.taesd_path = argv[i];
-        } else if (arg == "--control-net") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.controlnet_path = argv[i];
-        } else if (arg == "--upscale-model") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.esrgan_path = argv[i];
-        } else if (arg == "--embd-dir") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.embeddings_path = argv[i];
-        } else if (arg == "--stacked-id-embd-dir") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.stacked_id_embeddings_path = argv[i];
-        } else if (arg == "--input-id-images-dir") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.input_id_images_path = argv[i];
-        } else if (arg == "--type") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            std::string type        = argv[i];
-            bool found              = false;
-            std::string valid_types = "";
-            for (size_t i = 0; i < SD_TYPE_COUNT; i++) {
-                auto trait = ggml_get_type_traits((ggml_type)i);
-                std::string name(trait->type_name);
-                if (name == "f32" || trait->to_float && trait->type_size) {
-                    if (i)
-                        valid_types += ", ";
-                    valid_types += name;
-                    if (type == name) {
-                        if (ggml_quantize_requires_imatrix((ggml_type)i)) {
-                            printf("\033[35;1m[WARNING]\033[0m: type %s requires imatrix to work properly. A dummy imatrix will be used, expect poor quality.\n", trait->type_name);
-                        }
-                        params.wtype = (enum sd_type_t)i;
-                        found        = true;
-                        break;
-                    }
-                }
-            }
-            if (!found) {
-                fprintf(stderr, "error: invalid weight format %s, must be one of [%s]\n",
-                        type.c_str(),
-                        valid_types.c_str());
-                exit(1);
-            }
-        } else if (arg == "--lora-model-dir") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.lora_model_dir = argv[i];
-        } else if (arg == "-i" || arg == "--init-img") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.input_path = argv[i];
-        } else if (arg == "--mask") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.mask_path = argv[i];
-        } else if (arg == "--control-image") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.control_image_path = argv[i];
-        } else if (arg == "-o" || arg == "--output") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.output_path = argv[i];
-        } else if (arg == "-p" || arg == "--prompt") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.prompt = argv[i];
-        } else if (arg == "--upscale-repeats") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.upscale_repeats = std::stoi(argv[i]);
-            if (params.upscale_repeats < 1) {
-                fprintf(stderr, "error: upscale multiplier must be at least 1\n");
-                exit(1);
-            }
-        } else if (arg == "-n" || arg == "--negative-prompt") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.negative_prompt = argv[i];
-        } else if (arg == "--cfg-scale") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.cfg_scale = std::stof(argv[i]);
-        } else if (arg == "--guidance") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.guidance = std::stof(argv[i]);
-        } else if (arg == "--eta") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.eta = std::stof(argv[i]);
-        } else if (arg == "--strength") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.strength = std::stof(argv[i]);
-        } else if (arg == "--style-ratio") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.style_ratio = std::stof(argv[i]);
-        } else if (arg == "--control-strength") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.control_strength = std::stof(argv[i]);
-        } else if (arg == "-H" || arg == "--height") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.height = std::stoi(argv[i]);
-        } else if (arg == "-W" || arg == "--width") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.width = std::stoi(argv[i]);
-        } else if (arg == "--steps") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.sample_steps = std::stoi(argv[i]);
-        } else if (arg == "--clip-skip") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.clip_skip = std::stoi(argv[i]);
-        } else if (arg == "--vae-tiling") {
-            params.vae_tiling = true;
-        } else if (arg == "--control-net-cpu") {
-            params.control_net_cpu = true;
-        } else if (arg == "--normalize-input") {
-            params.normalize_input = true;
-        } else if (arg == "--clip-on-cpu") {
-            params.clip_on_cpu = true;  // will slow down get_learned_condiotion but necessary for low MEM GPUs
-        } else if (arg == "--vae-on-cpu") {
-            params.vae_on_cpu = true;  // will slow down latent decoding but necessary for low MEM GPUs
-        } else if (arg == "--diffusion-fa") {
-            params.diffusion_flash_attn = true;  // can reduce MEM significantly
-        } else if (arg == "--canny") {
-            params.canny_preprocess = true;
-        } else if (arg == "-b" || arg == "--batch-count") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.batch_count = std::stoi(argv[i]);
-        } else if (arg == "--rng") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            std::string rng_type_str = argv[i];
-            if (rng_type_str == "std_default") {
-                params.rng_type = STD_DEFAULT_RNG;
-            } else if (rng_type_str == "cuda") {
-                params.rng_type = CUDA_RNG;
-            } else {
-                invalid_arg = true;
-                break;
-            }
-        } else if (arg == "--schedule") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            const char* schedule_selected = argv[i];
-            int schedule_found            = -1;
-            for (int d = 0; d < N_SCHEDULES; d++) {
-                if (!strcmp(schedule_selected, schedule_str[d])) {
-                    schedule_found = d;
-                }
-            }
-            if (schedule_found == -1) {
-                invalid_arg = true;
-                break;
-            }
-            params.schedule = (schedule_t)schedule_found;
-        } else if (arg == "-s" || arg == "--seed") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.seed = std::stoll(argv[i]);
-        } else if (arg == "--sampling-method") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            const char* sample_method_selected = argv[i];
-            int sample_method_found            = -1;
-            for (int m = 0; m < N_SAMPLE_METHODS; m++) {
-                if (!strcmp(sample_method_selected, sample_method_str[m])) {
-                    sample_method_found = m;
-                }
-            }
-            if (sample_method_found == -1) {
-                invalid_arg = true;
-                break;
-            }
-            params.sample_method = (sample_method_t)sample_method_found;
-        } else if (arg == "-h" || arg == "--help") {
-            print_usage(argc, argv);
-            exit(0);
-        } else if (arg == "-v" || arg == "--verbose") {
-            params.verbose = true;
-        } else if (arg == "--color") {
-            params.color = true;
-        } else if (arg == "--slg-scale") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.slg_scale = std::stof(argv[i]);
-        } else if (arg == "--skip-layers") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            if (argv[i][0] != '[') {
-                invalid_arg = true;
-                break;
-            }
-            std::string layers_str = argv[i];
-            while (layers_str.back() != ']') {
+        for (auto& option : options.string_options) {
+            if ((option.short_name.size() > 0 && arg == option.short_name) || (option.long_name.size() > 0 && arg == option.long_name)) {
                 if (++i >= argc) {
                     invalid_arg = true;
                     break;
                 }
-                layers_str += " " + std::string(argv[i]);
+                *option.target = std::string(argv[i]);
             }
-            layers_str = layers_str.substr(1, layers_str.size() - 2);
+        }
+        if (invalid_arg) {
+            break;
+        }
 
-            std::regex regex("[, ]+");
-            std::sregex_token_iterator iter(layers_str.begin(), layers_str.end(), regex, -1);
-            std::sregex_token_iterator end;
-            std::vector<std::string> tokens(iter, end);
-            std::vector<int> layers;
-            for (const auto& token : tokens) {
-                try {
-                    layers.push_back(std::stoi(token));
-                } catch (const std::invalid_argument& e) {
+        for (auto& option : options.int_options) {
+            if ((option.short_name.size() > 0 && arg == option.short_name) || (option.long_name.size() > 0 && arg == option.long_name)) {
+                if (++i >= argc) {
                     invalid_arg = true;
                     break;
                 }
+                *option.target = std::stoi(argv[i]);
             }
-            params.skip_layers = layers;
+        }
+        if (invalid_arg) {
+            break;
+        }
 
-            if (invalid_arg) {
-                break;
+        for (auto& option : options.float_options) {
+            if ((option.short_name.size() > 0 && arg == option.short_name) || (option.long_name.size() > 0 && arg == option.long_name)) {
+                if (++i >= argc) {
+                    invalid_arg = true;
+                    break;
+                }
+                *option.target = std::stof(argv[i]);
             }
-        } else if (arg == "--skip-layer-start") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
+        }
+        if (invalid_arg) {
+            break;
+        }
+
+        for (auto& option : options.bool_options) {
+            if ((option.short_name.size() > 0 && arg == option.short_name) || (option.long_name.size() > 0 && arg == option.long_name)) {
+                if (option.keep_true) {
+                    *option.target = true;
+                } else {
+                    *option.target = false;
+                }
             }
-            params.skip_layer_start = std::stof(argv[i]);
-        } else if (arg == "--skip-layer-end") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
+        }
+        if (invalid_arg) {
+            break;
+        }
+
+        for (auto& option : options.manual_options) {
+            if ((option.short_name.size() > 0 && arg == option.short_name) || (option.long_name.size() > 0 && arg == option.long_name)) {
+                int ret = option.cb(argc, argv, i);
+                if (ret < 0) {
+                    invalid_arg = true;
+                    break;
+                }
+                i += ret;
             }
-            params.skip_layer_end = std::stof(argv[i]);
-        } else if (arg == "-ki" || arg == "--kontext-img") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.kontext_image_paths.push_back(argv[i]);
-        } else {
-            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            print_usage(argc, argv);
-            exit(1);
+        }
+        if (invalid_arg) {
+            break;
         }
     }
     if (invalid_arg) {
         fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
+        return false;
+    }
+    return true;
+}
+
+void parse_args(int argc, const char** argv, SDParams& params) {
+    ArgOptions options;
+    options.string_options = {
+        {"-m", "--model", "", &params.model_path},
+        {"", "--clip_l", "", &params.clip_l_path},
+        {"", "--clip_g", "", &params.clip_g_path},
+        {"", "--t5xxl", "", &params.t5xxl_path},
+        {"", "--diffusion-model", "", &params.diffusion_model_path},
+        {"", "--vae", "", &params.vae_path},
+        {"", "--taesd", "", &params.taesd_path},
+        {"", "--control-net", "", &params.control_net_path},
+        {"", "--embd-dir", "", &params.embedding_dir},
+        {"", "--stacked-id-embd-dir", "", &params.stacked_id_embed_dir},
+        {"", "--lora-model-dir", "", &params.lora_model_dir},
+        {"-i", "--init-img", "", &params.input_path},
+        {"", "--tensor-type-rules", "", &params.tensor_type_rules},
+        {"", "--input-id-images-dir", "", &params.input_id_images_path},
+        {"", "--mask", "", &params.mask_path},
+        {"", "--control-image", "", &params.control_image_path},
+        {"-o", "--output", "", &params.output_path},
+        {"-p", "--prompt", "", &params.prompt},
+        {"-n", "--negative-prompt", "", &params.negative_prompt},
+
+        {"", "--upscale-model", "", &params.esrgan_path},
+    };
+
+    options.int_options = {
+        {"-t", "--threads", "", &params.n_threads},
+        {"", "--upscale-repeats", "", &params.upscale_repeats},
+        {"-H", "--height", "", &params.height},
+        {"-W", "--width", "", &params.width},
+        {"", "--steps", "", &params.sample_steps},
+        {"", "--clip-skip", "", &params.clip_skip},
+        {"-b", "--batch-count", "", &params.batch_count},
+        {"", "--chroma-t5-mask-pad", "", &params.chroma_t5_mask_pad},
+    };
+
+    options.float_options = {
+        {"", "--cfg-scale", "", &params.cfg_scale},
+        {"", "--img-cfg-scale", "", &params.img_cfg_scale},
+        {"", "--guidance", "", &params.guidance},
+        {"", "--eta", "", &params.eta},
+        {"", "--strength", "", &params.strength},
+        {"", "--style-ratio", "", &params.style_ratio},
+        {"", "--control-strength", "", &params.control_strength},
+        {"", "--slg-scale", "", &params.slg_scale},
+        {"", "--skip-layer-start", "", &params.skip_layer_start},
+        {"", "--skip-layer-end", "", &params.skip_layer_end},
+
+    };
+
+    options.bool_options = {
+        {"", "--vae-tiling", "", true, &params.vae_tiling},
+        {"", "--control-net-cpu", "", true, &params.control_net_cpu},
+        {"", "--normalize-input", "", true, &params.normalize_input},
+        {"", "--clip-on-cpu", "", true, &params.clip_on_cpu},
+        {"", "--vae-on-cpu", "", true, &params.vae_on_cpu},
+        {"", "--diffusion-fa", "", true, &params.diffusion_flash_attn},
+        {"", "--diffusion-conv-direct", "", true, &params.diffusion_conv_direct},
+        {"", "--vae-conv-direct", "", true, &params.vae_conv_direct},
+        {"", "--canny", "", true, &params.canny_preprocess},
+        {"-v", "--verbos", "", true, &params.verbose},
+        {"", "--color", "", true, &params.color},
+        {"", "--chroma-disable-dit-mask", "", false, &params.chroma_use_dit_mask},
+        {"", "--chroma-enable-t5-mask", "", true, &params.chroma_use_t5_mask},
+    };
+
+    auto on_mode_arg = [&](int argc, const char** argv, int index) {
+        if (++index >= argc) {
+            return -1;
+        }
+        const char* mode = argv[index];
+        if (mode != NULL) {
+            int mode_found = -1;
+            for (int i = 0; i < MODE_COUNT; i++) {
+                if (!strcmp(mode, modes_str[i])) {
+                    mode_found = i;
+                }
+            }
+            if (mode_found == -1) {
+                fprintf(stderr,
+                        "error: invalid mode %s, must be one of [%s]\n",
+                        mode, SD_ALL_MODES_STR);
+                exit(1);
+            }
+            params.mode = (SDMode)mode_found;
+        }
+        return 1;
+    };
+
+    auto on_type_arg = [&](int argc, const char** argv, int index) {
+        if (++index >= argc) {
+            return -1;
+        }
+        const char* arg = argv[index];
+        params.wtype    = str_to_sd_type(arg);
+        if (params.wtype == SD_TYPE_COUNT) {
+            fprintf(stderr, "error: invalid weight format %s\n",
+                    arg);
+            return -1;
+        }
+        return 1;
+    };
+
+    auto on_rng_arg = [&](int argc, const char** argv, int index) {
+        if (++index >= argc) {
+            return -1;
+        }
+        const char* arg = argv[index];
+        params.rng_type = str_to_rng_type(arg);
+        if (params.rng_type == RNG_TYPE_COUNT) {
+            fprintf(stderr, "error: invalid rng type %s\n",
+                    arg);
+            return -1;
+        }
+        return 1;
+    };
+
+    auto on_schedule_arg = [&](int argc, const char** argv, int index) {
+        if (++index >= argc) {
+            return -1;
+        }
+        const char* arg = argv[index];
+        params.schedule = str_to_schedule(arg);
+        if (params.schedule == SCHEDULE_COUNT) {
+            fprintf(stderr, "error: invalid schedule %s\n",
+                    arg);
+            return -1;
+        }
+        return 1;
+    };
+
+    auto on_sample_method_arg = [&](int argc, const char** argv, int index) {
+        if (++index >= argc) {
+            return -1;
+        }
+        const char* arg      = argv[index];
+        params.sample_method = str_to_sample_method(arg);
+        if (params.sample_method == SAMPLE_METHOD_COUNT) {
+            fprintf(stderr, "error: invalid sample method %s\n",
+                    arg);
+            return -1;
+        }
+        return 1;
+    };
+
+    auto on_seed_arg = [&](int argc, const char** argv, int index) {
+        if (++index >= argc) {
+            return -1;
+        }
+        params.seed = std::stoll(argv[index]);
+        return 1;
+    };
+
+    auto on_help_arg = [&](int argc, const char** argv, int index) {
+        print_usage(argc, argv);
+        exit(0);
+        return 0;
+    };
+
+    auto on_skip_layers_arg = [&](int argc, const char** argv, int index) {
+        if (++index >= argc) {
+            return -1;
+        }
+        std::string layers_str = argv[index];
+        if (layers_str[0] != '[' || layers_str[layers_str.size() - 1] != ']') {
+            return -1;
+        }
+
+        layers_str = layers_str.substr(1, layers_str.size() - 2);
+
+        std::regex regex("[, ]+");
+        std::sregex_token_iterator iter(layers_str.begin(), layers_str.end(), regex, -1);
+        std::sregex_token_iterator end;
+        std::vector<std::string> tokens(iter, end);
+        std::vector<int> layers;
+        for (const auto& token : tokens) {
+            try {
+                layers.push_back(std::stoi(token));
+            } catch (const std::invalid_argument& e) {
+                return -1;
+            }
+        }
+        params.skip_layers = layers;
+        return 1;
+    };
+
+    auto on_ref_image_arg = [&](int argc, const char** argv, int index) {
+        if (++index >= argc) {
+            return -1;
+        }
+        params.ref_image_paths.push_back(argv[index]);
+        return 1;
+    };
+
+    options.manual_options = {
+        {"-M", "--mode", "", on_mode_arg},
+        {"", "--type", "", on_type_arg},
+        {"", "--rng", "", on_rng_arg},
+        {"-s", "--seed", "", on_seed_arg},
+        {"", "--sampling-method", "", on_sample_method_arg},
+        {"", "--schedule", "", on_schedule_arg},
+        {"", "--skip-layers", "", on_skip_layers_arg},
+        {"-r", "--ref-image", "", on_ref_image_arg},
+        {"-h", "--help", "", on_help_arg},
+    };
+
+    if (!parse_options(argc, argv, options)) {
         print_usage(argc, argv);
         exit(1);
     }
+
     if (params.n_threads <= 0) {
-        params.n_threads = sd_get_num_physical_cores();
+        params.n_threads = get_num_physical_cores();
     }
 
-    if (params.mode != CONVERT && params.mode != IMG2VID && params.prompt.length() == 0) {
+    if (params.mode != CONVERT && params.mode != VID_GEN && params.prompt.length() == 0) {
         fprintf(stderr, "error: the following arguments are required: prompt\n");
         print_usage(argc, argv);
         exit(1);
@@ -672,25 +600,19 @@ void parse_args(int argc, const char** argv, SDParams& params) {
         exit(1);
     }
 
-    if ((params.mode == IMG2IMG || params.mode == IMG2VID) && params.input_path.length() == 0) {
-        fprintf(stderr, "error: when using the img2img mode, the following arguments are required: init-img\n");
-        print_usage(argc, argv);
-        exit(1);
-    }
-
     if (params.output_path.length() == 0) {
         fprintf(stderr, "error: the following arguments are required: output_path\n");
         print_usage(argc, argv);
         exit(1);
     }
 
-    if (params.width <= 0 || params.width % 64 != 0) {
-        fprintf(stderr, "error: the width must be a multiple of 64\n");
+    if (params.width <= 0) {
+        fprintf(stderr, "error: the width must be greater than 0\n");
         exit(1);
     }
 
-    if (params.height <= 0 || params.height % 64 != 0) {
-        fprintf(stderr, "error: the height must be a multiple of 64\n");
+    if (params.height <= 0) {
+        fprintf(stderr, "error: the height must be greater than 0\n");
         exit(1);
     }
 
@@ -704,6 +626,15 @@ void parse_args(int argc, const char** argv, SDParams& params) {
         exit(1);
     }
 
+    if (params.mode != CONVERT && params.tensor_type_rules.size() > 0) {
+        fprintf(stderr, "warning: --tensor-type-rules is currently supported only for conversion\n");
+    }
+
+    if (params.upscale_repeats < 1) {
+        fprintf(stderr, "error: upscale multiplier must be at least 1\n");
+        exit(1);
+    }
+
     if (params.seed < 0) {
         srand((int)time(NULL));
         params.seed = rand();
@@ -714,6 +645,10 @@ void parse_args(int argc, const char** argv, SDParams& params) {
             params.output_path = "output.gguf";
         }
     }
+
+    if (!isfinite(params.img_cfg_scale)) {
+        params.img_cfg_scale = params.cfg_scale;
+    }
 }
 
 static std::string sd_basename(const std::string& path) {
@@ -750,12 +685,26 @@ std::string get_image_params(SDParams params, int64_t seed) {
     parameter_string += "Seed: " + std::to_string(seed) + ", ";
     parameter_string += "Size: " + std::to_string(params.width) + "x" + std::to_string(params.height) + ", ";
     parameter_string += "Model: " + sd_basename(params.model_path) + ", ";
-    parameter_string += "RNG: " + std::string(rng_type_to_str[params.rng_type]) + ", ";
-    parameter_string += "Sampler: " + std::string(sample_method_str[params.sample_method]);
-    if (params.schedule == KARRAS) {
-        parameter_string += " karras";
+    parameter_string += "RNG: " + std::string(sd_rng_type_name(params.rng_type)) + ", ";
+    parameter_string += "Sampler: " + std::string(sd_sample_method_name(params.sample_method));
+    if (params.schedule != DEFAULT) {
+        parameter_string += " " + std::string(sd_schedule_name(params.schedule));
     }
     parameter_string += ", ";
+    for (const auto& te : {params.clip_l_path, params.clip_g_path, params.t5xxl_path}) {
+        if (!te.empty()) {
+            parameter_string += "TE: " + sd_basename(te) + ", ";
+        }
+    }
+    if (!params.diffusion_model_path.empty()) {
+        parameter_string += "Unet: " + sd_basename(params.diffusion_model_path) + ", ";
+    }
+    if (!params.vae_path.empty()) {
+        parameter_string += "VAE: " + sd_basename(params.vae_path) + ", ";
+    }
+    if (params.clip_skip != -1) {
+        parameter_string += "Clip skip: " + std::to_string(params.clip_skip) + ", ";
+    }
     parameter_string += "Version: stable-diffusion.cpp";
     return parameter_string;
 }
@@ -808,6 +757,18 @@ int main(int argc, const char* argv[]) {
 
     parse_args(argc, argv, params);
 
+    sd_guidance_params_t guidance_params = {params.cfg_scale,
+                                            params.img_cfg_scale,
+                                            params.min_cfg,
+                                            params.guidance,
+                                            {
+                                                params.skip_layers.data(),
+                                                params.skip_layers.size(),
+                                                params.skip_layer_start,
+                                                params.skip_layer_end,
+                                                params.slg_scale,
+                                            }};
+
     sd_set_log_callback(sd_log_cb, (void*)&params);
 
     if (params.verbose) {
@@ -816,7 +777,7 @@ int main(int argc, const char* argv[]) {
     }
 
     if (params.mode == CONVERT) {
-        bool success = convert(params.model_path.c_str(), params.vae_path.c_str(), params.output_path.c_str(), params.wtype);
+        bool success = convert(params.model_path.c_str(), params.vae_path.c_str(), params.output_path.c_str(), params.wtype, params.tensor_type_rules.c_str());
         if (!success) {
             fprintf(stderr,
                     "convert '%s'/'%s' to '%s' failed\n",
@@ -833,49 +794,18 @@ int main(int argc, const char* argv[]) {
         }
     }
 
-    if (params.mode == IMG2VID) {
+    if (params.mode == VID_GEN) {
         fprintf(stderr, "SVD support is broken, do not use it!!!\n");
         return 1;
     }
-    bool vae_decode_only = true;
-
-    std::vector<sd_image_t> kontext_imgs;
-    for (auto& path : params.kontext_image_paths) {
-        vae_decode_only       = false;
-        int c                 = 0;
-        int width             = 0;
-        int height            = 0;
-        uint8_t* image_buffer = stbi_load(path.c_str(), &width, &height, &c, 3);
-        if (image_buffer == NULL) {
-            fprintf(stderr, "load image from '%s' failed\n", path.c_str());
-            return 1;
-        }
-        if (c < 3) {
-            fprintf(stderr, "the number of channels for the input image must be >= 3, but got %d channels\n", c);
-            free(image_buffer);
-            return 1;
-        }
-        if (width <= 0) {
-            fprintf(stderr, "error: the width of image must be greater than 0\n");
-            free(image_buffer);
-            return 1;
-        }
-        if (height <= 0) {
-            fprintf(stderr, "error: the height of image must be greater than 0\n");
-            free(image_buffer);
-            return 1;
-        }
-        kontext_imgs.push_back({(uint32_t)width,
-                                (uint32_t)height,
-                                3,
-                                image_buffer});
-    }
 
+    bool vae_decode_only          = true;
     uint8_t* input_image_buffer   = NULL;
     uint8_t* control_image_buffer = NULL;
     uint8_t* mask_image_buffer    = NULL;
+    std::vector<sd_image_t> ref_images;
 
-    if (params.mode == IMG2IMG || params.mode == IMG2VID) {
+    if (params.input_path.size() > 0) {
         vae_decode_only = false;
 
         int c              = 0;
@@ -925,41 +855,83 @@ int main(int argc, const char* argv[]) {
             free(input_image_buffer);
             input_image_buffer = resized_image_buffer;
         }
+    } else if (params.ref_image_paths.size() > 0) {
+        vae_decode_only = false;
+        for (auto& path : params.ref_image_paths) {
+            int c                 = 0;
+            int width             = 0;
+            int height            = 0;
+            uint8_t* image_buffer = stbi_load(path.c_str(), &width, &height, &c, 3);
+            if (image_buffer == NULL) {
+                fprintf(stderr, "load image from '%s' failed\n", path.c_str());
+                return 1;
+            }
+            if (c < 3) {
+                fprintf(stderr, "the number of channels for the input image must be >= 3, but got %d channels\n", c);
+                free(image_buffer);
+                return 1;
+            }
+            if (width <= 0) {
+                fprintf(stderr, "error: the width of image must be greater than 0\n");
+                free(image_buffer);
+                return 1;
+            }
+            if (height <= 0) {
+                fprintf(stderr, "error: the height of image must be greater than 0\n");
+                free(image_buffer);
+                return 1;
+            }
+            ref_images.push_back({(uint32_t)width,
+                                  (uint32_t)height,
+                                  3,
+                                  image_buffer});
+        }
     }
 
-    sd_ctx_t* sd_ctx = new_sd_ctx(params.model_path.c_str(),
-                                  params.clip_l_path.c_str(),
-                                  params.clip_g_path.c_str(),
-                                  params.t5xxl_path.c_str(),
-                                  params.diffusion_model_path.c_str(),
-                                  params.vae_path.c_str(),
-                                  params.taesd_path.c_str(),
-                                  params.controlnet_path.c_str(),
-                                  params.lora_model_dir.c_str(),
-                                  params.embeddings_path.c_str(),
-                                  params.stacked_id_embeddings_path.c_str(),
-                                  vae_decode_only,
-                                  params.vae_tiling,
-                                  true,
-                                  params.n_threads,
-                                  params.wtype,
-                                  params.rng_type,
-                                  params.schedule,
-                                  params.clip_on_cpu,
-                                  params.control_net_cpu,
-                                  params.vae_on_cpu,
-                                  params.diffusion_flash_attn,
-                                  params.chroma_use_dit_mask,
-                                  params.chroma_use_t5_mask,
-                                  params.chroma_t5_mask_pad);
+    sd_ctx_params_t sd_ctx_params = {
+        params.model_path.c_str(),
+        params.clip_l_path.c_str(),
+        params.clip_g_path.c_str(),
+        params.t5xxl_path.c_str(),
+        params.diffusion_model_path.c_str(),
+        params.vae_path.c_str(),
+        params.taesd_path.c_str(),
+        params.control_net_path.c_str(),
+        params.lora_model_dir.c_str(),
+        params.embedding_dir.c_str(),
+        params.stacked_id_embed_dir.c_str(),
+        vae_decode_only,
+        params.vae_tiling,
+        true,
+        params.n_threads,
+        params.wtype,
+        params.rng_type,
+        params.schedule,
+        params.clip_on_cpu,
+        params.control_net_cpu,
+        params.vae_on_cpu,
+        params.diffusion_flash_attn,
+        params.diffusion_conv_direct,
+        params.vae_conv_direct,
+        params.chroma_use_dit_mask,
+        params.chroma_use_t5_mask,
+        params.chroma_t5_mask_pad,
+    };
+
+    sd_ctx_t* sd_ctx = new_sd_ctx(&sd_ctx_params);
 
     if (sd_ctx == NULL) {
         printf("new_sd_ctx_t failed\n");
         return 1;
     }
 
+    sd_image_t input_image = {(uint32_t)params.width,
+                              (uint32_t)params.height,
+                              3,
+                              input_image_buffer};
+
     sd_image_t* control_image = NULL;
-    if (params.controlnet_path.size() > 0 && params.control_image_path.size() > 0) {
+    if (params.control_net_path.size() > 0 && params.control_image_path.size() > 0) {
         int c                = 0;
         control_image_buffer = stbi_load(params.control_image_path.c_str(), &params.width, &params.height, &c, 3);
         if (control_image_buffer == NULL) {
@@ -995,104 +967,52 @@ int main(int argc, const char* argv[]) {
                              mask_image_buffer};
 
     sd_image_t* results;
-    if (params.mode == TXT2IMG) {
-        results = txt2img(sd_ctx,
-                          params.prompt.c_str(),
-                          params.negative_prompt.c_str(),
-                          params.clip_skip,
-                          params.cfg_scale,
-                          params.guidance,
-                          params.eta,
-                          params.width,
-                          params.height,
-                          params.sample_method,
-                          params.sample_steps,
-                          params.seed,
-                          params.batch_count,
-                          control_image,
-                          params.control_strength,
-                          params.style_ratio,
-                          params.normalize_input,
-                          params.input_id_images_path.c_str(),
-                          kontext_imgs.data(), kontext_imgs.size(),
-                          params.skip_layers.data(),
-                          params.skip_layers.size(),
-                          params.slg_scale,
-                          params.skip_layer_start,
-                          params.skip_layer_end,
-                          std::vector<sd_image_t*>());
-    } else {
-        sd_image_t input_image = {(uint32_t)params.width,
-                                  (uint32_t)params.height,
-                                  3,
-                                  input_image_buffer};
+    int expected_num_results = 1;
+    if (params.mode == IMG_GEN) {
+        sd_img_gen_params_t img_gen_params = {
+            params.prompt.c_str(),
+            params.negative_prompt.c_str(),
+            params.clip_skip,
+            guidance_params,
+            input_image,
+            ref_images.data(),
+            (int)ref_images.size(),
+            mask_image,
+            params.width,
+            params.height,
+            params.sample_method,
+            params.sample_steps,
+            params.eta,
+            params.strength,
+            params.seed,
+            params.batch_count,
+            control_image,
+            params.control_strength,
+            params.style_ratio,
+            params.normalize_input,
+            params.input_id_images_path.c_str(),
+        };
 
-        if (params.mode == IMG2VID) {
-            results = img2vid(sd_ctx,
-                              input_image,
-                              params.width,
-                              params.height,
-                              params.video_frames,
-                              params.motion_bucket_id,
-                              params.fps,
-                              params.augmentation_level,
-                              params.min_cfg,
-                              params.cfg_scale,
-                              params.sample_method,
-                              params.sample_steps,
-                              params.strength,
-                              params.seed);
-            if (results == NULL) {
-                printf("generate failed\n");
-                free_sd_ctx(sd_ctx);
-                return 1;
-            }
-            size_t last            = params.output_path.find_last_of(".");
-            std::string dummy_name = last != std::string::npos ? params.output_path.substr(0, last) : params.output_path;
-            for (int i = 0; i < params.video_frames; i++) {
-                if (results[i].data == NULL) {
-                    continue;
-                }
-                std::string final_image_path = i > 0 ? dummy_name + "_" + std::to_string(i + 1) + ".png" : dummy_name + ".png";
-                stbi_write_png(final_image_path.c_str(), results[i].width, results[i].height, results[i].channel,
-                               results[i].data, 0, get_image_params(params, params.seed + i).c_str());
-                printf("save result image to '%s'\n", final_image_path.c_str());
-                free(results[i].data);
-                results[i].data = NULL;
-            }
-            free(results);
-            free_sd_ctx(sd_ctx);
-            return 0;
-        } else {
-            results = img2img(sd_ctx,
-                              input_image,
-                              mask_image,
-                              params.prompt.c_str(),
-                              params.negative_prompt.c_str(),
-                              params.clip_skip,
-                              params.cfg_scale,
-                              params.guidance,
-                              params.eta,
-                              params.width,
-                              params.height,
-                              params.sample_method,
-                              params.sample_steps,
-                              params.strength,
-                              params.seed,
-                              params.batch_count,
-                              control_image,
-                              params.control_strength,
-                              params.style_ratio,
-                              params.normalize_input,
-                              params.input_id_images_path.c_str(),
-                              kontext_imgs.data(), kontext_imgs.size(),
-                              params.skip_layers.data(),
-                              params.skip_layers.size(),
-                              params.slg_scale,
-                              params.skip_layer_start,
-                              params.skip_layer_end,
-                              std::vector<sd_image_t*>());
-        }
+        results              = generate_image(sd_ctx, &img_gen_params);
+        expected_num_results = params.batch_count;
+    } else if (params.mode == VID_GEN) {
+        sd_vid_gen_params_t vid_gen_params = {
+            input_image,
+            params.width,
+            params.height,
+            guidance_params,
+            params.sample_method,
+            params.sample_steps,
+            params.strength,
+            params.seed,
+            params.video_frames,
+            params.motion_bucket_id,
+            params.fps,
+            params.augmentation_level,
+        };
+
+        results              = generate_video(sd_ctx, &vid_gen_params);
+        expected_num_results = params.video_frames;
     }
 
     if (results == NULL) {
@@ -1104,7 +1024,8 @@ int main(int argc, const char* argv[]) {
     int upscale_factor = 4;  // unused for RealESRGAN_x4plus_anime_6B.pth
     if (params.esrgan_path.size() > 0 && params.upscale_repeats > 0) {
         upscaler_ctx_t* upscaler_ctx = new_upscaler_ctx(params.esrgan_path.c_str(),
-                                                        params.n_threads);
+                                                        params.n_threads,
+                                                        params.diffusion_conv_direct);
 
         if (upscaler_ctx == NULL) {
             printf("new_upscaler_ctx failed\n");
@@ -1149,14 +1070,14 @@ int main(int argc, const char* argv[]) {
         dummy_name += ext;
         ext = ".png";
     }
-    for (int i = 0; i < params.batch_count; i++) {
+    for (int i = 0; i < expected_num_results; i++) {
         if (results[i].data == NULL) {
             continue;
         }
         std::string final_image_path = i > 0 ? dummy_name + "_" + std::to_string(i + 1) + ext : dummy_name + ext;
         if (is_jpg) {
             stbi_write_jpg(final_image_path.c_str(), results[i].width, results[i].height, results[i].channel,
-                           results[i].data, 90);
+                           results[i].data, 90, get_image_params(params, params.seed + i).c_str());
             printf("save result JPEG image to '%s'\n", final_image_path.c_str());
         } else {
             stbi_write_png(final_image_path.c_str(), results[i].width, results[i].height, results[i].channel,
@@ -1172,4 +1093,4 @@ int main(int argc, const char* argv[]) {
     free(input_image_buffer);
 
     return 0;
-}
\ No newline at end of file
+}
diff --git a/otherarch/sdcpp/mmdit.hpp b/otherarch/sdcpp/mmdit.hpp
index dee7b1c49..a93a35dfd 100644
--- a/otherarch/sdcpp/mmdit.hpp
+++ b/otherarch/sdcpp/mmdit.hpp
@@ -147,8 +147,8 @@ protected:
     int64_t hidden_size;
     float eps;
 
-    void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, std::string prefix = "") {
-        enum ggml_type wtype = GGML_TYPE_F32;  //(tensor_types.find(prefix + "weight") != tensor_types.end()) ? tensor_types[prefix + "weight"] : GGML_TYPE_F32;
+    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, std::string prefix = "") {
+        enum ggml_type wtype = GGML_TYPE_F32;
         params["weight"]     = ggml_new_tensor_1d(ctx, wtype, hidden_size);
     }
 
@@ -652,13 +652,13 @@ protected:
     int64_t hidden_size;
     std::string qk_norm;
 
-    void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, std::string prefix = "") {
-        enum ggml_type wtype = GGML_TYPE_F32;  //(tensor_types.find(prefix + "pos_embed") != tensor_types.end()) ? tensor_types[prefix + "pos_embed"] : GGML_TYPE_F32;
+    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, std::string prefix = "") {
+        enum ggml_type wtype = GGML_TYPE_F32;
         params["pos_embed"]  = ggml_new_tensor_3d(ctx, wtype, hidden_size, num_patchs, 1);
     }
 
 public:
-    MMDiT(std::map<std::string, enum ggml_type>& tensor_types) {
+    MMDiT(const String2GGMLType& tensor_types = {}) {
         // input_size is always None
         // learn_sigma is always False
         // register_length is alwalys 0
@@ -869,11 +869,9 @@ public:
 struct MMDiTRunner : public GGMLRunner {
     MMDiT mmdit;
 
-    static std::map<std::string, enum ggml_type> empty_tensor_types;
-
     MMDiTRunner(ggml_backend_t backend,
-                std::map<std::string, enum ggml_type>& tensor_types = empty_tensor_types,
-                const std::string prefix                            = "")
+                const String2GGMLType& tensor_types = {},
+                const std::string prefix            = "")
         : GGMLRunner(backend), mmdit(tensor_types) {
         mmdit.init(params_ctx, tensor_types, prefix);
     }
diff --git a/otherarch/sdcpp/model.cpp b/otherarch/sdcpp/model.cpp
index 80af64f8d..d18fd914c 100644
--- a/otherarch/sdcpp/model.cpp
+++ b/otherarch/sdcpp/model.cpp
@@ -16,7 +16,6 @@
 #include "ggml-backend.h"
 #include "ggml-cpu.h"
 #include "ggml.h"
-#include "gguf.h"
 
 #include "stable-diffusion.h"
 
@@ -28,6 +27,10 @@
 #include "ggml-vulkan.h"
 #endif
 
+#ifdef SD_USE_OPENCL
+#include "ggml-opencl.h"
+#endif
+
 #define ST_HEADER_SIZE_LEN 8
 
 static std::string format(const char* fmt, ...) {
@@ -111,6 +114,7 @@ const char* unused_tensors[] = {
     "model_ema.diffusion_model",
     "embedding_manager",
     "denoiser.sigmas",
+    "text_encoders.t5xxl.transformer.encoder.embed_tokens.weight",  // only used during training
 };
 
 bool is_unused_tensor(std::string name) {
@@ -192,7 +196,7 @@ std::unordered_map<std::string, std::string> pmid_v2_name_map = {
 std::string convert_open_clip_to_hf_clip(const std::string& name) {
     std::string new_name = name;
     std::string prefix;
-     if (contains(new_name, ".enc.")) {
+    if (contains(new_name, ".enc.")) {
         // llama.cpp naming convention for T5
         size_t pos = new_name.find(".enc.");
         if (pos != std::string::npos) {
@@ -348,6 +352,10 @@ std::unordered_map<std::string, std::unordered_map<std::string, std::string>> su
             {"to_v", "v"},
             {"to_out_0", "proj_out"},
             {"group_norm", "norm"},
+            {"key", "k"},
+            {"query", "q"},
+            {"value", "v"},
+            {"proj_attn", "proj_out"},
         },
     },
     {
@@ -372,6 +380,10 @@ std::unordered_map<std::string, std::unordered_map<std::string, std::string>> su
             {"to_v", "v"},
             {"to_out.0", "proj_out"},
             {"group_norm", "norm"},
+            {"key", "k"},
+            {"query", "q"},
+            {"value", "v"},
+            {"proj_attn", "proj_out"},
         },
     },
     {
@@ -443,6 +455,10 @@ std::string convert_diffusers_name_to_compvis(std::string key, char seq) {
         return format("model%cdiffusion_model%ctime_embed%c", seq, seq, seq) + std::to_string(std::stoi(m[0]) * 2 - 2) + m[1];
     }
 
+    if (match(m, std::regex(format("unet%cadd_embedding%clinear_(\\d+)(.*)", seq, seq)), key)) {
+        return format("model%cdiffusion_model%clabel_emb%c0%c", seq, seq, seq, seq) + std::to_string(std::stoi(m[0]) * 2 - 2) + m[1];
+    }
+
     if (match(m, std::regex(format("unet%cdown_blocks%c(\\d+)%c(attentions|resnets)%c(\\d+)%c(.+)", seq, seq, seq, seq, seq)), key)) {
         std::string suffix = get_converted_suffix(m[1], m[3]);
         // LOG_DEBUG("%s %s %s %s", m[0].c_str(), m[1].c_str(), m[2].c_str(), m[3].c_str());
@@ -480,6 +496,19 @@ std::string convert_diffusers_name_to_compvis(std::string key, char seq) {
         return format("cond_stage_model%ctransformer%ctext_model", seq, seq) + m[0];
     }
 
+    // clip-g
+    if (match(m, std::regex(format("te%c1%ctext_model%cencoder%clayers%c(\\d+)%c(.+)", seq, seq, seq, seq, seq, seq)), key)) {
+        return format("cond_stage_model%c1%ctransformer%ctext_model%cencoder%clayers%c", seq, seq, seq, seq, seq, seq) + m[0] + seq + m[1];
+    }
+
+    if (match(m, std::regex(format("te%c1%ctext_model(.*)", seq, seq)), key)) {
+        return format("cond_stage_model%c1%ctransformer%ctext_model", seq, seq, seq) + m[0];
+    }
+
+    if (match(m, std::regex(format("te%c1%ctext_projection", seq, seq)), key)) {
+        return format("cond_stage_model%c1%ctransformer%ctext_model%ctext_projection", seq, seq, seq, seq);
+    }
+
     // vae
     if (match(m, std::regex(format("vae%c(.*)%cconv_norm_out(.*)", seq, seq)), key)) {
         return format("first_stage_model%c%s%cnorm_out%s", seq, m[0].c_str(), seq, m[1].c_str());
@@ -616,6 +645,8 @@ std::string convert_tensor_name(std::string name) {
             std::string new_key = convert_diffusers_name_to_compvis(name_without_network_parts, '.');
             if (new_key.empty()) {
                 new_name = name;
+            } else if (new_key == "cond_stage_model.1.transformer.text_model.text_projection") {
+                new_name = new_key;
             } else {
                 new_name = new_key + "." + network_part;
             }
@@ -631,7 +662,7 @@ std::string convert_tensor_name(std::string name) {
     return new_name;
 }
 
-void add_preprocess_tensor_storage_types(std::map<std::string, enum ggml_type>& tensor_storages_types, std::string name, enum ggml_type type) {
+void add_preprocess_tensor_storage_types(String2GGMLType& tensor_storages_types, std::string name, enum ggml_type type) {
     std::string new_name = convert_tensor_name(name);
 
     if (new_name.find("cond_stage_model") != std::string::npos && ends_with(new_name, "attn.in_proj_weight")) {
@@ -798,6 +829,7 @@ void f8_e4m3_to_f16_vec(uint8_t* src, uint16_t* dst, int64_t n) {
         dst[i] = f8_e4m3_to_f16(src[i]);
     }
 }
+
 void f8_e5m2_to_f16_vec(uint8_t* src, uint16_t* dst, int64_t n) {
     // support inplace op
     for (int64_t i = n - 1; i >= 0; i--) {
@@ -805,6 +837,20 @@ void f8_e5m2_to_f16_vec(uint8_t* src, uint16_t* dst, int64_t n) {
     }
 }
 
+void f64_to_f32_vec(double* src, float* dst, int64_t n) {
+    // support inplace op
+    for (int64_t i = 0; i < n; i++) {
+        dst[i] = (float)src[i];
+    }
+}
+
+void i64_to_i32_vec(int64_t* src, int32_t* dst, int64_t n) {
+    // support inplace op
+    for (int64_t i = 0; i < n; i++) {
+        dst[i] = (int32_t)src[i];
+    }
+}
+
 void convert_tensor(void* src,
                     ggml_type src_type,
                     void* dst,
@@ -1050,10 +1096,14 @@ ggml_type str_to_ggml_type(const std::string& dtype) {
         ttype = GGML_TYPE_F32;
     } else if (dtype == "F32") {
         ttype = GGML_TYPE_F32;
+    } else if (dtype == "F64") {
+        ttype = GGML_TYPE_F32;
     } else if (dtype == "F8_E4M3") {
         ttype = GGML_TYPE_F16;
     } else if (dtype == "F8_E5M2") {
         ttype = GGML_TYPE_F16;
+    } else if (dtype == "I64") {
+        ttype = GGML_TYPE_I32;
     }
     return ttype;
 }
@@ -1071,6 +1121,7 @@ bool ModelLoader::init_from_safetensors_file(const std::string& file_path, const
     std::ifstream file(fpath, std::ios::binary);
     if (!file.is_open()) {
         LOG_ERROR("failed to open '%s'", file_path.c_str());
+        file_paths_.pop_back();
         return false;
     }
 
@@ -1082,6 +1133,7 @@ bool ModelLoader::init_from_safetensors_file(const std::string& file_path, const
     // read header size
     if (file_size_ <= ST_HEADER_SIZE_LEN) {
         LOG_ERROR("invalid safetensor file '%s'", file_path.c_str());
+        file_paths_.pop_back();
         return false;
     }
 
@@ -1095,6 +1147,7 @@ bool ModelLoader::init_from_safetensors_file(const std::string& file_path, const
     size_t header_size_ = read_u64(header_size_buf);
     if (header_size_ >= file_size_) {
         LOG_ERROR("invalid safetensor file '%s'", file_path.c_str());
+        file_paths_.pop_back();
         return false;
     }
 
@@ -1105,6 +1158,7 @@ bool ModelLoader::init_from_safetensors_file(const std::string& file_path, const
     file.read(header_buf.data(), header_size_);
     if (!file) {
         LOG_ERROR("read safetensors header failed: '%s'", file_path.c_str());
+        file_paths_.pop_back();
         return false;
     }
 
@@ -1176,6 +1230,14 @@ bool ModelLoader::init_from_safetensors_file(const std::string& file_path, const
             tensor_storage.is_f8_e5m2 = true;
             // f8 -> f16
             GGML_ASSERT(tensor_storage.nbytes() == tensor_data_size * 2);
+        } else if (dtype == "F64") {
+            tensor_storage.is_f64 = true;
+            // f64 -> f32
+            GGML_ASSERT(tensor_storage.nbytes() * 2 == tensor_data_size);
+        } else if (dtype == "I64") {
+            tensor_storage.is_i64 = true;
+            // i64 -> i32
+            GGML_ASSERT(tensor_storage.nbytes() * 2 == tensor_data_size);
         } else {
             GGML_ASSERT(tensor_storage.nbytes() == tensor_data_size);
         }
@@ -1192,18 +1254,45 @@ bool ModelLoader::init_from_safetensors_file(const std::string& file_path, const
 /*================================================= DiffusersModelLoader ==================================================*/
 
 bool ModelLoader::init_from_diffusers_file(const std::string& file_path, const std::string& prefix) {
-    std::string unet_path = path_join(file_path, "unet/diffusion_pytorch_model.safetensors");
-    std::string vae_path  = path_join(file_path, "vae/diffusion_pytorch_model.safetensors");
-    std::string clip_path = path_join(file_path, "text_encoder/model.safetensors");
+    std::string unet_path   = path_join(file_path, "unet/diffusion_pytorch_model.safetensors");
+    std::string vae_path    = path_join(file_path, "vae/diffusion_pytorch_model.safetensors");
+    std::string clip_path   = path_join(file_path, "text_encoder/model.safetensors");
+    std::string clip_g_path = path_join(file_path, "text_encoder_2/model.safetensors");
 
     if (!init_from_safetensors_file(unet_path, "unet.")) {
         return false;
     }
+    for (auto ts : tensor_storages) {
+        if (ts.name.find("add_embedding") != std::string::npos || ts.name.find("label_emb") != std::string::npos) {
+            // probably SDXL
+            LOG_DEBUG("Fixing name for SDXL output blocks.2.2");
+            for (auto& tensor_storage : tensor_storages) {
+                int len  = 34;
+                auto pos = tensor_storage.name.find("unet.up_blocks.0.upsamplers.0.conv");
+                if (pos == std::string::npos) {
+                    len = 44;
+                    pos = tensor_storage.name.find("model.diffusion_model.output_blocks.2.1.conv");
+                }
+                if (pos != std::string::npos) {
+                    tensor_storage.name = "model.diffusion_model.output_blocks.2.2.conv" + tensor_storage.name.substr(len);
+                    LOG_DEBUG("NEW NAME: %s", tensor_storage.name.c_str());
+                    add_preprocess_tensor_storage_types(tensor_storages_types, tensor_storage.name, tensor_storage.type);
+                }
+            }
+            break;
+        }
+    }
+
     if (!init_from_safetensors_file(vae_path, "vae.")) {
-        return false;
+        LOG_WARN("Couldn't find working VAE in %s", file_path.c_str());
+        // return false;
     }
     if (!init_from_safetensors_file(clip_path, "te.")) {
-        return false;
+        LOG_WARN("Couldn't find working text encoder in %s", file_path.c_str());
+        // return false;
+    }
+    if (!init_from_safetensors_file(clip_g_path, "te.1.")) {
+        LOG_DEBUG("Couldn't find working second text encoder in %s", file_path.c_str());
     }
     return true;
 }
@@ -1566,6 +1655,15 @@ bool ModelLoader::init_from_ckpt_file(const std::string& file_path, const std::s
     return true;
 }
 
+bool ModelLoader::model_is_unet() {
+    for (auto& tensor_storage : tensor_storages) {
+        if (tensor_storage.name.find("model.diffusion_model.input_blocks.") != std::string::npos) {
+            return true;
+        }
+    }
+    return false;
+}
+
 bool ModelLoader::has_diffusion_model_tensors()
 {
     for (auto& tensor_storage : tensor_storages) {
@@ -1598,7 +1696,7 @@ SDVersion ModelLoader::get_sd_version() {
             if (tensor_storage.name.find("model.diffusion_model.joint_blocks.") != std::string::npos) {
                 return VERSION_SD3;
             }
-            if (tensor_storage.name.find("model.diffusion_model.input_blocks.") != std::string::npos) {
+            if (tensor_storage.name.find("model.diffusion_model.input_blocks.") != std::string::npos || tensor_storage.name.find("unet.down_blocks.") != std::string::npos) {
                 is_unet = true;
                 if (has_multiple_encoders) {
                     is_xl = true;
@@ -1607,7 +1705,7 @@ SDVersion ModelLoader::get_sd_version() {
                     }
                 }
             }
-            if (tensor_storage.name.find("conditioner.embedders.1") != std::string::npos || tensor_storage.name.find("cond_stage_model.1") != std::string::npos) {
+            if (tensor_storage.name.find("conditioner.embedders.1") != std::string::npos || tensor_storage.name.find("cond_stage_model.1") != std::string::npos || tensor_storage.name.find("te.1") != std::string::npos) {
                 has_multiple_encoders = true;
                 if (is_unet) {
                     is_xl = true;
@@ -1629,7 +1727,7 @@ SDVersion ModelLoader::get_sd_version() {
             token_embedding_weight = tensor_storage;
             // break;
         }
-        if (tensor_storage.name == "model.diffusion_model.input_blocks.0.0.weight" || tensor_storage.name == "model.diffusion_model.img_in.weight") {
+        if (tensor_storage.name == "model.diffusion_model.input_blocks.0.0.weight" || tensor_storage.name == "model.diffusion_model.img_in.weight" || tensor_storage.name == "unet.conv_in.weight") {
             input_block_weight  = tensor_storage;
             input_block_checked = true;
             if (found_family) {
@@ -1638,10 +1736,14 @@ SDVersion ModelLoader::get_sd_version() {
         }
     }
     bool is_inpaint = input_block_weight.ne[2] == 9;
+    bool is_ip2p    = input_block_weight.ne[2] == 8;
     if (is_xl) {
         if (is_inpaint) {
             return VERSION_SDXL_INPAINT;
         }
+        if (is_ip2p) {
+            return VERSION_SDXL_PIX2PIX;
+        }
         return VERSION_SDXL;
     }
 
@@ -1657,6 +1759,9 @@ SDVersion ModelLoader::get_sd_version() {
         if (is_inpaint) {
             return VERSION_SD1_INPAINT;
         }
+        if (is_ip2p) {
+            return VERSION_SD1_PIX2PIX;
+        }
         return VERSION_SD1;
     } else if (token_embedding_weight.ne[0] == 1024) {
         if (is_inpaint) {
@@ -1714,7 +1819,7 @@ ggml_type ModelLoader::get_diffusion_model_wtype() {
             continue;
         }
 
-        if (tensor_storage.name.find("model.diffusion_model.") == std::string::npos) {
+        if (tensor_storage.name.find("model.diffusion_model.") == std::string::npos && tensor_storage.name.find("unet.") == std::string::npos) {
             continue;
         }
 
@@ -1883,6 +1988,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
         };
         int tensor_count = 0;
         int64_t t1       = ggml_time_ms();
+        bool partial     = false;
         for (auto& tensor_storage : processed_tensor_storages) {
             if (tensor_storage.file_index != file_index) {
                 ++tensor_count;
@@ -1907,7 +2013,12 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
                 // for the CPU and Metal backend, we can copy directly into the tensor
                 if (tensor_storage.type == dst_tensor->type) {
                     GGML_ASSERT(ggml_nbytes(dst_tensor) == tensor_storage.nbytes());
-                    read_data(tensor_storage, (char*)dst_tensor->data, nbytes_to_read);
+                    if (tensor_storage.is_f64 || tensor_storage.is_i64) {
+                        read_buffer.resize(tensor_storage.nbytes_to_read());
+                        read_data(tensor_storage, (char*)read_buffer.data(), nbytes_to_read);
+                    } else {
+                        read_data(tensor_storage, (char*)dst_tensor->data, nbytes_to_read);
+                    }
 
                     if (tensor_storage.is_bf16) {
                         // inplace op
@@ -1918,9 +2029,13 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
                     } else if (tensor_storage.is_f8_e5m2) {
                         // inplace op
                         f8_e5m2_to_f16_vec((uint8_t*)dst_tensor->data, (uint16_t*)dst_tensor->data, tensor_storage.nelements());
+                    } else if (tensor_storage.is_f64) {
+                        f64_to_f32_vec((double*)read_buffer.data(), (float*)dst_tensor->data, tensor_storage.nelements());
+                    } else if (tensor_storage.is_i64) {
+                        i64_to_i32_vec((int64_t*)read_buffer.data(), (int32_t*)dst_tensor->data, tensor_storage.nelements());
                     }
                 } else {
-                    read_buffer.resize(tensor_storage.nbytes());
+                    read_buffer.resize(std::max(tensor_storage.nbytes(), tensor_storage.nbytes_to_read()));
                     read_data(tensor_storage, (char*)read_buffer.data(), nbytes_to_read);
 
                     if (tensor_storage.is_bf16) {
@@ -1932,13 +2047,19 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
                     } else if (tensor_storage.is_f8_e5m2) {
                         // inplace op
                         f8_e5m2_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements());
+                    } else if (tensor_storage.is_f64) {
+                        // inplace op
+                        f64_to_f32_vec((double*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements());
+                    } else if (tensor_storage.is_i64) {
+                        // inplace op
+                        i64_to_i32_vec((int64_t*)read_buffer.data(), (int32_t*)read_buffer.data(), tensor_storage.nelements());
                     }
 
                     convert_tensor((void*)read_buffer.data(), tensor_storage.type, dst_tensor->data,
                                    dst_tensor->type, (int)tensor_storage.nelements() / (int)tensor_storage.ne[0], (int)tensor_storage.ne[0]);
                 }
             } else {
-                read_buffer.resize(tensor_storage.nbytes());
+                read_buffer.resize(std::max(tensor_storage.nbytes(), tensor_storage.nbytes_to_read()));
                 read_data(tensor_storage, (char*)read_buffer.data(), nbytes_to_read);
 
                 if (tensor_storage.is_bf16) {
@@ -1950,6 +2071,12 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
                 } else if (tensor_storage.is_f8_e5m2) {
                     // inplace op
                     f8_e5m2_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements());
+                } else if (tensor_storage.is_f64) {
+                    // inplace op
+                    f64_to_f32_vec((double*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements());
+                } else if (tensor_storage.is_i64) {
+                    // inplace op
+                    i64_to_i32_vec((int64_t*)read_buffer.data(), (int32_t*)read_buffer.data(), tensor_storage.nelements());
                 }
 
                 if (tensor_storage.type == dst_tensor->type) {
@@ -1964,20 +2091,26 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
                     ggml_backend_tensor_set(dst_tensor, convert_buffer.data(), 0, ggml_nbytes(dst_tensor));
                 }
             }
-            int64_t t2 = ggml_time_ms();
+            size_t tensor_max = processed_tensor_storages.size();
+            int64_t t2        = ggml_time_ms();
+            // kcpp throttle progress printing
             ++tensor_count;
-            if(tensor_count<2 || tensor_count%5==0 || (tensor_count+10) > processed_tensor_storages.size())
+            if(tensor_count<2 || tensor_count%5==0 || (tensor_count+10) > tensor_max)
             {
-                //throttle progress printing
-                pretty_progress(tensor_count, processed_tensor_storages.size(), (t2 - t1) / 1000.0f);
+                pretty_progress(tensor_count, tensor_max, (t2 - t1) / 1000.0f);
             }
-            t1 = t2;
+            t1      = t2;
+            partial = tensor_count != tensor_max;
         }
 
         if (zip != NULL) {
             zip_close(zip);
         }
 
+        if (partial) {
+            printf("\n");
+        }
+
         if (!success) {
             break;
         }
@@ -2055,6 +2188,41 @@ bool ModelLoader::load_tensors(std::map<std::string, struct ggml_tensor*>& tenso
     return true;
 }
 
+std::vector<std::pair<std::string, ggml_type>> parse_tensor_type_rules(const std::string& tensor_type_rules) {
+    std::vector<std::pair<std::string, ggml_type>> result;
+    for (const auto& item : splitString(tensor_type_rules, ',')) {
+        if (item.size() == 0)
+            continue;
+        std::string::size_type pos = item.find('=');
+        if (pos == std::string::npos) {
+            LOG_WARN("ignoring invalid quant override \"%s\"", item.c_str());
+            continue;
+        }
+        std::string tensor_pattern = item.substr(0, pos);
+        std::string type_name      = item.substr(pos + 1);
+
+        ggml_type tensor_type = GGML_TYPE_COUNT;
+
+        if (type_name == "f32") {
+            tensor_type = GGML_TYPE_F32;
+        } else {
+            for (size_t i = 0; i < SD_TYPE_COUNT; i++) {
+                auto trait = ggml_get_type_traits((ggml_type)i);
+                if (trait->to_float && trait->type_size && type_name == trait->type_name) {
+                    tensor_type = (ggml_type)i;
+                }
+            }
+        }
+
+        if (tensor_type != GGML_TYPE_COUNT) {
+            result.emplace_back(tensor_pattern, tensor_type);
+        } else {
+            LOG_WARN("ignoring invalid quant override \"%s\"", item.c_str());
+        }
+    }
+    return result;
+}
+
 bool ModelLoader::tensor_should_be_converted(const TensorStorage& tensor_storage, ggml_type type) {
     const std::string& name = tensor_storage.name;
     if (type != GGML_TYPE_COUNT) {
@@ -2086,7 +2254,7 @@ bool ModelLoader::tensor_should_be_converted(const TensorStorage& tensor_storage
     return false;
 }
 
-bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type) {
+bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type, const std::string& tensor_type_rules_str) {
     auto backend    = ggml_backend_cpu_init();
     size_t mem_size = 1 * 1024 * 1024;  // for padding
     mem_size += tensor_storages.size() * ggml_tensor_overhead();
@@ -2096,12 +2264,23 @@ bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type
 
     gguf_context* gguf_ctx = gguf_init_empty();
 
+    auto tensor_type_rules = parse_tensor_type_rules(tensor_type_rules_str);
+
     auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool {
         const std::string& name = tensor_storage.name;
+        ggml_type tensor_type   = tensor_storage.type;
+        ggml_type dst_type      = type;
 
-        ggml_type tensor_type = tensor_storage.type;
-        if (tensor_should_be_converted(tensor_storage, type)) {
-            tensor_type = type;
+        for (const auto& tensor_type_rule : tensor_type_rules) {
+            std::regex pattern(tensor_type_rule.first);
+            if (std::regex_search(name, pattern)) {
+                dst_type = tensor_type_rule.second;
+                break;
+            }
+        }
+
+        if (tensor_should_be_converted(tensor_storage, dst_type)) {
+            tensor_type = dst_type;
         }
 
         ggml_tensor* tensor = ggml_new_tensor(ggml_ctx, tensor_type, tensor_storage.n_dims, tensor_storage.ne);
@@ -2160,7 +2339,7 @@ int64_t ModelLoader::get_params_mem_size(ggml_backend_t backend, ggml_type type)
     return mem_size;
 }
 
-bool convert(const char* input_path, const char* vae_path, const char* output_path, sd_type_t output_type) {
+bool convert(const char* input_path, const char* vae_path, const char* output_path, sd_type_t output_type, const char* tensor_type_rules) {
     ModelLoader model_loader;
 
     if (!model_loader.init_from_file(input_path)) {
@@ -2174,6 +2353,6 @@ bool convert(const char* input_path, const char* vae_path, const char* output_pa
             return false;
         }
     }
-    bool success = model_loader.save_to_gguf_file(output_path, (ggml_type)output_type);
+    bool success = model_loader.save_to_gguf_file(output_path, (ggml_type)output_type, tensor_type_rules);
     return success;
-}
\ No newline at end of file
+}
diff --git a/otherarch/sdcpp/model.h b/otherarch/sdcpp/model.h
index 9da9eb416..1c1468fc1 100644
--- a/otherarch/sdcpp/model.h
+++ b/otherarch/sdcpp/model.h
@@ -12,19 +12,21 @@
 
 #include "ggml-backend.h"
 #include "ggml.h"
+#include "gguf.h"
 #include <nlohmann/json.hpp>
 #include "zip.h"
-#include "gguf.h"
 
 #define SD_MAX_DIMS 5
 
 enum SDVersion {
     VERSION_SD1,
     VERSION_SD1_INPAINT,
+    VERSION_SD1_PIX2PIX,
     VERSION_SD2,
     VERSION_SD2_INPAINT,
     VERSION_SDXL,
     VERSION_SDXL_INPAINT,
+    VERSION_SDXL_PIX2PIX,
     VERSION_SVD,
     VERSION_SD3,
     VERSION_FLUX,
@@ -47,7 +49,7 @@ static inline bool sd_version_is_sd3(SDVersion version) {
 }
 
 static inline bool sd_version_is_sd1(SDVersion version) {
-    if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT) {
+    if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT || version == VERSION_SD1_PIX2PIX) {
         return true;
     }
     return false;
@@ -61,7 +63,7 @@ static inline bool sd_version_is_sd2(SDVersion version) {
 }
 
 static inline bool sd_version_is_sdxl(SDVersion version) {
-    if (version == VERSION_SDXL || version == VERSION_SDXL_INPAINT) {
+    if (version == VERSION_SDXL || version == VERSION_SDXL_INPAINT || version == VERSION_SDXL_PIX2PIX) {
         return true;
     }
     return false;
@@ -81,6 +83,14 @@ static inline bool sd_version_is_dit(SDVersion version) {
     return false;
 }
 
+static inline bool sd_version_is_unet_edit(SDVersion version) {
+    return version == VERSION_SD1_PIX2PIX || version == VERSION_SDXL_PIX2PIX;
+}
+
+static bool sd_version_is_inpaint_or_unet_edit(SDVersion version) {
+    return sd_version_is_unet_edit(version) || sd_version_is_inpaint(version);
+}
+
 enum PMVersion {
     PM_VERSION_1,
     PM_VERSION_2,
@@ -92,6 +102,8 @@ struct TensorStorage {
     bool is_bf16            = false;
     bool is_f8_e4m3         = false;
     bool is_f8_e5m2         = false;
+    bool is_f64             = false;
+    bool is_i64             = false;
     int64_t ne[SD_MAX_DIMS] = {1, 1, 1, 1, 1};
     int n_dims              = 0;
 
@@ -123,6 +135,8 @@ struct TensorStorage {
     int64_t nbytes_to_read() const {
         if (is_bf16 || is_f8_e4m3 || is_f8_e5m2) {
             return nbytes() / 2;
+        } else if (is_f64 || is_i64) {
+            return nbytes() * 2;
         } else {
             return nbytes();
         }
@@ -173,6 +187,10 @@ struct TensorStorage {
             type_name = "f8_e4m3";
         } else if (is_f8_e5m2) {
             type_name = "f8_e5m2";
+        } else if (is_f64) {
+            type_name = "f64";
+        } else if (is_i64) {
+            type_name = "i64";
         }
         ss << name << " | " << type_name << " | ";
         ss << n_dims << " [";
@@ -189,6 +207,8 @@ struct TensorStorage {
 
 typedef std::function<bool(const TensorStorage&, ggml_tensor**)> on_new_tensor_cb_t;
 
+typedef std::map<std::string, enum ggml_type> String2GGMLType;
+
 class ModelLoader {
 protected:
     std::vector<std::string> file_paths_;
@@ -207,10 +227,11 @@ protected:
     bool init_from_diffusers_file(const std::string& file_path, const std::string& prefix = "");
 
 public:
-    std::map<std::string, enum ggml_type> tensor_storages_types;
+    String2GGMLType tensor_storages_types;
 
     bool init_from_file(const std::string& file_path, const std::string& prefix = "");
     bool has_diffusion_model_tensors();
+    bool model_is_unet();
     SDVersion get_sd_version();
     ggml_type get_sd_wtype();
     ggml_type get_conditioner_wtype();
@@ -222,7 +243,7 @@ public:
                       ggml_backend_t backend,
                       std::set<std::string> ignore_tensors = {});
 
-    bool save_to_gguf_file(const std::string& file_path, ggml_type type);
+    bool save_to_gguf_file(const std::string& file_path, ggml_type type, const std::string& tensor_type_rules);
     bool tensor_should_be_converted(const TensorStorage& tensor_storage, ggml_type type);
     int64_t get_params_mem_size(ggml_backend_t backend, ggml_type type = GGML_TYPE_COUNT);
     ~ModelLoader() = default;
@@ -231,4 +252,4 @@ public:
     static std::string load_t5_tokenizer_json();
 };
 
-#endif  // __MODEL_H__
\ No newline at end of file
+#endif  // __MODEL_H__
diff --git a/otherarch/sdcpp/pmid.hpp b/otherarch/sdcpp/pmid.hpp
index ea9f02eb6..e2a0f6282 100644
--- a/otherarch/sdcpp/pmid.hpp
+++ b/otherarch/sdcpp/pmid.hpp
@@ -623,7 +623,12 @@ public:
     std::vector<float> zeros_right;
 
 public:
-    PhotoMakerIDEncoder(ggml_backend_t backend, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix, SDVersion version = VERSION_SDXL, PMVersion pm_v = PM_VERSION_1, float sty = 20.f)
+    PhotoMakerIDEncoder(ggml_backend_t backend,
+                        const String2GGMLType& tensor_types,
+                        const std::string prefix,
+                        SDVersion version = VERSION_SDXL,
+                        PMVersion pm_v    = PM_VERSION_1,
+                        float sty         = 20.f)
         : GGMLRunner(backend),
           version(version),
           pm_version(pm_v),
diff --git a/otherarch/sdcpp/sdtype_adapter.cpp b/otherarch/sdcpp/sdtype_adapter.cpp
index 7557814ea..d967bbf0e 100644
--- a/otherarch/sdcpp/sdtype_adapter.cpp
+++ b/otherarch/sdcpp/sdtype_adapter.cpp
@@ -134,6 +134,27 @@ static bool sd_is_quiet = false;
 static std::string sdmodelfilename = "";
 static bool photomaker_enabled = false;
 
+static void set_sd_vae_tiling(sd_ctx_t* ctx, bool tiling)
+{
+    ctx->sd->vae_tiling = tiling;
+}
+
+static int get_loaded_sd_version(sd_ctx_t* ctx)
+{
+    return ctx->sd->version;
+}
+
+static bool loaded_model_is_chroma(sd_ctx_t* ctx)
+{
+    if (ctx != nullptr && ctx->sd != nullptr) {
+        auto maybe_flux = std::dynamic_pointer_cast<FluxModel>(ctx->sd->diffusion_model);
+        if (maybe_flux != nullptr) {
+            return maybe_flux->flux.flux_params.is_chroma;
+        }
+    }
+    return false;
+}
+
 bool sdtype_load_model(const sd_load_model_inputs inputs) {
     sd_is_quiet = inputs.quiet;
     set_sd_quiet(sd_is_quiet);
@@ -160,6 +181,10 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) {
     {
         taesdpath = executable_path + "taesd.embd";
         printf("With TAE SD VAE: %s\n",taesdpath.c_str());
+        if (cfg_tiled_vae_threshold < 8192) {
+            printf("  disabling VAE tiling for TAESD\n");
+            cfg_tiled_vae_threshold = 8192;
+        }
     }
     else if(vaefilename!="")
     {
@@ -267,31 +292,35 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) {
         sd_params->control_net_cpu);
     }
 
-    sd_ctx = new_sd_ctx(sd_params->model_path.c_str(),
-                        sd_params->clip_l_path.c_str(),
-                        sd_params->clip_g_path.c_str(),
-                        sd_params->t5xxl_path.c_str(),
-                        sd_params->diffusion_model_path.c_str(),
-                        sd_params->vae_path.c_str(),
-                        sd_params->taesd_path.c_str(),
-                        sd_params->controlnet_path.c_str(),
-                        sd_params->lora_model_dir.c_str(),
-                        sd_params->embeddings_path.c_str(),
-                        sd_params->stacked_id_embeddings_path.c_str(),
-                        vae_decode_only,
-                        sd_params->vae_tiling,
-                        free_param,
-                        sd_params->n_threads,
-                        sd_params->wtype,
-                        sd_params->rng_type,
-                        sd_params->schedule,
-                        sd_params->clip_on_cpu,
-                        sd_params->control_net_cpu,
-                        sd_params->vae_on_cpu,
-                        sd_params->diffusion_flash_attn,
-                        sd_params->chroma_use_dit_mask,
-                        sd_params->chroma_use_t5_mask,
-                        sd_params->chroma_t5_mask_pad);
+    sd_ctx_params_t params;
+    sd_ctx_params_init(&params);
+    params.model_path = sd_params->model_path.c_str();
+    params.clip_l_path = sd_params->clip_l_path.c_str();
+    params.clip_g_path = sd_params->clip_g_path.c_str();
+    params.t5xxl_path = sd_params->t5xxl_path.c_str();
+    params.diffusion_model_path = sd_params->diffusion_model_path.c_str();
+    params.vae_path = sd_params->vae_path.c_str();
+    params.taesd_path = sd_params->taesd_path.c_str();
+    params.control_net_path = sd_params->controlnet_path.c_str();
+    params.lora_model_dir = sd_params->lora_model_dir.c_str();
+    params.embedding_dir = sd_params->embeddings_path.c_str();
+    params.stacked_id_embed_dir = sd_params->stacked_id_embeddings_path.c_str();
+    params.vae_decode_only = vae_decode_only;
+    params.vae_tiling = sd_params->vae_tiling;
+    params.free_params_immediately = free_param;
+    params.n_threads = sd_params->n_threads;
+    params.wtype = sd_params->wtype;
+    params.rng_type = sd_params->rng_type;
+    params.schedule = sd_params->schedule;
+    params.keep_clip_on_cpu = sd_params->clip_on_cpu;
+    params.keep_control_net_on_cpu = sd_params->control_net_cpu;
+    params.keep_vae_on_cpu = sd_params->vae_on_cpu;
+    params.diffusion_flash_attn = sd_params->diffusion_flash_attn;
+    params.chroma_use_dit_mask = sd_params->chroma_use_dit_mask;
+    params.chroma_use_t5_mask = sd_params->chroma_use_t5_mask;
+    params.chroma_t5_mask_pad = sd_params->chroma_t5_mask_pad;
+
+    sd_ctx = new_sd_ctx(&params);
 
     if (sd_ctx == NULL) {
         printf("\nError: KCPP SD Failed to create context!\nIf using Flux/SD3.5, make sure you have ALL files required (e.g. VAE, T5, Clip...) or baked in!\n");
@@ -305,7 +334,6 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) {
     if(lorafilename!="" && inputs.lora_multiplier>0)
     {
         printf("\nApply LoRA...\n");
-       // sd_ctx->sd->set_pending_lora(lorafilename,inputs.lora_multiplier);
         sd_ctx->sd->apply_lora_from_file(lorafilename,inputs.lora_multiplier);
     }
 
@@ -482,11 +510,29 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
     auto loadedsdver = get_loaded_sd_version(sd_ctx);
     if (loadedsdver == SDVersion::VERSION_FLUX)
     {
-        if (!sd_loaded_chroma()) {
-            sd_params->cfg_scale = 1;  //non chroma clamp cfg scale
+        if (loaded_model_is_chroma(sd_ctx)) {
+            if (sd_params->diffusion_flash_attn && sd_params->chroma_use_dit_mask) {
+                if (!sd_is_quiet && sddebugmode) {
+                    printf("Chroma: flash attention is on, disabling DiT mask\n");
+                }
+                sd_params->chroma_use_dit_mask = false;
+            }
+        }
+        else {
+            if (sd_params->cfg_scale != 1.0f) {
+                //non chroma clamp cfg scale
+                if (!sd_is_quiet && sddebugmode) {
+                    printf("Flux: clamping CFG Scale to 1\n");
+                }
+                sd_params->cfg_scale = 1.0f;
+            }
         }
         if (sampler == "euler a" || sampler == "k_euler_a" || sampler == "euler_a") {
-            sampler = "euler";  //euler a broken on flux
+            //euler a broken on flux
+            if (!sd_is_quiet && sddebugmode) {
+                printf("Flux: switching Euler A to Euler\n");
+            }
+            sampler = "euler";
         }
     }
 
@@ -521,17 +567,6 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
     bool dotile = (sd_params->width*sd_params->height > cfg_tiled_vae_threshold*cfg_tiled_vae_threshold);
     set_sd_vae_tiling(sd_ctx,dotile); //changes vae tiling, prevents memory related crash/oom
 
-    if (sd_params->clip_skip <= 0) {
-        // workaround for clip_skip being "stuck" at the previous requested value
-        // 2 is the default for all recent base models (SD2, SDXL, Flux, SD3)
-        if (sd_version_is_sd1((SDVersion)loadedsdver)) {
-            sd_params->clip_skip = 1;
-        }
-        else {
-            sd_params->clip_skip = 2;
-        }
-    }
-
     //for img2img
     sd_image_t input_image = {0,0,0,nullptr};
     std::vector<sd_image_t> extraimage_references;
@@ -663,25 +698,25 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
         }
     }
 
-    std::vector<sd_image_t> kontext_imgs;
-    if(extra_image_data.size()>0 && loadedsdver==SDVersion::VERSION_FLUX && !sd_loaded_chroma())
+    std::vector<sd_image_t> reference_imgs;
+    if(extra_image_data.size()>0 && loadedsdver==SDVersion::VERSION_FLUX && !loaded_model_is_chroma(sd_ctx))
     {
         for(int i=0;i<extra_image_data.size();++i)
         {
-            kontext_imgs.push_back(extraimage_references[i]);
+            reference_imgs.push_back(extraimage_references[i]);
         }
         if(!sd_is_quiet && sddebugmode==1)
         {
-            printf("\nFlux Kontext: Using %d reference images\n",kontext_imgs.size());
+            printf("\nFlux Kontext: Using %d reference images\n",reference_imgs.size());
         }
     }
 
-    std::vector<sd_image_t*> photomaker_imgs;
+    std::vector<sd_image_t> photomaker_imgs;
     if(photomaker_enabled && extra_image_data.size()>0)
     {
         for(int i=0;i<extra_image_data.size();++i)
         {
-            photomaker_imgs.push_back(&extraimage_references[i]);
+            photomaker_imgs.push_back(extraimage_references[i]);
         }
         if(!sd_is_quiet && sddebugmode==1)
         {
@@ -689,6 +724,41 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
         }
     }
 
+    sd_img_gen_params_t params;
+    sd_img_gen_params_init (&params);
+
+    params.prompt = sd_params->prompt.c_str();
+    params.negative_prompt = sd_params->negative_prompt.c_str();
+    params.clip_skip = sd_params->clip_skip;
+    params.guidance.txt_cfg = sd_params->cfg_scale;
+    params.guidance.img_cfg = sd_params->cfg_scale;
+    params.guidance.distilled_guidance = sd_params->guidance;
+    params.eta = sd_params->eta;
+    params.width = sd_params->width;
+    params.height = sd_params->height;
+    params.sample_method = sd_params->sample_method;
+    params.sample_steps = sd_params->sample_steps;
+    params.seed = sd_params->seed;
+    params.batch_count = sd_params->batch_count;
+    params.control_cond = control_image;
+    params.control_strength = sd_params->control_strength;
+    params.style_strength = sd_params->style_ratio;
+    params.normalize_input = sd_params->normalize_input;
+    params.input_id_images_path = sd_params->input_id_images_path.c_str();
+
+    params.guidance.slg.layers = sd_params->skip_layers.data();
+    params.guidance.slg.layer_count = sd_params->skip_layers.size();
+    params.guidance.slg.layer_start = sd_params->skip_layer_start;
+    params.guidance.slg.layer_end = sd_params->skip_layer_end;
+    params.guidance.slg.scale = sd_params->slg_scale;
+
+    params.ref_images = reference_imgs.data();
+    params.ref_images_count = reference_imgs.size();
+
+    kcpp_img_gen_params_t extra_params = {};
+    extra_params.photomaker_references = photomaker_imgs.data();
+    extra_params.photomaker_reference_count = photomaker_imgs.size();
+
     if (sd_params->mode == TXT2IMG) {
 
         if(!sd_is_quiet && sddebugmode==1)
@@ -708,32 +778,8 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
             sd_params->control_strength);
         }
 
+        results = generate_image(sd_ctx, &params, &extra_params);
 
-        results = txt2img(sd_ctx,
-                          sd_params->prompt.c_str(),
-                          sd_params->negative_prompt.c_str(),
-                          sd_params->clip_skip,
-                          sd_params->cfg_scale,
-                          sd_params->guidance,
-                          sd_params->eta,
-                          sd_params->width,
-                          sd_params->height,
-                          sd_params->sample_method,
-                          sd_params->sample_steps,
-                          sd_params->seed,
-                          sd_params->batch_count,
-                          control_image,
-                          sd_params->control_strength,
-                          sd_params->style_ratio,
-                          sd_params->normalize_input,
-                          sd_params->input_id_images_path.c_str(),
-                          kontext_imgs.data(), kontext_imgs.size(),
-                          sd_params->skip_layers.data(),
-                          sd_params->skip_layers.size(),
-                          sd_params->slg_scale,
-                          sd_params->skip_layer_start,
-                          sd_params->skip_layer_end,
-                          photomaker_imgs);
     } else {
 
         if (sd_params->width <= 0 || sd_params->width % 64 != 0 || sd_params->height <= 0 || sd_params->height % 64 != 0) {
@@ -839,34 +885,12 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
             sd_params->strength);
         }
 
-        results = img2img(sd_ctx,
-                            input_image,
-                            mask_image,
-                            sd_params->prompt.c_str(),
-                            sd_params->negative_prompt.c_str(),
-                            sd_params->clip_skip,
-                            sd_params->cfg_scale,
-                            sd_params->guidance,
-                            sd_params->eta,
-                            sd_params->width,
-                            sd_params->height,
-                            sd_params->sample_method,
-                            sd_params->sample_steps,
-                            sd_params->strength,
-                            sd_params->seed,
-                            sd_params->batch_count,
-                            control_image,
-                            sd_params->control_strength,
-                            sd_params->style_ratio,
-                            sd_params->normalize_input,
-                            sd_params->input_id_images_path.c_str(),
-                            kontext_imgs.data(), kontext_imgs.size(),
-                            sd_params->skip_layers.data(),
-                            sd_params->skip_layers.size(),
-                            sd_params->slg_scale,
-                            sd_params->skip_layer_start,
-                            sd_params->skip_layer_end,
-                            photomaker_imgs);
+        params.strength = sd_params->strength;
+        params.init_image = input_image;
+        params.mask_image = mask_image;
+
+        results = generate_image(sd_ctx, &params, &extra_params);
+
     }
 
     if (results == NULL) {
diff --git a/otherarch/sdcpp/stable-diffusion.cpp b/otherarch/sdcpp/stable-diffusion.cpp
index 322e888bd..ec9194d22 100644
--- a/otherarch/sdcpp/stable-diffusion.cpp
+++ b/otherarch/sdcpp/stable-diffusion.cpp
@@ -16,21 +16,26 @@
 #include "tae.hpp"
 #include "vae.hpp"
 
+// #define STB_IMAGE_IMPLEMENTATION
+// #define STB_IMAGE_STATIC
 #include "stb_image.h"
 
 #include <inttypes.h>
 #include <cinttypes>
-static std::string pending_apply_lora_fname = "";
-static float pending_apply_lora_power = 1.0f;
-static bool is_loaded_chroma = false;
+
+// #define STB_IMAGE_WRITE_IMPLEMENTATION
+// #define STB_IMAGE_WRITE_STATIC
+// #include "stb_image_write.h"
 
 const char* model_version_to_str[] = {
     "SD 1.x",
     "SD 1.x Inpaint",
+    "Instruct-Pix2Pix",
     "SD 2.x",
     "SD 2.x Inpaint",
     "SDXL",
     "SDXL Inpaint",
+    "SDXL Instruct-Pix2Pix",
     "SVD",
     "SD3.x",
     "Flux",
@@ -48,8 +53,7 @@ const char* sampling_methods_str[] = {
     "iPNDM_v",
     "LCM",
     "DDIM \"trailing\"",
-    "TCD"
-};
+    "TCD"};
 
 /*================================================== Helper Functions ================================================*/
 
@@ -104,6 +108,9 @@ public:
     bool vae_tiling           = false;
     bool stacked_id           = false;
 
+    bool is_using_v_parameterization     = false;
+    bool is_using_edm_v_parameterization = false;
+
     std::map<std::string, struct ggml_tensor*> tensors;
 
     std::string lora_model_dir;
@@ -114,22 +121,6 @@ public:
 
     StableDiffusionGGML() = default;
 
-    StableDiffusionGGML(int n_threads,
-                        bool vae_decode_only,
-                        bool free_params_immediately,
-                        std::string lora_model_dir,
-                        rng_type_t rng_type)
-        : n_threads(n_threads),
-          vae_decode_only(vae_decode_only),
-          free_params_immediately(free_params_immediately),
-          lora_model_dir(lora_model_dir) {
-        if (rng_type == STD_DEFAULT_RNG) {
-            rng = std::make_shared<STDDefaultRNG>();
-        } else if (rng_type == CUDA_RNG) {
-            rng = std::make_shared<PhiloxRNG>();
-        }
-    }
-
     ~StableDiffusionGGML() {
         if (clip_backend != backend) {
             ggml_backend_free(clip_backend);
@@ -143,36 +134,14 @@ public:
         ggml_backend_free(backend);
     }
 
-    bool load_from_file(const std::string& model_path,
-                        const std::string& clip_l_path,
-                        const std::string& clip_g_path,
-                        const std::string& t5xxl_path,
-                        const std::string& diffusion_model_path,
-                        const std::string& vae_path,
-                        const std::string control_net_path,
-                        const std::string embeddings_path,
-                        const std::string id_embeddings_path_original,
-                        const std::string& taesd_path,
-                        bool vae_tiling_,
-                        ggml_type wtype,
-                        schedule_t schedule,
-                        bool clip_on_cpu,
-                        bool control_net_cpu,
-                        bool vae_on_cpu,
-                        bool diffusion_flash_attn,
-                        bool chroma_use_dit_mask,
-                        bool chroma_use_t5_mask,
-                        int chroma_t5_mask_pad) {
-        use_tiny_autoencoder = taesd_path.size() > 0;
-        std::string taesd_path_fixed = taesd_path;
-        is_loaded_chroma = false;
-        std::string id_embeddings_path = id_embeddings_path_original;
+    void init_backend() {
 #ifdef SD_USE_CUDA
         LOG_DEBUG("Using CUDA backend");
         backend = ggml_backend_cuda_init(0);
 #endif
 #ifdef SD_USE_METAL
         LOG_DEBUG("Using Metal backend");
+        ggml_log_set(ggml_log_callback_default, nullptr);
         backend = ggml_backend_metal_init();
 #endif
 #ifdef SD_USE_VULKAN
@@ -184,6 +153,14 @@ public:
             LOG_WARN("Failed to initialize Vulkan backend");
         }
 #endif
+#ifdef SD_USE_OPENCL
+        LOG_DEBUG("Using OpenCL backend");
+        // ggml_log_set(ggml_log_callback_default, nullptr); // Optional ggml logs
+        backend = ggml_backend_opencl_init();
+        if (!backend) {
+            LOG_WARN("Failed to initialize OpenCL backend");
+        }
+#endif
 #ifdef SD_USE_SYCL
         LOG_DEBUG("Using SYCL backend");
         backend = ggml_backend_sycl_init(0);
@@ -193,80 +170,101 @@ public:
             LOG_DEBUG("Using CPU backend");
             backend = ggml_backend_cpu_init();
         }
+    }
+
+    bool init(const sd_ctx_params_t* sd_ctx_params) {
+        n_threads               = sd_ctx_params->n_threads;
+        vae_decode_only         = sd_ctx_params->vae_decode_only;
+        free_params_immediately = sd_ctx_params->free_params_immediately;
+        lora_model_dir          = SAFE_STR(sd_ctx_params->lora_model_dir);
+        taesd_path              = SAFE_STR(sd_ctx_params->taesd_path);
+        use_tiny_autoencoder    = taesd_path.size() > 0;
+        vae_tiling              = sd_ctx_params->vae_tiling;
+
+        if (sd_ctx_params->rng_type == STD_DEFAULT_RNG) {
+            rng = std::make_shared<STDDefaultRNG>();
+        } else if (sd_ctx_params->rng_type == CUDA_RNG) {
+            rng = std::make_shared<PhiloxRNG>();
+        }
+
+        init_backend();
+
+        std::string taesd_path_fixed = taesd_path;
 
         ModelLoader model_loader;
 
-        vae_tiling = vae_tiling_;
-
-        if (model_path.size() > 0) {
-            LOG_INFO("loading model from '%s'", model_path.c_str());
-            if (!model_loader.init_from_file(model_path)) {
-                LOG_ERROR("init model loader from file failed: '%s'", model_path.c_str());
+        if (strlen(SAFE_STR(sd_ctx_params->model_path)) > 0) {
+            LOG_INFO("loading model from '%s'", sd_ctx_params->model_path);
+            if (!model_loader.init_from_file(sd_ctx_params->model_path)) {
+                LOG_ERROR("init model loader from file failed: '%s'", sd_ctx_params->model_path);
             }
         }
 
-        if (clip_l_path.size() > 0) {
-            LOG_INFO("loading clip_l from '%s'", clip_l_path.c_str());
-            if (!model_loader.init_from_file(clip_l_path, "text_encoders.clip_l.transformer.")) {
-                LOG_WARN("loading clip_l from '%s' failed", clip_l_path.c_str());
+        if (strlen(SAFE_STR(sd_ctx_params->diffusion_model_path)) > 0) {
+            LOG_INFO("loading diffusion model from '%s'", sd_ctx_params->diffusion_model_path);
+            if (!model_loader.init_from_file(sd_ctx_params->diffusion_model_path, "model.diffusion_model.")) {
+                LOG_WARN("loading diffusion model from '%s' failed", sd_ctx_params->diffusion_model_path);
             }
         }
 
-        if (clip_g_path.size() > 0) {
-            LOG_INFO("loading clip_g from '%s'", clip_g_path.c_str());
-            if (!model_loader.init_from_file(clip_g_path, "text_encoders.clip_g.transformer.")) {
-                LOG_WARN("loading clip_g from '%s' failed", clip_g_path.c_str());
+        bool is_unet = model_loader.model_is_unet();
+
+        if (strlen(SAFE_STR(sd_ctx_params->clip_l_path)) > 0) {
+            LOG_INFO("loading clip_l from '%s'", sd_ctx_params->clip_l_path);
+            std::string prefix = is_unet ? "cond_stage_model.transformer." : "text_encoders.clip_l.transformer.";
+            if (!model_loader.init_from_file(sd_ctx_params->clip_l_path, prefix)) {
+                LOG_WARN("loading clip_l from '%s' failed", sd_ctx_params->clip_l_path);
             }
         }
 
-        if (t5xxl_path.size() > 0) {
-            LOG_INFO("loading t5xxl from '%s'", t5xxl_path.c_str());
-            if (!model_loader.init_from_file(t5xxl_path, "text_encoders.t5xxl.transformer.")) {
-                LOG_WARN("loading t5xxl from '%s' failed", t5xxl_path.c_str());
+        if (strlen(SAFE_STR(sd_ctx_params->clip_g_path)) > 0) {
+            LOG_INFO("loading clip_g from '%s'", sd_ctx_params->clip_g_path);
+            std::string prefix = is_unet ? "cond_stage_model.1.transformer." : "text_encoders.clip_g.transformer.";
+            if (!model_loader.init_from_file(sd_ctx_params->clip_g_path, prefix)) {
+                LOG_WARN("loading clip_g from '%s' failed", sd_ctx_params->clip_g_path);
             }
         }
 
-        if (diffusion_model_path.size() > 0) {
-            LOG_INFO("loading diffusion model from '%s'", diffusion_model_path.c_str());
-            if (!model_loader.init_from_file(diffusion_model_path, "model.diffusion_model.")) {
-                LOG_WARN("loading diffusion model from '%s' failed", diffusion_model_path.c_str());
+        if (strlen(SAFE_STR(sd_ctx_params->t5xxl_path)) > 0) {
+            LOG_INFO("loading t5xxl from '%s'", sd_ctx_params->t5xxl_path);
+            if (!model_loader.init_from_file(sd_ctx_params->t5xxl_path, "text_encoders.t5xxl.transformer.")) {
+                LOG_WARN("loading t5xxl from '%s' failed", sd_ctx_params->t5xxl_path);
             }
         }
 
-        if (vae_path.size() > 0) {
-            LOG_INFO("loading vae from '%s'", vae_path.c_str());
-            if (!model_loader.init_from_file(vae_path, "vae.")) {
-                LOG_WARN("loading vae from '%s' failed", vae_path.c_str());
+        if (strlen(SAFE_STR(sd_ctx_params->vae_path)) > 0) {
+            LOG_INFO("loading vae from '%s'", sd_ctx_params->vae_path);
+            if (!model_loader.init_from_file(sd_ctx_params->vae_path, "vae.")) {
+                LOG_WARN("loading vae from '%s' failed", sd_ctx_params->vae_path);
             }
         }
 
         version = model_loader.get_sd_version();
 
-        if (version == VERSION_COUNT && model_path.size() > 0 && diffusion_model_path.size() == 0 && t5xxl_path.size() > 0) {
-            bool endswithsafetensors = (model_path.rfind(".safetensors") == model_path.size() - 12);
+        // kcpp fallback to separate diffusion model passed as model
+        if (version == VERSION_COUNT &&
+            strlen(SAFE_STR(sd_ctx_params->model_path)) > 0 &&
+            strlen(SAFE_STR(sd_ctx_params->diffusion_model_path)) == 0 &&
+            strlen(SAFE_STR(sd_ctx_params->t5xxl_path)) > 0 )
+        {
+            bool endswithsafetensors = ends_with(sd_ctx_params->model_path, ".safetensors");
             if(endswithsafetensors && !model_loader.has_diffusion_model_tensors())
             {
                 LOG_INFO("SD Diffusion Model tensors missing! Fallback trying alternative tensor names...\n");
-                if (!model_loader.init_from_file(model_path, "model.diffusion_model.")) {
-                    LOG_WARN("loading diffusion model from '%s' failed", model_path.c_str());
+                if (!model_loader.init_from_file(sd_ctx_params->model_path, "model.diffusion_model.")) {
+                    LOG_WARN("loading diffusion model from '%s' failed", sd_ctx_params->model_path);
                 }
                 version = model_loader.get_sd_version();
             }
         }
 
         if (version == VERSION_COUNT) {
-            LOG_ERROR("Error: get SD version from file failed: '%s'", model_path.c_str());
+            LOG_ERROR("get sd version from file failed: '%s'", SAFE_STR(sd_ctx_params->model_path));
             return false;
         }
 
         LOG_INFO("Version: %s ", model_version_to_str[version]);
 
-        if(id_embeddings_path!="" && version!=VERSION_SDXL)
-        {
-            printf("\n!!!!\nWARNING: PhotoMaker is only compatible with SDXL models. PhotoMaker will be disabled!\n!!!!\n");
-            id_embeddings_path = "";
-        }
-
         if(use_tiny_autoencoder)
         {
             std::string to_search = "taesd.embd";
@@ -293,6 +291,7 @@ public:
             }
         }
 
+        ggml_type wtype = (ggml_type)sd_ctx_params->wtype;
         if (wtype == GGML_TYPE_COUNT) {
             model_wtype = model_loader.get_sd_wtype();
             if (model_wtype == GGML_TYPE_COUNT) {
@@ -325,16 +324,16 @@ public:
             model_loader.set_wtype_override(GGML_TYPE_F32, "vae.");
         }
 
-        LOG_INFO("Weight type:                 %s", model_wtype != SD_TYPE_COUNT ? ggml_type_name(model_wtype) : "??");
-        LOG_INFO("Conditioner weight type:     %s", conditioner_wtype != SD_TYPE_COUNT ? ggml_type_name(conditioner_wtype) : "??");
-        LOG_INFO("Diffusion model weight type: %s", diffusion_model_wtype != SD_TYPE_COUNT ? ggml_type_name(diffusion_model_wtype) : "??");
-        LOG_INFO("VAE weight type:             %s", vae_wtype != SD_TYPE_COUNT ? ggml_type_name(vae_wtype) : "??");
+        LOG_INFO("Weight type:                 %s", model_wtype != GGML_TYPE_COUNT ? ggml_type_name(model_wtype) : "??");
+        LOG_INFO("Conditioner weight type:     %s", conditioner_wtype != GGML_TYPE_COUNT ? ggml_type_name(conditioner_wtype) : "??");
+        LOG_INFO("Diffusion model weight type: %s", diffusion_model_wtype != GGML_TYPE_COUNT ? ggml_type_name(diffusion_model_wtype) : "??");
+        LOG_INFO("VAE weight type:             %s", vae_wtype != GGML_TYPE_COUNT ? ggml_type_name(vae_wtype) : "??");
 
         LOG_DEBUG("ggml tensor size = %d bytes", (int)sizeof(ggml_tensor));
 
         if (sd_version_is_sdxl(version)) {
             scale_factor = 0.13025f;
-            if (vae_path.size() == 0 && taesd_path_fixed.size() == 0) {
+            if (strlen(SAFE_STR(sd_ctx_params->vae_path)) == 0 && taesd_path_fixed.size() == 0) {
                 LOG_WARN(
                     "!!!It looks like you are using SDXL model. "
                     "If you find that the generated images are completely black, "
@@ -348,6 +347,8 @@ public:
             // TODO: shift_factor
         }
 
+        bool clip_on_cpu = sd_ctx_params->keep_clip_on_cpu;
+
         if (version == VERSION_SVD) {
             clip_vision = std::make_shared<FrozenCLIPVisionEmbedder>(backend, model_loader.tensor_storages_types);
             clip_vision->alloc_params_buffer();
@@ -375,11 +376,11 @@ public:
                 LOG_INFO("CLIP: Using CPU backend");
                 clip_backend = ggml_backend_cpu_init();
             }
-            if (diffusion_flash_attn) {
+            if (sd_ctx_params->diffusion_flash_attn) {
                 LOG_INFO("Using flash attention in the diffusion model");
             }
             if (sd_version_is_sd3(version)) {
-                if (diffusion_flash_attn) {
+                if (sd_ctx_params->diffusion_flash_attn) {
                     LOG_WARN("flash attention in this diffusion model is currently unsupported!");
                 }
                 cond_stage_model = std::make_shared<SD3CLIPEmbedder>(clip_backend, model_loader.tensor_storages_types);
@@ -389,23 +390,44 @@ public:
                 for (auto pair : model_loader.tensor_storages_types) {
                     if (pair.first.find("distilled_guidance_layer.in_proj.weight") != std::string::npos) {
                         is_chroma = true;
-                        is_loaded_chroma = true;
                         break;
                     }
                 }
                 if (is_chroma) {
-                    cond_stage_model = std::make_shared<PixArtCLIPEmbedder>(clip_backend, model_loader.tensor_storages_types, -1, chroma_use_t5_mask, chroma_t5_mask_pad);
+                    cond_stage_model = std::make_shared<PixArtCLIPEmbedder>(clip_backend,
+                                                                            model_loader.tensor_storages_types,
+                                                                            -1,
+                                                                            sd_ctx_params->chroma_use_t5_mask,
+                                                                            sd_ctx_params->chroma_t5_mask_pad);
                 } else {
                     cond_stage_model = std::make_shared<FluxCLIPEmbedder>(clip_backend, model_loader.tensor_storages_types);
                 }
-                diffusion_model = std::make_shared<FluxModel>(backend, model_loader.tensor_storages_types, version, diffusion_flash_attn, chroma_use_dit_mask);
+                diffusion_model = std::make_shared<FluxModel>(backend,
+                                                              model_loader.tensor_storages_types,
+                                                              version,
+                                                              sd_ctx_params->diffusion_flash_attn,
+                                                              sd_ctx_params->chroma_use_dit_mask);
             } else {
-                if (id_embeddings_path.find("v2") != std::string::npos) {
-                    cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend, model_loader.tensor_storages_types, embeddings_path, version, PM_VERSION_2);
+                if (strstr(SAFE_STR(sd_ctx_params->stacked_id_embed_dir), "v2")) {
+                    cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend,
+                                                                                           model_loader.tensor_storages_types,
+                                                                                           SAFE_STR(sd_ctx_params->embedding_dir),
+                                                                                           version,
+                                                                                           PM_VERSION_2);
                 } else {
-                    cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend, model_loader.tensor_storages_types, embeddings_path, version);
+                    cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend,
+                                                                                           model_loader.tensor_storages_types,
+                                                                                           SAFE_STR(sd_ctx_params->embedding_dir),
+                                                                                           version);
+                }
+                diffusion_model = std::make_shared<UNetModel>(backend,
+                                                              model_loader.tensor_storages_types,
+                                                              version,
+                                                              sd_ctx_params->diffusion_flash_attn);
+                if (sd_ctx_params->diffusion_conv_direct) {
+                    LOG_INFO("Using Conv2d direct in the diffusion model");
+                    std::dynamic_pointer_cast<UNetModel>(diffusion_model)->unet.enable_conv2d_direct();
                 }
-                diffusion_model = std::make_shared<UNetModel>(backend, model_loader.tensor_storages_types, version, diffusion_flash_attn);
             }
 
             cond_stage_model->alloc_params_buffer();
@@ -415,49 +437,74 @@ public:
             diffusion_model->get_param_tensors(tensors);
 
             if (!use_tiny_autoencoder) {
-                if (vae_on_cpu && !ggml_backend_is_cpu(backend)) {
+                if (sd_ctx_params->keep_vae_on_cpu && !ggml_backend_is_cpu(backend)) {
                     LOG_INFO("VAE Autoencoder: Using CPU backend");
                     vae_backend = ggml_backend_cpu_init();
                 } else {
                     vae_backend = backend;
                 }
-                first_stage_model = std::make_shared<AutoEncoderKL>(vae_backend, model_loader.tensor_storages_types, "first_stage_model", vae_decode_only, false, version);
+                first_stage_model = std::make_shared<AutoEncoderKL>(vae_backend,
+                                                                    model_loader.tensor_storages_types,
+                                                                    "first_stage_model",
+                                                                    vae_decode_only,
+                                                                    false,
+                                                                    version);
+                if (sd_ctx_params->vae_conv_direct) {
+                    LOG_INFO("Using Conv2d direct in the vae model");
+                    first_stage_model->enable_conv2d_direct();
+                }
                 first_stage_model->alloc_params_buffer();
                 first_stage_model->get_param_tensors(tensors, "first_stage_model");
             } else {
-                tae_first_stage = std::make_shared<TinyAutoEncoder>(backend, model_loader.tensor_storages_types, "decoder.layers", vae_decode_only, version);
+                tae_first_stage = std::make_shared<TinyAutoEncoder>(backend,
+                                                                    model_loader.tensor_storages_types,
+                                                                    "decoder.layers",
+                                                                    vae_decode_only,
+                                                                    version);
+                if (sd_ctx_params->vae_conv_direct) {
+                    LOG_INFO("Using Conv2d direct in the tae model");
+                    tae_first_stage->enable_conv2d_direct();
+                }
             }
             // first_stage_model->get_param_tensors(tensors, "first_stage_model.");
 
-            if (control_net_path.size() > 0) {
+            if (strlen(SAFE_STR(sd_ctx_params->control_net_path)) > 0) {
                 ggml_backend_t controlnet_backend = NULL;
-                if (control_net_cpu && !ggml_backend_is_cpu(backend)) {
+                if (sd_ctx_params->keep_control_net_on_cpu && !ggml_backend_is_cpu(backend)) {
                     LOG_DEBUG("ControlNet: Using CPU backend");
                     controlnet_backend = ggml_backend_cpu_init();
                 } else {
                     controlnet_backend = backend;
                 }
                 control_net = std::make_shared<ControlNet>(controlnet_backend, model_loader.tensor_storages_types, version);
+                if (sd_ctx_params->diffusion_conv_direct) {
+                    LOG_INFO("Using Conv2d direct in the control net");
+                    control_net->enable_conv2d_direct();
+                }
             }
 
-            if (id_embeddings_path.find("v2") != std::string::npos) {
+            if (strstr(SAFE_STR(sd_ctx_params->stacked_id_embed_dir), "v2")) {
                 pmid_model = std::make_shared<PhotoMakerIDEncoder>(backend, model_loader.tensor_storages_types, "pmid", version, PM_VERSION_2);
                 LOG_INFO("using PhotoMaker Version 2");
             } else {
                 pmid_model = std::make_shared<PhotoMakerIDEncoder>(backend, model_loader.tensor_storages_types, "pmid", version);
             }
-            if (id_embeddings_path.size() > 0) {
-                pmid_lora = std::make_shared<LoraModel>(backend, id_embeddings_path, "");
+            if (strlen(SAFE_STR(sd_ctx_params->stacked_id_embed_dir)) > 0) {
+              if (version != VERSION_SDXL) {
+                printf("\n!!!!\nWARNING: PhotoMaker is only compatible with SDXL models. PhotoMaker will be disabled!\n!!!!\n");
+              } else {
+                pmid_lora = std::make_shared<LoraModel>(backend, sd_ctx_params->stacked_id_embed_dir, "");
                 if (!pmid_lora->load_from_file(true)) {
-                    LOG_WARN("load photomaker lora tensors from %s failed", id_embeddings_path.c_str());
+                    LOG_WARN("load photomaker lora tensors from %s failed", sd_ctx_params->stacked_id_embed_dir);
                     return false;
                 }
-                LOG_INFO("loading stacked ID embedding (PHOTOMAKER) model file from '%s'", id_embeddings_path.c_str());
-                if (!model_loader.init_from_file(id_embeddings_path, "pmid.")) {
-                    LOG_WARN("loading stacked ID embedding from '%s' failed", id_embeddings_path.c_str());
+                LOG_INFO("loading stacked ID embedding (PHOTOMAKER) model file from '%s'", sd_ctx_params->stacked_id_embed_dir);
+                if (!model_loader.init_from_file(sd_ctx_params->stacked_id_embed_dir, "pmid.")) {
+                    LOG_WARN("loading stacked ID embedding from '%s' failed", sd_ctx_params->stacked_id_embed_dir);
                 } else {
                     stacked_id = true;
                 }
+              }
             }
             if (stacked_id) {
                 if (!pmid_model->alloc_params_buffer()) {
@@ -526,7 +573,7 @@ public:
             }
             size_t control_net_params_mem_size = 0;
             if (control_net) {
-                if (!control_net->load_from_file(control_net_path)) {
+                if (!control_net->load_from_file(SAFE_STR(sd_ctx_params->control_net_path))) {
                     return false;
                 }
                 control_net_params_mem_size = control_net->get_params_buffer_size();
@@ -582,15 +629,20 @@ public:
         }
 
         int64_t t1 = ggml_time_ms();
-        LOG_INFO("loading model from '%s' completed, taking %.2fs", model_path.c_str(), (t1 - t0) * 1.0f / 1000);
+        LOG_INFO("loading model from '%s' completed, taking %.2fs", SAFE_STR(sd_ctx_params->model_path), (t1 - t0) * 1.0f / 1000);
 
         // check is_using_v_parameterization_for_sd2
-        bool is_using_v_parameterization = false;
+
         if (sd_version_is_sd2(version)) {
             if (is_using_v_parameterization_for_sd2(ctx, sd_version_is_inpaint(version))) {
                 is_using_v_parameterization = true;
             }
         } else if (sd_version_is_sdxl(version)) {
+            if (model_loader.tensor_storages_types.find("edm_vpred.sigma_max") != model_loader.tensor_storages_types.end()) {
+                // CosXL models
+                // TODO: get sigma_min and sigma_max values from file
+                is_using_edm_v_parameterization = true;
+            }
             if (model_loader.tensor_storages_types.find("v_pred") != model_loader.tensor_storages_types.end()) {
                 is_using_v_parameterization = true;
             }
@@ -615,12 +667,15 @@ public:
         } else if (is_using_v_parameterization) {
             LOG_INFO("running in v-prediction mode");
             denoiser = std::make_shared<CompVisVDenoiser>();
+        } else if (is_using_edm_v_parameterization) {
+            LOG_INFO("running in v-prediction EDM mode");
+            denoiser = std::make_shared<EDMVDenoiser>();
         } else {
             LOG_INFO("running in eps-prediction mode");
         }
 
-        if (schedule != DEFAULT) {
-            switch (schedule) {
+        if (sd_ctx_params->schedule != DEFAULT) {
+            switch (sd_ctx_params->schedule) {
                 case DISCRETE:
                     LOG_INFO("running with discrete schedule");
                     denoiser->schedule = std::make_shared<DiscreteSchedule>();
@@ -647,7 +702,7 @@ public:
                     // Don't touch anything.
                     break;
                 default:
-                    LOG_ERROR("Unknown schedule %i", schedule);
+                    LOG_ERROR("Unknown schedule %i", sd_ctx_params->schedule);
                     abort();
             }
         }
@@ -701,11 +756,6 @@ public:
         return result < -1;
     }
 
-    void set_pending_lora(const std::string& lora_path, float multiplier) {
-        pending_apply_lora_fname = lora_path;
-        pending_apply_lora_power = multiplier;
-    }
-
     void apply_lora_from_file(const std::string& lora_path, float multiplier) {
         int64_t t0                 = ggml_time_ms();
         std::string st_file_path   = lora_path;
@@ -876,7 +926,7 @@ public:
             set_timestep_embedding(timesteps, y, out_dim);
         }
         int64_t t1 = ggml_time_ms();
-        LOG_DEBUG("computing svd condition graph completed, taking %d ms", (int)(t1 - t0));
+        LOG_DEBUG("computing svd condition graph completed, taking %" PRId64 " ms", t1 - t0);
         return {c_crossattn, y, c_concat};
     }
 
@@ -885,22 +935,30 @@ public:
                         ggml_tensor* noise,
                         SDCondition cond,
                         SDCondition uncond,
+                        SDCondition img_cond,
                         ggml_tensor* control_hint,
                         float control_strength,
-                        float min_cfg,
-                        float cfg_scale,
-                        float guidance,
+                        sd_guidance_params_t guidance,
                         float eta,
                         sample_method_t method,
                         const std::vector<float>& sigmas,
                         int start_merge_step,
                         SDCondition id_cond,
                         std::vector<ggml_tensor*> ref_latents = {},
-                        std::vector<int> skip_layers = {},
-                        float slg_scale              = 0,
-                        float skip_layer_start       = 0.01,
-                        float skip_layer_end         = 0.2,
-                        ggml_tensor* noise_mask      = nullptr) {
+                        ggml_tensor* denoise_mask             = nullptr) {
+        std::vector<int> skip_layers(guidance.slg.layers, guidance.slg.layers + guidance.slg.layer_count);
+
+        float cfg_scale     = guidance.txt_cfg;
+        float img_cfg_scale = guidance.img_cfg;
+        float slg_scale     = guidance.slg.scale;
+
+        float min_cfg = guidance.min_cfg;
+
+        if (img_cfg_scale != cfg_scale && !sd_version_is_inpaint_or_unet_edit(version)) {
+            LOG_WARN("2-conditioning CFG is not supported with this model, disabling it for better performance...");
+            img_cfg_scale = cfg_scale;
+        }
+
         LOG_DEBUG("Sample");
         struct ggml_init_params params;
         size_t data_size = ggml_row_size(init_latent->type, init_latent->ne[0]);
@@ -922,13 +980,15 @@ public:
 
         struct ggml_tensor* noised_input = ggml_dup_tensor(work_ctx, noise);
 
-        bool has_unconditioned = cfg_scale != 1.0 && uncond.c_crossattn != NULL;
+        bool has_unconditioned = img_cfg_scale != 1.0 && uncond.c_crossattn != NULL;
+        bool has_img_cond      = cfg_scale != img_cfg_scale && img_cond.c_crossattn != NULL;
         bool has_skiplayer     = slg_scale != 0.0 && skip_layers.size() > 0;
 
         // denoise wrapper
-        struct ggml_tensor* out_cond   = ggml_dup_tensor(work_ctx, x);
-        struct ggml_tensor* out_uncond = NULL;
-        struct ggml_tensor* out_skip   = NULL;
+        struct ggml_tensor* out_cond     = ggml_dup_tensor(work_ctx, x);
+        struct ggml_tensor* out_uncond   = NULL;
+        struct ggml_tensor* out_skip     = NULL;
+        struct ggml_tensor* out_img_cond = NULL;
 
         if (has_unconditioned) {
             out_uncond = ggml_dup_tensor(work_ctx, x);
@@ -941,6 +1001,9 @@ public:
                 LOG_WARN("SLG is incompatible with %s models", model_version_to_str[version]);
             }
         }
+        if (has_img_cond) {
+            out_img_cond = ggml_dup_tensor(work_ctx, x);
+        }
         struct ggml_tensor* denoised = ggml_dup_tensor(work_ctx, x);
 
         auto denoise = [&](ggml_tensor* input, float sigma, int step) -> ggml_tensor* {
@@ -958,7 +1021,7 @@ public:
             float t = denoiser->sigma_to_t(sigma);
             std::vector<float> timesteps_vec(x->ne[3], t);  // [N, ]
             auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec);
-            std::vector<float> guidance_vec(x->ne[3], guidance);
+            std::vector<float> guidance_vec(x->ne[3], guidance.distilled_guidance);
             auto guidance_tensor = vector_to_ggml_tensor(work_ctx, guidance_vec);
 
             copy_ggml_tensor(noised_input, input);
@@ -1025,8 +1088,25 @@ public:
                 negative_data = (float*)out_uncond->data;
             }
 
+            float* img_cond_data = NULL;
+            if (has_img_cond) {
+                diffusion_model->compute(n_threads,
+                                         noised_input,
+                                         timesteps,
+                                         img_cond.c_crossattn,
+                                         img_cond.c_concat,
+                                         img_cond.c_vector,
+                                         guidance_tensor,
+                                         ref_latents,
+                                         -1,
+                                         controls,
+                                         control_strength,
+                                         &out_img_cond);
+                img_cond_data = (float*)out_img_cond->data;
+            }
+
             int step_count         = sigmas.size();
-            bool is_skiplayer_step = has_skiplayer && step > (int)(skip_layer_start * step_count) && step < (int)(skip_layer_end * step_count);
+            bool is_skiplayer_step = has_skiplayer && step > (int)(guidance.slg.layer_start * step_count) && step < (int)(guidance.slg.layer_end * step_count);
             float* skip_layer_data = NULL;
             if (is_skiplayer_step) {
                 LOG_DEBUG("Skipping layers at step %d\n", step);
@@ -1060,8 +1140,17 @@ public:
                         int64_t i3  = i / out_cond->ne[0] * out_cond->ne[1] * out_cond->ne[2];
                         float scale = min_cfg + (cfg_scale - min_cfg) * (i3 * 1.0f / ne3);
                     } else {
-                        latent_result = negative_data[i] + cfg_scale * (positive_data[i] - negative_data[i]);
+                        if (has_img_cond) {
+                            // out_uncond + text_cfg_scale * (out_cond - out_img_cond) + image_cfg_scale * (out_img_cond - out_uncond)
+                            latent_result = negative_data[i] + img_cfg_scale * (img_cond_data[i] - negative_data[i]) + cfg_scale * (positive_data[i] - img_cond_data[i]);
+                        } else {
+                            // img_cfg_scale == cfg_scale
+                            latent_result = negative_data[i] + cfg_scale * (positive_data[i] - negative_data[i]);
+                        }
                     }
+                } else if (has_img_cond) {
+                    // img_cfg_scale == 1
+                    latent_result = img_cond_data[i] + cfg_scale * (positive_data[i] - img_cond_data[i]);
                 }
                 if (is_skiplayer_step) {
                     latent_result = latent_result + (positive_data[i] - skip_layer_data[i]) * slg_scale;
@@ -1075,10 +1164,10 @@ public:
                 pretty_progress(step, (int)steps, (t1 - t0) / 1000000.f);
                 // LOG_INFO("step %d sampling completed taking %.2fs", step, (t1 - t0) * 1.0f / 1000000);
             }
-            if (noise_mask != nullptr) {
+            if (denoise_mask != nullptr) {
                 for (int64_t x = 0; x < denoised->ne[0]; x++) {
                     for (int64_t y = 0; y < denoised->ne[1]; y++) {
-                        float mask = ggml_tensor_get_f32(noise_mask, x, y);
+                        float mask = ggml_tensor_get_f32(denoise_mask, x, y);
                         for (int64_t k = 0; k < denoised->ne[2]; k++) {
                             float init = ggml_tensor_get_f32(init_latent, x, y, k);
                             float den  = ggml_tensor_get_f32(denoised, x, y, k);
@@ -1267,8 +1356,7 @@ public:
                 ggml_tensor_scale_output(result);
             }
         } else {
-            //koboldcpp never use tiling with taesd
-            if (false && vae_tiling) {  // TODO: support tiling vae encode
+            if (vae_tiling) {
                 // split latent in 64x64 tiles and compute in several steps
                 auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
                     tae_first_stage->compute(n_threads, in, decode, &out);
@@ -1299,96 +1387,301 @@ public:
 
 /*================================================= SD API ==================================================*/
 
+#define NONE_STR "NONE"
+
+const char* sd_type_name(enum sd_type_t type) {
+    return ggml_type_name((ggml_type)type);
+}
+
+enum sd_type_t str_to_sd_type(const char* str) {
+    for (int i = 0; i < SD_TYPE_COUNT; i++) {
+        auto trait = ggml_get_type_traits((ggml_type)i);
+        if (!strcmp(str, trait->type_name)) {
+            return (enum sd_type_t)i;
+        }
+    }
+    return SD_TYPE_COUNT;
+}
+
+const char* rng_type_to_str[] = {
+    "std_default",
+    "cuda",
+};
+
+const char* sd_rng_type_name(enum rng_type_t rng_type) {
+    if (rng_type < RNG_TYPE_COUNT) {
+        return rng_type_to_str[rng_type];
+    }
+    return NONE_STR;
+}
+
+enum rng_type_t str_to_rng_type(const char* str) {
+    for (int i = 0; i < RNG_TYPE_COUNT; i++) {
+        if (!strcmp(str, rng_type_to_str[i])) {
+            return (enum rng_type_t)i;
+        }
+    }
+    return RNG_TYPE_COUNT;
+}
+
+const char* sample_method_to_str[] = {
+    "euler_a",
+    "euler",
+    "heun",
+    "dpm2",
+    "dpm++2s_a",
+    "dpm++2m",
+    "dpm++2mv2",
+    "ipndm",
+    "ipndm_v",
+    "lcm",
+    "ddim_trailing",
+    "tcd",
+};
+
+const char* sd_sample_method_name(enum sample_method_t sample_method) {
+    if (sample_method < SAMPLE_METHOD_COUNT) {
+        return sample_method_to_str[sample_method];
+    }
+    return NONE_STR;
+}
+
+enum sample_method_t str_to_sample_method(const char* str) {
+    for (int i = 0; i < SAMPLE_METHOD_COUNT; i++) {
+        if (!strcmp(str, sample_method_to_str[i])) {
+            return (enum sample_method_t)i;
+        }
+    }
+    return SAMPLE_METHOD_COUNT;
+}
+
+const char* schedule_to_str[] = {
+    "default",
+    "discrete",
+    "karras",
+    "exponential",
+    "ays",
+    "gits",
+};
+
+const char* sd_schedule_name(enum schedule_t schedule) {
+    if (schedule < SCHEDULE_COUNT) {
+        return schedule_to_str[schedule];
+    }
+    return NONE_STR;
+}
+
+enum schedule_t str_to_schedule(const char* str) {
+    for (int i = 0; i < SCHEDULE_COUNT; i++) {
+        if (!strcmp(str, schedule_to_str[i])) {
+            return (enum schedule_t)i;
+        }
+    }
+    return SCHEDULE_COUNT;
+}
+
+void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
+    memset((void*)sd_ctx_params, 0, sizeof(sd_ctx_params_t));
+    sd_ctx_params->vae_decode_only         = true;
+    sd_ctx_params->vae_tiling              = false;
+    sd_ctx_params->free_params_immediately = true;
+    sd_ctx_params->n_threads               = sd_get_num_physical_cores();
+    sd_ctx_params->wtype                   = SD_TYPE_COUNT;
+    sd_ctx_params->rng_type                = CUDA_RNG;
+    sd_ctx_params->schedule                = DEFAULT;
+    sd_ctx_params->keep_clip_on_cpu        = false;
+    sd_ctx_params->keep_control_net_on_cpu = false;
+    sd_ctx_params->keep_vae_on_cpu         = false;
+    sd_ctx_params->diffusion_flash_attn    = false;
+    sd_ctx_params->chroma_use_dit_mask     = true;
+    sd_ctx_params->chroma_use_t5_mask      = false;
+    sd_ctx_params->chroma_t5_mask_pad      = 1;
+}
+
+char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
+    char* buf = (char*)malloc(4096);
+    if (!buf)
+        return NULL;
+    buf[0] = '\0';
+
+    snprintf(buf + strlen(buf), 4096 - strlen(buf),
+             "model_path: %s\n"
+             "clip_l_path: %s\n"
+             "clip_g_path: %s\n"
+             "t5xxl_path: %s\n"
+             "diffusion_model_path: %s\n"
+             "vae_path: %s\n"
+             "taesd_path: %s\n"
+             "control_net_path: %s\n"
+             "lora_model_dir: %s\n"
+             "embedding_dir: %s\n"
+             "stacked_id_embed_dir: %s\n"
+             "vae_decode_only: %s\n"
+             "vae_tiling: %s\n"
+             "free_params_immediately: %s\n"
+             "n_threads: %d\n"
+             "wtype: %s\n"
+             "rng_type: %s\n"
+             "schedule: %s\n"
+             "keep_clip_on_cpu: %s\n"
+             "keep_control_net_on_cpu: %s\n"
+             "keep_vae_on_cpu: %s\n"
+             "diffusion_flash_attn: %s\n"
+             "chroma_use_dit_mask: %s\n"
+             "chroma_use_t5_mask: %s\n"
+             "chroma_t5_mask_pad: %d\n",
+             SAFE_STR(sd_ctx_params->model_path),
+             SAFE_STR(sd_ctx_params->clip_l_path),
+             SAFE_STR(sd_ctx_params->clip_g_path),
+             SAFE_STR(sd_ctx_params->t5xxl_path),
+             SAFE_STR(sd_ctx_params->diffusion_model_path),
+             SAFE_STR(sd_ctx_params->vae_path),
+             SAFE_STR(sd_ctx_params->taesd_path),
+             SAFE_STR(sd_ctx_params->control_net_path),
+             SAFE_STR(sd_ctx_params->lora_model_dir),
+             SAFE_STR(sd_ctx_params->embedding_dir),
+             SAFE_STR(sd_ctx_params->stacked_id_embed_dir),
+             BOOL_STR(sd_ctx_params->vae_decode_only),
+             BOOL_STR(sd_ctx_params->vae_tiling),
+             BOOL_STR(sd_ctx_params->free_params_immediately),
+             sd_ctx_params->n_threads,
+             sd_type_name(sd_ctx_params->wtype),
+             sd_rng_type_name(sd_ctx_params->rng_type),
+             sd_schedule_name(sd_ctx_params->schedule),
+             BOOL_STR(sd_ctx_params->keep_clip_on_cpu),
+             BOOL_STR(sd_ctx_params->keep_control_net_on_cpu),
+             BOOL_STR(sd_ctx_params->keep_vae_on_cpu),
+             BOOL_STR(sd_ctx_params->diffusion_flash_attn),
+             BOOL_STR(sd_ctx_params->chroma_use_dit_mask),
+             BOOL_STR(sd_ctx_params->chroma_use_t5_mask),
+             sd_ctx_params->chroma_t5_mask_pad);
+
+    return buf;
+}
+
+void sd_img_gen_params_init(sd_img_gen_params_t* sd_img_gen_params) {
+    memset((void*)sd_img_gen_params, 0, sizeof(sd_img_gen_params_t));
+    sd_img_gen_params->clip_skip                   = -1;
+    sd_img_gen_params->guidance.txt_cfg            = 7.0f;
+    sd_img_gen_params->guidance.min_cfg            = 1.0f;
+    sd_img_gen_params->guidance.img_cfg            = INFINITY;
+    sd_img_gen_params->guidance.distilled_guidance = 3.5f;
+    sd_img_gen_params->guidance.slg.layer_count    = 0;
+    sd_img_gen_params->guidance.slg.layer_start    = 0.01f;
+    sd_img_gen_params->guidance.slg.layer_end      = 0.2f;
+    sd_img_gen_params->guidance.slg.scale          = 0.f;
+    sd_img_gen_params->ref_images_count            = 0;
+    sd_img_gen_params->width                       = 512;
+    sd_img_gen_params->height                      = 512;
+    sd_img_gen_params->sample_method               = EULER_A;
+    sd_img_gen_params->sample_steps                = 20;
+    sd_img_gen_params->eta                         = 0.f;
+    sd_img_gen_params->strength                    = 0.75f;
+    sd_img_gen_params->seed                        = -1;
+    sd_img_gen_params->batch_count                 = 1;
+    sd_img_gen_params->control_strength            = 0.9f;
+    sd_img_gen_params->style_strength              = 20.f;
+    sd_img_gen_params->normalize_input             = false;
+}
+
+char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params) {
+    char* buf = (char*)malloc(4096);
+    if (!buf)
+        return NULL;
+    buf[0] = '\0';
+
+    snprintf(buf + strlen(buf), 4096 - strlen(buf),
+             "prompt: %s\n"
+             "negative_prompt: %s\n"
+             "clip_skip: %d\n"
+             "txt_cfg: %.2f\n"
+             "img_cfg: %.2f\n"
+             "min_cfg: %.2f\n"
+             "distilled_guidance: %.2f\n"
+             "slg.layer_count: %zu\n"
+             "slg.layer_start: %.2f\n"
+             "slg.layer_end: %.2f\n"
+             "slg.scale: %.2f\n"
+             "width: %d\n"
+             "height: %d\n"
+             "sample_method: %s\n"
+             "sample_steps: %d\n"
+             "eta: %.2f\n"
+             "strength: %.2f\n"
+             "seed: %" PRId64
+             "\n"
+             "batch_count: %d\n"
+             "ref_images_count: %d\n"
+             "control_strength: %.2f\n"
+             "style_strength: %.2f\n"
+             "normalize_input: %s\n"
+             "input_id_images_path: %s\n",
+             SAFE_STR(sd_img_gen_params->prompt),
+             SAFE_STR(sd_img_gen_params->negative_prompt),
+             sd_img_gen_params->clip_skip,
+             sd_img_gen_params->guidance.txt_cfg,
+             sd_img_gen_params->guidance.img_cfg,
+             sd_img_gen_params->guidance.min_cfg,
+             sd_img_gen_params->guidance.distilled_guidance,
+             sd_img_gen_params->guidance.slg.layer_count,
+             sd_img_gen_params->guidance.slg.layer_start,
+             sd_img_gen_params->guidance.slg.layer_end,
+             sd_img_gen_params->guidance.slg.scale,
+             sd_img_gen_params->width,
+             sd_img_gen_params->height,
+             sd_sample_method_name(sd_img_gen_params->sample_method),
+             sd_img_gen_params->sample_steps,
+             sd_img_gen_params->eta,
+             sd_img_gen_params->strength,
+             sd_img_gen_params->seed,
+             sd_img_gen_params->batch_count,
+             sd_img_gen_params->ref_images_count,
+             sd_img_gen_params->control_strength,
+             sd_img_gen_params->style_strength,
+             BOOL_STR(sd_img_gen_params->normalize_input),
+             SAFE_STR(sd_img_gen_params->input_id_images_path));
+
+    return buf;
+}
+
+void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params) {
+    memset((void*)sd_vid_gen_params, 0, sizeof(sd_vid_gen_params_t));
+    sd_vid_gen_params->guidance.txt_cfg            = 7.0f;
+    sd_vid_gen_params->guidance.min_cfg            = 1.0f;
+    sd_vid_gen_params->guidance.img_cfg            = INFINITY;
+    sd_vid_gen_params->guidance.distilled_guidance = 3.5f;
+    sd_vid_gen_params->guidance.slg.layer_count    = 0;
+    sd_vid_gen_params->guidance.slg.layer_start    = 0.01f;
+    sd_vid_gen_params->guidance.slg.layer_end      = 0.2f;
+    sd_vid_gen_params->guidance.slg.scale          = 0.f;
+    sd_vid_gen_params->width                       = 512;
+    sd_vid_gen_params->height                      = 512;
+    sd_vid_gen_params->sample_method               = EULER_A;
+    sd_vid_gen_params->sample_steps                = 20;
+    sd_vid_gen_params->strength                    = 0.75f;
+    sd_vid_gen_params->seed                        = -1;
+    sd_vid_gen_params->video_frames                = 6;
+    sd_vid_gen_params->motion_bucket_id            = 127;
+    sd_vid_gen_params->fps                         = 6;
+    sd_vid_gen_params->augmentation_level          = 0.f;
+}
+
 struct sd_ctx_t {
     StableDiffusionGGML* sd = NULL;
 };
 
-void set_sd_vae_tiling(sd_ctx_t* ctx, bool tiling)
-{
-    ctx->sd->vae_tiling = tiling;
-}
-
-int get_loaded_sd_version(sd_ctx_t* ctx)
-{
-    return ctx->sd->version;
-}
-
-//kcpp hack to check if chroma
-bool sd_loaded_chroma()
-{
-    return is_loaded_chroma;
-}
-
-sd_ctx_t* new_sd_ctx(const char* model_path_c_str,
-                     const char* clip_l_path_c_str,
-                     const char* clip_g_path_c_str,
-                     const char* t5xxl_path_c_str,
-                     const char* diffusion_model_path_c_str,
-                     const char* vae_path_c_str,
-                     const char* taesd_path_c_str,
-                     const char* control_net_path_c_str,
-                     const char* lora_model_dir_c_str,
-                     const char* embed_dir_c_str,
-                     const char* id_embed_dir_c_str,
-                     bool vae_decode_only,
-                     bool vae_tiling,
-                     bool free_params_immediately,
-                     int n_threads,
-                     enum sd_type_t wtype,
-                     enum rng_type_t rng_type,
-                     enum schedule_t s,
-                     bool keep_clip_on_cpu,
-                     bool keep_control_net_cpu,
-                     bool keep_vae_on_cpu,
-                     bool diffusion_flash_attn,
-                     bool chroma_use_dit_mask,
-                     bool chroma_use_t5_mask,
-                     int chroma_t5_mask_pad) {
+sd_ctx_t* new_sd_ctx(const sd_ctx_params_t* sd_ctx_params) {
     sd_ctx_t* sd_ctx = (sd_ctx_t*)malloc(sizeof(sd_ctx_t));
     if (sd_ctx == NULL) {
         return NULL;
     }
-    std::string model_path(model_path_c_str);
-    std::string clip_l_path(clip_l_path_c_str);
-    std::string clip_g_path(clip_g_path_c_str);
-    std::string t5xxl_path(t5xxl_path_c_str);
-    std::string diffusion_model_path(diffusion_model_path_c_str);
-    std::string vae_path(vae_path_c_str);
-    std::string taesd_path(taesd_path_c_str);
-    std::string control_net_path(control_net_path_c_str);
-    std::string embd_path(embed_dir_c_str);
-    std::string id_embd_path(id_embed_dir_c_str);
-    std::string lora_model_dir(lora_model_dir_c_str);
 
-    sd_ctx->sd = new StableDiffusionGGML(n_threads,
-                                         vae_decode_only,
-                                         free_params_immediately,
-                                         lora_model_dir,
-                                         rng_type);
+    sd_ctx->sd = new StableDiffusionGGML();
     if (sd_ctx->sd == NULL) {
         return NULL;
     }
 
-    if (!sd_ctx->sd->load_from_file(model_path,
-                                    clip_l_path,
-                                    clip_g_path,
-                                    t5xxl_path_c_str,
-                                    diffusion_model_path,
-                                    vae_path,
-                                    control_net_path,
-                                    embd_path,
-                                    id_embd_path,
-                                    taesd_path,
-                                    vae_tiling,
-                                    (ggml_type)wtype,
-                                    s,
-                                    keep_clip_on_cpu,
-                                    keep_control_net_cpu,
-                                    keep_vae_on_cpu,
-                                    diffusion_flash_attn,
-                                    chroma_use_dit_mask,
-                                    chroma_use_t5_mask,
-                                    chroma_t5_mask_pad)) {
+    if (!sd_ctx->sd->init(sd_ctx_params)) {
         delete sd_ctx->sd;
         sd_ctx->sd = NULL;
         free(sd_ctx);
@@ -1405,33 +1698,29 @@ void free_sd_ctx(sd_ctx_t* sd_ctx) {
     free(sd_ctx);
 }
 
-sd_image_t* generate_image(sd_ctx_t* sd_ctx,
-                           struct ggml_context* work_ctx,
-                           ggml_tensor* init_latent,
-                           std::string prompt,
-                           std::string negative_prompt,
-                           int clip_skip,
-                           float cfg_scale,
-                           float guidance,
-                           float eta,
-                           int width,
-                           int height,
-                           enum sample_method_t sample_method,
-                           const std::vector<float>& sigmas,
-                           int64_t seed,
-                           int batch_count,
-                           const sd_image_t* control_cond,
-                           float control_strength,
-                           float style_ratio,
-                           bool normalize_input,
-                           std::string input_id_images_path,
-                           std::vector<ggml_tensor*> ref_latents,
-                           std::vector<int> skip_layers = {},
-                           float slg_scale              = 0,
-                           float skip_layer_start       = 0.01,
-                           float skip_layer_end         = 0.2,
-                           ggml_tensor* masked_image    = NULL,
-                           const std::vector<sd_image_t*> photomaker_references = std::vector<sd_image_t*>()) {
+sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
+                                    struct ggml_context* work_ctx,
+                                    ggml_tensor* init_latent,
+                                    std::string prompt,
+                                    std::string negative_prompt,
+                                    int clip_skip,
+                                    sd_guidance_params_t guidance,
+                                    float eta,
+                                    int width,
+                                    int height,
+                                    enum sample_method_t sample_method,
+                                    const std::vector<float>& sigmas,
+                                    int64_t seed,
+                                    int batch_count,
+                                    const sd_image_t* control_cond,
+                                    float control_strength,
+                                    float style_ratio,
+                                    bool normalize_input,
+                                    std::string input_id_images_path,
+                                    std::vector<ggml_tensor*> ref_latents,
+                                    ggml_tensor* concat_latent = NULL,
+                                    ggml_tensor* denoise_mask  = NULL,
+                                    const kcpp_img_gen_params_t* kcpp_img_gen_params = NULL) {
     if (seed < 0) {
         // Generally, when using the provided command line, the seed is always >0.
         // However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library
@@ -1458,14 +1747,14 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
     prompt = result_pair.second;
     LOG_DEBUG("prompt after extract and remove lora: \"%s\"", prompt.c_str());
 
-    int64_t t0 = ggml_time_ms();
-    // sd_ctx->sd->apply_loras(lora_f2m); //only use hardcoded lora for kcpp
-    if(pending_apply_lora_fname!="" && pending_apply_lora_power>0)
-    {
-        printf("\nApplying LoRA now...\n");
-        sd_ctx->sd->apply_lora_from_file(pending_apply_lora_fname,pending_apply_lora_power);
-        pending_apply_lora_fname = "";
+    //only use hardcoded lora for kcpp
+    if (!lora_f2m.empty()) {
+        lora_f2m.clear();
+        printf("\nWarning: not applying LoRAs requested by prompt!\n");
     }
+
+    int64_t t0 = ggml_time_ms();
+    sd_ctx->sd->apply_loras(lora_f2m);
     int64_t t1 = ggml_time_ms();
     LOG_INFO("apply_loras completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
 
@@ -1474,7 +1763,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
     ggml_tensor* init_img = NULL;
     SDCondition id_cond;
     std::vector<bool> class_tokens_mask;
-    if (sd_ctx->sd->pmid_model && photomaker_references.size()>0)
+    if (sd_ctx->sd->pmid_lora && kcpp_img_gen_params && kcpp_img_gen_params->photomaker_reference_count>0)
     {
         sd_ctx->sd->stacked_id = true; //turn on photomaker if needed
     }
@@ -1522,16 +1811,16 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
         }
 
         // handle multiple photomaker image passed in by kcpp
-        if (sd_ctx->sd->pmid_model && photomaker_references.size()>0)
+        if (sd_ctx->sd->pmid_lora && kcpp_img_gen_params)
         {
-            for(int i=0;i<photomaker_references.size();++i)
+            for(int i=0;i<kcpp_img_gen_params->photomaker_reference_count;++i)
             {
                 int c = 0;
                 int width, height;
-                width = photomaker_references[i]->width;
-                height = photomaker_references[i]->height;
-                c = photomaker_references[i]->channel;
-                uint8_t* input_image_buffer = photomaker_references[i]->data;
+                width = kcpp_img_gen_params->photomaker_references[i].width;
+                height = kcpp_img_gen_params->photomaker_references[i].height;
+                c = kcpp_img_gen_params->photomaker_references[i].channel;
+                uint8_t* input_image_buffer = kcpp_img_gen_params->photomaker_references[i].data;
                 sd_image_t* input_image = NULL;
                 input_image  = new sd_image_t{(uint32_t)width,
                                                 (uint32_t)height,
@@ -1616,9 +1905,10 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
                                                                            sd_ctx->sd->diffusion_model->get_adm_in_channels());
 
     SDCondition uncond;
-    if (cfg_scale != 1.0) {
+    if (guidance.txt_cfg != 1.0 ||
+        (sd_version_is_inpaint_or_unet_edit(sd_ctx->sd->version) && guidance.txt_cfg != guidance.img_cfg)) {
         bool force_zero_embeddings = false;
-        if (sd_version_is_sdxl(sd_ctx->sd->version) && negative_prompt.size() == 0) {
+        if (sd_version_is_sdxl(sd_ctx->sd->version) && negative_prompt.size() == 0 && !sd_ctx->sd->is_using_edm_v_parameterization) {
             force_zero_embeddings = true;
         }
         uncond = sd_ctx->sd->cond_stage_model->get_learned_condition(work_ctx,
@@ -1631,7 +1921,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
                                                                      force_zero_embeddings);
     }
     t1 = ggml_time_ms();
-    LOG_INFO("get_learned_condition completed, taking %d ms", t1 - t0);
+    LOG_INFO("get_learned_condition completed, taking %" PRId64 " ms", t1 - t0);
 
     if (sd_ctx->sd->free_params_immediately) {
         sd_ctx->sd->cond_stage_model->free_params_buffer();
@@ -1655,38 +1945,50 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
     int W = width / 8;
     int H = height / 8;
     LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]);
-    ggml_tensor* noise_mask = nullptr;
     if (sd_version_is_inpaint(sd_ctx->sd->version)) {
-        if (masked_image == NULL) {
-            int64_t mask_channels = 1;
-            if (sd_ctx->sd->version == VERSION_FLUX_FILL) {
-                mask_channels = 8 * 8;  // flatten the whole mask
-            }
-            // no mask, set the whole image as masked
-            masked_image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], mask_channels + init_latent->ne[2], 1);
-            for (int64_t x = 0; x < masked_image->ne[0]; x++) {
-                for (int64_t y = 0; y < masked_image->ne[1]; y++) {
-                    if (sd_ctx->sd->version == VERSION_FLUX_FILL) {
-                        // TODO: this might be wrong
-                        for (int64_t c = 0; c < init_latent->ne[2]; c++) {
-                            ggml_tensor_set_f32(masked_image, 0, x, y, c);
-                        }
-                        for (int64_t c = init_latent->ne[2]; c < masked_image->ne[2]; c++) {
-                            ggml_tensor_set_f32(masked_image, 1, x, y, c);
-                        }
-                    } else {
-                        ggml_tensor_set_f32(masked_image, 1, x, y, 0);
-                        for (int64_t c = 1; c < masked_image->ne[2]; c++) {
-                            ggml_tensor_set_f32(masked_image, 0, x, y, c);
-                        }
+        int64_t mask_channels = 1;
+        if (sd_ctx->sd->version == VERSION_FLUX_FILL) {
+            mask_channels = 8 * 8;  // flatten the whole mask
+        }
+        auto empty_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], mask_channels + init_latent->ne[2], 1);
+        // no mask, set the whole image as masked
+        for (int64_t x = 0; x < empty_latent->ne[0]; x++) {
+            for (int64_t y = 0; y < empty_latent->ne[1]; y++) {
+                if (sd_ctx->sd->version == VERSION_FLUX_FILL) {
+                    // TODO: this might be wrong
+                    for (int64_t c = 0; c < init_latent->ne[2]; c++) {
+                        ggml_tensor_set_f32(empty_latent, 0, x, y, c);
+                    }
+                    for (int64_t c = init_latent->ne[2]; c < empty_latent->ne[2]; c++) {
+                        ggml_tensor_set_f32(empty_latent, 1, x, y, c);
+                    }
+                } else {
+                    ggml_tensor_set_f32(empty_latent, 1, x, y, 0);
+                    for (int64_t c = 1; c < empty_latent->ne[2]; c++) {
+                        ggml_tensor_set_f32(empty_latent, 0, x, y, c);
                     }
                 }
             }
         }
-        cond.c_concat   = masked_image;
-        uncond.c_concat = masked_image;
-    } else {
-        noise_mask = masked_image;
+        if (concat_latent == NULL) {
+            concat_latent = empty_latent;
+        }
+        cond.c_concat   = concat_latent;
+        uncond.c_concat = empty_latent;
+        denoise_mask    = NULL;
+    } else if (sd_version_is_unet_edit(sd_ctx->sd->version)) {
+        auto empty_latent = ggml_dup_tensor(work_ctx, init_latent);
+        ggml_set_f32(empty_latent, 0);
+        uncond.c_concat = empty_latent;
+        if (concat_latent == NULL) {
+            concat_latent = empty_latent;
+        }
+        cond.c_concat = ref_latents[0];
+    }
+    SDCondition img_cond;
+    if (uncond.c_crossattn != NULL &&
+        (sd_version_is_inpaint_or_unet_edit(sd_ctx->sd->version) && guidance.txt_cfg != guidance.img_cfg)) {
+        img_cond = SDCondition(uncond.c_crossattn, uncond.c_vector, cond.c_concat);
     }
     for (int b = 0; b < batch_count; b++) {
         int64_t sampling_start = ggml_time_ms();
@@ -1706,15 +2008,17 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
             LOG_INFO("PHOTOMAKER: start_merge_step: %d", start_merge_step);
         }
 
+        // Disable min_cfg
+        guidance.min_cfg = guidance.txt_cfg;
+
         struct ggml_tensor* x_0 = sd_ctx->sd->sample(work_ctx,
                                                      x_t,
                                                      noise,
                                                      cond,
                                                      uncond,
+                                                     img_cond,
                                                      image_hint,
                                                      control_strength,
-                                                     cfg_scale,
-                                                     cfg_scale,
                                                      guidance,
                                                      eta,
                                                      sample_method,
@@ -1722,11 +2026,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
                                                      start_merge_step,
                                                      id_cond,
                                                      ref_latents,
-                                                     skip_layers,
-                                                     slg_scale,
-                                                     skip_layer_start,
-                                                     skip_layer_end,
-                                                     noise_mask);
+                                                     denoise_mask);
 
         // struct ggml_tensor* x_0 = load_tensor_from_file(ctx, "samples_ddim.bin");
         // print_ggml_tensor(x_0);
@@ -1739,7 +2039,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
         sd_ctx->sd->diffusion_model->free_params_buffer();
     }
     int64_t t3 = ggml_time_ms();
-    LOG_INFO("generating %d latent images completed, taking %.2fs", final_latents.size(), (t3 - t1) * 1.0f / 1000);
+    LOG_INFO("generating %" PRId64 " latent images completed, taking %.2fs", final_latents.size(), (t3 - t1) * 1.0f / 1000);
 
     // Decode to image
     LOG_INFO("decoding %zu latents", final_latents.size());
@@ -1752,7 +2052,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
             decoded_images.push_back(img);
         }
         int64_t t2 = ggml_time_ms();
-        LOG_INFO("latent %d decoded, taking %.2fs", i + 1, (t2 - t1) * 1.0f / 1000);
+        LOG_INFO("latent %" PRId64 " decoded, taking %.2fs", i + 1, (t2 - t1) * 1.0f / 1000);
     }
 
     int64_t t4 = ggml_time_ms();
@@ -1777,65 +2077,10 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
     return result_images;
 }
 
-sd_image_t* txt2img(sd_ctx_t* sd_ctx,
-                    const char* prompt_c_str,
-                    const char* negative_prompt_c_str,
-                    int clip_skip,
-                    float cfg_scale,
-                    float guidance,
-                    float eta,
-                    int width,
-                    int height,
-                    enum sample_method_t sample_method,
-                    int sample_steps,
-                    int64_t seed,
-                    int batch_count,
-                    const sd_image_t* control_cond,
-                    float control_strength,
-                    float style_ratio,
-                    bool normalize_input,
-                    const char* input_id_images_path_c_str,
-                    sd_image_t* kontext_imgs,
-                    int kontext_img_count,
-                    int* skip_layers         = NULL,
-                    size_t skip_layers_count = 0,
-                    float slg_scale          = 0,
-                    float skip_layer_start   = 0.01,
-                    float skip_layer_end     = 0.2,
-                    const std::vector<sd_image_t*> photomaker_references = std::vector<sd_image_t*>()) {
-    std::vector<int> skip_layers_vec(skip_layers, skip_layers + skip_layers_count);
-    LOG_DEBUG("txt2img %dx%d", width, height);
-    if (sd_ctx == NULL) {
-        return NULL;
-    }
-
-    struct ggml_init_params params;
-    params.mem_size = static_cast<size_t>(20 * 1024 * 1024);  // 20 MB increased by kcpp
-    if (sd_version_is_sd3(sd_ctx->sd->version)) {
-        params.mem_size *= 2; //readjust by kcpp as above changed
-    }
-    if (sd_version_is_flux(sd_ctx->sd->version)) {
-        params.mem_size *= 3; //readjust by kcpp as above changed
-    }
-    if (sd_ctx->sd->stacked_id) {
-        params.mem_size += static_cast<size_t>(15 * 1024 * 1024);  // 10 MB
-    }
-    params.mem_size += width * height * 3 * sizeof(float);
-    params.mem_size *= batch_count;
-    params.mem_buffer = NULL;
-    params.no_alloc   = false;
-    // LOG_DEBUG("mem_size %u ", params.mem_size);
-
-    struct ggml_context* work_ctx = ggml_init(params);
-    if (!work_ctx) {
-        LOG_ERROR("ggml_init() failed");
-        return NULL;
-    }
-
-    size_t t0 = ggml_time_ms();
-
-    std::vector<float> sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps);
-
+ggml_tensor* generate_init_latent(sd_ctx_t* sd_ctx,
+                                  ggml_context* work_ctx,
+                                  int width,
+                                  int height) {
     int C = 4;
     if (sd_version_is_sd3(sd_ctx->sd->version)) {
         C = 16;
@@ -1852,110 +2097,40 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
     } else {
         ggml_set_f32(init_latent, 0.f);
     }
-
-    if (sd_version_is_inpaint(sd_ctx->sd->version)) {
-        LOG_WARN("This is an inpainting model, this should only be used in img2img mode with a mask");
-    }
-    std::vector<struct ggml_tensor*> kontext_latents = std::vector<struct ggml_tensor*>();
-    if (kontext_imgs) {
-        for (int i = 0; i < kontext_img_count; i++) {
-            ggml_tensor* img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, kontext_imgs[i].width, kontext_imgs[i].height, 3, 1);
-            sd_image_to_tensor(kontext_imgs[i].data, img);
-
-            ggml_tensor* latent = NULL;
-            if (!sd_ctx->sd->use_tiny_autoencoder) {
-                ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, img);
-                latent               = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments);
-            } else {
-                latent = sd_ctx->sd->encode_first_stage(work_ctx, img);
-            }
-            kontext_latents.push_back(latent);
-        }
-    }
-
-    sd_image_t* result_images = generate_image(sd_ctx,
-                                               work_ctx,
-                                               init_latent,
-                                               prompt_c_str,
-                                               negative_prompt_c_str,
-                                               clip_skip,
-                                               cfg_scale,
-                                               guidance,
-                                               eta,
-                                               width,
-                                               height,
-                                               sample_method,
-                                               sigmas,
-                                               seed,
-                                               batch_count,
-                                               control_cond,
-                                               control_strength,
-                                               style_ratio,
-                                               normalize_input,
-                                               input_id_images_path_c_str,
-                                               kontext_latents,
-                                               skip_layers_vec,
-                                               slg_scale,
-                                               skip_layer_start,
-                                               skip_layer_end,
-                                               nullptr,
-                                               photomaker_references);
-
-    size_t t1 = ggml_time_ms();
-
-    LOG_INFO("txt2img completed in %.2fs", (t1 - t0) * 1.0f / 1000);
-
-    return result_images;
+    return init_latent;
 }
 
-sd_image_t* img2img(sd_ctx_t* sd_ctx,
-                    sd_image_t init_image,
-                    sd_image_t mask,
-                    const char* prompt_c_str,
-                    const char* negative_prompt_c_str,
-                    int clip_skip,
-                    float cfg_scale,
-                    float guidance,
-                    float eta,
-                    int width,
-                    int height,
-                    sample_method_t sample_method,
-                    int sample_steps,
-                    float strength,
-                    int64_t seed,
-                    int batch_count,
-                    const sd_image_t* control_cond,
-                    float control_strength,
-                    float style_ratio,
-                    bool normalize_input,
-                    const char* input_id_images_path_c_str,
-                    sd_image_t* kontext_imgs,
-                    int kontext_img_count,
-                    int* skip_layers         = NULL,
-                    size_t skip_layers_count = 0,
-                    float slg_scale          = 0,
-                    float skip_layer_start   = 0.01,
-                    float skip_layer_end     = 0.2,
-                    const std::vector<sd_image_t*> photomaker_references = std::vector<sd_image_t*>()) {
-    std::vector<int> skip_layers_vec(skip_layers, skip_layers + skip_layers_count);
-    LOG_DEBUG("img2img %dx%d", width, height);
-    if (sd_ctx == NULL) {
+sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params, const kcpp_img_gen_params_t* kcpp_img_gen_params) {
+    int width  = sd_img_gen_params->width;
+    int height = sd_img_gen_params->height;
+    if (sd_version_is_dit(sd_ctx->sd->version)) {
+        if (width % 16 || height % 16) {
+            LOG_ERROR("Image dimensions must be must be a multiple of 16 on each axis for %s models. (Got %dx%d)", model_version_to_str[sd_ctx->sd->version], width, height);
+            return NULL;
+        }
+    } else if (width % 64 || height % 64) {
+        LOG_ERROR("Image dimensions must be must be a multiple of 64 on each axis for %s models. (Got %dx%d)", model_version_to_str[sd_ctx->sd->version], width, height);
+        return NULL;
+    }
+    LOG_DEBUG("generate_image %dx%d", width, height);
+    if (sd_ctx == NULL || sd_img_gen_params == NULL) {
         return NULL;
     }
 
     struct ggml_init_params params;
-    params.mem_size = static_cast<size_t>(10 * 1024 * 1024);  // 10 MB
+    params.mem_size = static_cast<size_t>(20 * 1024 * 1024);  // 20 MB increased by kcpp
     if (sd_version_is_sd3(sd_ctx->sd->version)) {
-        params.mem_size *= 2;
+        params.mem_size *= 2; //readjust by kcpp as above changed
     }
     if (sd_version_is_flux(sd_ctx->sd->version)) {
-        params.mem_size *= 3;
+        params.mem_size *= 3; //readjust by kcpp as above changed
     }
     if (sd_ctx->sd->stacked_id) {
-        params.mem_size += static_cast<size_t>(10 * 1024 * 1024);  // 10 MB
+        params.mem_size += static_cast<size_t>(15 * 1024 * 1024);  // 15 MB increased by kcpp
     }
     params.mem_size += width * height * 3 * sizeof(float) * 3;
-    params.mem_size *= batch_count;
+    params.mem_size += width * height * 3 * sizeof(float) * 3 * sd_img_gen_params->ref_images_count;
+    params.mem_size *= sd_img_gen_params->batch_count;
     params.mem_buffer = NULL;
     params.no_alloc   = false;
     // LOG_DEBUG("mem_size %u ", params.mem_size);
@@ -1966,175 +2141,198 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
         return NULL;
     }
 
-    size_t t0 = ggml_time_ms();
-
+    int64_t seed = sd_img_gen_params->seed;
     if (seed < 0) {
         srand((int)time(NULL));
         seed = rand();
     }
     sd_ctx->sd->rng->manual_seed(seed);
 
-    ggml_tensor* init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
-    ggml_tensor* mask_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 1, 1);
+    size_t t0 = ggml_time_ms();
 
-    sd_mask_to_tensor(mask.data, mask_img);
+    ggml_tensor* init_latent   = NULL;
+    ggml_tensor* concat_latent = NULL;
+    ggml_tensor* denoise_mask  = NULL;
+    std::vector<float> sigmas  = sd_ctx->sd->denoiser->get_sigmas(sd_img_gen_params->sample_steps);
 
-    sd_image_to_tensor(init_image.data, init_img);
+    if (sd_img_gen_params->init_image.data) {
+        LOG_INFO("IMG2IMG");
 
-    ggml_tensor* masked_image;
+        size_t t_enc = static_cast<size_t>(sd_img_gen_params->sample_steps * sd_img_gen_params->strength);
+        if (t_enc == sd_img_gen_params->sample_steps)
+            t_enc--;
+        LOG_INFO("target t_enc is %zu steps", t_enc);
+        std::vector<float> sigma_sched;
+        sigma_sched.assign(sigmas.begin() + sd_img_gen_params->sample_steps - t_enc - 1, sigmas.end());
+        sigmas = sigma_sched;
 
-    if (sd_version_is_inpaint(sd_ctx->sd->version)) {
-        int64_t mask_channels = 1;
-        if (sd_ctx->sd->version == VERSION_FLUX_FILL) {
-            mask_channels = 8 * 8;  // flatten the whole mask
-        }
-        ggml_tensor* masked_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
-        sd_apply_mask(init_img, mask_img, masked_img);
-        ggml_tensor* masked_image_0 = NULL;
-        if (!sd_ctx->sd->use_tiny_autoencoder) {
-            ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, masked_img);
-            masked_image_0       = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments);
-        } else {
-            masked_image_0 = sd_ctx->sd->encode_first_stage(work_ctx, masked_img);
-        }
-        masked_image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, masked_image_0->ne[0], masked_image_0->ne[1], mask_channels + masked_image_0->ne[2], 1);
-        for (int ix = 0; ix < masked_image_0->ne[0]; ix++) {
-            for (int iy = 0; iy < masked_image_0->ne[1]; iy++) {
-                int mx = ix * 8;
-                int my = iy * 8;
-                if (sd_ctx->sd->version == VERSION_FLUX_FILL) {
-                    for (int k = 0; k < masked_image_0->ne[2]; k++) {
-                        float v = ggml_tensor_get_f32(masked_image_0, ix, iy, k);
-                        ggml_tensor_set_f32(masked_image, v, ix, iy, k);
-                    }
-                    // "Encode" 8x8 mask chunks into a flattened 1x64 vector, and concatenate to masked image
-                    for (int x = 0; x < 8; x++) {
-                        for (int y = 0; y < 8; y++) {
-                            float m = ggml_tensor_get_f32(mask_img, mx + x, my + y);
-                            // TODO: check if the way the mask is flattened is correct (is it supposed to be x*8+y or x+8*y?)
-                            // python code was using "b (h 8) (w 8) -> b (8 8) h w"
-                            ggml_tensor_set_f32(masked_image, m, ix, iy, masked_image_0->ne[2] + x * 8 + y);
+        ggml_tensor* init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
+        ggml_tensor* mask_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 1, 1);
+
+        sd_mask_to_tensor(sd_img_gen_params->mask_image.data, mask_img);
+        sd_image_to_tensor(sd_img_gen_params->init_image.data, init_img);
+
+        if (sd_version_is_inpaint(sd_ctx->sd->version)) {
+            int64_t mask_channels = 1;
+            if (sd_ctx->sd->version == VERSION_FLUX_FILL) {
+                mask_channels = 8 * 8;  // flatten the whole mask
+            }
+            ggml_tensor* masked_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
+            sd_apply_mask(init_img, mask_img, masked_img);
+            ggml_tensor* masked_latent = NULL;
+            if (!sd_ctx->sd->use_tiny_autoencoder) {
+                ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, masked_img);
+                masked_latent        = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments);
+            } else {
+                masked_latent = sd_ctx->sd->encode_first_stage(work_ctx, masked_img);
+            }
+            concat_latent = ggml_new_tensor_4d(work_ctx,
+                                               GGML_TYPE_F32,
+                                               masked_latent->ne[0],
+                                               masked_latent->ne[1],
+                                               mask_channels + masked_latent->ne[2],
+                                               1);
+            for (int ix = 0; ix < masked_latent->ne[0]; ix++) {
+                for (int iy = 0; iy < masked_latent->ne[1]; iy++) {
+                    int mx = ix * 8;
+                    int my = iy * 8;
+                    if (sd_ctx->sd->version == VERSION_FLUX_FILL) {
+                        for (int k = 0; k < masked_latent->ne[2]; k++) {
+                            float v = ggml_tensor_get_f32(masked_latent, ix, iy, k);
+                            ggml_tensor_set_f32(concat_latent, v, ix, iy, k);
+                        }
+                        // "Encode" 8x8 mask chunks into a flattened 1x64 vector, and concatenate to masked image
+                        for (int x = 0; x < 8; x++) {
+                            for (int y = 0; y < 8; y++) {
+                                float m = ggml_tensor_get_f32(mask_img, mx + x, my + y);
+                                // TODO: check if the way the mask is flattened is correct (is it supposed to be x*8+y or x+8*y?)
+                                // python code was using "b (h 8) (w 8) -> b (8 8) h w"
+                                ggml_tensor_set_f32(concat_latent, m, ix, iy, masked_latent->ne[2] + x * 8 + y);
+                            }
+                        }
+                    } else {
+                        float m = ggml_tensor_get_f32(mask_img, mx, my);
+                        ggml_tensor_set_f32(concat_latent, m, ix, iy, 0);
+                        for (int k = 0; k < masked_latent->ne[2]; k++) {
+                            float v = ggml_tensor_get_f32(masked_latent, ix, iy, k);
+                            ggml_tensor_set_f32(concat_latent, v, ix, iy, k + mask_channels);
                         }
-                    }
-                } else {
-                    float m = ggml_tensor_get_f32(mask_img, mx, my);
-                    ggml_tensor_set_f32(masked_image, m, ix, iy, 0);
-                    for (int k = 0; k < masked_image_0->ne[2]; k++) {
-                        float v = ggml_tensor_get_f32(masked_image_0, ix, iy, k);
-                        ggml_tensor_set_f32(masked_image, v, ix, iy, k + mask_channels);
                     }
                 }
             }
         }
-    } else {
-        // LOG_WARN("Inpainting with a base model is not great");
-        masked_image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width / 8, height / 8, 1, 1);
-        for (int ix = 0; ix < masked_image->ne[0]; ix++) {
-            for (int iy = 0; iy < masked_image->ne[1]; iy++) {
-                int mx  = ix * 8;
-                int my  = iy * 8;
-                float m = ggml_tensor_get_f32(mask_img, mx, my);
-                ggml_tensor_set_f32(masked_image, m, ix, iy);
+
+        {
+            // LOG_WARN("Inpainting with a base model is not great");
+            denoise_mask = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width / 8, height / 8, 1, 1);
+            for (int ix = 0; ix < denoise_mask->ne[0]; ix++) {
+                for (int iy = 0; iy < denoise_mask->ne[1]; iy++) {
+                    int mx  = ix * 8;
+                    int my  = iy * 8;
+                    float m = ggml_tensor_get_f32(mask_img, mx, my);
+                    ggml_tensor_set_f32(denoise_mask, m, ix, iy);
+                }
             }
         }
-    }
 
-    ggml_tensor* init_latent = NULL;
-    if (!sd_ctx->sd->use_tiny_autoencoder) {
-        ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, init_img);
-        init_latent          = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments);
-    } else {
-        init_latent = sd_ctx->sd->encode_first_stage(work_ctx, init_img);
-    }
-    std::vector<struct ggml_tensor*> kontext_latents = std::vector<struct ggml_tensor*>();
-    if (kontext_imgs) {
-        for (int i = 0; i < kontext_img_count; i++) {
-            ggml_tensor* img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
-            sd_image_to_tensor(kontext_imgs[i].data, img);
-
-            ggml_tensor* latent = NULL;
-            if (!sd_ctx->sd->use_tiny_autoencoder) {
-                ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, img);
-                latent               = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments);
-            } else {
-                latent = sd_ctx->sd->encode_first_stage(work_ctx, img);
-            }
-            kontext_latents.push_back(latent);
+        if (!sd_ctx->sd->use_tiny_autoencoder) {
+            ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, init_img);
+            init_latent          = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments);
+        } else {
+            init_latent = sd_ctx->sd->encode_first_stage(work_ctx, init_img);
         }
+    } else {
+        LOG_INFO("TXT2IMG");
+        if (sd_version_is_inpaint(sd_ctx->sd->version)) {
+            LOG_WARN("This is an inpainting model, this should only be used in img2img mode with a mask");
+        }
+        init_latent = generate_init_latent(sd_ctx, work_ctx, width, height);
     }
 
-    // print_ggml_tensor(init_latent, true);
-    size_t t1 = ggml_time_ms();
-    LOG_INFO("encode_first_stage completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
+    if (sd_img_gen_params->ref_images_count > 0) {
+        LOG_INFO("EDIT mode");
+    }
 
-    std::vector<float> sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps);
-    size_t t_enc              = static_cast<size_t>(sample_steps * strength);
-    if (t_enc == sample_steps)
-        t_enc--;
-    LOG_INFO("target t_enc is %zu steps", t_enc);
-    std::vector<float> sigma_sched;
-    sigma_sched.assign(sigmas.begin() + sample_steps - t_enc - 1, sigmas.end());
+    std::vector<struct ggml_tensor*> ref_latents;
+    for (int i = 0; i < sd_img_gen_params->ref_images_count; i++) {
+        ggml_tensor* img = ggml_new_tensor_4d(work_ctx,
+                                              GGML_TYPE_F32,
+                                              sd_img_gen_params->ref_images[i].width,
+                                              sd_img_gen_params->ref_images[i].height,
+                                              3,
+                                              1);
+        sd_image_to_tensor(sd_img_gen_params->ref_images[i].data, img);
 
-    sd_image_t* result_images = generate_image(sd_ctx,
-                                               work_ctx,
-                                               init_latent,
-                                               prompt_c_str,
-                                               negative_prompt_c_str,
-                                               clip_skip,
-                                               cfg_scale,
-                                               guidance,
-                                               eta,
-                                               width,
-                                               height,
-                                               sample_method,
-                                               sigma_sched,
-                                               seed,
-                                               batch_count,
-                                               control_cond,
-                                               control_strength,
-                                               style_ratio,
-                                               normalize_input,
-                                               input_id_images_path_c_str,
-                                               kontext_latents,
-                                               skip_layers_vec,
-                                               slg_scale,
-                                               skip_layer_start,
-                                               skip_layer_end,
-                                               masked_image,
-                                               photomaker_references);
+        ggml_tensor* latent = NULL;
+        if (sd_ctx->sd->use_tiny_autoencoder) {
+            latent = sd_ctx->sd->encode_first_stage(work_ctx, img);
+        } else if (sd_ctx->sd->version == VERSION_SD1_PIX2PIX) {
+            latent = sd_ctx->sd->encode_first_stage(work_ctx, img);
+            latent = ggml_view_3d(work_ctx,
+                                  latent,
+                                  latent->ne[0],
+                                  latent->ne[1],
+                                  latent->ne[2] / 2,
+                                  latent->nb[1],
+                                  latent->nb[2],
+                                  0);
+        } else {
+            ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, img);
+            latent               = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments);
+        }
+        ref_latents.push_back(latent);
+    }
+
+    if (sd_img_gen_params->init_image.data != NULL || sd_img_gen_params->ref_images_count > 0) {
+        size_t t1 = ggml_time_ms();
+        LOG_INFO("encode_first_stage completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
+    }
+
+    sd_image_t* result_images = generate_image_internal(sd_ctx,
+                                                        work_ctx,
+                                                        init_latent,
+                                                        SAFE_STR(sd_img_gen_params->prompt),
+                                                        SAFE_STR(sd_img_gen_params->negative_prompt),
+                                                        sd_img_gen_params->clip_skip,
+                                                        sd_img_gen_params->guidance,
+                                                        sd_img_gen_params->eta,
+                                                        width,
+                                                        height,
+                                                        sd_img_gen_params->sample_method,
+                                                        sigmas,
+                                                        seed,
+                                                        sd_img_gen_params->batch_count,
+                                                        sd_img_gen_params->control_cond,
+                                                        sd_img_gen_params->control_strength,
+                                                        sd_img_gen_params->style_strength,
+                                                        sd_img_gen_params->normalize_input,
+                                                        sd_img_gen_params->input_id_images_path,
+                                                        ref_latents,
+                                                        concat_latent,
+                                                        denoise_mask,
+                                                        kcpp_img_gen_params);
 
     size_t t2 = ggml_time_ms();
 
-    LOG_INFO("img2img completed in %.2fs", (t2 - t0) * 1.0f / 1000);
+    LOG_INFO("generate_image completed in %.2fs", (t2 - t0) * 1.0f / 1000);
 
     return result_images;
 }
 
-SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx,
-                           sd_image_t init_image,
-                           int width,
-                           int height,
-                           int video_frames,
-                           int motion_bucket_id,
-                           int fps,
-                           float augmentation_level,
-                           float min_cfg,
-                           float cfg_scale,
-                           enum sample_method_t sample_method,
-                           int sample_steps,
-                           float strength,
-                           int64_t seed) {
-    if (sd_ctx == NULL) {
+SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* sd_vid_gen_params) {
+    if (sd_ctx == NULL || sd_vid_gen_params == NULL) {
         return NULL;
     }
 
+    int width  = sd_vid_gen_params->width;
+    int height = sd_vid_gen_params->height;
     LOG_INFO("img2vid %dx%d", width, height);
 
-    std::vector<float> sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps);
+    std::vector<float> sigmas = sd_ctx->sd->denoiser->get_sigmas(sd_vid_gen_params->sample_steps);
 
     struct ggml_init_params params;
     params.mem_size = static_cast<size_t>(10 * 1024) * 1024;  // 10 MB
-    params.mem_size += width * height * 3 * sizeof(float) * video_frames;
+    params.mem_size += width * height * 3 * sizeof(float) * sd_vid_gen_params->video_frames;
     params.mem_buffer = NULL;
     params.no_alloc   = false;
     // LOG_DEBUG("mem_size %u ", params.mem_size);
@@ -2146,6 +2344,7 @@ SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx,
         return NULL;
     }
 
+    int64_t seed = sd_vid_gen_params->seed;
     if (seed < 0) {
         seed = (int)time(NULL);
     }
@@ -2155,12 +2354,12 @@ SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx,
     int64_t t0 = ggml_time_ms();
 
     SDCondition cond = sd_ctx->sd->get_svd_condition(work_ctx,
-                                                     init_image,
+                                                     sd_vid_gen_params->init_image,
                                                      width,
                                                      height,
-                                                     fps,
-                                                     motion_bucket_id,
-                                                     augmentation_level);
+                                                     sd_vid_gen_params->fps,
+                                                     sd_vid_gen_params->motion_bucket_id,
+                                                     sd_vid_gen_params->augmentation_level);
 
     auto uc_crossattn = ggml_dup_tensor(work_ctx, cond.c_crossattn);
     ggml_set_f32(uc_crossattn, 0.f);
@@ -2173,7 +2372,7 @@ SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx,
     SDCondition uncond = SDCondition(uc_crossattn, uc_vector, uc_concat);
 
     int64_t t1 = ggml_time_ms();
-    LOG_INFO("get_learned_condition completed, taking %d ms", t1 - t0);
+    LOG_INFO("get_learned_condition completed, taking %" PRId64 " ms", t1 - t0);
     if (sd_ctx->sd->free_params_immediately) {
         sd_ctx->sd->clip_vision->free_params_buffer();
     }
@@ -2182,25 +2381,24 @@ SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx,
     int C                   = 4;
     int W                   = width / 8;
     int H                   = height / 8;
-    struct ggml_tensor* x_t = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, video_frames);
+    struct ggml_tensor* x_t = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, sd_vid_gen_params->video_frames);
     ggml_set_f32(x_t, 0.f);
 
-    struct ggml_tensor* noise = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, video_frames);
+    struct ggml_tensor* noise = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, sd_vid_gen_params->video_frames);
     ggml_tensor_set_f32_randn(noise, sd_ctx->sd->rng);
 
-    LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]);
+    LOG_INFO("sampling using %s method", sampling_methods_str[sd_vid_gen_params->sample_method]);
     struct ggml_tensor* x_0 = sd_ctx->sd->sample(work_ctx,
                                                  x_t,
                                                  noise,
                                                  cond,
                                                  uncond,
                                                  {},
+                                                 {},
                                                  0.f,
-                                                 min_cfg,
-                                                 cfg_scale,
+                                                 sd_vid_gen_params->guidance,
                                                  0.f,
-                                                 0.f,
-                                                 sample_method,
+                                                 sd_vid_gen_params->sample_method,
                                                  sigmas,
                                                  -1,
                                                  SDCondition(NULL, NULL, NULL));
@@ -2220,13 +2418,13 @@ SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx,
         return NULL;
     }
 
-    sd_image_t* result_images = (sd_image_t*)calloc(video_frames, sizeof(sd_image_t));
+    sd_image_t* result_images = (sd_image_t*)calloc(sd_vid_gen_params->video_frames, sizeof(sd_image_t));
     if (result_images == NULL) {
         ggml_free(work_ctx);
         return NULL;
     }
 
-    for (size_t i = 0; i < video_frames; i++) {
+    for (size_t i = 0; i < sd_vid_gen_params->video_frames; i++) {
         auto img_i = ggml_view_3d(work_ctx, img, img->ne[0], img->ne[1], img->ne[2], img->nb[1], img->nb[2], img->nb[3] * i);
 
         result_images[i].width   = width;
@@ -2242,131 +2440,3 @@ SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx,
 
     return result_images;
 }
-
-sd_image_t* edit(sd_ctx_t* sd_ctx,
-                 sd_image_t* ref_images,
-                 int ref_images_count,
-                 const char* prompt_c_str,
-                 const char* negative_prompt_c_str,
-                 int clip_skip,
-                 float cfg_scale,
-                 float guidance,
-                 float eta,
-                 int width,
-                 int height,
-                 sample_method_t sample_method,
-                 int sample_steps,
-                 float strength,
-                 int64_t seed,
-                 int batch_count,
-                 const sd_image_t* control_cond,
-                 float control_strength,
-                 float style_ratio,
-                 bool normalize_input,
-                 int* skip_layers         = NULL,
-                 size_t skip_layers_count = 0,
-                 float slg_scale          = 0,
-                 float skip_layer_start   = 0.01,
-                 float skip_layer_end     = 0.2) {
-    std::vector<int> skip_layers_vec(skip_layers, skip_layers + skip_layers_count);
-    LOG_DEBUG("edit %dx%d", width, height);
-    if (sd_ctx == NULL) {
-        return NULL;
-    }
-    if (ref_images_count <= 0) {
-        LOG_ERROR("ref images count should > 0");
-        return NULL;
-    }
-
-    struct ggml_init_params params;
-    params.mem_size = static_cast<size_t>(30 * 1024 * 1024);  // 10 MB
-    params.mem_size += width * height * 3 * sizeof(float) * 3 * ref_images_count;
-    params.mem_size *= batch_count;
-    params.mem_buffer = NULL;
-    params.no_alloc   = false;
-    // LOG_DEBUG("mem_size %u ", params.mem_size);
-
-    struct ggml_context* work_ctx = ggml_init(params);
-    if (!work_ctx) {
-        LOG_ERROR("ggml_init() failed");
-        return NULL;
-    }
-
-    if (seed < 0) {
-        srand((int)time(NULL));
-        seed = rand();
-    }
-    sd_ctx->sd->rng->manual_seed(seed);
-
-    int C = 4;
-    if (sd_version_is_sd3(sd_ctx->sd->version)) {
-        C = 16;
-    } else if (sd_version_is_flux(sd_ctx->sd->version)) {
-        C = 16;
-    }
-    int W                    = width / 8;
-    int H                    = height / 8;
-    ggml_tensor* init_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1);
-    if (sd_version_is_sd3(sd_ctx->sd->version)) {
-        ggml_set_f32(init_latent, 0.0609f);
-    } else if (sd_version_is_flux(sd_ctx->sd->version)) {
-        ggml_set_f32(init_latent, 0.1159f);
-    } else {
-        ggml_set_f32(init_latent, 0.f);
-    }
-
-    size_t t0 = ggml_time_ms();
-
-    std::vector<struct ggml_tensor*> ref_latents;
-    for (int i = 0; i < ref_images_count; i++) {
-        ggml_tensor* img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, ref_images[i].width, ref_images[i].height, 3, 1);
-        sd_image_to_tensor(ref_images[i].data, img);
-
-        ggml_tensor* latent = NULL;
-        if (!sd_ctx->sd->use_tiny_autoencoder) {
-            ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, img);
-            latent               = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments);
-        } else {
-            latent = sd_ctx->sd->encode_first_stage(work_ctx, img);
-        }
-        ref_latents.push_back(latent);
-    }
-
-    size_t t1 = ggml_time_ms();
-    LOG_INFO("encode_first_stage completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
-
-    std::vector<float> sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps);
-
-    sd_image_t* result_images = generate_image(sd_ctx,
-                                               work_ctx,
-                                               init_latent,
-                                               prompt_c_str,
-                                               negative_prompt_c_str,
-                                               clip_skip,
-                                               cfg_scale,
-                                               guidance,
-                                               eta,
-                                               width,
-                                               height,
-                                               sample_method,
-                                               sigmas,
-                                               seed,
-                                               batch_count,
-                                               control_cond,
-                                               control_strength,
-                                               style_ratio,
-                                               normalize_input,
-                                               "",
-                                               ref_latents,
-                                               skip_layers_vec,
-                                               slg_scale,
-                                               skip_layer_start,
-                                               skip_layer_end,
-                                               NULL);
-
-    size_t t2 = ggml_time_ms();
-
-    LOG_INFO("edit completed in %.2fs", (t2 - t0) * 1.0f / 1000);
-
-    return result_images;
-}
\ No newline at end of file
diff --git a/otherarch/sdcpp/stable-diffusion.h b/otherarch/sdcpp/stable-diffusion.h
index 9d4fcda6e..57603441a 100644
--- a/otherarch/sdcpp/stable-diffusion.h
+++ b/otherarch/sdcpp/stable-diffusion.h
@@ -30,7 +30,8 @@ extern "C" {
 
 enum rng_type_t {
     STD_DEFAULT_RNG,
-    CUDA_RNG
+    CUDA_RNG,
+    RNG_TYPE_COUNT
 };
 
 enum sample_method_t {
@@ -46,7 +47,7 @@ enum sample_method_t {
     LCM,
     DDIM_TRAILING,
     TCD,
-    N_SAMPLE_METHODS
+    SAMPLE_METHOD_COUNT
 };
 
 enum schedule_t {
@@ -56,15 +57,15 @@ enum schedule_t {
     EXPONENTIAL,
     AYS,
     GITS,
-    N_SCHEDULES
+    SCHEDULE_COUNT
 };
 
 // same as enum ggml_type
 enum sd_type_t {
-    SD_TYPE_F32     = 0,
-    SD_TYPE_F16     = 1,
-    SD_TYPE_Q4_0    = 2,
-    SD_TYPE_Q4_1    = 3,
+    SD_TYPE_F32  = 0,
+    SD_TYPE_F16  = 1,
+    SD_TYPE_Q4_0 = 2,
+    SD_TYPE_Q4_1 = 3,
     // SD_TYPE_Q4_2 = 4, support has been removed
     // SD_TYPE_Q4_3 = 5, support has been removed
     SD_TYPE_Q5_0    = 6,
@@ -92,19 +93,17 @@ enum sd_type_t {
     SD_TYPE_F64     = 28,
     SD_TYPE_IQ1_M   = 29,
     SD_TYPE_BF16    = 30,
-    SD_TYPE_Q4_0_4_4 = 31,
-    SD_TYPE_Q4_0_4_8 = 32,
-    SD_TYPE_Q4_0_8_8 = 33,
-    SD_TYPE_TQ1_0   = 34,
-    SD_TYPE_TQ2_0   = 35,
-    SD_TYPE_IQ4_NL_4_4 = 36,
+    // SD_TYPE_Q4_0_4_4 = 31, support has been removed from gguf files
+    // SD_TYPE_Q4_0_4_8 = 32,
+    // SD_TYPE_Q4_0_8_8 = 33,
+    SD_TYPE_TQ1_0 = 34,
+    SD_TYPE_TQ2_0 = 35,
+    // SD_TYPE_IQ4_NL_4_4 = 36,
     // SD_TYPE_IQ4_NL_4_8 = 37,
     // SD_TYPE_IQ4_NL_8_8 = 38,
     SD_TYPE_COUNT   = 40,
 };
 
-SD_API const char* sd_type_name(enum sd_type_t type);
-
 enum sd_log_level_t {
     SD_LOG_DEBUG,
     SD_LOG_INFO,
@@ -112,6 +111,105 @@ enum sd_log_level_t {
     SD_LOG_ERROR
 };
 
+typedef struct {
+    const char* model_path;
+    const char* clip_l_path;
+    const char* clip_g_path;
+    const char* t5xxl_path;
+    const char* diffusion_model_path;
+    const char* vae_path;
+    const char* taesd_path;
+    const char* control_net_path;
+    const char* lora_model_dir;
+    const char* embedding_dir;
+    const char* stacked_id_embed_dir;
+    bool vae_decode_only;
+    bool vae_tiling;
+    bool free_params_immediately;
+    int n_threads;
+    enum sd_type_t wtype;
+    enum rng_type_t rng_type;
+    enum schedule_t schedule;
+    bool keep_clip_on_cpu;
+    bool keep_control_net_on_cpu;
+    bool keep_vae_on_cpu;
+    bool diffusion_flash_attn;
+    bool diffusion_conv_direct;
+    bool vae_conv_direct;
+    bool chroma_use_dit_mask;
+    bool chroma_use_t5_mask;
+    int chroma_t5_mask_pad;
+} sd_ctx_params_t;
+
+typedef struct {
+    uint32_t width;
+    uint32_t height;
+    uint32_t channel;
+    uint8_t* data;
+} sd_image_t;
+
+typedef struct {
+    int* layers;
+    size_t layer_count;
+    float layer_start;
+    float layer_end;
+    float scale;
+} sd_slg_params_t;
+
+typedef struct {
+    float txt_cfg;
+    float img_cfg;
+    float min_cfg;
+    float distilled_guidance;
+    sd_slg_params_t slg;
+} sd_guidance_params_t;
+
+typedef struct {
+    const char* prompt;
+    const char* negative_prompt;
+    int clip_skip;
+    sd_guidance_params_t guidance;
+    sd_image_t init_image;
+    sd_image_t* ref_images;
+    int ref_images_count;
+    sd_image_t mask_image;
+    int width;
+    int height;
+    enum sample_method_t sample_method;
+    int sample_steps;
+    float eta;
+    float strength;
+    int64_t seed;
+    int batch_count;
+    const sd_image_t* control_cond;
+    float control_strength;
+    float style_strength;
+    bool normalize_input;
+    const char* input_id_images_path;
+} sd_img_gen_params_t;
+
+typedef struct {
+    sd_image_t* photomaker_references;
+    int photomaker_reference_count;
+} kcpp_img_gen_params_t;
+
+typedef struct {
+    sd_image_t init_image;
+    int width;
+    int height;
+    sd_guidance_params_t guidance;
+    enum sample_method_t sample_method;
+    int sample_steps;
+    float strength;
+    int64_t seed;
+    int video_frames;
+    int motion_bucket_id;
+    int fps;
+    float augmentation_level;
+} sd_vid_gen_params_t;
+
+typedef struct sd_ctx_t sd_ctx_t;
+
 typedef void (*sd_log_cb_t)(enum sd_log_level_t level, const char* text, void* data);
 typedef void (*sd_progress_cb_t)(int step, int steps, float time, void* data);
 
@@ -120,154 +218,42 @@ SD_API void sd_set_progress_callback(sd_progress_cb_t cb, void* data);
 SD_API int32_t sd_get_num_physical_cores();
 SD_API const char* sd_get_system_info();
 
-typedef struct {
-    uint32_t width;
-    uint32_t height;
-    uint32_t channel;
-    uint8_t* data;
-} sd_image_t;
+SD_API const char* sd_type_name(enum sd_type_t type);
+SD_API enum sd_type_t str_to_sd_type(const char* str);
+SD_API const char* sd_rng_type_name(enum rng_type_t rng_type);
+SD_API enum rng_type_t str_to_rng_type(const char* str);
+SD_API const char* sd_sample_method_name(enum sample_method_t sample_method);
+SD_API enum sample_method_t str_to_sample_method(const char* str);
+SD_API const char* sd_schedule_name(enum schedule_t schedule);
+SD_API enum schedule_t str_to_schedule(const char* str);
 
-typedef struct sd_ctx_t sd_ctx_t;
-
-SD_API void set_sd_vae_tiling(sd_ctx_t* ctx, bool tiling);
-SD_API int get_loaded_sd_version(sd_ctx_t* ctx);
-SD_API bool sd_loaded_chroma();
-
-SD_API sd_ctx_t* new_sd_ctx(const char* model_path,
-                            const char* clip_l_path,
-                            const char* clip_g_path,
-                            const char* t5xxl_path,
-                            const char* diffusion_model_path,
-                            const char* vae_path,
-                            const char* taesd_path,
-                            const char* control_net_path_c_str,
-                            const char* lora_model_dir,
-                            const char* embed_dir_c_str,
-                            const char* stacked_id_embed_dir_c_str,
-                            bool vae_decode_only,
-                            bool vae_tiling,
-                            bool free_params_immediately,
-                            int n_threads,
-                            enum sd_type_t wtype,
-                            enum rng_type_t rng_type,
-                            enum schedule_t s,
-                            bool keep_clip_on_cpu,
-                            bool keep_control_net_cpu,
-                            bool keep_vae_on_cpu,
-                            bool diffusion_flash_attn,
-                            bool chroma_use_dit_mask,
-                            bool chroma_use_t5_mask,
-                            int chroma_t5_mask_pad);
+SD_API void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params);
+SD_API char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params);
 
+SD_API sd_ctx_t* new_sd_ctx(const sd_ctx_params_t* sd_ctx_params);
 SD_API void free_sd_ctx(sd_ctx_t* sd_ctx);
 
-SD_API sd_image_t* txt2img(sd_ctx_t* sd_ctx,
-                           const char* prompt,
-                           const char* negative_prompt,
-                           int clip_skip,
-                           float cfg_scale,
-                           float guidance,
-                           float eta,
-                           int width,
-                           int height,
-                           enum sample_method_t sample_method,
-                           int sample_steps,
-                           int64_t seed,
-                           int batch_count,
-                           const sd_image_t* control_cond,
-                           float control_strength,
-                           float style_strength,
-                           bool normalize_input,
-                           const char* input_id_images_path,
-                           sd_image_t* kontext_imgs,
-                           int kontext_img_count,
-                           int* skip_layers,
-                           size_t skip_layers_count,
-                           float slg_scale,
-                           float skip_layer_start,
-                           float skip_layer_end,
-                           const std::vector<sd_image_t*> photomaker_references);
+SD_API void sd_img_gen_params_init(sd_img_gen_params_t* sd_img_gen_params);
+SD_API char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params);
+SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params, const kcpp_img_gen_params_t* kcpp_img_gen_params);
 
-SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx,
-                           sd_image_t init_image,
-                           sd_image_t mask_image,
-                           const char* prompt,
-                           const char* negative_prompt,
-                           int clip_skip,
-                           float cfg_scale,
-                           float guidance,
-                           float eta,
-                           int width,
-                           int height,
-                           enum sample_method_t sample_method,
-                           int sample_steps,
-                           float strength,
-                           int64_t seed,
-                           int batch_count,
-                           const sd_image_t* control_cond,
-                           float control_strength,
-                           float style_strength,
-                           bool normalize_input,
-                           const char* input_id_images_path,
-                           sd_image_t* kontext_imgs,
-                           int kontext_img_count,
-                           int* skip_layers,
-                           size_t skip_layers_count,
-                           float slg_scale,
-                           float skip_layer_start,
-                           float skip_layer_end,
-                           const std::vector<sd_image_t*> photomaker_references);
-
-SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx,
-                           sd_image_t init_image,
-                           int width,
-                           int height,
-                           int video_frames,
-                           int motion_bucket_id,
-                           int fps,
-                           float augmentation_level,
-                           float min_cfg,
-                           float cfg_scale,
-                           enum sample_method_t sample_method,
-                           int sample_steps,
-                           float strength,
-                           int64_t seed);
-
-SD_API sd_image_t* edit(sd_ctx_t* sd_ctx,
-                        sd_image_t* ref_images,
-                        int ref_images_count,
-                        const char* prompt,
-                        const char* negative_prompt,
-                        int clip_skip,
-                        float cfg_scale,
-                        float guidance,
-                        float eta,
-                        int width,
-                        int height,
-                        enum sample_method_t sample_method,
-                        int sample_steps,
-                        float strength,
-                        int64_t seed,
-                        int batch_count,
-                        const sd_image_t* control_cond,
-                        float control_strength,
-                        float style_strength,
-                        bool normalize_input,
-                        int* skip_layers,
-                        size_t skip_layers_count,
-                        float slg_scale,
-                        float skip_layer_start,
-                        float skip_layer_end);
+SD_API void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params);
+SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* sd_vid_gen_params);  // broken
 
 typedef struct upscaler_ctx_t upscaler_ctx_t;
 
 SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path,
-                                        int n_threads);
+                                        int n_threads,
+                                        bool direct);
 SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx);
 
 SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx, sd_image_t input_image, uint32_t upscale_factor);
 
-SD_API bool convert(const char* input_path, const char* vae_path, const char* output_path, enum sd_type_t output_type);
+SD_API bool convert(const char* input_path,
+                    const char* vae_path,
+                    const char* output_path,
+                    enum sd_type_t output_type,
+                    const char* tensor_type_rules);
 
 SD_API uint8_t* preprocess_canny(uint8_t* img,
                                  int width,
@@ -282,4 +268,4 @@ SD_API uint8_t* preprocess_canny(uint8_t* img,
 }
 #endif
 
-#endif  // __STABLE_DIFFUSION_H__
\ No newline at end of file
+#endif  // __STABLE_DIFFUSION_H__
diff --git a/otherarch/sdcpp/t5.hpp b/otherarch/sdcpp/t5.hpp
index 1861ad478..253b3fbcd 100644
--- a/otherarch/sdcpp/t5.hpp
+++ b/otherarch/sdcpp/t5.hpp
@@ -457,8 +457,8 @@ protected:
     int64_t hidden_size;
     float eps;
 
-    void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") {
-        enum ggml_type wtype = GGML_TYPE_F32;  //(tensor_types.find(prefix + "weight") != tensor_types.end()) ? tensor_types[prefix + "weight"] : GGML_TYPE_F32;
+    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
+        enum ggml_type wtype = GGML_TYPE_F32;
         params["weight"]     = ggml_new_tensor_1d(ctx, wtype, hidden_size);
     }
 
@@ -735,7 +735,7 @@ struct T5Runner : public GGMLRunner {
     std::vector<int> relative_position_bucket_vec;
 
     T5Runner(ggml_backend_t backend,
-             std::map<std::string, enum ggml_type>& tensor_types,
+             const String2GGMLType& tensor_types,
              const std::string prefix,
              int64_t num_layers = 24,
              int64_t model_dim  = 4096,
@@ -876,16 +876,14 @@ struct T5Embedder {
     T5UniGramTokenizer tokenizer;
     T5Runner model;
 
-    static std::map<std::string, enum ggml_type> empty_tensor_types;
-
     T5Embedder(ggml_backend_t backend,
-               std::map<std::string, enum ggml_type>& tensor_types = empty_tensor_types,
-               const std::string prefix                            = "",
-               int64_t num_layers                                  = 24,
-               int64_t model_dim                                   = 4096,
-               int64_t ff_dim                                      = 10240,
-               int64_t num_heads                                   = 64,
-               int64_t vocab_size                                  = 32128)
+               const String2GGMLType& tensor_types = {},
+               const std::string prefix            = "",
+               int64_t num_layers                  = 24,
+               int64_t model_dim                   = 4096,
+               int64_t ff_dim                      = 10240,
+               int64_t num_heads                   = 64,
+               int64_t vocab_size                  = 32128)
         : model(backend, tensor_types, prefix, num_layers, model_dim, ff_dim, num_heads, vocab_size) {
     }
 
diff --git a/otherarch/sdcpp/tae.hpp b/otherarch/sdcpp/tae.hpp
index 4c822eaf9..4959bbd08 100644
--- a/otherarch/sdcpp/tae.hpp
+++ b/otherarch/sdcpp/tae.hpp
@@ -149,7 +149,7 @@ public:
                 if (i == 1) {
                     h = ggml_relu_inplace(ctx, h);
                 } else {
-                    h = ggml_upscale(ctx, h, 2, ggml_scale_mode::GGML_SCALE_MODE_NEAREST);
+                    h = ggml_upscale(ctx, h, 2, GGML_SCALE_MODE_NEAREST);
                 }
                 continue;
             }
@@ -196,7 +196,7 @@ struct TinyAutoEncoder : public GGMLRunner {
     bool decode_only = false;
 
     TinyAutoEncoder(ggml_backend_t backend,
-                    std::map<std::string, enum ggml_type>& tensor_types,
+                    const String2GGMLType& tensor_types,
                     const std::string prefix,
                     bool decoder_only = true,
                     SDVersion version = VERSION_SD1)
@@ -206,6 +206,17 @@ struct TinyAutoEncoder : public GGMLRunner {
         taesd.init(params_ctx, tensor_types, prefix);
     }
 
+    void enable_conv2d_direct() {
+        std::vector<GGMLBlock*> blocks;
+        taesd.get_all_blocks(blocks);
+        for (auto block : blocks) {
+            if (block->get_desc() == "Conv2d") {
+                auto conv_block = (Conv2d*)block;
+                conv_block->enable_direct();
+            }
+        }
+    }
+
     std::string get_desc() {
         return "taesd";
     }
diff --git a/otherarch/sdcpp/unet.hpp b/otherarch/sdcpp/unet.hpp
index d52d71285..696bc6dfa 100644
--- a/otherarch/sdcpp/unet.hpp
+++ b/otherarch/sdcpp/unet.hpp
@@ -166,7 +166,6 @@ public:
 // ldm.modules.diffusionmodules.openaimodel.UNetModel
 class UnetModelBlock : public GGMLBlock {
 protected:
-    static std::map<std::string, enum ggml_type> empty_tensor_types;
     SDVersion version = VERSION_SD1;
     // network hparams
     int in_channels                        = 4;
@@ -184,7 +183,7 @@ public:
     int model_channels  = 320;
     int adm_in_channels = 2816;  // only for VERSION_SDXL/SVD
 
-    UnetModelBlock(SDVersion version = VERSION_SD1, std::map<std::string, enum ggml_type>& tensor_types = empty_tensor_types, bool flash_attn = false)
+    UnetModelBlock(SDVersion version = VERSION_SD1, const String2GGMLType& tensor_types = {}, bool flash_attn = false)
         : version(version) {
         if (sd_version_is_sd2(version)) {
             context_dim       = 1024;
@@ -207,6 +206,8 @@ public:
         }
         if (sd_version_is_inpaint(version)) {
             in_channels = 9;
+        } else if (sd_version_is_unet_edit(version)) {
+            in_channels = 8;
         }
 
         // dims is always 2
@@ -537,7 +538,7 @@ struct UNetModelRunner : public GGMLRunner {
     UnetModelBlock unet;
 
     UNetModelRunner(ggml_backend_t backend,
-                    std::map<std::string, enum ggml_type>& tensor_types,
+                    const String2GGMLType& tensor_types,
                     const std::string prefix,
                     SDVersion version = VERSION_SD1,
                     bool flash_attn   = false)
@@ -545,6 +546,18 @@ struct UNetModelRunner : public GGMLRunner {
         unet.init(params_ctx, tensor_types, prefix);
     }
 
+    void enable_conv2d_direct() {
+        std::vector<GGMLBlock*> blocks;
+        unet.get_all_blocks(blocks);
+        for (auto block : blocks) {
+            if (block->get_desc() == "Conv2d") {
+                LOG_DEBUG("block %s", block->get_desc().c_str());
+                auto conv_block = (Conv2d*)block;
+                conv_block->enable_direct();
+            }
+        }
+    }
+
     std::string get_desc() {
         return "unet";
     }
@@ -657,4 +670,4 @@ struct UNetModelRunner : public GGMLRunner {
     }
 };
 
-#endif  // __UNET_HPP__
\ No newline at end of file
+#endif  // __UNET_HPP__
diff --git a/otherarch/sdcpp/upscaler.cpp b/otherarch/sdcpp/upscaler.cpp
index 8907e69c3..599f263f9 100644
--- a/otherarch/sdcpp/upscaler.cpp
+++ b/otherarch/sdcpp/upscaler.cpp
@@ -9,9 +9,12 @@ struct UpscalerGGML {
     std::shared_ptr<ESRGAN> esrgan_upscaler;
     std::string esrgan_path;
     int n_threads;
+    bool direct = false;
 
-    UpscalerGGML(int n_threads)
-        : n_threads(n_threads) {
+    UpscalerGGML(int n_threads,
+                 bool direct = false)
+        : n_threads(n_threads),
+          direct(direct) {
     }
 
     bool load_from_file(const std::string& esrgan_path) {
@@ -21,12 +24,17 @@ struct UpscalerGGML {
 #endif
 #ifdef SD_USE_METAL
         LOG_DEBUG("Using Metal backend");
+        ggml_log_set(ggml_log_callback_default, nullptr);
         backend = ggml_backend_metal_init();
 #endif
 #ifdef SD_USE_VULKAN
         LOG_DEBUG("Using Vulkan backend");
         backend = ggml_backend_vk_init(0);
 #endif
+#ifdef SD_USE_OPENCL
+        LOG_DEBUG("Using OpenCL backend");
+        backend = ggml_backend_opencl_init();
+#endif
 #ifdef SD_USE_SYCL
         LOG_DEBUG("Using SYCL backend");
         backend = ggml_backend_sycl_init(0);
@@ -42,6 +50,9 @@ struct UpscalerGGML {
         }
         LOG_INFO("Upscaler weight type: %s", ggml_type_name(model_data_type));
         esrgan_upscaler = std::make_shared<ESRGAN>(backend, model_loader.tensor_storages_types);
+        if (direct) {
+            esrgan_upscaler->enable_conv2d_direct();
+        }
         if (!esrgan_upscaler->load_from_file(esrgan_path)) {
             return false;
         }
@@ -99,14 +110,15 @@ struct upscaler_ctx_t {
 };
 
 upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str,
-                                 int n_threads) {
+                                 int n_threads,
+                                 bool direct = false) {
     upscaler_ctx_t* upscaler_ctx = (upscaler_ctx_t*)malloc(sizeof(upscaler_ctx_t));
     if (upscaler_ctx == NULL) {
         return NULL;
     }
     std::string esrgan_path(esrgan_path_c_str);
 
-    upscaler_ctx->upscaler = new UpscalerGGML(n_threads);
+    upscaler_ctx->upscaler = new UpscalerGGML(n_threads, direct);
     if (upscaler_ctx->upscaler == NULL) {
         return NULL;
     }
diff --git a/otherarch/sdcpp/util.cpp b/otherarch/sdcpp/util.cpp
index d5cd8e229..829099487 100644
--- a/otherarch/sdcpp/util.cpp
+++ b/otherarch/sdcpp/util.cpp
@@ -102,19 +102,32 @@ std::vector<std::string> get_files_from_dir(const std::string& dir) {
     sprintf(directoryPath, "%s\\%s\\*", currentDirectory, dir.c_str());
 
     // Find the first file in the directory
-    hFind = FindFirstFile(directoryPath, &findFileData);
-
+    hFind               = FindFirstFile(directoryPath, &findFileData);
+    bool isAbsolutePath = false;
     // Check if the directory was found
     if (hFind == INVALID_HANDLE_VALUE) {
-        printf("Unable to find directory.\n");
-        return files;
+        printf("Unable to find directory. Try with original path \n");
+
+        char directoryPathAbsolute[MAX_PATH];
+        sprintf(directoryPathAbsolute, "%s*", dir.c_str());
+
+        hFind          = FindFirstFile(directoryPathAbsolute, &findFileData);
+        isAbsolutePath = true;
+        if (hFind == INVALID_HANDLE_VALUE) {
+            printf("Absolute path was also wrong.\n");
+            return files;
+        }
     }
 
     // Loop through all files in the directory
     do {
         // Check if the found file is a regular file (not a directory)
         if (!(findFileData.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY)) {
-            files.push_back(std::string(currentDirectory) + "\\" + dir + "\\" + std::string(findFileData.cFileName));
+            if (isAbsolutePath) {
+                files.push_back(dir + "\\" + std::string(findFileData.cFileName));
+            } else {
+                files.push_back(std::string(currentDirectory) + "\\" + dir + "\\" + std::string(findFileData.cFileName));
+            }
         }
     } while (FindNextFile(hFind, &findFileData) != 0);
 
@@ -447,10 +460,6 @@ const char* sd_get_system_info() {
     return buffer;
 }
 
-const char* sd_type_name(enum sd_type_t type) {
-    return ggml_type_name((ggml_type)type);
-}
-
 sd_image_f32_t sd_image_t_to_sd_image_f32_t(sd_image_t image) {
     sd_image_f32_t converted_image;
     converted_image.width   = image.width;
diff --git a/otherarch/sdcpp/util.h b/otherarch/sdcpp/util.h
index 607ea1bb1..7f0cb337d 100644
--- a/otherarch/sdcpp/util.h
+++ b/otherarch/sdcpp/util.h
@@ -7,6 +7,9 @@
 
 #include "stable-diffusion.h"
 
+#define SAFE_STR(s) ((s) ? (s) : "")
+#define BOOL_STR(b) ((b) ? "true" : "false")
+
 bool ends_with(const std::string& str, const std::string& ending);
 bool starts_with(const std::string& str, const std::string& start);
 bool contains(const std::string& str, const std::string& substr);
diff --git a/otherarch/sdcpp/vae.hpp b/otherarch/sdcpp/vae.hpp
index 4add881f6..bdf160bb8 100644
--- a/otherarch/sdcpp/vae.hpp
+++ b/otherarch/sdcpp/vae.hpp
@@ -163,8 +163,8 @@ public:
 
 class VideoResnetBlock : public ResnetBlock {
 protected:
-    void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") {
-        enum ggml_type wtype = (tensor_types.find(prefix + "mix_factor") != tensor_types.end()) ? tensor_types[prefix + "mix_factor"] : GGML_TYPE_F32;
+    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
+        enum ggml_type wtype = get_type(prefix + "mix_factor", tensor_types, GGML_TYPE_F32);
         params["mix_factor"] = ggml_new_tensor_1d(ctx, wtype, 1);
     }
 
@@ -525,7 +525,7 @@ struct AutoEncoderKL : public GGMLRunner {
     AutoencodingEngine ae;
 
     AutoEncoderKL(ggml_backend_t backend,
-                  std::map<std::string, enum ggml_type>& tensor_types,
+                  const String2GGMLType& tensor_types,
                   const std::string prefix,
                   bool decode_only       = false,
                   bool use_video_decoder = false,
@@ -534,6 +534,17 @@ struct AutoEncoderKL : public GGMLRunner {
         ae.init(params_ctx, tensor_types, prefix);
     }
 
+    void enable_conv2d_direct() {
+        std::vector<GGMLBlock*> blocks;
+        ae.get_all_blocks(blocks);
+        for (auto block : blocks) {
+            if (block->get_desc() == "Conv2d") {
+                auto conv_block = (Conv2d*)block;
+                conv_block->enable_direct();
+            }
+        }
+    }
+
     std::string get_desc() {
         return "vae";
     }