WIP: update stable-diffusion.cpp to 5900ef6605c6 (new API) (#1669)

* Update stable-diffusion.cpp to 5900ef6605c6 (new API) * Clean up pending LoRA code and simplify LoRA changes to upstream * Move VAE tiling disabling for TAESD to sdtype_adapter.cpp * Move auxiliary ctx functions to sdtype_adapter.cpp * Use ref_images parameter for Kontext images * Drop clip skip workaround (fixed upstream) * Workaround for flash attention with img2img leejet/stable-diffusion.cpp#756 * Workaround for Chroma with flash attention, debug prints * Disable forcing CLIP weights to F32 for reduced memory usage
2026-05-21 18:52:02 +00:00 · 2025-08-12 12:25:02 -03:00 · 2025-08-12 12:25:02 -03:00 · 5de7ed3d56
commit 5de7ed3d56
parent 7b5cf7143f
26 changed files with 2255 additions and 1870 deletions
--- a/otherarch/sdcpp/clip.hpp
+++ b/otherarch/sdcpp/clip.hpp
@ -545,9 +545,15 @@ protected:
    int64_t vocab_size;
    int64_t num_positions;

-    void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") {
-        enum ggml_type token_wtype    = (tensor_types.find(prefix + "token_embedding.weight") != tensor_types.end()) ? tensor_types[prefix + "token_embedding.weight"] : GGML_TYPE_F32;
-        enum ggml_type position_wtype = GGML_TYPE_F32;  //(tensor_types.find(prefix + "position_embedding.weight") != tensor_types.end()) ? tensor_types[prefix + "position_embedding.weight"] : GGML_TYPE_F32;
+    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
+        enum ggml_type token_wtype    = GGML_TYPE_F32;
+        #if 1
+        // kcpp reduce memory usage (reverts https://github.com/leejet/stable-diffusion.cpp/pull/601)
+        auto tensor_type = tensor_types.find(prefix + "token_embedding.weight");
+        if (tensor_type != tensor_types.end())
+            token_wtype = tensor_type->second;
+        #endif
+        enum ggml_type position_wtype = GGML_TYPE_F32;

        params["token_embedding.weight"]    = ggml_new_tensor_2d(ctx, token_wtype, embed_dim, vocab_size);
        params["position_embedding.weight"] = ggml_new_tensor_2d(ctx, position_wtype, embed_dim, num_positions);
@ -594,10 +600,10 @@ protected:
    int64_t image_size;
    int64_t num_patches;
    int64_t num_positions;
-    void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") {
-        enum ggml_type patch_wtype    = GGML_TYPE_F16;  // tensor_types.find(prefix + "patch_embedding.weight") != tensor_types.end() ? tensor_types[prefix + "patch_embedding.weight"] : GGML_TYPE_F16;
-        enum ggml_type class_wtype    = GGML_TYPE_F32;  // tensor_types.find(prefix + "class_embedding") != tensor_types.end() ? tensor_types[prefix + "class_embedding"] : GGML_TYPE_F32;
-        enum ggml_type position_wtype = GGML_TYPE_F32;  // tensor_types.find(prefix + "position_embedding.weight") != tensor_types.end() ? tensor_types[prefix + "position_embedding.weight"] : GGML_TYPE_F32;
+    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
+        enum ggml_type patch_wtype    = GGML_TYPE_F16;
+        enum ggml_type class_wtype    = GGML_TYPE_F32;
+        enum ggml_type position_wtype = GGML_TYPE_F32;

        params["patch_embedding.weight"]    = ggml_new_tensor_4d(ctx, patch_wtype, patch_size, patch_size, num_channels, embed_dim);
        params["class_embedding"]           = ggml_new_tensor_1d(ctx, class_wtype, embed_dim);
@ -657,9 +663,9 @@ enum CLIPVersion {

 class CLIPTextModel : public GGMLBlock {
 protected:
-    void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") {
+    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
        if (version == OPEN_CLIP_VIT_BIGG_14) {
-            enum ggml_type wtype      = GGML_TYPE_F32;  // tensor_types.find(prefix + "text_projection") != tensor_types.end() ? tensor_types[prefix + "text_projection"] : GGML_TYPE_F32;
+            enum ggml_type wtype      = GGML_TYPE_F32;
            params["text_projection"] = ggml_new_tensor_2d(ctx, wtype, projection_dim, hidden_size);
        }
    }
@ -678,8 +684,8 @@ public:
    bool with_final_ln        = true;

    CLIPTextModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14,
-                  int clip_skip_value = -1,
-                  bool with_final_ln  = true)
+                  bool with_final_ln  = true,
+                  int clip_skip_value = -1)
        : version(version), with_final_ln(with_final_ln) {
        if (version == OPEN_CLIP_VIT_H_14) {
            hidden_size       = 1024;
@ -701,7 +707,7 @@ public:

    void set_clip_skip(int skip) {
        if (skip <= 0) {
-            return;
+            skip = -1;
        }
        clip_skip = skip;
    }
@ -805,8 +811,8 @@ protected:
    int64_t out_features;
    bool transpose_weight;

-    void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") {
-        enum ggml_type wtype = tensor_types.find(prefix + "weight") != tensor_types.end() ? tensor_types[prefix + "weight"] : GGML_TYPE_F32;
+    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
+        enum ggml_type wtype = get_type(prefix + "weight", tensor_types, GGML_TYPE_F32);
        if (transpose_weight) {
            params["weight"] = ggml_new_tensor_2d(ctx, wtype, out_features, in_features);
        } else {
@ -868,12 +874,12 @@ struct CLIPTextModelRunner : public GGMLRunner {
    CLIPTextModel model;

    CLIPTextModelRunner(ggml_backend_t backend,
-                        std::map<std::string, enum ggml_type>& tensor_types,
+                        const String2GGMLType& tensor_types,
                        const std::string prefix,
                        CLIPVersion version = OPENAI_CLIP_VIT_L_14,
-                        int clip_skip_value = 1,
-                        bool with_final_ln  = true)
-        : GGMLRunner(backend), model(version, clip_skip_value, with_final_ln) {
+                        bool with_final_ln  = true,
+                        int clip_skip_value = -1)
+        : GGMLRunner(backend), model(version, with_final_ln, clip_skip_value) {
        model.init(params_ctx, tensor_types, prefix);
    }

@ -949,4 +955,4 @@ struct CLIPTextModelRunner : public GGMLRunner {
    }
 };

-#endif  // __CLIP_HPP__
+#endif  // __CLIP_HPP__
--- a/otherarch/sdcpp/common.hpp
+++ b/otherarch/sdcpp/common.hpp
@ -56,8 +56,8 @@ public:
        // x: [N, channels, h, w]
        auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]);

-        x = ggml_upscale(ctx, x, 2, ggml_scale_mode::GGML_SCALE_MODE_NEAREST);  // [N, channels, h*2, w*2]
-        x = conv->forward(ctx, x);    // [N, out_channels, h*2, w*2]
+        x = ggml_upscale(ctx, x, 2, GGML_SCALE_MODE_NEAREST);  // [N, channels, h*2, w*2]
+        x = conv->forward(ctx, x);                             // [N, out_channels, h*2, w*2]
        return x;
    }
 };
@ -182,9 +182,9 @@ protected:
    int64_t dim_in;
    int64_t dim_out;

-    void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, std::string prefix = "") {
-        enum ggml_type wtype      = (tensor_types.find(prefix + "proj.weight") != tensor_types.end()) ? tensor_types[prefix + "proj.weight"] : GGML_TYPE_F32;
-        enum ggml_type bias_wtype = GGML_TYPE_F32;  //(tensor_types.find(prefix + "proj.bias") != tensor_types.end()) ? tensor_types[prefix + "proj.bias"] : GGML_TYPE_F32;
+    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, std::string prefix = "") {
+        enum ggml_type wtype      = get_type(prefix + "proj.weight", tensor_types, GGML_TYPE_F32);
+        enum ggml_type bias_wtype = GGML_TYPE_F32;
        params["proj.weight"]     = ggml_new_tensor_2d(ctx, wtype, dim_in, dim_out * 2);
        params["proj.bias"]       = ggml_new_tensor_1d(ctx, bias_wtype, dim_out * 2);
    }
@ -440,9 +440,9 @@ public:

 class AlphaBlender : public GGMLBlock {
 protected:
-    void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, std::string prefix = "") {
+    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, std::string prefix = "") {
        // Get the type of the "mix_factor" tensor from the input tensors map with the specified prefix
-        enum ggml_type wtype = GGML_TYPE_F32;  //(tensor_types.ypes.find(prefix + "mix_factor") != tensor_types.end()) ? tensor_types[prefix + "mix_factor"] : GGML_TYPE_F32;
+        enum ggml_type wtype = GGML_TYPE_F32;
        params["mix_factor"] = ggml_new_tensor_1d(ctx, wtype, 1);
    }

--- a/otherarch/sdcpp/conditioner.hpp
+++ b/otherarch/sdcpp/conditioner.hpp
@ -57,29 +57,30 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
    std::vector<std::string> readed_embeddings;

    FrozenCLIPEmbedderWithCustomWords(ggml_backend_t backend,
-                                      std::map<std::string, enum ggml_type>& tensor_types,
+                                      const String2GGMLType& tensor_types,
                                      const std::string& embd_dir,
                                      SDVersion version = VERSION_SD1,
                                      PMVersion pv      = PM_VERSION_1,
                                      int clip_skip     = -1)
        : version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407), embd_dir(embd_dir) {
+        if (sd_version_is_sd1(version)) {
+            text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14);
+        } else if (sd_version_is_sd2(version)) {
+            text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14);
+        } else if (sd_version_is_sdxl(version)) {
+            text_model  = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
+            text_model2 = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
+        }
+        set_clip_skip(clip_skip);
+    }
+
+    void set_clip_skip(int clip_skip) {
        if (clip_skip <= 0) {
            clip_skip = 1;
            if (sd_version_is_sd2(version) || sd_version_is_sdxl(version)) {
                clip_skip = 2;
            }
        }
-        if (sd_version_is_sd1(version)) {
-            text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, clip_skip);
-        } else if (sd_version_is_sd2(version)) {
-            text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, clip_skip);
-        } else if (sd_version_is_sdxl(version)) {
-            text_model  = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, clip_skip, false);
-            text_model2 = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, clip_skip, false);
-        }
-    }
-
-    void set_clip_skip(int clip_skip) {
        text_model->set_clip_skip(clip_skip);
        if (sd_version_is_sdxl(version)) {
            text_model2->set_clip_skip(clip_skip);
@ -458,8 +459,8 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                if (sd_version_is_sdxl(version)) {
                    text_model2->compute(n_threads,
                                         input_ids2,
-                                         0,
-                                         NULL,
+                                         num_custom_embeddings,
+                                         token_embed_custom.data(),
                                         max_token_idx,
                                         false,
                                         &chunk_hidden_states2, work_ctx);
@ -469,8 +470,8 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                    if (chunk_idx == 0) {
                        text_model2->compute(n_threads,
                                             input_ids2,
-                                             0,
-                                             NULL,
+                                             num_custom_embeddings,
+                                             token_embed_custom.data(),
                                             max_token_idx,
                                             true,
                                             &pooled,
@ -617,7 +618,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
 struct FrozenCLIPVisionEmbedder : public GGMLRunner {
    CLIPVisionModelProjection vision_model;

-    FrozenCLIPVisionEmbedder(ggml_backend_t backend, std::map<std::string, enum ggml_type>& tensor_types)
+    FrozenCLIPVisionEmbedder(ggml_backend_t backend, const String2GGMLType& tensor_types = {})
        : vision_model(OPEN_CLIP_VIT_H_14, true), GGMLRunner(backend) {
        vision_model.init(params_ctx, tensor_types, "cond_stage_model.transformer");
    }
@ -662,18 +663,19 @@ struct SD3CLIPEmbedder : public Conditioner {
    std::shared_ptr<T5Runner> t5;

    SD3CLIPEmbedder(ggml_backend_t backend,
-                    std::map<std::string, enum ggml_type>& tensor_types,
-                    int clip_skip = -1)
+                    const String2GGMLType& tensor_types = {},
+                    int clip_skip                       = -1)
        : clip_g_tokenizer(0) {
-        if (clip_skip <= 0) {
-            clip_skip = 2;
-        }
-        clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, clip_skip, false);
-        clip_g = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, clip_skip, false);
+        clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
+        clip_g = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
        t5     = std::make_shared<T5Runner>(backend, tensor_types, "text_encoders.t5xxl.transformer");
+        set_clip_skip(clip_skip);
    }

    void set_clip_skip(int clip_skip) {
+        if (clip_skip <= 0) {
+            clip_skip = 2;
+        }
        clip_l->set_clip_skip(clip_skip);
        clip_g->set_clip_skip(clip_skip);
    }
@ -1008,16 +1010,17 @@ struct FluxCLIPEmbedder : public Conditioner {
    size_t chunk_len = 256;

    FluxCLIPEmbedder(ggml_backend_t backend,
-                     std::map<std::string, enum ggml_type>& tensor_types,
-                     int clip_skip = -1) {
-        if (clip_skip <= 0) {
-            clip_skip = 2;
-        }
-        clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, clip_skip, true);
+                     const String2GGMLType& tensor_types = {},
+                     int clip_skip                       = -1) {
+        clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true);
        t5     = std::make_shared<T5Runner>(backend, tensor_types, "text_encoders.t5xxl.transformer");
+        set_clip_skip(clip_skip);
    }

    void set_clip_skip(int clip_skip) {
+        if (clip_skip <= 0) {
+            clip_skip = 2;
+        }
        clip_l->set_clip_skip(clip_skip);
    }

@ -1228,10 +1231,10 @@ struct PixArtCLIPEmbedder : public Conditioner {
    int mask_pad     = 1;

    PixArtCLIPEmbedder(ggml_backend_t backend,
-                       std::map<std::string, enum ggml_type>& tensor_types,
-                       int clip_skip = -1,
-                       bool use_mask = false,
-                       int mask_pad  = 1)
+                       const String2GGMLType& tensor_types = {},
+                       int clip_skip                       = -1,
+                       bool use_mask                       = false,
+                       int mask_pad                        = 1)
        : use_mask(use_mask), mask_pad(mask_pad) {
        t5 = std::make_shared<T5Runner>(backend, tensor_types, "text_encoders.t5xxl.transformer");
    }
@ -1422,4 +1425,4 @@ struct PixArtCLIPEmbedder : public Conditioner {
    }
 };

-#endif
+#endif
--- a/otherarch/sdcpp/control.hpp
+++ b/otherarch/sdcpp/control.hpp
@ -317,12 +317,23 @@ struct ControlNet : public GGMLRunner {
    bool guided_hint_cached         = false;

    ControlNet(ggml_backend_t backend,
-               std::map<std::string, enum ggml_type>& tensor_types,
-               SDVersion version = VERSION_SD1)
+               const String2GGMLType& tensor_types = {},
+               SDVersion version                   = VERSION_SD1)
        : GGMLRunner(backend), control_net(version) {
        control_net.init(params_ctx, tensor_types, "");
    }

+    void enable_conv2d_direct() {
+        std::vector<GGMLBlock*> blocks;
+        control_net.get_all_blocks(blocks);
+        for (auto block : blocks) {
+            if (block->get_desc() == "Conv2d") {
+                auto conv_block = (Conv2d*)block;
+                conv_block->enable_direct();
+            }
+        }
+    }
+
    ~ControlNet() {
        free_control_ctx();
    }
--- a/otherarch/sdcpp/denoiser.hpp
+++ b/otherarch/sdcpp/denoiser.hpp
@ -168,24 +168,21 @@ struct AYSSchedule : SigmaSchedule {
        std::vector<float> inputs;
        std::vector<float> results(n + 1);

-        switch (version) {
-            case VERSION_SD2: /* fallthrough */
-                LOG_WARN("AYS not designed for SD2.X models");
-            case VERSION_SD1:
-                LOG_INFO("AYS using SD1.5 noise levels");
-                inputs = noise_levels[0];
-                break;
-            case VERSION_SDXL:
-                LOG_INFO("AYS using SDXL noise levels");
-                inputs = noise_levels[1];
-                break;
-            case VERSION_SVD:
-                LOG_INFO("AYS using SVD noise levels");
-                inputs = noise_levels[2];
-                break;
-            default:
-                LOG_ERROR("Version not compatable with AYS scheduler");
-                return results;
+        if (sd_version_is_sd2((SDVersion)version)) {
+            LOG_WARN("AYS not designed for SD2.X models");
+        } /* fallthrough */
+        else if (sd_version_is_sd1((SDVersion)version)) {
+            LOG_INFO("AYS using SD1.5 noise levels");
+            inputs = noise_levels[0];
+        } else if (sd_version_is_sdxl((SDVersion)version)) {
+            LOG_INFO("AYS using SDXL noise levels");
+            inputs = noise_levels[1];
+        } else if (version == VERSION_SVD) {
+            LOG_INFO("AYS using SVD noise levels");
+            inputs = noise_levels[2];
+        } else {
+            LOG_ERROR("Version not compatible with AYS scheduler");
+            return results;
        }

        /* Stretches those pre-calculated reference levels out to the desired
@ -346,6 +343,32 @@ struct CompVisVDenoiser : public CompVisDenoiser {
    }
 };

+struct EDMVDenoiser : public CompVisVDenoiser {
+    float min_sigma = 0.002;
+    float max_sigma = 120.0;
+
+    EDMVDenoiser(float min_sigma = 0.002, float max_sigma = 120.0)
+        : min_sigma(min_sigma), max_sigma(max_sigma) {
+        schedule = std::make_shared<ExponentialSchedule>();
+    }
+
+    float t_to_sigma(float t) {
+        return std::exp(t * 4 / (float)TIMESTEPS);
+    }
+
+    float sigma_to_t(float s) {
+        return 0.25 * std::log(s);
+    }
+
+    float sigma_min() {
+        return min_sigma;
+    }
+
+    float sigma_max() {
+        return max_sigma;
+    }
+};
+
 float time_snr_shift(float alpha, float t) {
    if (alpha == 1.0f) {
        return t;
@ -1019,7 +1042,7 @@ static void sample_k_diffusion(sample_method_t method,
            // also needed to invert the behavior of CompVisDenoiser
            // (k-diffusion's LMSDiscreteScheduler)
            float beta_start = 0.00085f;
-            float beta_end = 0.0120f;
+            float beta_end   = 0.0120f;
            std::vector<double> alphas_cumprod;
            std::vector<double> compvis_sigmas;

@ -1030,8 +1053,9 @@ static void sample_k_diffusion(sample_method_t method,
                    (i == 0 ? 1.0f : alphas_cumprod[i - 1]) *
                    (1.0f -
                     std::pow(sqrtf(beta_start) +
-                              (sqrtf(beta_end) - sqrtf(beta_start)) *
-                              ((float)i / (TIMESTEPS - 1)), 2));
+                                  (sqrtf(beta_end) - sqrtf(beta_start)) *
+                                      ((float)i / (TIMESTEPS - 1)),
+                              2));
                compvis_sigmas[i] =
                    std::sqrt((1 - alphas_cumprod[i]) /
                              alphas_cumprod[i]);
@ -1061,7 +1085,8 @@ static void sample_k_diffusion(sample_method_t method,
                // - pred_prev_sample -> "x_t-1"
                int timestep =
                    roundf(TIMESTEPS -
-                           i * ((float)TIMESTEPS / steps)) - 1;
+                           i * ((float)TIMESTEPS / steps)) -
+                    1;
                // 1. get previous step value (=t-1)
                int prev_timestep = timestep - TIMESTEPS / steps;
                // The sigma here is chosen to cause the
@ -1086,10 +1111,9 @@ static void sample_k_diffusion(sample_method_t method,
                    float* vec_x = (float*)x->data;
                    for (int j = 0; j < ggml_nelements(x); j++) {
                        vec_x[j] *= std::sqrt(sigma * sigma + 1) /
-                            sigma;
+                                    sigma;
                    }
-                }
-                else {
+                } else {
                    // For the subsequent steps after the first one,
                    // at this point x = latents or x = sample, and
                    // needs to be prescaled with x <- sample / c_in
@ -1127,9 +1151,8 @@ static void sample_k_diffusion(sample_method_t method,
                float alpha_prod_t = alphas_cumprod[timestep];
                // Note final_alpha_cumprod = alphas_cumprod[0] due to
                // trailing timestep spacing
-                float alpha_prod_t_prev = prev_timestep >= 0 ?
-                    alphas_cumprod[prev_timestep] : alphas_cumprod[0];
-                float beta_prod_t = 1 - alpha_prod_t;
+                float alpha_prod_t_prev = prev_timestep >= 0 ? alphas_cumprod[prev_timestep] : alphas_cumprod[0];
+                float beta_prod_t       = 1 - alpha_prod_t;
                // 3. compute predicted original sample from predicted
                // noise also called "predicted x_0" of formula (12)
                // from https://arxiv.org/pdf/2010.02502.pdf
@ -1145,7 +1168,7 @@ static void sample_k_diffusion(sample_method_t method,
                        vec_pred_original_sample[j] =
                            (vec_x[j] / std::sqrt(sigma * sigma + 1) -
                             std::sqrt(beta_prod_t) *
-                             vec_model_output[j]) *
+                                 vec_model_output[j]) *
                            (1 / std::sqrt(alpha_prod_t));
                    }
                }
@ -1159,8 +1182,8 @@ static void sample_k_diffusion(sample_method_t method,
                // sigma_t = sqrt((1 - alpha_t-1)/(1 - alpha_t)) *
                // sqrt(1 - alpha_t/alpha_t-1)
                float beta_prod_t_prev = 1 - alpha_prod_t_prev;
-                float variance = (beta_prod_t_prev / beta_prod_t) *
-                    (1 - alpha_prod_t / alpha_prod_t_prev);
+                float variance         = (beta_prod_t_prev / beta_prod_t) *
+                                 (1 - alpha_prod_t / alpha_prod_t_prev);
                float std_dev_t = eta * std::sqrt(variance);
                // 6. compute "direction pointing to x_t" of formula
                // (12) from https://arxiv.org/pdf/2010.02502.pdf
@ -1179,8 +1202,8 @@ static void sample_k_diffusion(sample_method_t method,
                                      std::pow(std_dev_t, 2)) *
                            vec_model_output[j];
                        vec_x[j] = std::sqrt(alpha_prod_t_prev) *
-                            vec_pred_original_sample[j] +
-                            pred_sample_direction;
+                                       vec_pred_original_sample[j] +
+                                   pred_sample_direction;
                    }
                }
                if (eta > 0) {
@ -1208,7 +1231,7 @@ static void sample_k_diffusion(sample_method_t method,
            // by Semi-Linear Consistency Function with Trajectory
            // Mapping", arXiv:2402.19159 [cs.CV]
            float beta_start = 0.00085f;
-            float beta_end = 0.0120f;
+            float beta_end   = 0.0120f;
            std::vector<double> alphas_cumprod;
            std::vector<double> compvis_sigmas;

@ -1219,8 +1242,9 @@ static void sample_k_diffusion(sample_method_t method,
                    (i == 0 ? 1.0f : alphas_cumprod[i - 1]) *
                    (1.0f -
                     std::pow(sqrtf(beta_start) +
-                              (sqrtf(beta_end) - sqrtf(beta_start)) *
-                              ((float)i / (TIMESTEPS - 1)), 2));
+                                  (sqrtf(beta_end) - sqrtf(beta_start)) *
+                                      ((float)i / (TIMESTEPS - 1)),
+                              2));
                compvis_sigmas[i] =
                    std::sqrt((1 - alphas_cumprod[i]) /
                              alphas_cumprod[i]);
@ -1235,13 +1259,10 @@ static void sample_k_diffusion(sample_method_t method,
            for (int i = 0; i < steps; i++) {
                // Analytic form for TCD timesteps
                int timestep = TIMESTEPS - 1 -
-                    (TIMESTEPS / original_steps) *
-                    (int)floor(i * ((float)original_steps / steps));
+                               (TIMESTEPS / original_steps) *
+                                   (int)floor(i * ((float)original_steps / steps));
                // 1. get previous step value
-                int prev_timestep = i >= steps - 1 ? 0 :
-                    TIMESTEPS - 1 - (TIMESTEPS / original_steps) *
-                    (int)floor((i + 1) *
-                               ((float)original_steps / steps));
+                int prev_timestep = i >= steps - 1 ? 0 : TIMESTEPS - 1 - (TIMESTEPS / original_steps) * (int)floor((i + 1) * ((float)original_steps / steps));
                // Here timestep_s is tau_n' in Algorithm 4. The _s
                // notation appears to be that from C. Lu,
                // "DPM-Solver: A Fast ODE Solver for Diffusion
@ -1258,10 +1279,9 @@ static void sample_k_diffusion(sample_method_t method,
                    float* vec_x = (float*)x->data;
                    for (int j = 0; j < ggml_nelements(x); j++) {
                        vec_x[j] *= std::sqrt(sigma * sigma + 1) /
-                            sigma;
+                                    sigma;
                    }
-                }
-                else {
+                } else {
                    float* vec_x = (float*)x->data;
                    for (int j = 0; j < ggml_nelements(x); j++) {
                        vec_x[j] *= std::sqrt(sigma * sigma + 1);
@ -1294,15 +1314,14 @@ static void sample_k_diffusion(sample_method_t method,
                // DPM-Solver. In fact, we have alpha_{t_n} =
                // \sqrt{\hat{alpha_n}}, [...]"
                float alpha_prod_t = alphas_cumprod[timestep];
-                float beta_prod_t = 1 - alpha_prod_t;
+                float beta_prod_t  = 1 - alpha_prod_t;
                // Note final_alpha_cumprod = alphas_cumprod[0] since
                // TCD is always "trailing"
-                float alpha_prod_t_prev = prev_timestep >= 0 ?
-                    alphas_cumprod[prev_timestep] : alphas_cumprod[0];
+                float alpha_prod_t_prev = prev_timestep >= 0 ? alphas_cumprod[prev_timestep] : alphas_cumprod[0];
                // The subscript _s are the only portion in this
                // section (2) unique to TCD
                float alpha_prod_s = alphas_cumprod[timestep_s];
-                float beta_prod_s = 1 - alpha_prod_s;
+                float beta_prod_s  = 1 - alpha_prod_s;
                // 3. Compute the predicted noised sample x_s based on
                // the model parameterization
                //
@ -1317,7 +1336,7 @@ static void sample_k_diffusion(sample_method_t method,
                        vec_pred_original_sample[j] =
                            (vec_x[j] / std::sqrt(sigma * sigma + 1) -
                             std::sqrt(beta_prod_t) *
-                             vec_model_output[j]) *
+                                 vec_model_output[j]) *
                            (1 / std::sqrt(alpha_prod_t));
                    }
                }
@ -1339,9 +1358,9 @@ static void sample_k_diffusion(sample_method_t method,
                        // pred_epsilon = model_output
                        vec_x[j] =
                            std::sqrt(alpha_prod_s) *
-                            vec_pred_original_sample[j] +
+                                vec_pred_original_sample[j] +
                            std::sqrt(beta_prod_s) *
-                            vec_model_output[j];
+                                vec_model_output[j];
                    }
                }
                // 4. Sample and inject noise z ~ N(0, I) for
@ -1357,7 +1376,7 @@ static void sample_k_diffusion(sample_method_t method,
                    // In this case, x is still pred_noised_sample,
                    // continue in-place
                    ggml_tensor_set_f32_randn(noise, rng);
-                    float* vec_x = (float*)x->data;
+                    float* vec_x     = (float*)x->data;
                    float* vec_noise = (float*)noise->data;
                    for (int j = 0; j < ggml_nelements(x); j++) {
                        // Corresponding to (35) in Zheng et
@ -1366,10 +1385,10 @@ static void sample_k_diffusion(sample_method_t method,
                        vec_x[j] =
                            std::sqrt(alpha_prod_t_prev /
                                      alpha_prod_s) *
-                            vec_x[j] +
+                                vec_x[j] +
                            std::sqrt(1 - alpha_prod_t_prev /
-                                      alpha_prod_s) *
-                            vec_noise[j];
+                                              alpha_prod_s) *
+                                vec_noise[j];
                    }
                }
            }
@ -1381,4 +1400,4 @@ static void sample_k_diffusion(sample_method_t method,
    }
 }

-#endif  // __DENOISER_HPP__
+#endif  // __DENOISER_HPP__
--- a/otherarch/sdcpp/diffusion_model.hpp
+++ b/otherarch/sdcpp/diffusion_model.hpp
@ -13,7 +13,7 @@ struct DiffusionModel {
                         struct ggml_tensor* c_concat,
                         struct ggml_tensor* y,
                         struct ggml_tensor* guidance,
-                         std::vector<ggml_tensor*> ref_latents = {},
+                         std::vector<ggml_tensor*> ref_latents     = {},
                         int num_video_frames                      = -1,
                         std::vector<struct ggml_tensor*> controls = {},
                         float control_strength                    = 0.f,
@ -32,9 +32,9 @@ struct UNetModel : public DiffusionModel {
    UNetModelRunner unet;

    UNetModel(ggml_backend_t backend,
-              std::map<std::string, enum ggml_type>& tensor_types,
-              SDVersion version = VERSION_SD1,
-              bool flash_attn   = false)
+              const String2GGMLType& tensor_types = {},
+              SDVersion version                   = VERSION_SD1,
+              bool flash_attn                     = false)
        : unet(backend, tensor_types, "model.diffusion_model", version, flash_attn) {
    }

@ -69,7 +69,7 @@ struct UNetModel : public DiffusionModel {
                 struct ggml_tensor* c_concat,
                 struct ggml_tensor* y,
                 struct ggml_tensor* guidance,
-                 std::vector<ggml_tensor*> ref_latents = {},
+                 std::vector<ggml_tensor*> ref_latents     = {},
                 int num_video_frames                      = -1,
                 std::vector<struct ggml_tensor*> controls = {},
                 float control_strength                    = 0.f,
@ -85,7 +85,7 @@ struct MMDiTModel : public DiffusionModel {
    MMDiTRunner mmdit;

    MMDiTModel(ggml_backend_t backend,
-               std::map<std::string, enum ggml_type>& tensor_types)
+               const String2GGMLType& tensor_types = {})
        : mmdit(backend, tensor_types, "model.diffusion_model") {
    }

@ -120,7 +120,7 @@ struct MMDiTModel : public DiffusionModel {
                 struct ggml_tensor* c_concat,
                 struct ggml_tensor* y,
                 struct ggml_tensor* guidance,
-                 std::vector<ggml_tensor*> ref_latents = {},
+                 std::vector<ggml_tensor*> ref_latents     = {},
                 int num_video_frames                      = -1,
                 std::vector<struct ggml_tensor*> controls = {},
                 float control_strength                    = 0.f,
@ -135,10 +135,10 @@ struct FluxModel : public DiffusionModel {
    Flux::FluxRunner flux;

    FluxModel(ggml_backend_t backend,
-              std::map<std::string, enum ggml_type>& tensor_types,
-              SDVersion version = VERSION_FLUX,
-              bool flash_attn   = false,
-              bool use_mask     = false)
+              const String2GGMLType& tensor_types = {},
+              SDVersion version                   = VERSION_FLUX,
+              bool flash_attn                     = false,
+              bool use_mask                       = false)
        : flux(backend, tensor_types, "model.diffusion_model", version, flash_attn, use_mask) {
    }

@ -173,7 +173,7 @@ struct FluxModel : public DiffusionModel {
                 struct ggml_tensor* c_concat,
                 struct ggml_tensor* y,
                 struct ggml_tensor* guidance,
-                 std::vector<ggml_tensor*> ref_latents = {},
+                 std::vector<ggml_tensor*> ref_latents     = {},
                 int num_video_frames                      = -1,
                 std::vector<struct ggml_tensor*> controls = {},
                 float control_strength                    = 0.f,
@ -184,4 +184,4 @@ struct FluxModel : public DiffusionModel {
    }
 };

-#endif
+#endif
--- a/otherarch/sdcpp/esrgan.hpp
+++ b/otherarch/sdcpp/esrgan.hpp
@ -130,8 +130,8 @@ public:
        body_feat = conv_body->forward(ctx, body_feat);
        feat      = ggml_add(ctx, feat, body_feat);
        // upsample
-        feat     = lrelu(ctx, conv_up1->forward(ctx, ggml_upscale(ctx, feat, 2, ggml_scale_mode::GGML_SCALE_MODE_NEAREST)));
-        feat     = lrelu(ctx, conv_up2->forward(ctx, ggml_upscale(ctx, feat, 2, ggml_scale_mode::GGML_SCALE_MODE_NEAREST)));
+        feat     = lrelu(ctx, conv_up1->forward(ctx, ggml_upscale(ctx, feat, 2, GGML_SCALE_MODE_NEAREST)));
+        feat     = lrelu(ctx, conv_up2->forward(ctx, ggml_upscale(ctx, feat, 2, GGML_SCALE_MODE_NEAREST)));
        auto out = conv_last->forward(ctx, lrelu(ctx, conv_hr->forward(ctx, feat)));
        return out;
    }
@ -142,11 +142,22 @@ struct ESRGAN : public GGMLRunner {
    int scale     = 4;
    int tile_size = 128;  // avoid cuda OOM for 4gb VRAM

-    ESRGAN(ggml_backend_t backend, std::map<std::string, enum ggml_type>& tensor_types)
+    ESRGAN(ggml_backend_t backend, const String2GGMLType& tensor_types = {})
        : GGMLRunner(backend) {
        rrdb_net.init(params_ctx, tensor_types, "");
    }

+    void enable_conv2d_direct() {
+        std::vector<GGMLBlock*> blocks;
+        rrdb_net.get_all_blocks(blocks);
+        for (auto block : blocks) {
+            if (block->get_desc() == "Conv2d") {
+                auto conv_block = (Conv2d*)block;
+                conv_block->enable_direct();
+            }
+        }
+    }
+
    std::string get_desc() {
        return "esrgan";
    }
--- a/otherarch/sdcpp/flux.hpp
+++ b/otherarch/sdcpp/flux.hpp
@ -35,8 +35,8 @@ namespace Flux {
        int64_t hidden_size;
        float eps;

-        void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") {
-            ggml_type wtype = GGML_TYPE_F32;  //(tensor_types.find(prefix + "scale") != tensor_types.end()) ? tensor_types[prefix + "scale"] : GGML_TYPE_F32;
+        void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
+            ggml_type wtype = GGML_TYPE_F32;
            params["scale"] = ggml_new_tensor_1d(ctx, wtype, hidden_size);
        }

@ -512,7 +512,8 @@ namespace Flux {
        LastLayer(int64_t hidden_size,
                  int64_t patch_size,
                  int64_t out_channels,
-                  bool prune_mod = false) : prune_mod(prune_mod) {
+                  bool prune_mod = false)
+            : prune_mod(prune_mod) {
            blocks["norm_final"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-06f, false));
            blocks["linear"]     = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, patch_size * patch_size * out_channels));
            if (!prune_mod) {
@ -723,7 +724,7 @@ namespace Flux {
            auto txt_ids = gen_txt_ids(bs, context_len);
            auto img_ids = gen_img_ids(h, w, patch_size, bs);

-            auto ids = concat_ids(txt_ids, img_ids, bs);
+            auto ids               = concat_ids(txt_ids, img_ids, bs);
            uint64_t curr_h_offset = 0;
            uint64_t curr_w_offset = 0;
            for (ggml_tensor* ref : ref_latents) {
@ -736,7 +737,7 @@ namespace Flux {
                }

                auto ref_ids = gen_img_ids(ref->ne[1], ref->ne[0], patch_size, bs, 1, h_offset, w_offset);
-                ids = concat_ids(ids, ref_ids, bs);
+                ids          = concat_ids(ids, ref_ids, bs);

                curr_h_offset = std::max(curr_h_offset, ref->ne[1] + h_offset);
                curr_w_offset = std::max(curr_w_offset, ref->ne[0] + w_offset);
@ -744,7 +745,6 @@ namespace Flux {
            return ids;
        }

-
        // Generate positional embeddings
        std::vector<float> gen_pe(int h, int w, int patch_size, int bs, int context_len, std::vector<ggml_tensor*> ref_latents, int theta, const std::vector<int>& axes_dim) {
            std::vector<std::vector<float>> ids       = gen_ids(h, w, patch_size, bs, context_len, ref_latents);
@ -872,8 +872,8 @@ namespace Flux {
                                         struct ggml_tensor* y,
                                         struct ggml_tensor* guidance,
                                         struct ggml_tensor* pe,
-                                         struct ggml_tensor* mod_index_arange   = NULL,
-                                         std::vector<int> skip_layers = {}) {
+                                         struct ggml_tensor* mod_index_arange = NULL,
+                                         std::vector<int> skip_layers         = {}) {
            auto img_in      = std::dynamic_pointer_cast<Linear>(blocks["img_in"]);
            auto txt_in      = std::dynamic_pointer_cast<Linear>(blocks["txt_in"]);
            auto final_layer = std::dynamic_pointer_cast<LastLayer>(blocks["final_layer"]);
@ -962,7 +962,6 @@ namespace Flux {

        struct ggml_tensor* process_img(struct ggml_context* ctx,
                                        struct ggml_tensor* x) {
-
            int64_t W          = x->ne[0];
            int64_t H          = x->ne[1];
            int64_t patch_size = 2;
@ -983,9 +982,9 @@ namespace Flux {
                                    struct ggml_tensor* y,
                                    struct ggml_tensor* guidance,
                                    struct ggml_tensor* pe,
-                                    struct ggml_tensor* mod_index_arange   = NULL,
+                                    struct ggml_tensor* mod_index_arange  = NULL,
                                    std::vector<ggml_tensor*> ref_latents = {},
-                                    std::vector<int> skip_layers = {}) {
+                                    std::vector<int> skip_layers          = {}) {
            // Forward pass of DiT.
            // x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
            // timestep: (N,) tensor of diffusion timesteps
@ -1005,7 +1004,7 @@ namespace Flux {
            int pad_h          = (patch_size - H % patch_size) % patch_size;
            int pad_w          = (patch_size - W % patch_size) % patch_size;

-            auto img = process_img(ctx, x);
+            auto img            = process_img(ctx, x);
            uint64_t img_tokens = img->ne[1];

            if (c_concat != NULL) {
@ -1013,7 +1012,7 @@ namespace Flux {
                ggml_tensor* mask   = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], 8 * 8, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * C);

                masked = process_img(ctx, masked);
-                mask = process_img(ctx, mask);
+                mask   = process_img(ctx, mask);

                img = ggml_concat(ctx, img, ggml_concat(ctx, masked, mask, 0), 0);
            }
@ -1027,9 +1026,9 @@ namespace Flux {

            auto out = forward_orig(ctx, img, context, timestep, y, guidance, pe, mod_index_arange, skip_layers);  // [N, num_tokens, C * patch_size * patch_size]
            if (out->ne[1] > img_tokens) {
-                out = ggml_cont(ctx, ggml_permute(ctx, out, 0, 2, 1, 3)); // [num_tokens, N, C * patch_size * patch_size]
+                out = ggml_cont(ctx, ggml_permute(ctx, out, 0, 2, 1, 3));  // [num_tokens, N, C * patch_size * patch_size]
                out = ggml_view_3d(ctx, out, out->ne[0], out->ne[1], img_tokens, out->nb[1], out->nb[2], 0);
-                out = ggml_cont(ctx, ggml_permute(ctx, out, 0, 2, 1, 3)); // [N, h*w, C * patch_size * patch_size]
+                out = ggml_cont(ctx, ggml_permute(ctx, out, 0, 2, 1, 3));  // [N, h*w, C * patch_size * patch_size]
            }

            // rearrange(out, "b (h w) (c ph pw) -> b c (h ph) (w pw)", h=h_len, w=w_len, ph=2, pw=2)
@ -1040,8 +1039,6 @@ namespace Flux {
    };

    struct FluxRunner : public GGMLRunner {
-        static std::map<std::string, enum ggml_type> empty_tensor_types;
-
    public:
        FluxParams flux_params;
        Flux flux;
@ -1051,11 +1048,11 @@ namespace Flux {
        bool use_mask = false;

        FluxRunner(ggml_backend_t backend,
-                   std::map<std::string, enum ggml_type>& tensor_types = empty_tensor_types,
-                   const std::string prefix                            = "",
-                   SDVersion version                                   = VERSION_FLUX,
-                   bool flash_attn                                     = false,
-                   bool use_mask                                       = false)
+                   const String2GGMLType& tensor_types = {},
+                   const std::string prefix            = "",
+                   SDVersion version                   = VERSION_FLUX,
+                   bool flash_attn                     = false,
+                   bool use_mask                       = false)
            : GGMLRunner(backend), use_mask(use_mask) {
            flux_params.flash_attn          = flash_attn;
            flux_params.guidance_embed      = false;
@ -1120,7 +1117,7 @@ namespace Flux {
                                        struct ggml_tensor* y,
                                        struct ggml_tensor* guidance,
                                        std::vector<ggml_tensor*> ref_latents = {},
-                                        std::vector<int> skip_layers = {}) {
+                                        std::vector<int> skip_layers          = {}) {
            GGML_ASSERT(x->ne[3] == 1);
            struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, FLUX_GRAPH_SIZE, false);

@ -1139,8 +1136,8 @@ namespace Flux {
                }

                // ggml_arange is not working on some backends, precompute it
-                mod_index_arange_vec  = arange(0, 344);
-                mod_index_arange = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_F32, mod_index_arange_vec.size());
+                mod_index_arange_vec = arange(0, 344);
+                mod_index_arange     = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_F32, mod_index_arange_vec.size());
                set_backend_tensor_data(mod_index_arange, mod_index_arange_vec.data());
            }
            y = to_backend(y);
@ -1187,9 +1184,9 @@ namespace Flux {
                     struct ggml_tensor* y,
                     struct ggml_tensor* guidance,
                     std::vector<ggml_tensor*> ref_latents = {},
-                     struct ggml_tensor** output     = NULL,
-                     struct ggml_context* output_ctx = NULL,
-                     std::vector<int> skip_layers    = std::vector<int>()) {
+                     struct ggml_tensor** output           = NULL,
+                     struct ggml_context* output_ctx       = NULL,
+                     std::vector<int> skip_layers          = std::vector<int>()) {
            // x: [N, in_channels, h, w]
            // timesteps: [N, ]
            // context: [N, max_position, hidden_size]
@ -1277,4 +1274,4 @@ namespace Flux {

 }  // namespace Flux

-#endif  // __FLUX_HPP__
+#endif  // __FLUX_HPP__
--- a/otherarch/sdcpp/ggml_extend.hpp
+++ b/otherarch/sdcpp/ggml_extend.hpp
@ -40,6 +40,10 @@
 #include "ggml-vulkan.h"
 #endif

+#ifdef SD_USE_OPENCL
+#include "ggml-opencl.h"
+#endif
+
 #ifdef SD_USE_SYCL
 #include "ggml-sycl.h"
 #endif
@ -53,6 +57,8 @@
 #define __STATIC_INLINE__ static inline
 #endif

+static_assert(GGML_MAX_NAME >= 128, "GGML_MAX_NAME must be at least 128");
+
 // n-mode trensor-matrix product
 // example: 2-mode product
 // A: [ne03, k, ne01, ne00]
@ -109,13 +115,13 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_merge_lora(ggml_context* ctx, struct
 // [ne03,ne02,ne01,ne00] x [ne13,ne12,ne11,ne10] => [ne03*ne13,ne02*ne12,ne01*ne11,ne00*ne10]
 __STATIC_INLINE__ struct ggml_tensor* ggml_kronecker(ggml_context* ctx, struct ggml_tensor* a, struct ggml_tensor* b) {
    return ggml_mul(ctx,
-                    ggml_upscale_ext(ctx,
+                    ggml_interpolate(ctx,
                                     a,
                                     a->ne[0] * b->ne[0],
                                     a->ne[1] * b->ne[1],
                                     a->ne[2] * b->ne[2],
                                     a->ne[3] * b->ne[3],
-                                     ggml_scale_mode::GGML_SCALE_MODE_NEAREST),
+                                     GGML_SCALE_MODE_NEAREST),
                    b);
 }

@ -811,6 +817,25 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_conv_2d(struct ggml_context* ctx,
    return x;
 }

+__STATIC_INLINE__ struct ggml_tensor* ggml_nn_conv_2d_direct(struct ggml_context* ctx,
+                                                             struct ggml_tensor* x,
+                                                             struct ggml_tensor* w,
+                                                             struct ggml_tensor* b,
+                                                             int s0 = 1,
+                                                             int s1 = 1,
+                                                             int p0 = 0,
+                                                             int p1 = 0,
+                                                             int d0 = 1,
+                                                             int d1 = 1) {
+    x = ggml_conv_2d_direct(ctx, w, x, s0, s1, p0, p1, d0, d1);
+    if (b != NULL) {
+        b = ggml_reshape_4d(ctx, b, 1, 1, b->ne[0], 1);
+        // b = ggml_repeat(ctx, b, x);
+        x = ggml_add(ctx, x, b);
+    }
+    return x;
+}
+
 // w: [OC，IC, KD, 1 * 1]
 // x: [N, IC, IH, IW]
 // b: [OC,]
@ -945,18 +970,33 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_attention_ext(struct ggml_context*

    float scale = (1.0f / sqrt((float)d_head));

+    int kv_pad = 0;
    // if (flash_attn) {
    //     LOG_DEBUG("attention_ext L_q:%d L_k:%d n_head:%d C:%d d_head:%d N:%d", L_q, L_k, n_head, C, d_head, N);
    // }
-    //  is there anything oddly shaped?? ping Green-Sky if you can trip this assert
+    //   is there anything oddly shaped?? ping Green-Sky if you can trip this assert
    GGML_ASSERT(((L_k % 256 == 0) && L_q == L_k) || !(L_k % 256 == 0));

    bool can_use_flash_attn = true;
+    can_use_flash_attn      = can_use_flash_attn && (d_head == 64 ||
+                                                d_head == 80 ||
+                                                d_head == 96 ||
+                                                d_head == 112 ||
+                                                d_head == 128 ||
+                                                d_head == 256);
+// kcpp disable kv_pad (leejet/stable-diffusion.cpp#756)
+#if 1
    can_use_flash_attn      = can_use_flash_attn && L_k % 256 == 0;
-    can_use_flash_attn      = can_use_flash_attn && d_head % 64 == 0;  // double check
-
-    // cuda max d_head seems to be 256, cpu does seem to work with 512
-    can_use_flash_attn = can_use_flash_attn && d_head <= 256;  // double check
+#else
+    if (can_use_flash_attn && L_k % 256 != 0) {
+        // TODO(Green-Sky): might be worth just padding by default
+        if (L_k == 77 || L_k == 4208 || L_k == 3952) {
+            kv_pad = GGML_PAD(L_k, 256) - L_k;
+        } else {
+            can_use_flash_attn = false;
+        }
+    }
+#endif

    if (mask != nullptr) {
        // TODO(Green-Sky): figure out if we can bend t5 to work too
@ -969,11 +1009,18 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_attention_ext(struct ggml_context*
    ggml_tensor* kqv = nullptr;
    // GGML_ASSERT((flash_attn && can_use_flash_attn) || !flash_attn);
    if (can_use_flash_attn && flash_attn) {
-        // LOG_DEBUG("using flash attention");
+        // LOG_DEBUG(" uses flash attention");
+        if (kv_pad != 0) {
+            // LOG_DEBUG(" padding k and v dim1 by %d", kv_pad);
+            k = ggml_pad(ctx, k, 0, kv_pad, 0, 0);
+        }
        k = ggml_cast(ctx, k, GGML_TYPE_F16);

        v = ggml_cont(ctx, ggml_permute(ctx, v, 0, 2, 1, 3));  // [N, n_head, L_k, d_head]
        v = ggml_reshape_3d(ctx, v, d_head, L_k, n_head * N);  // [N * n_head, L_k, d_head]
+        if (kv_pad != 0) {
+            v = ggml_pad(ctx, v, 0, kv_pad, 0, 0);
+        }
        v = ggml_cast(ctx, v, GGML_TYPE_F16);

        if (mask != nullptr) {
@ -1181,6 +1228,8 @@ __STATIC_INLINE__ size_t ggml_tensor_num(ggml_context* ctx) {
 #define MAX_PARAMS_TENSOR_NUM 32768
 #define MAX_GRAPH_SIZE 32768

+typedef std::map<std::string, enum ggml_type> String2GGMLType;
+
 struct GGMLRunner {
 protected:
    typedef std::function<struct ggml_cgraph*()> get_graph_cb_t;
@ -1365,13 +1414,7 @@ public:
            ggml_backend_cpu_set_n_threads(backend, n_threads);
        }

-// #ifdef SD_USE_METAL
-//         if (ggml_backend_is_metal(backend)) {
-//             ggml_backend_metal_set_n_cb(backend, n_threads);
-//         }
-// #endif
        ggml_backend_graph_compute(backend, gf);
-
 #ifdef GGML_PERF
        ggml_graph_print(gf);
 #endif
@ -1398,17 +1441,25 @@ protected:
    GGMLBlockMap blocks;
    ParameterMap params;

-    void init_blocks(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") {
+    ggml_type get_type(const std::string& name, const String2GGMLType& tensor_types, ggml_type default_type) {
+        auto iter = tensor_types.find(name);
+        if (iter != tensor_types.end()) {
+            return iter->second;
+        }
+        return default_type;
+    }
+
+    void init_blocks(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
        for (auto& pair : blocks) {
            auto& block = pair.second;
            block->init(ctx, tensor_types, prefix + pair.first);
        }
    }

-    virtual void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") {}
+    virtual void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {}

 public:
-    void init(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, std::string prefix = "") {
+    void init(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, std::string prefix = "") {
        if (prefix.size() > 0) {
            prefix = prefix + ".";
        }
@ -1455,6 +1506,19 @@ public:
            tensors[prefix + pair.first] = pair.second;
        }
    }
+
+    virtual std::string get_desc() {
+        return "GGMLBlock";
+    }
+
+    void get_all_blocks(std::vector<GGMLBlock*>& result) {
+        result.push_back(this);
+        for (auto& block_iter : blocks) {
+            if (block_iter.second) {
+                block_iter.second->get_all_blocks(result);
+            }
+        }
+    }
 };

 class UnaryBlock : public GGMLBlock {
@ -1469,8 +1533,8 @@ protected:
    bool bias;
    bool force_f32;

-    void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") {
-        enum ggml_type wtype = (tensor_types.find(prefix + "weight") != tensor_types.end()) ? tensor_types[prefix + "weight"] : GGML_TYPE_F32;
+    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
+        enum ggml_type wtype = get_type(prefix + "weight", tensor_types, GGML_TYPE_F32);
        if (in_features % ggml_blck_size(wtype) != 0 || force_f32) {
            wtype = GGML_TYPE_F32;
        }
@ -1505,8 +1569,8 @@ class Embedding : public UnaryBlock {
 protected:
    int64_t embedding_dim;
    int64_t num_embeddings;
-    void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") {
-        enum ggml_type wtype = (tensor_types.find(prefix + "weight") != tensor_types.end()) ? tensor_types[prefix + "weight"] : GGML_TYPE_F32;
+    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types, const std::string prefix = "") {
+        enum ggml_type wtype = get_type(prefix + "weight", tensor_types, GGML_TYPE_F32);
        params["weight"]     = ggml_new_tensor_2d(ctx, wtype, embedding_dim, num_embeddings);
    }

@ -1544,12 +1608,13 @@ protected:
    std::pair<int, int> padding;
    std::pair<int, int> dilation;
    bool bias;
+    bool direct = false;

-    void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") {
-        enum ggml_type wtype = GGML_TYPE_F16;  //(tensor_types.find(prefix + "weight") != tensor_types.end()) ? tensor_types[prefix + "weight"] : GGML_TYPE_F16;
+    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types, const std::string prefix = "") {
+        enum ggml_type wtype = GGML_TYPE_F16;
        params["weight"]     = ggml_new_tensor_4d(ctx, wtype, kernel_size.second, kernel_size.first, in_channels, out_channels);
        if (bias) {
-            enum ggml_type wtype = GGML_TYPE_F32;  // (tensor_types.find(prefix + "bias") != tensor_types.end()) ? tensor_types[prefix + "bias"] : GGML_TYPE_F32;
+            enum ggml_type wtype = GGML_TYPE_F32;
            params["bias"]       = ggml_new_tensor_1d(ctx, wtype, out_channels);
        }
    }
@ -1570,13 +1635,25 @@ public:
          dilation(dilation),
          bias(bias) {}

+    void enable_direct() {
+        direct = true;
+    }
+
+    std::string get_desc() {
+        return "Conv2d";
+    }
+
    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
        struct ggml_tensor* w = params["weight"];
        struct ggml_tensor* b = NULL;
        if (bias) {
            b = params["bias"];
        }
-        return ggml_nn_conv_2d(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first);
+        if (direct) {
+            return ggml_nn_conv_2d_direct(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first);
+        } else {
+            return ggml_nn_conv_2d(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first);
+        }
    }
 };

@ -1590,11 +1667,11 @@ protected:
    int64_t dilation;
    bool bias;

-    void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") {
-        enum ggml_type wtype = GGML_TYPE_F16;                                                              //(tensor_types.find(prefix + "weight") != tensor_types.end()) ? tensor_types[prefix + "weight"] : GGML_TYPE_F16;
+    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types, const std::string prefix = "") {
+        enum ggml_type wtype = GGML_TYPE_F16;
        params["weight"]     = ggml_new_tensor_4d(ctx, wtype, 1, kernel_size, in_channels, out_channels);  // 5d => 4d
        if (bias) {
-            enum ggml_type wtype = GGML_TYPE_F32;  //(tensor_types.find(prefix + "bias") != tensor_types.end()) ? tensor_types[prefix + "bias"] : GGML_TYPE_F32;
+            enum ggml_type wtype = GGML_TYPE_F32;
            params["bias"]       = ggml_new_tensor_1d(ctx, wtype, out_channels);
        }
    }
@ -1634,12 +1711,12 @@ protected:
    bool elementwise_affine;
    bool bias;

-    void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") {
+    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
        if (elementwise_affine) {
-            enum ggml_type wtype = GGML_TYPE_F32;  //(tensor_types.ypes.find(prefix + "weight") != tensor_types.end()) ? tensor_types[prefix + "weight"] : GGML_TYPE_F32;
+            enum ggml_type wtype = GGML_TYPE_F32;
            params["weight"]     = ggml_new_tensor_1d(ctx, wtype, normalized_shape);
            if (bias) {
-                enum ggml_type wtype = GGML_TYPE_F32;  //(tensor_types.ypes.find(prefix + "bias") != tensor_types.end()) ? tensor_types[prefix + "bias"] : GGML_TYPE_F32;
+                enum ggml_type wtype = GGML_TYPE_F32;
                params["bias"]       = ggml_new_tensor_1d(ctx, wtype, normalized_shape);
            }
        }
@ -1676,10 +1753,10 @@ protected:
    float eps;
    bool affine;

-    void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") {
+    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
        if (affine) {
-            enum ggml_type wtype      = GGML_TYPE_F32;  //(tensor_types.find(prefix + "weight") != tensor_types.end()) ? tensor_types[prefix + "weight"] : GGML_TYPE_F32;
-            enum ggml_type bias_wtype = GGML_TYPE_F32;  //(tensor_types.find(prefix + "bias") != tensor_types.end()) ? tensor_types[prefix + "bias"] : GGML_TYPE_F32;
+            enum ggml_type wtype      = GGML_TYPE_F32;
+            enum ggml_type bias_wtype = GGML_TYPE_F32;
            params["weight"]          = ggml_new_tensor_1d(ctx, wtype, num_channels);
            params["bias"]            = ggml_new_tensor_1d(ctx, bias_wtype, num_channels);
        }
@ -1760,4 +1837,4 @@ public:
    }
 };

-#endif  // __GGML_EXTEND__HPP__
+#endif  // __GGML_EXTEND__HPP__
--- a/otherarch/sdcpp/gits_noise.inl
+++ b/otherarch/sdcpp/gits_noise.inl
@ -346,4 +346,4 @@ const std::vector<const std::vector<std::vector<float>>*> GITS_NOISE = {
    &GITS_NOISE_1_50
 };

-#endif // GITS_NOISE_INL
+#endif // GITS_NOISE_INL
--- a/otherarch/sdcpp/lora.hpp
+++ b/otherarch/sdcpp/lora.hpp
@ -3,7 +3,7 @@

 #include "ggml_extend.hpp"

-#define LORA_GRAPH_SIZE 20480
+#define LORA_GRAPH_BASE_SIZE 10240

 struct LoraModel : public GGMLRunner {
    enum lora_t {
@ -238,7 +238,8 @@ struct LoraModel : public GGMLRunner {
    }

    struct ggml_cgraph* build_lora_graph(std::map<std::string, struct ggml_tensor*> model_tensors, SDVersion version) {
-        struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, LORA_GRAPH_SIZE, false);
+        size_t lora_graph_size = LORA_GRAPH_BASE_SIZE + lora_tensors.size() * 10;
+        struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, lora_graph_size, false);

        zero_index = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_I32, 1);
        set_backend_tensor_data(zero_index, zero_index_vec.data());
@ -291,7 +292,6 @@ struct LoraModel : public GGMLRunner {
                    std::string hada_2_down_name = "";
                    std::string hada_2_up_name   = "";

-
                    hada_1_down_name = fk + ".hada_w1_b";
                    hada_1_up_name   = fk + ".hada_w1_a";
                    hada_1_mid_name  = fk + ".hada_t1";
@ -843,4 +843,4 @@ struct LoraModel : public GGMLRunner {
    }
 };

-#endif  // __LORA_HPP__
+#endif  // __LORA_HPP__
--- a/otherarch/sdcpp/main.cpp
+++ b/otherarch/sdcpp/main.cpp
--- a/otherarch/sdcpp/mmdit.hpp
+++ b/otherarch/sdcpp/mmdit.hpp
@ -147,8 +147,8 @@ protected:
    int64_t hidden_size;
    float eps;

-    void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, std::string prefix = "") {
-        enum ggml_type wtype = GGML_TYPE_F32;  //(tensor_types.find(prefix + "weight") != tensor_types.end()) ? tensor_types[prefix + "weight"] : GGML_TYPE_F32;
+    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, std::string prefix = "") {
+        enum ggml_type wtype = GGML_TYPE_F32;
        params["weight"]     = ggml_new_tensor_1d(ctx, wtype, hidden_size);
    }

@ -652,13 +652,13 @@ protected:
    int64_t hidden_size;
    std::string qk_norm;

-    void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, std::string prefix = "") {
-        enum ggml_type wtype = GGML_TYPE_F32;  //(tensor_types.find(prefix + "pos_embed") != tensor_types.end()) ? tensor_types[prefix + "pos_embed"] : GGML_TYPE_F32;
+    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, std::string prefix = "") {
+        enum ggml_type wtype = GGML_TYPE_F32;
        params["pos_embed"]  = ggml_new_tensor_3d(ctx, wtype, hidden_size, num_patchs, 1);
    }

 public:
-    MMDiT(std::map<std::string, enum ggml_type>& tensor_types) {
+    MMDiT(const String2GGMLType& tensor_types = {}) {
        // input_size is always None
        // learn_sigma is always False
        // register_length is alwalys 0
@ -869,11 +869,9 @@ public:
 struct MMDiTRunner : public GGMLRunner {
    MMDiT mmdit;

-    static std::map<std::string, enum ggml_type> empty_tensor_types;
-
    MMDiTRunner(ggml_backend_t backend,
-                std::map<std::string, enum ggml_type>& tensor_types = empty_tensor_types,
-                const std::string prefix                            = "")
+                const String2GGMLType& tensor_types = {},
+                const std::string prefix            = "")
        : GGMLRunner(backend), mmdit(tensor_types) {
        mmdit.init(params_ctx, tensor_types, prefix);
    }
--- a/otherarch/sdcpp/model.cpp
+++ b/otherarch/sdcpp/model.cpp
@ -16,7 +16,6 @@
 #include "ggml-backend.h"
 #include "ggml-cpu.h"
 #include "ggml.h"
-#include "gguf.h"

 #include "stable-diffusion.h"

@ -28,6 +27,10 @@
 #include "ggml-vulkan.h"
 #endif

+#ifdef SD_USE_OPENCL
+#include "ggml-opencl.h"
+#endif
+
 #define ST_HEADER_SIZE_LEN 8

 static std::string format(const char* fmt, ...) {
@ -111,6 +114,7 @@ const char* unused_tensors[] = {
    "model_ema.diffusion_model",
    "embedding_manager",
    "denoiser.sigmas",
+    "text_encoders.t5xxl.transformer.encoder.embed_tokens.weight",  // only used during training
 };

 bool is_unused_tensor(std::string name) {
@ -192,7 +196,7 @@ std::unordered_map<std::string, std::string> pmid_v2_name_map = {
 std::string convert_open_clip_to_hf_clip(const std::string& name) {
    std::string new_name = name;
    std::string prefix;
-     if (contains(new_name, ".enc.")) {
+    if (contains(new_name, ".enc.")) {
        // llama.cpp naming convention for T5
        size_t pos = new_name.find(".enc.");
        if (pos != std::string::npos) {
@ -348,6 +352,10 @@ std::unordered_map<std::string, std::unordered_map<std::string, std::string>> su
            {"to_v", "v"},
            {"to_out_0", "proj_out"},
            {"group_norm", "norm"},
+            {"key", "k"},
+            {"query", "q"},
+            {"value", "v"},
+            {"proj_attn", "proj_out"},
        },
    },
    {
@ -372,6 +380,10 @@ std::unordered_map<std::string, std::unordered_map<std::string, std::string>> su
            {"to_v", "v"},
            {"to_out.0", "proj_out"},
            {"group_norm", "norm"},
+            {"key", "k"},
+            {"query", "q"},
+            {"value", "v"},
+            {"proj_attn", "proj_out"},
        },
    },
    {
@ -443,6 +455,10 @@ std::string convert_diffusers_name_to_compvis(std::string key, char seq) {
        return format("model%cdiffusion_model%ctime_embed%c", seq, seq, seq) + std::to_string(std::stoi(m[0]) * 2 - 2) + m[1];
    }

+    if (match(m, std::regex(format("unet%cadd_embedding%clinear_(\\d+)(.*)", seq, seq)), key)) {
+        return format("model%cdiffusion_model%clabel_emb%c0%c", seq, seq, seq, seq) + std::to_string(std::stoi(m[0]) * 2 - 2) + m[1];
+    }
+
    if (match(m, std::regex(format("unet%cdown_blocks%c(\\d+)%c(attentions|resnets)%c(\\d+)%c(.+)", seq, seq, seq, seq, seq)), key)) {
        std::string suffix = get_converted_suffix(m[1], m[3]);
        // LOG_DEBUG("%s %s %s %s", m[0].c_str(), m[1].c_str(), m[2].c_str(), m[3].c_str());
@ -480,6 +496,19 @@ std::string convert_diffusers_name_to_compvis(std::string key, char seq) {
        return format("cond_stage_model%ctransformer%ctext_model", seq, seq) + m[0];
    }

+    // clip-g
+    if (match(m, std::regex(format("te%c1%ctext_model%cencoder%clayers%c(\\d+)%c(.+)", seq, seq, seq, seq, seq, seq)), key)) {
+        return format("cond_stage_model%c1%ctransformer%ctext_model%cencoder%clayers%c", seq, seq, seq, seq, seq, seq) + m[0] + seq + m[1];
+    }
+
+    if (match(m, std::regex(format("te%c1%ctext_model(.*)", seq, seq)), key)) {
+        return format("cond_stage_model%c1%ctransformer%ctext_model", seq, seq, seq) + m[0];
+    }
+
+    if (match(m, std::regex(format("te%c1%ctext_projection", seq, seq)), key)) {
+        return format("cond_stage_model%c1%ctransformer%ctext_model%ctext_projection", seq, seq, seq, seq);
+    }
+
    // vae
    if (match(m, std::regex(format("vae%c(.*)%cconv_norm_out(.*)", seq, seq)), key)) {
        return format("first_stage_model%c%s%cnorm_out%s", seq, m[0].c_str(), seq, m[1].c_str());
@ -616,6 +645,8 @@ std::string convert_tensor_name(std::string name) {
            std::string new_key = convert_diffusers_name_to_compvis(name_without_network_parts, '.');
            if (new_key.empty()) {
                new_name = name;
+            } else if (new_key == "cond_stage_model.1.transformer.text_model.text_projection") {
+                new_name = new_key;
            } else {
                new_name = new_key + "." + network_part;
            }
@ -631,7 +662,7 @@ std::string convert_tensor_name(std::string name) {
    return new_name;
 }

-void add_preprocess_tensor_storage_types(std::map<std::string, enum ggml_type>& tensor_storages_types, std::string name, enum ggml_type type) {
+void add_preprocess_tensor_storage_types(String2GGMLType& tensor_storages_types, std::string name, enum ggml_type type) {
    std::string new_name = convert_tensor_name(name);

    if (new_name.find("cond_stage_model") != std::string::npos && ends_with(new_name, "attn.in_proj_weight")) {
@ -798,6 +829,7 @@ void f8_e4m3_to_f16_vec(uint8_t* src, uint16_t* dst, int64_t n) {
        dst[i] = f8_e4m3_to_f16(src[i]);
    }
 }
+
 void f8_e5m2_to_f16_vec(uint8_t* src, uint16_t* dst, int64_t n) {
    // support inplace op
    for (int64_t i = n - 1; i >= 0; i--) {
@ -805,6 +837,20 @@ void f8_e5m2_to_f16_vec(uint8_t* src, uint16_t* dst, int64_t n) {
    }
 }

+void f64_to_f32_vec(double* src, float* dst, int64_t n) {
+    // support inplace op
+    for (int64_t i = 0; i < n; i++) {
+        dst[i] = (float)src[i];
+    }
+}
+
+void i64_to_i32_vec(int64_t* src, int32_t* dst, int64_t n) {
+    // support inplace op
+    for (int64_t i = 0; i < n; i++) {
+        dst[i] = (int32_t)src[i];
+    }
+}
+
 void convert_tensor(void* src,
                    ggml_type src_type,
                    void* dst,
@ -1050,10 +1096,14 @@ ggml_type str_to_ggml_type(const std::string& dtype) {
        ttype = GGML_TYPE_F32;
    } else if (dtype == "F32") {
        ttype = GGML_TYPE_F32;
+    } else if (dtype == "F64") {
+        ttype = GGML_TYPE_F32;
    } else if (dtype == "F8_E4M3") {
        ttype = GGML_TYPE_F16;
    } else if (dtype == "F8_E5M2") {
        ttype = GGML_TYPE_F16;
+    } else if (dtype == "I64") {
+        ttype = GGML_TYPE_I32;
    }
    return ttype;
 }
@ -1071,6 +1121,7 @@ bool ModelLoader::init_from_safetensors_file(const std::string& file_path, const
    std::ifstream file(fpath, std::ios::binary);
    if (!file.is_open()) {
        LOG_ERROR("failed to open '%s'", file_path.c_str());
+        file_paths_.pop_back();
        return false;
    }

@ -1082,6 +1133,7 @@ bool ModelLoader::init_from_safetensors_file(const std::string& file_path, const
    // read header size
    if (file_size_ <= ST_HEADER_SIZE_LEN) {
        LOG_ERROR("invalid safetensor file '%s'", file_path.c_str());
+        file_paths_.pop_back();
        return false;
    }

@ -1095,6 +1147,7 @@ bool ModelLoader::init_from_safetensors_file(const std::string& file_path, const
    size_t header_size_ = read_u64(header_size_buf);
    if (header_size_ >= file_size_) {
        LOG_ERROR("invalid safetensor file '%s'", file_path.c_str());
+        file_paths_.pop_back();
        return false;
    }

@ -1105,6 +1158,7 @@ bool ModelLoader::init_from_safetensors_file(const std::string& file_path, const
    file.read(header_buf.data(), header_size_);
    if (!file) {
        LOG_ERROR("read safetensors header failed: '%s'", file_path.c_str());
+        file_paths_.pop_back();
        return false;
    }

@ -1176,6 +1230,14 @@ bool ModelLoader::init_from_safetensors_file(const std::string& file_path, const
            tensor_storage.is_f8_e5m2 = true;
            // f8 -> f16
            GGML_ASSERT(tensor_storage.nbytes() == tensor_data_size * 2);
+        } else if (dtype == "F64") {
+            tensor_storage.is_f64 = true;
+            // f64 -> f32
+            GGML_ASSERT(tensor_storage.nbytes() * 2 == tensor_data_size);
+        } else if (dtype == "I64") {
+            tensor_storage.is_i64 = true;
+            // i64 -> i32
+            GGML_ASSERT(tensor_storage.nbytes() * 2 == tensor_data_size);
        } else {
            GGML_ASSERT(tensor_storage.nbytes() == tensor_data_size);
        }
@ -1192,18 +1254,45 @@ bool ModelLoader::init_from_safetensors_file(const std::string& file_path, const
 /*================================================= DiffusersModelLoader ==================================================*/

 bool ModelLoader::init_from_diffusers_file(const std::string& file_path, const std::string& prefix) {
-    std::string unet_path = path_join(file_path, "unet/diffusion_pytorch_model.safetensors");
-    std::string vae_path  = path_join(file_path, "vae/diffusion_pytorch_model.safetensors");
-    std::string clip_path = path_join(file_path, "text_encoder/model.safetensors");
+    std::string unet_path   = path_join(file_path, "unet/diffusion_pytorch_model.safetensors");
+    std::string vae_path    = path_join(file_path, "vae/diffusion_pytorch_model.safetensors");
+    std::string clip_path   = path_join(file_path, "text_encoder/model.safetensors");
+    std::string clip_g_path = path_join(file_path, "text_encoder_2/model.safetensors");

    if (!init_from_safetensors_file(unet_path, "unet.")) {
        return false;
    }
+    for (auto ts : tensor_storages) {
+        if (ts.name.find("add_embedding") != std::string::npos || ts.name.find("label_emb") != std::string::npos) {
+            // probably SDXL
+            LOG_DEBUG("Fixing name for SDXL output blocks.2.2");
+            for (auto& tensor_storage : tensor_storages) {
+                int len  = 34;
+                auto pos = tensor_storage.name.find("unet.up_blocks.0.upsamplers.0.conv");
+                if (pos == std::string::npos) {
+                    len = 44;
+                    pos = tensor_storage.name.find("model.diffusion_model.output_blocks.2.1.conv");
+                }
+                if (pos != std::string::npos) {
+                    tensor_storage.name = "model.diffusion_model.output_blocks.2.2.conv" + tensor_storage.name.substr(len);
+                    LOG_DEBUG("NEW NAME: %s", tensor_storage.name.c_str());
+                    add_preprocess_tensor_storage_types(tensor_storages_types, tensor_storage.name, tensor_storage.type);
+                }
+            }
+            break;
+        }
+    }
+
    if (!init_from_safetensors_file(vae_path, "vae.")) {
-        return false;
+        LOG_WARN("Couldn't find working VAE in %s", file_path.c_str());
+        // return false;
    }
    if (!init_from_safetensors_file(clip_path, "te.")) {
-        return false;
+        LOG_WARN("Couldn't find working text encoder in %s", file_path.c_str());
+        // return false;
+    }
+    if (!init_from_safetensors_file(clip_g_path, "te.1.")) {
+        LOG_DEBUG("Couldn't find working second text encoder in %s", file_path.c_str());
    }
    return true;
 }
@ -1566,6 +1655,15 @@ bool ModelLoader::init_from_ckpt_file(const std::string& file_path, const std::s
    return true;
 }

+bool ModelLoader::model_is_unet() {
+    for (auto& tensor_storage : tensor_storages) {
+        if (tensor_storage.name.find("model.diffusion_model.input_blocks.") != std::string::npos) {
+            return true;
+        }
+    }
+    return false;
+}
+
 bool ModelLoader::has_diffusion_model_tensors()
 {
    for (auto& tensor_storage : tensor_storages) {
@ -1598,7 +1696,7 @@ SDVersion ModelLoader::get_sd_version() {
            if (tensor_storage.name.find("model.diffusion_model.joint_blocks.") != std::string::npos) {
                return VERSION_SD3;
            }
-            if (tensor_storage.name.find("model.diffusion_model.input_blocks.") != std::string::npos) {
+            if (tensor_storage.name.find("model.diffusion_model.input_blocks.") != std::string::npos || tensor_storage.name.find("unet.down_blocks.") != std::string::npos) {
                is_unet = true;
                if (has_multiple_encoders) {
                    is_xl = true;
@ -1607,7 +1705,7 @@ SDVersion ModelLoader::get_sd_version() {
                    }
                }
            }
-            if (tensor_storage.name.find("conditioner.embedders.1") != std::string::npos || tensor_storage.name.find("cond_stage_model.1") != std::string::npos) {
+            if (tensor_storage.name.find("conditioner.embedders.1") != std::string::npos || tensor_storage.name.find("cond_stage_model.1") != std::string::npos || tensor_storage.name.find("te.1") != std::string::npos) {
                has_multiple_encoders = true;
                if (is_unet) {
                    is_xl = true;
@ -1629,7 +1727,7 @@ SDVersion ModelLoader::get_sd_version() {
            token_embedding_weight = tensor_storage;
            // break;
        }
-        if (tensor_storage.name == "model.diffusion_model.input_blocks.0.0.weight" || tensor_storage.name == "model.diffusion_model.img_in.weight") {
+        if (tensor_storage.name == "model.diffusion_model.input_blocks.0.0.weight" || tensor_storage.name == "model.diffusion_model.img_in.weight" || tensor_storage.name == "unet.conv_in.weight") {
            input_block_weight  = tensor_storage;
            input_block_checked = true;
            if (found_family) {
@ -1638,10 +1736,14 @@ SDVersion ModelLoader::get_sd_version() {
        }
    }
    bool is_inpaint = input_block_weight.ne[2] == 9;
+    bool is_ip2p    = input_block_weight.ne[2] == 8;
    if (is_xl) {
        if (is_inpaint) {
            return VERSION_SDXL_INPAINT;
        }
+        if (is_ip2p) {
+            return VERSION_SDXL_PIX2PIX;
+        }
        return VERSION_SDXL;
    }

@ -1657,6 +1759,9 @@ SDVersion ModelLoader::get_sd_version() {
        if (is_inpaint) {
            return VERSION_SD1_INPAINT;
        }
+        if (is_ip2p) {
+            return VERSION_SD1_PIX2PIX;
+        }
        return VERSION_SD1;
    } else if (token_embedding_weight.ne[0] == 1024) {
        if (is_inpaint) {
@ -1714,7 +1819,7 @@ ggml_type ModelLoader::get_diffusion_model_wtype() {
            continue;
        }

-        if (tensor_storage.name.find("model.diffusion_model.") == std::string::npos) {
+        if (tensor_storage.name.find("model.diffusion_model.") == std::string::npos && tensor_storage.name.find("unet.") == std::string::npos) {
            continue;
        }

@ -1883,6 +1988,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
        };
        int tensor_count = 0;
        int64_t t1       = ggml_time_ms();
+        bool partial     = false;
        for (auto& tensor_storage : processed_tensor_storages) {
            if (tensor_storage.file_index != file_index) {
                ++tensor_count;
@ -1907,7 +2013,12 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
                // for the CPU and Metal backend, we can copy directly into the tensor
                if (tensor_storage.type == dst_tensor->type) {
                    GGML_ASSERT(ggml_nbytes(dst_tensor) == tensor_storage.nbytes());
-                    read_data(tensor_storage, (char*)dst_tensor->data, nbytes_to_read);
+                    if (tensor_storage.is_f64 || tensor_storage.is_i64) {
+                        read_buffer.resize(tensor_storage.nbytes_to_read());
+                        read_data(tensor_storage, (char*)read_buffer.data(), nbytes_to_read);
+                    } else {
+                        read_data(tensor_storage, (char*)dst_tensor->data, nbytes_to_read);
+                    }

                    if (tensor_storage.is_bf16) {
                        // inplace op
@ -1918,9 +2029,13 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
                    } else if (tensor_storage.is_f8_e5m2) {
                        // inplace op
                        f8_e5m2_to_f16_vec((uint8_t*)dst_tensor->data, (uint16_t*)dst_tensor->data, tensor_storage.nelements());
+                    } else if (tensor_storage.is_f64) {
+                        f64_to_f32_vec((double*)read_buffer.data(), (float*)dst_tensor->data, tensor_storage.nelements());
+                    } else if (tensor_storage.is_i64) {
+                        i64_to_i32_vec((int64_t*)read_buffer.data(), (int32_t*)dst_tensor->data, tensor_storage.nelements());
                    }
                } else {
-                    read_buffer.resize(tensor_storage.nbytes());
+                    read_buffer.resize(std::max(tensor_storage.nbytes(), tensor_storage.nbytes_to_read()));
                    read_data(tensor_storage, (char*)read_buffer.data(), nbytes_to_read);

                    if (tensor_storage.is_bf16) {
@ -1932,13 +2047,19 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
                    } else if (tensor_storage.is_f8_e5m2) {
                        // inplace op
                        f8_e5m2_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements());
+                    } else if (tensor_storage.is_f64) {
+                        // inplace op
+                        f64_to_f32_vec((double*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements());
+                    } else if (tensor_storage.is_i64) {
+                        // inplace op
+                        i64_to_i32_vec((int64_t*)read_buffer.data(), (int32_t*)read_buffer.data(), tensor_storage.nelements());
                    }

                    convert_tensor((void*)read_buffer.data(), tensor_storage.type, dst_tensor->data,
                                   dst_tensor->type, (int)tensor_storage.nelements() / (int)tensor_storage.ne[0], (int)tensor_storage.ne[0]);
                }
            } else {
-                read_buffer.resize(tensor_storage.nbytes());
+                read_buffer.resize(std::max(tensor_storage.nbytes(), tensor_storage.nbytes_to_read()));
                read_data(tensor_storage, (char*)read_buffer.data(), nbytes_to_read);

                if (tensor_storage.is_bf16) {
@ -1950,6 +2071,12 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
                } else if (tensor_storage.is_f8_e5m2) {
                    // inplace op
                    f8_e5m2_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements());
+                } else if (tensor_storage.is_f64) {
+                    // inplace op
+                    f64_to_f32_vec((double*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements());
+                } else if (tensor_storage.is_i64) {
+                    // inplace op
+                    i64_to_i32_vec((int64_t*)read_buffer.data(), (int32_t*)read_buffer.data(), tensor_storage.nelements());
                }

                if (tensor_storage.type == dst_tensor->type) {
@ -1964,20 +2091,26 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
                    ggml_backend_tensor_set(dst_tensor, convert_buffer.data(), 0, ggml_nbytes(dst_tensor));
                }
            }
-            int64_t t2 = ggml_time_ms();
+            size_t tensor_max = processed_tensor_storages.size();
+            int64_t t2        = ggml_time_ms();
+            // kcpp throttle progress printing
            ++tensor_count;
-            if(tensor_count<2 || tensor_count%5==0 || (tensor_count+10) > processed_tensor_storages.size())
+            if(tensor_count<2 || tensor_count%5==0 || (tensor_count+10) > tensor_max)
            {
-                //throttle progress printing
-                pretty_progress(tensor_count, processed_tensor_storages.size(), (t2 - t1) / 1000.0f);
+                pretty_progress(tensor_count, tensor_max, (t2 - t1) / 1000.0f);
            }
-            t1 = t2;
+            t1      = t2;
+            partial = tensor_count != tensor_max;
        }

        if (zip != NULL) {
            zip_close(zip);
        }

+        if (partial) {
+            printf("\n");
+        }
+
        if (!success) {
            break;
        }
@ -2055,6 +2188,41 @@ bool ModelLoader::load_tensors(std::map<std::string, struct ggml_tensor*>& tenso
    return true;
 }

+std::vector<std::pair<std::string, ggml_type>> parse_tensor_type_rules(const std::string& tensor_type_rules) {
+    std::vector<std::pair<std::string, ggml_type>> result;
+    for (const auto& item : splitString(tensor_type_rules, ',')) {
+        if (item.size() == 0)
+            continue;
+        std::string::size_type pos = item.find('=');
+        if (pos == std::string::npos) {
+            LOG_WARN("ignoring invalid quant override \"%s\"", item.c_str());
+            continue;
+        }
+        std::string tensor_pattern = item.substr(0, pos);
+        std::string type_name      = item.substr(pos + 1);
+
+        ggml_type tensor_type = GGML_TYPE_COUNT;
+
+        if (type_name == "f32") {
+            tensor_type = GGML_TYPE_F32;
+        } else {
+            for (size_t i = 0; i < SD_TYPE_COUNT; i++) {
+                auto trait = ggml_get_type_traits((ggml_type)i);
+                if (trait->to_float && trait->type_size && type_name == trait->type_name) {
+                    tensor_type = (ggml_type)i;
+                }
+            }
+        }
+
+        if (tensor_type != GGML_TYPE_COUNT) {
+            result.emplace_back(tensor_pattern, tensor_type);
+        } else {
+            LOG_WARN("ignoring invalid quant override \"%s\"", item.c_str());
+        }
+    }
+    return result;
+}
+
 bool ModelLoader::tensor_should_be_converted(const TensorStorage& tensor_storage, ggml_type type) {
    const std::string& name = tensor_storage.name;
    if (type != GGML_TYPE_COUNT) {
@ -2086,7 +2254,7 @@ bool ModelLoader::tensor_should_be_converted(const TensorStorage& tensor_storage
    return false;
 }

-bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type) {
+bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type, const std::string& tensor_type_rules_str) {
    auto backend    = ggml_backend_cpu_init();
    size_t mem_size = 1 * 1024 * 1024;  // for padding
    mem_size += tensor_storages.size() * ggml_tensor_overhead();
@ -2096,12 +2264,23 @@ bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type

    gguf_context* gguf_ctx = gguf_init_empty();

+    auto tensor_type_rules = parse_tensor_type_rules(tensor_type_rules_str);
+
    auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool {
        const std::string& name = tensor_storage.name;
+        ggml_type tensor_type   = tensor_storage.type;
+        ggml_type dst_type      = type;

-        ggml_type tensor_type = tensor_storage.type;
-        if (tensor_should_be_converted(tensor_storage, type)) {
-            tensor_type = type;
+        for (const auto& tensor_type_rule : tensor_type_rules) {
+            std::regex pattern(tensor_type_rule.first);
+            if (std::regex_search(name, pattern)) {
+                dst_type = tensor_type_rule.second;
+                break;
+            }
+        }
+
+        if (tensor_should_be_converted(tensor_storage, dst_type)) {
+            tensor_type = dst_type;
        }

        ggml_tensor* tensor = ggml_new_tensor(ggml_ctx, tensor_type, tensor_storage.n_dims, tensor_storage.ne);
@ -2160,7 +2339,7 @@ int64_t ModelLoader::get_params_mem_size(ggml_backend_t backend, ggml_type type)
    return mem_size;
 }

-bool convert(const char* input_path, const char* vae_path, const char* output_path, sd_type_t output_type) {
+bool convert(const char* input_path, const char* vae_path, const char* output_path, sd_type_t output_type, const char* tensor_type_rules) {
    ModelLoader model_loader;

    if (!model_loader.init_from_file(input_path)) {
@ -2174,6 +2353,6 @@ bool convert(const char* input_path, const char* vae_path, const char* output_pa
            return false;
        }
    }
-    bool success = model_loader.save_to_gguf_file(output_path, (ggml_type)output_type);
+    bool success = model_loader.save_to_gguf_file(output_path, (ggml_type)output_type, tensor_type_rules);
    return success;
-}
+}
--- a/otherarch/sdcpp/model.h
+++ b/otherarch/sdcpp/model.h
@ -12,19 +12,21 @@

 #include "ggml-backend.h"
 #include "ggml.h"
+#include "gguf.h"
 #include <nlohmann/json.hpp>
 #include "zip.h"
-#include "gguf.h"

 #define SD_MAX_DIMS 5

 enum SDVersion {
    VERSION_SD1,
    VERSION_SD1_INPAINT,
+    VERSION_SD1_PIX2PIX,
    VERSION_SD2,
    VERSION_SD2_INPAINT,
    VERSION_SDXL,
    VERSION_SDXL_INPAINT,
+    VERSION_SDXL_PIX2PIX,
    VERSION_SVD,
    VERSION_SD3,
    VERSION_FLUX,
@ -47,7 +49,7 @@ static inline bool sd_version_is_sd3(SDVersion version) {
 }

 static inline bool sd_version_is_sd1(SDVersion version) {
-    if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT) {
+    if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT || version == VERSION_SD1_PIX2PIX) {
        return true;
    }
    return false;
@ -61,7 +63,7 @@ static inline bool sd_version_is_sd2(SDVersion version) {
 }

 static inline bool sd_version_is_sdxl(SDVersion version) {
-    if (version == VERSION_SDXL || version == VERSION_SDXL_INPAINT) {
+    if (version == VERSION_SDXL || version == VERSION_SDXL_INPAINT || version == VERSION_SDXL_PIX2PIX) {
        return true;
    }
    return false;
@ -81,6 +83,14 @@ static inline bool sd_version_is_dit(SDVersion version) {
    return false;
 }

+static inline bool sd_version_is_unet_edit(SDVersion version) {
+    return version == VERSION_SD1_PIX2PIX || version == VERSION_SDXL_PIX2PIX;
+}
+
+static bool sd_version_is_inpaint_or_unet_edit(SDVersion version) {
+    return sd_version_is_unet_edit(version) || sd_version_is_inpaint(version);
+}
+
 enum PMVersion {
    PM_VERSION_1,
    PM_VERSION_2,
@ -92,6 +102,8 @@ struct TensorStorage {
    bool is_bf16            = false;
    bool is_f8_e4m3         = false;
    bool is_f8_e5m2         = false;
+    bool is_f64             = false;
+    bool is_i64             = false;
    int64_t ne[SD_MAX_DIMS] = {1, 1, 1, 1, 1};
    int n_dims              = 0;

@ -123,6 +135,8 @@ struct TensorStorage {
    int64_t nbytes_to_read() const {
        if (is_bf16 || is_f8_e4m3 || is_f8_e5m2) {
            return nbytes() / 2;
+        } else if (is_f64 || is_i64) {
+            return nbytes() * 2;
        } else {
            return nbytes();
        }
@ -173,6 +187,10 @@ struct TensorStorage {
            type_name = "f8_e4m3";
        } else if (is_f8_e5m2) {
            type_name = "f8_e5m2";
+        } else if (is_f64) {
+            type_name = "f64";
+        } else if (is_i64) {
+            type_name = "i64";
        }
        ss << name << " | " << type_name << " | ";
        ss << n_dims << " [";
@ -189,6 +207,8 @@ struct TensorStorage {

 typedef std::function<bool(const TensorStorage&, ggml_tensor**)> on_new_tensor_cb_t;

+typedef std::map<std::string, enum ggml_type> String2GGMLType;
+
 class ModelLoader {
 protected:
    std::vector<std::string> file_paths_;
@ -207,10 +227,11 @@ protected:
    bool init_from_diffusers_file(const std::string& file_path, const std::string& prefix = "");

 public:
-    std::map<std::string, enum ggml_type> tensor_storages_types;
+    String2GGMLType tensor_storages_types;

    bool init_from_file(const std::string& file_path, const std::string& prefix = "");
    bool has_diffusion_model_tensors();
+    bool model_is_unet();
    SDVersion get_sd_version();
    ggml_type get_sd_wtype();
    ggml_type get_conditioner_wtype();
@ -222,7 +243,7 @@ public:
                      ggml_backend_t backend,
                      std::set<std::string> ignore_tensors = {});

-    bool save_to_gguf_file(const std::string& file_path, ggml_type type);
+    bool save_to_gguf_file(const std::string& file_path, ggml_type type, const std::string& tensor_type_rules);
    bool tensor_should_be_converted(const TensorStorage& tensor_storage, ggml_type type);
    int64_t get_params_mem_size(ggml_backend_t backend, ggml_type type = GGML_TYPE_COUNT);
    ~ModelLoader() = default;
@ -231,4 +252,4 @@ public:
    static std::string load_t5_tokenizer_json();
 };

-#endif  // __MODEL_H__
+#endif  // __MODEL_H__
--- a/otherarch/sdcpp/pmid.hpp
+++ b/otherarch/sdcpp/pmid.hpp
@ -623,7 +623,12 @@ public:
    std::vector<float> zeros_right;

 public:
-    PhotoMakerIDEncoder(ggml_backend_t backend, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix, SDVersion version = VERSION_SDXL, PMVersion pm_v = PM_VERSION_1, float sty = 20.f)
+    PhotoMakerIDEncoder(ggml_backend_t backend,
+                        const String2GGMLType& tensor_types,
+                        const std::string prefix,
+                        SDVersion version = VERSION_SDXL,
+                        PMVersion pm_v    = PM_VERSION_1,
+                        float sty         = 20.f)
        : GGMLRunner(backend),
          version(version),
          pm_version(pm_v),
--- a/otherarch/sdcpp/sdtype_adapter.cpp
+++ b/otherarch/sdcpp/sdtype_adapter.cpp
@ -134,6 +134,27 @@ static bool sd_is_quiet = false;
 static std::string sdmodelfilename = "";
 static bool photomaker_enabled = false;

+static void set_sd_vae_tiling(sd_ctx_t* ctx, bool tiling)
+{
+    ctx->sd->vae_tiling = tiling;
+}
+
+static int get_loaded_sd_version(sd_ctx_t* ctx)
+{
+    return ctx->sd->version;
+}
+
+static bool loaded_model_is_chroma(sd_ctx_t* ctx)
+{
+    if (ctx != nullptr && ctx->sd != nullptr) {
+        auto maybe_flux = std::dynamic_pointer_cast<FluxModel>(ctx->sd->diffusion_model);
+        if (maybe_flux != nullptr) {
+            return maybe_flux->flux.flux_params.is_chroma;
+        }
+    }
+    return false;
+}
+
 bool sdtype_load_model(const sd_load_model_inputs inputs) {
    sd_is_quiet = inputs.quiet;
    set_sd_quiet(sd_is_quiet);
@ -160,6 +181,10 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) {
    {
        taesdpath = executable_path + "taesd.embd";
        printf("With TAE SD VAE: %s\n",taesdpath.c_str());
+        if (cfg_tiled_vae_threshold < 8192) {
+            printf("  disabling VAE tiling for TAESD\n");
+            cfg_tiled_vae_threshold = 8192;
+        }
    }
    else if(vaefilename!="")
    {
@ -267,31 +292,35 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) {
        sd_params->control_net_cpu);
    }

-    sd_ctx = new_sd_ctx(sd_params->model_path.c_str(),
-                        sd_params->clip_l_path.c_str(),
-                        sd_params->clip_g_path.c_str(),
-                        sd_params->t5xxl_path.c_str(),
-                        sd_params->diffusion_model_path.c_str(),
-                        sd_params->vae_path.c_str(),
-                        sd_params->taesd_path.c_str(),
-                        sd_params->controlnet_path.c_str(),
-                        sd_params->lora_model_dir.c_str(),
-                        sd_params->embeddings_path.c_str(),
-                        sd_params->stacked_id_embeddings_path.c_str(),
-                        vae_decode_only,
-                        sd_params->vae_tiling,
-                        free_param,
-                        sd_params->n_threads,
-                        sd_params->wtype,
-                        sd_params->rng_type,
-                        sd_params->schedule,
-                        sd_params->clip_on_cpu,
-                        sd_params->control_net_cpu,
-                        sd_params->vae_on_cpu,
-                        sd_params->diffusion_flash_attn,
-                        sd_params->chroma_use_dit_mask,
-                        sd_params->chroma_use_t5_mask,
-                        sd_params->chroma_t5_mask_pad);
+    sd_ctx_params_t params;
+    sd_ctx_params_init(&params);
+    params.model_path = sd_params->model_path.c_str();
+    params.clip_l_path = sd_params->clip_l_path.c_str();
+    params.clip_g_path = sd_params->clip_g_path.c_str();
+    params.t5xxl_path = sd_params->t5xxl_path.c_str();
+    params.diffusion_model_path = sd_params->diffusion_model_path.c_str();
+    params.vae_path = sd_params->vae_path.c_str();
+    params.taesd_path = sd_params->taesd_path.c_str();
+    params.control_net_path = sd_params->controlnet_path.c_str();
+    params.lora_model_dir = sd_params->lora_model_dir.c_str();
+    params.embedding_dir = sd_params->embeddings_path.c_str();
+    params.stacked_id_embed_dir = sd_params->stacked_id_embeddings_path.c_str();
+    params.vae_decode_only = vae_decode_only;
+    params.vae_tiling = sd_params->vae_tiling;
+    params.free_params_immediately = free_param;
+    params.n_threads = sd_params->n_threads;
+    params.wtype = sd_params->wtype;
+    params.rng_type = sd_params->rng_type;
+    params.schedule = sd_params->schedule;
+    params.keep_clip_on_cpu = sd_params->clip_on_cpu;
+    params.keep_control_net_on_cpu = sd_params->control_net_cpu;
+    params.keep_vae_on_cpu = sd_params->vae_on_cpu;
+    params.diffusion_flash_attn = sd_params->diffusion_flash_attn;
+    params.chroma_use_dit_mask = sd_params->chroma_use_dit_mask;
+    params.chroma_use_t5_mask = sd_params->chroma_use_t5_mask;
+    params.chroma_t5_mask_pad = sd_params->chroma_t5_mask_pad;
+
+    sd_ctx = new_sd_ctx(&params);

    if (sd_ctx == NULL) {
        printf("\nError: KCPP SD Failed to create context!\nIf using Flux/SD3.5, make sure you have ALL files required (e.g. VAE, T5, Clip...) or baked in!\n");
@ -305,7 +334,6 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) {
    if(lorafilename!="" && inputs.lora_multiplier>0)
    {
        printf("\nApply LoRA...\n");
-       // sd_ctx->sd->set_pending_lora(lorafilename,inputs.lora_multiplier);
        sd_ctx->sd->apply_lora_from_file(lorafilename,inputs.lora_multiplier);
    }

@ -482,11 +510,29 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
    auto loadedsdver = get_loaded_sd_version(sd_ctx);
    if (loadedsdver == SDVersion::VERSION_FLUX)
    {
-        if (!sd_loaded_chroma()) {
-            sd_params->cfg_scale = 1;  //non chroma clamp cfg scale
+        if (loaded_model_is_chroma(sd_ctx)) {
+            if (sd_params->diffusion_flash_attn && sd_params->chroma_use_dit_mask) {
+                if (!sd_is_quiet && sddebugmode) {
+                    printf("Chroma: flash attention is on, disabling DiT mask\n");
+                }
+                sd_params->chroma_use_dit_mask = false;
+            }
+        }
+        else {
+            if (sd_params->cfg_scale != 1.0f) {
+                //non chroma clamp cfg scale
+                if (!sd_is_quiet && sddebugmode) {
+                    printf("Flux: clamping CFG Scale to 1\n");
+                }
+                sd_params->cfg_scale = 1.0f;
+            }
        }
        if (sampler == "euler a" || sampler == "k_euler_a" || sampler == "euler_a") {
-            sampler = "euler";  //euler a broken on flux
+            //euler a broken on flux
+            if (!sd_is_quiet && sddebugmode) {
+                printf("Flux: switching Euler A to Euler\n");
+            }
+            sampler = "euler";
        }
    }

@ -521,17 +567,6 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
    bool dotile = (sd_params->width*sd_params->height > cfg_tiled_vae_threshold*cfg_tiled_vae_threshold);
    set_sd_vae_tiling(sd_ctx,dotile); //changes vae tiling, prevents memory related crash/oom

-    if (sd_params->clip_skip <= 0) {
-        // workaround for clip_skip being "stuck" at the previous requested value
-        // 2 is the default for all recent base models (SD2, SDXL, Flux, SD3)
-        if (sd_version_is_sd1((SDVersion)loadedsdver)) {
-            sd_params->clip_skip = 1;
-        }
-        else {
-            sd_params->clip_skip = 2;
-        }
-    }
-
    //for img2img
    sd_image_t input_image = {0,0,0,nullptr};
    std::vector<sd_image_t> extraimage_references;
@ -663,25 +698,25 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
        }
    }

-    std::vector<sd_image_t> kontext_imgs;
-    if(extra_image_data.size()>0 && loadedsdver==SDVersion::VERSION_FLUX && !sd_loaded_chroma())
+    std::vector<sd_image_t> reference_imgs;
+    if(extra_image_data.size()>0 && loadedsdver==SDVersion::VERSION_FLUX && !loaded_model_is_chroma(sd_ctx))
    {
        for(int i=0;i<extra_image_data.size();++i)
        {
-            kontext_imgs.push_back(extraimage_references[i]);
+            reference_imgs.push_back(extraimage_references[i]);
        }
        if(!sd_is_quiet && sddebugmode==1)
        {
-            printf("\nFlux Kontext: Using %d reference images\n",kontext_imgs.size());
+            printf("\nFlux Kontext: Using %d reference images\n",reference_imgs.size());
        }
    }

-    std::vector<sd_image_t*> photomaker_imgs;
+    std::vector<sd_image_t> photomaker_imgs;
    if(photomaker_enabled && extra_image_data.size()>0)
    {
        for(int i=0;i<extra_image_data.size();++i)
        {
-            photomaker_imgs.push_back(&extraimage_references[i]);
+            photomaker_imgs.push_back(extraimage_references[i]);
        }
        if(!sd_is_quiet && sddebugmode==1)
        {
@ -689,6 +724,41 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
        }
    }

+    sd_img_gen_params_t params;
+    sd_img_gen_params_init (&params);
+
+    params.prompt = sd_params->prompt.c_str();
+    params.negative_prompt = sd_params->negative_prompt.c_str();
+    params.clip_skip = sd_params->clip_skip;
+    params.guidance.txt_cfg = sd_params->cfg_scale;
+    params.guidance.img_cfg = sd_params->cfg_scale;
+    params.guidance.distilled_guidance = sd_params->guidance;
+    params.eta = sd_params->eta;
+    params.width = sd_params->width;
+    params.height = sd_params->height;
+    params.sample_method = sd_params->sample_method;
+    params.sample_steps = sd_params->sample_steps;
+    params.seed = sd_params->seed;
+    params.batch_count = sd_params->batch_count;
+    params.control_cond = control_image;
+    params.control_strength = sd_params->control_strength;
+    params.style_strength = sd_params->style_ratio;
+    params.normalize_input = sd_params->normalize_input;
+    params.input_id_images_path = sd_params->input_id_images_path.c_str();
+
+    params.guidance.slg.layers = sd_params->skip_layers.data();
+    params.guidance.slg.layer_count = sd_params->skip_layers.size();
+    params.guidance.slg.layer_start = sd_params->skip_layer_start;
+    params.guidance.slg.layer_end = sd_params->skip_layer_end;
+    params.guidance.slg.scale = sd_params->slg_scale;
+
+    params.ref_images = reference_imgs.data();
+    params.ref_images_count = reference_imgs.size();
+
+    kcpp_img_gen_params_t extra_params = {};
+    extra_params.photomaker_references = photomaker_imgs.data();
+    extra_params.photomaker_reference_count = photomaker_imgs.size();
+
    if (sd_params->mode == TXT2IMG) {

        if(!sd_is_quiet && sddebugmode==1)
@ -708,32 +778,8 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
            sd_params->control_strength);
        }

+        results = generate_image(sd_ctx, &params, &extra_params);

-        results = txt2img(sd_ctx,
-                          sd_params->prompt.c_str(),
-                          sd_params->negative_prompt.c_str(),
-                          sd_params->clip_skip,
-                          sd_params->cfg_scale,
-                          sd_params->guidance,
-                          sd_params->eta,
-                          sd_params->width,
-                          sd_params->height,
-                          sd_params->sample_method,
-                          sd_params->sample_steps,
-                          sd_params->seed,
-                          sd_params->batch_count,
-                          control_image,
-                          sd_params->control_strength,
-                          sd_params->style_ratio,
-                          sd_params->normalize_input,
-                          sd_params->input_id_images_path.c_str(),
-                          kontext_imgs.data(), kontext_imgs.size(),
-                          sd_params->skip_layers.data(),
-                          sd_params->skip_layers.size(),
-                          sd_params->slg_scale,
-                          sd_params->skip_layer_start,
-                          sd_params->skip_layer_end,
-                          photomaker_imgs);
    } else {

        if (sd_params->width <= 0 || sd_params->width % 64 != 0 || sd_params->height <= 0 || sd_params->height % 64 != 0) {
@ -839,34 +885,12 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
            sd_params->strength);
        }

-        results = img2img(sd_ctx,
-                            input_image,
-                            mask_image,
-                            sd_params->prompt.c_str(),
-                            sd_params->negative_prompt.c_str(),
-                            sd_params->clip_skip,
-                            sd_params->cfg_scale,
-                            sd_params->guidance,
-                            sd_params->eta,
-                            sd_params->width,
-                            sd_params->height,
-                            sd_params->sample_method,
-                            sd_params->sample_steps,
-                            sd_params->strength,
-                            sd_params->seed,
-                            sd_params->batch_count,
-                            control_image,
-                            sd_params->control_strength,
-                            sd_params->style_ratio,
-                            sd_params->normalize_input,
-                            sd_params->input_id_images_path.c_str(),
-                            kontext_imgs.data(), kontext_imgs.size(),
-                            sd_params->skip_layers.data(),
-                            sd_params->skip_layers.size(),
-                            sd_params->slg_scale,
-                            sd_params->skip_layer_start,
-                            sd_params->skip_layer_end,
-                            photomaker_imgs);
+        params.strength = sd_params->strength;
+        params.init_image = input_image;
+        params.mask_image = mask_image;
+
+        results = generate_image(sd_ctx, &params, &extra_params);
+
    }

    if (results == NULL) {
--- a/otherarch/sdcpp/stable-diffusion.cpp
+++ b/otherarch/sdcpp/stable-diffusion.cpp
--- a/otherarch/sdcpp/stable-diffusion.h
+++ b/otherarch/sdcpp/stable-diffusion.h
@ -30,7 +30,8 @@ extern "C" {

 enum rng_type_t {
    STD_DEFAULT_RNG,
-    CUDA_RNG
+    CUDA_RNG,
+    RNG_TYPE_COUNT
 };

 enum sample_method_t {
@ -46,7 +47,7 @@ enum sample_method_t {
    LCM,
    DDIM_TRAILING,
    TCD,
-    N_SAMPLE_METHODS
+    SAMPLE_METHOD_COUNT
 };

 enum schedule_t {
@ -56,15 +57,15 @@ enum schedule_t {
    EXPONENTIAL,
    AYS,
    GITS,
-    N_SCHEDULES
+    SCHEDULE_COUNT
 };

 // same as enum ggml_type
 enum sd_type_t {
-    SD_TYPE_F32     = 0,
-    SD_TYPE_F16     = 1,
-    SD_TYPE_Q4_0    = 2,
-    SD_TYPE_Q4_1    = 3,
+    SD_TYPE_F32  = 0,
+    SD_TYPE_F16  = 1,
+    SD_TYPE_Q4_0 = 2,
+    SD_TYPE_Q4_1 = 3,
    // SD_TYPE_Q4_2 = 4, support has been removed
    // SD_TYPE_Q4_3 = 5, support has been removed
    SD_TYPE_Q5_0    = 6,
@ -92,19 +93,17 @@ enum sd_type_t {
    SD_TYPE_F64     = 28,
    SD_TYPE_IQ1_M   = 29,
    SD_TYPE_BF16    = 30,
-    SD_TYPE_Q4_0_4_4 = 31,
-    SD_TYPE_Q4_0_4_8 = 32,
-    SD_TYPE_Q4_0_8_8 = 33,
-    SD_TYPE_TQ1_0   = 34,
-    SD_TYPE_TQ2_0   = 35,
-    SD_TYPE_IQ4_NL_4_4 = 36,
+    // SD_TYPE_Q4_0_4_4 = 31, support has been removed from gguf files
+    // SD_TYPE_Q4_0_4_8 = 32,
+    // SD_TYPE_Q4_0_8_8 = 33,
+    SD_TYPE_TQ1_0 = 34,
+    SD_TYPE_TQ2_0 = 35,
+    // SD_TYPE_IQ4_NL_4_4 = 36,
    // SD_TYPE_IQ4_NL_4_8 = 37,
    // SD_TYPE_IQ4_NL_8_8 = 38,
    SD_TYPE_COUNT   = 40,
 };

-SD_API const char* sd_type_name(enum sd_type_t type);
-
 enum sd_log_level_t {
    SD_LOG_DEBUG,
    SD_LOG_INFO,
@ -112,6 +111,105 @@ enum sd_log_level_t {
    SD_LOG_ERROR
 };

+typedef struct {
+    const char* model_path;
+    const char* clip_l_path;
+    const char* clip_g_path;
+    const char* t5xxl_path;
+    const char* diffusion_model_path;
+    const char* vae_path;
+    const char* taesd_path;
+    const char* control_net_path;
+    const char* lora_model_dir;
+    const char* embedding_dir;
+    const char* stacked_id_embed_dir;
+    bool vae_decode_only;
+    bool vae_tiling;
+    bool free_params_immediately;
+    int n_threads;
+    enum sd_type_t wtype;
+    enum rng_type_t rng_type;
+    enum schedule_t schedule;
+    bool keep_clip_on_cpu;
+    bool keep_control_net_on_cpu;
+    bool keep_vae_on_cpu;
+    bool diffusion_flash_attn;
+    bool diffusion_conv_direct;
+    bool vae_conv_direct;
+    bool chroma_use_dit_mask;
+    bool chroma_use_t5_mask;
+    int chroma_t5_mask_pad;
+} sd_ctx_params_t;
+
+typedef struct {
+    uint32_t width;
+    uint32_t height;
+    uint32_t channel;
+    uint8_t* data;
+} sd_image_t;
+
+typedef struct {
+    int* layers;
+    size_t layer_count;
+    float layer_start;
+    float layer_end;
+    float scale;
+} sd_slg_params_t;
+
+typedef struct {
+    float txt_cfg;
+    float img_cfg;
+    float min_cfg;
+    float distilled_guidance;
+    sd_slg_params_t slg;
+} sd_guidance_params_t;
+
+typedef struct {
+    const char* prompt;
+    const char* negative_prompt;
+    int clip_skip;
+    sd_guidance_params_t guidance;
+    sd_image_t init_image;
+    sd_image_t* ref_images;
+    int ref_images_count;
+    sd_image_t mask_image;
+    int width;
+    int height;
+    enum sample_method_t sample_method;
+    int sample_steps;
+    float eta;
+    float strength;
+    int64_t seed;
+    int batch_count;
+    const sd_image_t* control_cond;
+    float control_strength;
+    float style_strength;
+    bool normalize_input;
+    const char* input_id_images_path;
+} sd_img_gen_params_t;
+
+typedef struct {
+    sd_image_t* photomaker_references;
+    int photomaker_reference_count;
+} kcpp_img_gen_params_t;
+
+typedef struct {
+    sd_image_t init_image;
+    int width;
+    int height;
+    sd_guidance_params_t guidance;
+    enum sample_method_t sample_method;
+    int sample_steps;
+    float strength;
+    int64_t seed;
+    int video_frames;
+    int motion_bucket_id;
+    int fps;
+    float augmentation_level;
+} sd_vid_gen_params_t;
+
+typedef struct sd_ctx_t sd_ctx_t;
+
 typedef void (*sd_log_cb_t)(enum sd_log_level_t level, const char* text, void* data);
 typedef void (*sd_progress_cb_t)(int step, int steps, float time, void* data);

@ -120,154 +218,42 @@ SD_API void sd_set_progress_callback(sd_progress_cb_t cb, void* data);
 SD_API int32_t sd_get_num_physical_cores();
 SD_API const char* sd_get_system_info();

-typedef struct {
-    uint32_t width;
-    uint32_t height;
-    uint32_t channel;
-    uint8_t* data;
-} sd_image_t;
+SD_API const char* sd_type_name(enum sd_type_t type);
+SD_API enum sd_type_t str_to_sd_type(const char* str);
+SD_API const char* sd_rng_type_name(enum rng_type_t rng_type);
+SD_API enum rng_type_t str_to_rng_type(const char* str);
+SD_API const char* sd_sample_method_name(enum sample_method_t sample_method);
+SD_API enum sample_method_t str_to_sample_method(const char* str);
+SD_API const char* sd_schedule_name(enum schedule_t schedule);
+SD_API enum schedule_t str_to_schedule(const char* str);

-typedef struct sd_ctx_t sd_ctx_t;
-
-SD_API void set_sd_vae_tiling(sd_ctx_t* ctx, bool tiling);
-SD_API int get_loaded_sd_version(sd_ctx_t* ctx);
-SD_API bool sd_loaded_chroma();
-
-SD_API sd_ctx_t* new_sd_ctx(const char* model_path,
-                            const char* clip_l_path,
-                            const char* clip_g_path,
-                            const char* t5xxl_path,
-                            const char* diffusion_model_path,
-                            const char* vae_path,
-                            const char* taesd_path,
-                            const char* control_net_path_c_str,
-                            const char* lora_model_dir,
-                            const char* embed_dir_c_str,
-                            const char* stacked_id_embed_dir_c_str,
-                            bool vae_decode_only,
-                            bool vae_tiling,
-                            bool free_params_immediately,
-                            int n_threads,
-                            enum sd_type_t wtype,
-                            enum rng_type_t rng_type,
-                            enum schedule_t s,
-                            bool keep_clip_on_cpu,
-                            bool keep_control_net_cpu,
-                            bool keep_vae_on_cpu,
-                            bool diffusion_flash_attn,
-                            bool chroma_use_dit_mask,
-                            bool chroma_use_t5_mask,
-                            int chroma_t5_mask_pad);
+SD_API void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params);
+SD_API char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params);

+SD_API sd_ctx_t* new_sd_ctx(const sd_ctx_params_t* sd_ctx_params);
 SD_API void free_sd_ctx(sd_ctx_t* sd_ctx);

-SD_API sd_image_t* txt2img(sd_ctx_t* sd_ctx,
-                           const char* prompt,
-                           const char* negative_prompt,
-                           int clip_skip,
-                           float cfg_scale,
-                           float guidance,
-                           float eta,
-                           int width,
-                           int height,
-                           enum sample_method_t sample_method,
-                           int sample_steps,
-                           int64_t seed,
-                           int batch_count,
-                           const sd_image_t* control_cond,
-                           float control_strength,
-                           float style_strength,
-                           bool normalize_input,
-                           const char* input_id_images_path,
-                           sd_image_t* kontext_imgs,
-                           int kontext_img_count,
-                           int* skip_layers,
-                           size_t skip_layers_count,
-                           float slg_scale,
-                           float skip_layer_start,
-                           float skip_layer_end,
-                           const std::vector<sd_image_t*> photomaker_references);
+SD_API void sd_img_gen_params_init(sd_img_gen_params_t* sd_img_gen_params);
+SD_API char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params);
+SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params, const kcpp_img_gen_params_t* kcpp_img_gen_params);

-SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx,
-                           sd_image_t init_image,
-                           sd_image_t mask_image,
-                           const char* prompt,
-                           const char* negative_prompt,
-                           int clip_skip,
-                           float cfg_scale,
-                           float guidance,
-                           float eta,
-                           int width,
-                           int height,
-                           enum sample_method_t sample_method,
-                           int sample_steps,
-                           float strength,
-                           int64_t seed,
-                           int batch_count,
-                           const sd_image_t* control_cond,
-                           float control_strength,
-                           float style_strength,
-                           bool normalize_input,
-                           const char* input_id_images_path,
-                           sd_image_t* kontext_imgs,
-                           int kontext_img_count,
-                           int* skip_layers,
-                           size_t skip_layers_count,
-                           float slg_scale,
-                           float skip_layer_start,
-                           float skip_layer_end,
-                           const std::vector<sd_image_t*> photomaker_references);
-
-SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx,
-                           sd_image_t init_image,
-                           int width,
-                           int height,
-                           int video_frames,
-                           int motion_bucket_id,
-                           int fps,
-                           float augmentation_level,
-                           float min_cfg,
-                           float cfg_scale,
-                           enum sample_method_t sample_method,
-                           int sample_steps,
-                           float strength,
-                           int64_t seed);
-
-SD_API sd_image_t* edit(sd_ctx_t* sd_ctx,
-                        sd_image_t* ref_images,
-                        int ref_images_count,
-                        const char* prompt,
-                        const char* negative_prompt,
-                        int clip_skip,
-                        float cfg_scale,
-                        float guidance,
-                        float eta,
-                        int width,
-                        int height,
-                        enum sample_method_t sample_method,
-                        int sample_steps,
-                        float strength,
-                        int64_t seed,
-                        int batch_count,
-                        const sd_image_t* control_cond,
-                        float control_strength,
-                        float style_strength,
-                        bool normalize_input,
-                        int* skip_layers,
-                        size_t skip_layers_count,
-                        float slg_scale,
-                        float skip_layer_start,
-                        float skip_layer_end);
+SD_API void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params);
+SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* sd_vid_gen_params);  // broken

 typedef struct upscaler_ctx_t upscaler_ctx_t;

 SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path,
-                                        int n_threads);
+                                        int n_threads,
+                                        bool direct);
 SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx);

 SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx, sd_image_t input_image, uint32_t upscale_factor);

-SD_API bool convert(const char* input_path, const char* vae_path, const char* output_path, enum sd_type_t output_type);
+SD_API bool convert(const char* input_path,
+                    const char* vae_path,
+                    const char* output_path,
+                    enum sd_type_t output_type,
+                    const char* tensor_type_rules);

 SD_API uint8_t* preprocess_canny(uint8_t* img,
                                 int width,
@ -282,4 +268,4 @@ SD_API uint8_t* preprocess_canny(uint8_t* img,
 }
 #endif

-#endif  // __STABLE_DIFFUSION_H__
+#endif  // __STABLE_DIFFUSION_H__
--- a/otherarch/sdcpp/t5.hpp
+++ b/otherarch/sdcpp/t5.hpp
@ -457,8 +457,8 @@ protected:
    int64_t hidden_size;
    float eps;

-    void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") {
-        enum ggml_type wtype = GGML_TYPE_F32;  //(tensor_types.find(prefix + "weight") != tensor_types.end()) ? tensor_types[prefix + "weight"] : GGML_TYPE_F32;
+    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
+        enum ggml_type wtype = GGML_TYPE_F32;
        params["weight"]     = ggml_new_tensor_1d(ctx, wtype, hidden_size);
    }

@ -735,7 +735,7 @@ struct T5Runner : public GGMLRunner {
    std::vector<int> relative_position_bucket_vec;

    T5Runner(ggml_backend_t backend,
-             std::map<std::string, enum ggml_type>& tensor_types,
+             const String2GGMLType& tensor_types,
             const std::string prefix,
             int64_t num_layers = 24,
             int64_t model_dim  = 4096,
@ -876,16 +876,14 @@ struct T5Embedder {
    T5UniGramTokenizer tokenizer;
    T5Runner model;

-    static std::map<std::string, enum ggml_type> empty_tensor_types;
-
    T5Embedder(ggml_backend_t backend,
-               std::map<std::string, enum ggml_type>& tensor_types = empty_tensor_types,
-               const std::string prefix                            = "",
-               int64_t num_layers                                  = 24,
-               int64_t model_dim                                   = 4096,
-               int64_t ff_dim                                      = 10240,
-               int64_t num_heads                                   = 64,
-               int64_t vocab_size                                  = 32128)
+               const String2GGMLType& tensor_types = {},
+               const std::string prefix            = "",
+               int64_t num_layers                  = 24,
+               int64_t model_dim                   = 4096,
+               int64_t ff_dim                      = 10240,
+               int64_t num_heads                   = 64,
+               int64_t vocab_size                  = 32128)
        : model(backend, tensor_types, prefix, num_layers, model_dim, ff_dim, num_heads, vocab_size) {
    }

--- a/otherarch/sdcpp/tae.hpp
+++ b/otherarch/sdcpp/tae.hpp
@ -149,7 +149,7 @@ public:
                if (i == 1) {
                    h = ggml_relu_inplace(ctx, h);
                } else {
-                    h = ggml_upscale(ctx, h, 2, ggml_scale_mode::GGML_SCALE_MODE_NEAREST);
+                    h = ggml_upscale(ctx, h, 2, GGML_SCALE_MODE_NEAREST);
                }
                continue;
            }
@ -196,7 +196,7 @@ struct TinyAutoEncoder : public GGMLRunner {
    bool decode_only = false;

    TinyAutoEncoder(ggml_backend_t backend,
-                    std::map<std::string, enum ggml_type>& tensor_types,
+                    const String2GGMLType& tensor_types,
                    const std::string prefix,
                    bool decoder_only = true,
                    SDVersion version = VERSION_SD1)
@ -206,6 +206,17 @@ struct TinyAutoEncoder : public GGMLRunner {
        taesd.init(params_ctx, tensor_types, prefix);
    }

+    void enable_conv2d_direct() {
+        std::vector<GGMLBlock*> blocks;
+        taesd.get_all_blocks(blocks);
+        for (auto block : blocks) {
+            if (block->get_desc() == "Conv2d") {
+                auto conv_block = (Conv2d*)block;
+                conv_block->enable_direct();
+            }
+        }
+    }
+
    std::string get_desc() {
        return "taesd";
    }
--- a/otherarch/sdcpp/unet.hpp
+++ b/otherarch/sdcpp/unet.hpp
@ -166,7 +166,6 @@ public:
 // ldm.modules.diffusionmodules.openaimodel.UNetModel
 class UnetModelBlock : public GGMLBlock {
 protected:
-    static std::map<std::string, enum ggml_type> empty_tensor_types;
    SDVersion version = VERSION_SD1;
    // network hparams
    int in_channels                        = 4;
@ -184,7 +183,7 @@ public:
    int model_channels  = 320;
    int adm_in_channels = 2816;  // only for VERSION_SDXL/SVD

-    UnetModelBlock(SDVersion version = VERSION_SD1, std::map<std::string, enum ggml_type>& tensor_types = empty_tensor_types, bool flash_attn = false)
+    UnetModelBlock(SDVersion version = VERSION_SD1, const String2GGMLType& tensor_types = {}, bool flash_attn = false)
        : version(version) {
        if (sd_version_is_sd2(version)) {
            context_dim       = 1024;
@ -207,6 +206,8 @@ public:
        }
        if (sd_version_is_inpaint(version)) {
            in_channels = 9;
+        } else if (sd_version_is_unet_edit(version)) {
+            in_channels = 8;
        }

        // dims is always 2
@ -537,7 +538,7 @@ struct UNetModelRunner : public GGMLRunner {
    UnetModelBlock unet;

    UNetModelRunner(ggml_backend_t backend,
-                    std::map<std::string, enum ggml_type>& tensor_types,
+                    const String2GGMLType& tensor_types,
                    const std::string prefix,
                    SDVersion version = VERSION_SD1,
                    bool flash_attn   = false)
@ -545,6 +546,18 @@ struct UNetModelRunner : public GGMLRunner {
        unet.init(params_ctx, tensor_types, prefix);
    }

+    void enable_conv2d_direct() {
+        std::vector<GGMLBlock*> blocks;
+        unet.get_all_blocks(blocks);
+        for (auto block : blocks) {
+            if (block->get_desc() == "Conv2d") {
+                LOG_DEBUG("block %s", block->get_desc().c_str());
+                auto conv_block = (Conv2d*)block;
+                conv_block->enable_direct();
+            }
+        }
+    }
+
    std::string get_desc() {
        return "unet";
    }
@ -657,4 +670,4 @@ struct UNetModelRunner : public GGMLRunner {
    }
 };

-#endif  // __UNET_HPP__
+#endif  // __UNET_HPP__
--- a/otherarch/sdcpp/upscaler.cpp
+++ b/otherarch/sdcpp/upscaler.cpp
@ -9,9 +9,12 @@ struct UpscalerGGML {
    std::shared_ptr<ESRGAN> esrgan_upscaler;
    std::string esrgan_path;
    int n_threads;
+    bool direct = false;

-    UpscalerGGML(int n_threads)
-        : n_threads(n_threads) {
+    UpscalerGGML(int n_threads,
+                 bool direct = false)
+        : n_threads(n_threads),
+          direct(direct) {
    }

    bool load_from_file(const std::string& esrgan_path) {
@ -21,12 +24,17 @@ struct UpscalerGGML {
 #endif
 #ifdef SD_USE_METAL
        LOG_DEBUG("Using Metal backend");
+        ggml_log_set(ggml_log_callback_default, nullptr);
        backend = ggml_backend_metal_init();
 #endif
 #ifdef SD_USE_VULKAN
        LOG_DEBUG("Using Vulkan backend");
        backend = ggml_backend_vk_init(0);
 #endif
+#ifdef SD_USE_OPENCL
+        LOG_DEBUG("Using OpenCL backend");
+        backend = ggml_backend_opencl_init();
+#endif
 #ifdef SD_USE_SYCL
        LOG_DEBUG("Using SYCL backend");
        backend = ggml_backend_sycl_init(0);
@ -42,6 +50,9 @@ struct UpscalerGGML {
        }
        LOG_INFO("Upscaler weight type: %s", ggml_type_name(model_data_type));
        esrgan_upscaler = std::make_shared<ESRGAN>(backend, model_loader.tensor_storages_types);
+        if (direct) {
+            esrgan_upscaler->enable_conv2d_direct();
+        }
        if (!esrgan_upscaler->load_from_file(esrgan_path)) {
            return false;
        }
@ -99,14 +110,15 @@ struct upscaler_ctx_t {
 };

 upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str,
-                                 int n_threads) {
+                                 int n_threads,
+                                 bool direct = false) {
    upscaler_ctx_t* upscaler_ctx = (upscaler_ctx_t*)malloc(sizeof(upscaler_ctx_t));
    if (upscaler_ctx == NULL) {
        return NULL;
    }
    std::string esrgan_path(esrgan_path_c_str);

-    upscaler_ctx->upscaler = new UpscalerGGML(n_threads);
+    upscaler_ctx->upscaler = new UpscalerGGML(n_threads, direct);
    if (upscaler_ctx->upscaler == NULL) {
        return NULL;
    }
--- a/otherarch/sdcpp/util.cpp
+++ b/otherarch/sdcpp/util.cpp
@ -102,19 +102,32 @@ std::vector<std::string> get_files_from_dir(const std::string& dir) {
    sprintf(directoryPath, "%s\\%s\\*", currentDirectory, dir.c_str());

    // Find the first file in the directory
-    hFind = FindFirstFile(directoryPath, &findFileData);
-
+    hFind               = FindFirstFile(directoryPath, &findFileData);
+    bool isAbsolutePath = false;
    // Check if the directory was found
    if (hFind == INVALID_HANDLE_VALUE) {
-        printf("Unable to find directory.\n");
-        return files;
+        printf("Unable to find directory. Try with original path \n");
+
+        char directoryPathAbsolute[MAX_PATH];
+        sprintf(directoryPathAbsolute, "%s*", dir.c_str());
+
+        hFind          = FindFirstFile(directoryPathAbsolute, &findFileData);
+        isAbsolutePath = true;
+        if (hFind == INVALID_HANDLE_VALUE) {
+            printf("Absolute path was also wrong.\n");
+            return files;
+        }
    }

    // Loop through all files in the directory
    do {
        // Check if the found file is a regular file (not a directory)
        if (!(findFileData.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY)) {
-            files.push_back(std::string(currentDirectory) + "\\" + dir + "\\" + std::string(findFileData.cFileName));
+            if (isAbsolutePath) {
+                files.push_back(dir + "\\" + std::string(findFileData.cFileName));
+            } else {
+                files.push_back(std::string(currentDirectory) + "\\" + dir + "\\" + std::string(findFileData.cFileName));
+            }
        }
    } while (FindNextFile(hFind, &findFileData) != 0);

@ -447,10 +460,6 @@ const char* sd_get_system_info() {
    return buffer;
 }

-const char* sd_type_name(enum sd_type_t type) {
-    return ggml_type_name((ggml_type)type);
-}
-
 sd_image_f32_t sd_image_t_to_sd_image_f32_t(sd_image_t image) {
    sd_image_f32_t converted_image;
    converted_image.width   = image.width;
--- a/otherarch/sdcpp/util.h
+++ b/otherarch/sdcpp/util.h
@ -7,6 +7,9 @@

 #include "stable-diffusion.h"

+#define SAFE_STR(s) ((s) ? (s) : "")
+#define BOOL_STR(b) ((b) ? "true" : "false")
+
 bool ends_with(const std::string& str, const std::string& ending);
 bool starts_with(const std::string& str, const std::string& start);
 bool contains(const std::string& str, const std::string& substr);
--- a/otherarch/sdcpp/vae.hpp
+++ b/otherarch/sdcpp/vae.hpp
@ -163,8 +163,8 @@ public:

 class VideoResnetBlock : public ResnetBlock {
 protected:
-    void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") {
-        enum ggml_type wtype = (tensor_types.find(prefix + "mix_factor") != tensor_types.end()) ? tensor_types[prefix + "mix_factor"] : GGML_TYPE_F32;
+    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
+        enum ggml_type wtype = get_type(prefix + "mix_factor", tensor_types, GGML_TYPE_F32);
        params["mix_factor"] = ggml_new_tensor_1d(ctx, wtype, 1);
    }

@ -525,7 +525,7 @@ struct AutoEncoderKL : public GGMLRunner {
    AutoencodingEngine ae;

    AutoEncoderKL(ggml_backend_t backend,
-                  std::map<std::string, enum ggml_type>& tensor_types,
+                  const String2GGMLType& tensor_types,
                  const std::string prefix,
                  bool decode_only       = false,
                  bool use_video_decoder = false,
@ -534,6 +534,17 @@ struct AutoEncoderKL : public GGMLRunner {
        ae.init(params_ctx, tensor_types, prefix);
    }

+    void enable_conv2d_direct() {
+        std::vector<GGMLBlock*> blocks;
+        ae.get_all_blocks(blocks);
+        for (auto block : blocks) {
+            if (block->get_desc() == "Conv2d") {
+                auto conv_block = (Conv2d*)block;
+                conv_block->enable_direct();
+            }
+        }
+    }
+
    std::string get_desc() {
        return "vae";
    }