updated sdcpp prepare for inpaint

fixed img2img (+1 squashed commits) Squashed commits: [42c48f14] try update sdcpp, feels kind of buggy
2025-09-11 09:34:37 +00:00 · 2025-04-08 23:47:12 +08:00 · 2025-04-08 23:47:12 +08:00 · fea3b2bd4a
commit fea3b2bd4a
parent ebf924c5d1
18 changed files with 1850 additions and 271 deletions
--- a/otherarch/sdcpp/clip.hpp
+++ b/otherarch/sdcpp/clip.hpp
@ -546,7 +546,7 @@ protected:
    int64_t num_positions;

    void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") {
-        enum ggml_type token_wtype    = (tensor_types.find(prefix + "token_embedding.weight") != tensor_types.end()) ? tensor_types[prefix + "token_embedding.weight"] : GGML_TYPE_F32;
+        enum ggml_type token_wtype    = GGML_TYPE_F32;  //(tensor_types.find(prefix + "token_embedding.weight") != tensor_types.end()) ? tensor_types[prefix + "token_embedding.weight"] : GGML_TYPE_F32;
        enum ggml_type position_wtype = GGML_TYPE_F32;  //(tensor_types.find(prefix + "position_embedding.weight") != tensor_types.end()) ? tensor_types[prefix + "position_embedding.weight"] : GGML_TYPE_F32;

        params["token_embedding.weight"]    = ggml_new_tensor_2d(ctx, token_wtype, embed_dim, vocab_size);
@ -949,4 +949,4 @@ struct CLIPTextModelRunner : public GGMLRunner {
    }
 };

-#endif  // __CLIP_HPP__
+#endif  // __CLIP_HPP__
--- a/otherarch/sdcpp/conditioner.hpp
+++ b/otherarch/sdcpp/conditioner.hpp
@ -51,7 +51,8 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {

    std::string trigger_word = "img";  // should be user settable
    std::string embd_dir;
-    int32_t num_custom_embeddings = 0;
+    int32_t num_custom_embeddings   = 0;
+    int32_t num_custom_embeddings_2 = 0;
    std::vector<uint8_t> token_embed_custom;
    std::vector<std::string> readed_embeddings;

@ -61,18 +62,18 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                                      SDVersion version = VERSION_SD1,
                                      PMVersion pv      = PM_VERSION_1,
                                      int clip_skip     = -1)
-        : version(version), pm_version(pv), tokenizer(version == VERSION_SD2 ? 0 : 49407), embd_dir(embd_dir) {
+        : version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407), embd_dir(embd_dir) {
        if (clip_skip <= 0) {
            clip_skip = 1;
-            if (version == VERSION_SD2 || version == VERSION_SDXL) {
+            if (sd_version_is_sd2(version) || sd_version_is_sdxl(version)) {
                clip_skip = 2;
            }
        }
-        if (version == VERSION_SD1) {
+        if (sd_version_is_sd1(version)) {
            text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, clip_skip);
-        } else if (version == VERSION_SD2) {
+        } else if (sd_version_is_sd2(version)) {
            text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, clip_skip);
-        } else if (version == VERSION_SDXL) {
+        } else if (sd_version_is_sdxl(version)) {
            text_model  = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, clip_skip, false);
            text_model2 = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, clip_skip, false);
        }
@ -80,35 +81,35 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {

    void set_clip_skip(int clip_skip) {
        text_model->set_clip_skip(clip_skip);
-        if (version == VERSION_SDXL) {
+        if (sd_version_is_sdxl(version)) {
            text_model2->set_clip_skip(clip_skip);
        }
    }

    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
        text_model->get_param_tensors(tensors, "cond_stage_model.transformer.text_model");
-        if (version == VERSION_SDXL) {
+        if (sd_version_is_sdxl(version)) {
            text_model2->get_param_tensors(tensors, "cond_stage_model.1.transformer.text_model");
        }
    }

    void alloc_params_buffer() {
        text_model->alloc_params_buffer();
-        if (version == VERSION_SDXL) {
+        if (sd_version_is_sdxl(version)) {
            text_model2->alloc_params_buffer();
        }
    }

    void free_params_buffer() {
        text_model->free_params_buffer();
-        if (version == VERSION_SDXL) {
+        if (sd_version_is_sdxl(version)) {
            text_model2->free_params_buffer();
        }
    }

    size_t get_params_buffer_size() {
        size_t buffer_size = text_model->get_params_buffer_size();
-        if (version == VERSION_SDXL) {
+        if (sd_version_is_sdxl(version)) {
            buffer_size += text_model2->get_params_buffer_size();
        }
        return buffer_size;
@ -131,28 +132,55 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
        params.no_alloc               = false;
        struct ggml_context* embd_ctx = ggml_init(params);
        struct ggml_tensor* embd      = NULL;
-        int64_t hidden_size           = text_model->model.hidden_size;
+        struct ggml_tensor* embd2     = NULL;
        auto on_load                  = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) {
-            if (tensor_storage.ne[0] != hidden_size) {
-                LOG_DEBUG("embedding wrong hidden size, got %i, expected %i", tensor_storage.ne[0], hidden_size);
-                return false;
+            if (tensor_storage.ne[0] != text_model->model.hidden_size) {
+                if (text_model2) {
+                    if (tensor_storage.ne[0] == text_model2->model.hidden_size) {
+                        embd2       = ggml_new_tensor_2d(embd_ctx, tensor_storage.type, text_model2->model.hidden_size, tensor_storage.n_dims > 1 ? tensor_storage.ne[1] : 1);
+                        *dst_tensor = embd2;
+                    } else {
+                        LOG_DEBUG("embedding wrong hidden size, got %i, expected %i or %i", tensor_storage.ne[0], text_model->model.hidden_size, text_model2->model.hidden_size);
+                        return false;
+                    }
+                } else {
+                    LOG_DEBUG("embedding wrong hidden size, got %i, expected %i", tensor_storage.ne[0], text_model->model.hidden_size);
+                    return false;
+                }
+            } else {
+                embd        = ggml_new_tensor_2d(embd_ctx, tensor_storage.type, text_model->model.hidden_size, tensor_storage.n_dims > 1 ? tensor_storage.ne[1] : 1);
+                *dst_tensor = embd;
            }
-            embd        = ggml_new_tensor_2d(embd_ctx, tensor_storage.type, hidden_size, tensor_storage.n_dims > 1 ? tensor_storage.ne[1] : 1);
-            *dst_tensor = embd;
            return true;
        };
        model_loader.load_tensors(on_load, NULL);
        readed_embeddings.push_back(embd_name);
-        token_embed_custom.resize(token_embed_custom.size() + ggml_nbytes(embd));
-        memcpy((void*)(token_embed_custom.data() + num_custom_embeddings * hidden_size * ggml_type_size(embd->type)),
-               embd->data,
-               ggml_nbytes(embd));
-        for (int i = 0; i < embd->ne[1]; i++) {
-            bpe_tokens.push_back(text_model->model.vocab_size + num_custom_embeddings);
-            // LOG_DEBUG("new custom token: %i", text_model.vocab_size + num_custom_embeddings);
-            num_custom_embeddings++;
+        if (embd) {
+            int64_t hidden_size = text_model->model.hidden_size;
+            token_embed_custom.resize(token_embed_custom.size() + ggml_nbytes(embd));
+            memcpy((void*)(token_embed_custom.data() + num_custom_embeddings * hidden_size * ggml_type_size(embd->type)),
+                   embd->data,
+                   ggml_nbytes(embd));
+            for (int i = 0; i < embd->ne[1]; i++) {
+                bpe_tokens.push_back(text_model->model.vocab_size + num_custom_embeddings);
+                // LOG_DEBUG("new custom token: %i", text_model.vocab_size + num_custom_embeddings);
+                num_custom_embeddings++;
+            }
+            LOG_DEBUG("embedding '%s' applied, custom embeddings: %i", embd_name.c_str(), num_custom_embeddings);
+        }
+        if (embd2) {
+            int64_t hidden_size = text_model2->model.hidden_size;
+            token_embed_custom.resize(token_embed_custom.size() + ggml_nbytes(embd2));
+            memcpy((void*)(token_embed_custom.data() + num_custom_embeddings_2 * hidden_size * ggml_type_size(embd2->type)),
+                   embd2->data,
+                   ggml_nbytes(embd2));
+            for (int i = 0; i < embd2->ne[1]; i++) {
+                bpe_tokens.push_back(text_model2->model.vocab_size + num_custom_embeddings_2);
+                // LOG_DEBUG("new custom token: %i", text_model.vocab_size + num_custom_embeddings);
+                num_custom_embeddings_2++;
+            }
+            LOG_DEBUG("embedding '%s' applied, custom embeddings: %i (text model 2)", embd_name.c_str(), num_custom_embeddings_2);
        }
-        LOG_DEBUG("embedding '%s' applied, custom embeddings: %i", embd_name.c_str(), num_custom_embeddings);
        return true;
    }

@ -402,7 +430,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
            auto input_ids                 = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens);
            struct ggml_tensor* input_ids2 = NULL;
            size_t max_token_idx           = 0;
-            if (version == VERSION_SDXL) {
+            if (sd_version_is_sdxl(version)) {
                auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), tokenizer.EOS_TOKEN_ID);
                if (it != chunk_tokens.end()) {
                    std::fill(std::next(it), chunk_tokens.end(), 0);
@ -427,7 +455,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                                    false,
                                    &chunk_hidden_states1,
                                    work_ctx);
-                if (version == VERSION_SDXL) {
+                if (sd_version_is_sdxl(version)) {
                    text_model2->compute(n_threads,
                                         input_ids2,
                                         0,
@ -486,7 +514,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                                        ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]);

        ggml_tensor* vec = NULL;
-        if (version == VERSION_SDXL) {
+        if (sd_version_is_sdxl(version)) {
            int out_dim = 256;
            vec         = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, adm_in_channels);
            // [0:1280]
--- a/otherarch/sdcpp/control.hpp
+++ b/otherarch/sdcpp/control.hpp
@ -34,11 +34,11 @@ public:

    ControlNetBlock(SDVersion version = VERSION_SD1)
        : version(version) {
-        if (version == VERSION_SD2) {
+        if (sd_version_is_sd2(version)) {
            context_dim       = 1024;
            num_head_channels = 64;
            num_heads         = -1;
-        } else if (version == VERSION_SDXL) {
+        } else if (sd_version_is_sdxl(version)) {
            context_dim           = 2048;
            attention_resolutions = {4, 2};
            channel_mult          = {1, 2, 4};
@ -58,7 +58,7 @@ public:
        // time_embed_1 is nn.SiLU()
        blocks["time_embed.2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, time_embed_dim));

-        if (version == VERSION_SDXL || version == VERSION_SVD) {
+        if (sd_version_is_sdxl(version) || version == VERSION_SVD) {
            blocks["label_emb.0.0"] = std::shared_ptr<GGMLBlock>(new Linear(adm_in_channels, time_embed_dim));
            // label_emb_1 is nn.SiLU()
            blocks["label_emb.0.2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, time_embed_dim));
--- a/otherarch/sdcpp/denoiser.hpp
+++ b/otherarch/sdcpp/denoiser.hpp
@ -474,7 +474,8 @@ static void sample_k_diffusion(sample_method_t method,
                               ggml_context* work_ctx,
                               ggml_tensor* x,
                               std::vector<float> sigmas,
-                               std::shared_ptr<RNG> rng) {
+                               std::shared_ptr<RNG> rng,
+                               float eta) {
    size_t steps = sigmas.size() - 1;
    // sample_euler_ancestral
    switch (method) {
@ -1005,6 +1006,374 @@ static void sample_k_diffusion(sample_method_t method,
                }
            }
        } break;
+        case DDIM_TRAILING:  // Denoising Diffusion Implicit Models
+                             // with the "trailing" timestep spacing
+        {
+            // See J. Song et al., "Denoising Diffusion Implicit
+            // Models", arXiv:2010.02502 [cs.LG]
+            //
+            // DDIM itself needs alphas_cumprod (DDPM, J. Ho et al.,
+            // arXiv:2006.11239 [cs.LG] with k-diffusion's start and
+            // end beta) (which unfortunately k-diffusion's data
+            // structure hides from the denoiser), and the sigmas are
+            // also needed to invert the behavior of CompVisDenoiser
+            // (k-diffusion's LMSDiscreteScheduler)
+            float beta_start = 0.00085f;
+            float beta_end = 0.0120f;
+            std::vector<double> alphas_cumprod;
+            std::vector<double> compvis_sigmas;
+
+            alphas_cumprod.reserve(TIMESTEPS);
+            compvis_sigmas.reserve(TIMESTEPS);
+            for (int i = 0; i < TIMESTEPS; i++) {
+                alphas_cumprod[i] =
+                    (i == 0 ? 1.0f : alphas_cumprod[i - 1]) *
+                    (1.0f -
+                     std::pow(sqrtf(beta_start) +
+                              (sqrtf(beta_end) - sqrtf(beta_start)) *
+                              ((float)i / (TIMESTEPS - 1)), 2));
+                compvis_sigmas[i] =
+                    std::sqrt((1 - alphas_cumprod[i]) /
+                              alphas_cumprod[i]);
+            }
+
+            struct ggml_tensor* pred_original_sample =
+                ggml_dup_tensor(work_ctx, x);
+            struct ggml_tensor* variance_noise =
+                ggml_dup_tensor(work_ctx, x);
+
+            for (int i = 0; i < steps; i++) {
+                // The "trailing" DDIM timestep, see S. Lin et al.,
+                // "Common Diffusion Noise Schedules and Sample Steps
+                // are Flawed", arXiv:2305.08891 [cs], p. 4, Table
+                // 2. Most variables below follow Diffusers naming
+                //
+                // Diffuser naming vs. Song et al. (2010), p. 5, (12)
+                // and p. 16, (16) (<variable name> -> <name in
+                // paper>):
+                //
+                // - pred_noise_t -> epsilon_theta^(t)(x_t)
+                // - pred_original_sample -> f_theta^(t)(x_t) or x_0
+                // - std_dev_t -> sigma_t (not the LMS sigma)
+                // - eta -> eta (set to 0 at the moment)
+                // - pred_sample_direction -> "direction pointing to
+                //   x_t"
+                // - pred_prev_sample -> "x_t-1"
+                int timestep =
+                    roundf(TIMESTEPS -
+                           i * ((float)TIMESTEPS / steps)) - 1;
+                // 1. get previous step value (=t-1)
+                int prev_timestep = timestep - TIMESTEPS / steps;
+                // The sigma here is chosen to cause the
+                // CompVisDenoiser to produce t = timestep
+                float sigma = compvis_sigmas[timestep];
+                if (i == 0) {
+                    // The function add_noise intializes x to
+                    // Diffusers' latents * sigma (as in Diffusers'
+                    // pipeline) or sample * sigma (Diffusers'
+                    // scheduler), where this sigma = init_noise_sigma
+                    // in Diffusers. For DDPM and DDIM however,
+                    // init_noise_sigma = 1. But the k-diffusion
+                    // model() also evaluates F_theta(c_in(sigma) x;
+                    // ...) instead of the bare U-net F_theta, with
+                    // c_in = 1 / sqrt(sigma^2 + 1), as defined in
+                    // T. Karras et al., "Elucidating the Design Space
+                    // of Diffusion-Based Generative Models",
+                    // arXiv:2206.00364 [cs.CV], p. 3, Table 1. Hence
+                    // the first call has to be prescaled as x <- x /
+                    // (c_in * sigma) with the k-diffusion pipeline
+                    // and CompVisDenoiser.
+                    float* vec_x = (float*)x->data;
+                    for (int j = 0; j < ggml_nelements(x); j++) {
+                        vec_x[j] *= std::sqrt(sigma * sigma + 1) /
+                            sigma;
+                    }
+                }
+                else {
+                    // For the subsequent steps after the first one,
+                    // at this point x = latents or x = sample, and
+                    // needs to be prescaled with x <- sample / c_in
+                    // to compensate for model() applying the scale
+                    // c_in before the U-net F_theta
+                    float* vec_x = (float*)x->data;
+                    for (int j = 0; j < ggml_nelements(x); j++) {
+                        vec_x[j] *= std::sqrt(sigma * sigma + 1);
+                    }
+                }
+                // Note (also noise_pred in Diffuser's pipeline)
+                // model_output = model() is the D(x, sigma) as
+                // defined in Karras et al. (2022), p. 3, Table 1 and
+                // p. 8 (7), compare also p. 38 (226) therein.
+                struct ggml_tensor* model_output =
+                    model(x, sigma, i + 1);
+                // Here model_output is still the k-diffusion denoiser
+                // output, not the U-net output F_theta(c_in(sigma) x;
+                // ...) in Karras et al. (2022), whereas Diffusers'
+                // model_output is F_theta(...). Recover the actual
+                // model_output, which is also referred to as the
+                // "Karras ODE derivative" d or d_cur in several
+                // samplers above.
+                {
+                    float* vec_x = (float*)x->data;
+                    float* vec_model_output =
+                        (float*)model_output->data;
+                    for (int j = 0; j < ggml_nelements(x); j++) {
+                        vec_model_output[j] =
+                            (vec_x[j] - vec_model_output[j]) *
+                            (1 / sigma);
+                    }
+                }
+                // 2. compute alphas, betas
+                float alpha_prod_t = alphas_cumprod[timestep];
+                // Note final_alpha_cumprod = alphas_cumprod[0] due to
+                // trailing timestep spacing
+                float alpha_prod_t_prev = prev_timestep >= 0 ?
+                    alphas_cumprod[prev_timestep] : alphas_cumprod[0];
+                float beta_prod_t = 1 - alpha_prod_t;
+                // 3. compute predicted original sample from predicted
+                // noise also called "predicted x_0" of formula (12)
+                // from https://arxiv.org/pdf/2010.02502.pdf
+                {
+                    float* vec_x = (float*)x->data;
+                    float* vec_model_output =
+                        (float*)model_output->data;
+                    float* vec_pred_original_sample =
+                        (float*)pred_original_sample->data;
+                    // Note the substitution of latents or sample = x
+                    // * c_in = x / sqrt(sigma^2 + 1)
+                    for (int j = 0; j < ggml_nelements(x); j++) {
+                        vec_pred_original_sample[j] =
+                            (vec_x[j] / std::sqrt(sigma * sigma + 1) -
+                             std::sqrt(beta_prod_t) *
+                             vec_model_output[j]) *
+                            (1 / std::sqrt(alpha_prod_t));
+                    }
+                }
+                // Assuming the "epsilon" prediction type, where below
+                // pred_epsilon = model_output is inserted, and is not
+                // defined/copied explicitly.
+                //
+                // 5. compute variance: "sigma_t(eta)" -> see formula
+                // (16)
+                //
+                // sigma_t = sqrt((1 - alpha_t-1)/(1 - alpha_t)) *
+                // sqrt(1 - alpha_t/alpha_t-1)
+                float beta_prod_t_prev = 1 - alpha_prod_t_prev;
+                float variance = (beta_prod_t_prev / beta_prod_t) *
+                    (1 - alpha_prod_t / alpha_prod_t_prev);
+                float std_dev_t = eta * std::sqrt(variance);
+                // 6. compute "direction pointing to x_t" of formula
+                // (12) from https://arxiv.org/pdf/2010.02502.pdf
+                // 7. compute x_t without "random noise" of formula
+                // (12) from https://arxiv.org/pdf/2010.02502.pdf
+                {
+                    float* vec_model_output = (float*)model_output->data;
+                    float* vec_pred_original_sample =
+                        (float*)pred_original_sample->data;
+                    float* vec_x = (float*)x->data;
+                    for (int j = 0; j < ggml_nelements(x); j++) {
+                        // Two step inner loop without an explicit
+                        // tensor
+                        float pred_sample_direction =
+                            std::sqrt(1 - alpha_prod_t_prev -
+                                      std::pow(std_dev_t, 2)) *
+                            vec_model_output[j];
+                        vec_x[j] = std::sqrt(alpha_prod_t_prev) *
+                            vec_pred_original_sample[j] +
+                            pred_sample_direction;
+                    }
+                }
+                if (eta > 0) {
+                    ggml_tensor_set_f32_randn(variance_noise, rng);
+                    float* vec_variance_noise =
+                        (float*)variance_noise->data;
+                    float* vec_x = (float*)x->data;
+                    for (int j = 0; j < ggml_nelements(x); j++) {
+                        vec_x[j] += std_dev_t * vec_variance_noise[j];
+                    }
+                }
+                // See the note above: x = latents or sample here, and
+                // is not scaled by the c_in. For the final output
+                // this is correct, but for subsequent iterations, x
+                // needs to be prescaled again, since k-diffusion's
+                // model() differes from the bare U-net F_theta by the
+                // factor c_in.
+            }
+        } break;
+        case TCD:  // Strategic Stochastic Sampling (Algorithm 4) in
+                   // Trajectory Consistency Distillation
+        {
+            // See J. Zheng et al., "Trajectory Consistency
+            // Distillation: Improved Latent Consistency Distillation
+            // by Semi-Linear Consistency Function with Trajectory
+            // Mapping", arXiv:2402.19159 [cs.CV]
+            float beta_start = 0.00085f;
+            float beta_end = 0.0120f;
+            std::vector<double> alphas_cumprod;
+            std::vector<double> compvis_sigmas;
+
+            alphas_cumprod.reserve(TIMESTEPS);
+            compvis_sigmas.reserve(TIMESTEPS);
+            for (int i = 0; i < TIMESTEPS; i++) {
+                alphas_cumprod[i] =
+                    (i == 0 ? 1.0f : alphas_cumprod[i - 1]) *
+                    (1.0f -
+                     std::pow(sqrtf(beta_start) +
+                              (sqrtf(beta_end) - sqrtf(beta_start)) *
+                              ((float)i / (TIMESTEPS - 1)), 2));
+                compvis_sigmas[i] =
+                    std::sqrt((1 - alphas_cumprod[i]) /
+                              alphas_cumprod[i]);
+            }
+            int original_steps = 50;
+
+            struct ggml_tensor* pred_original_sample =
+                ggml_dup_tensor(work_ctx, x);
+            struct ggml_tensor* noise =
+                ggml_dup_tensor(work_ctx, x);
+
+            for (int i = 0; i < steps; i++) {
+                // Analytic form for TCD timesteps
+                int timestep = TIMESTEPS - 1 -
+                    (TIMESTEPS / original_steps) *
+                    (int)floor(i * ((float)original_steps / steps));
+                // 1. get previous step value
+                int prev_timestep = i >= steps - 1 ? 0 :
+                    TIMESTEPS - 1 - (TIMESTEPS / original_steps) *
+                    (int)floor((i + 1) *
+                               ((float)original_steps / steps));
+                // Here timestep_s is tau_n' in Algorithm 4. The _s
+                // notation appears to be that from C. Lu,
+                // "DPM-Solver: A Fast ODE Solver for Diffusion
+                // Probabilistic Model Sampling in Around 10 Steps",
+                // arXiv:2206.00927 [cs.LG], but this notation is not
+                // continued in Algorithm 4, where _n' is used.
+                int timestep_s =
+                    (int)floor((1 - eta) * prev_timestep);
+                // Begin k-diffusion specific workaround for
+                // evaluating F_theta(x; ...) from D(x, sigma), same
+                // as in DDIM (and see there for detailed comments)
+                float sigma = compvis_sigmas[timestep];
+                if (i == 0) {
+                    float* vec_x = (float*)x->data;
+                    for (int j = 0; j < ggml_nelements(x); j++) {
+                        vec_x[j] *= std::sqrt(sigma * sigma + 1) /
+                            sigma;
+                    }
+                }
+                else {
+                    float* vec_x = (float*)x->data;
+                    for (int j = 0; j < ggml_nelements(x); j++) {
+                        vec_x[j] *= std::sqrt(sigma * sigma + 1);
+                    }
+                }
+                struct ggml_tensor* model_output =
+                    model(x, sigma, i + 1);
+                {
+                    float* vec_x = (float*)x->data;
+                    float* vec_model_output =
+                        (float*)model_output->data;
+                    for (int j = 0; j < ggml_nelements(x); j++) {
+                        vec_model_output[j] =
+                            (vec_x[j] - vec_model_output[j]) *
+                            (1 / sigma);
+                    }
+                }
+                // 2. compute alphas, betas
+                //
+                // When comparing TCD with DDPM/DDIM note that Zheng
+                // et al. (2024) follows the DPM-Solver notation for
+                // alpha. One can find the following comment in the
+                // original DPM-Solver code
+                // (https://github.com/LuChengTHU/dpm-solver/):
+                // "**Important**: Please pay special attention for
+                // the args for `alphas_cumprod`: The `alphas_cumprod`
+                // is the \hat{alpha_n} arrays in the notations of
+                // DDPM. [...] Therefore, the notation \hat{alpha_n}
+                // is different from the notation alpha_t in
+                // DPM-Solver. In fact, we have alpha_{t_n} =
+                // \sqrt{\hat{alpha_n}}, [...]"
+                float alpha_prod_t = alphas_cumprod[timestep];
+                float beta_prod_t = 1 - alpha_prod_t;
+                // Note final_alpha_cumprod = alphas_cumprod[0] since
+                // TCD is always "trailing"
+                float alpha_prod_t_prev = prev_timestep >= 0 ?
+                    alphas_cumprod[prev_timestep] : alphas_cumprod[0];
+                // The subscript _s are the only portion in this
+                // section (2) unique to TCD
+                float alpha_prod_s = alphas_cumprod[timestep_s];
+                float beta_prod_s = 1 - alpha_prod_s;
+                // 3. Compute the predicted noised sample x_s based on
+                // the model parameterization
+                //
+                // This section is also exactly the same as DDIM
+                {
+                    float* vec_x = (float*)x->data;
+                    float* vec_model_output =
+                        (float*)model_output->data;
+                    float* vec_pred_original_sample =
+                        (float*)pred_original_sample->data;
+                    for (int j = 0; j < ggml_nelements(x); j++) {
+                        vec_pred_original_sample[j] =
+                            (vec_x[j] / std::sqrt(sigma * sigma + 1) -
+                             std::sqrt(beta_prod_t) *
+                             vec_model_output[j]) *
+                            (1 / std::sqrt(alpha_prod_t));
+                    }
+                }
+                // This consistency function step can be difficult to
+                // decipher from Algorithm 4, as it is simply stated
+                // using a consistency function. This step is the
+                // modified DDIM, i.e. p. 8 (32) in Zheng et
+                // al. (2024), with eta set to 0 (see the paragraph
+                // immediately thereafter that states this somewhat
+                // obliquely).
+                {
+                    float* vec_pred_original_sample =
+                        (float*)pred_original_sample->data;
+                    float* vec_model_output =
+                        (float*)model_output->data;
+                    float* vec_x = (float*)x->data;
+                    for (int j = 0; j < ggml_nelements(x); j++) {
+                        // Substituting x = pred_noised_sample and
+                        // pred_epsilon = model_output
+                        vec_x[j] =
+                            std::sqrt(alpha_prod_s) *
+                            vec_pred_original_sample[j] +
+                            std::sqrt(beta_prod_s) *
+                            vec_model_output[j];
+                    }
+                }
+                // 4. Sample and inject noise z ~ N(0, I) for
+                // MultiStep Inference Noise is not used on the final
+                // timestep of the timestep schedule. This also means
+                // that noise is not used for one-step sampling. Eta
+                // (referred to as "gamma" in the paper) was
+                // introduced to control the stochasticity in every
+                // step. When eta = 0, it represents deterministic
+                // sampling, whereas eta = 1 indicates full stochastic
+                // sampling.
+                if (eta > 0 && i != steps - 1) {
+                    // In this case, x is still pred_noised_sample,
+                    // continue in-place
+                    ggml_tensor_set_f32_randn(noise, rng);
+                    float* vec_x = (float*)x->data;
+                    float* vec_noise = (float*)noise->data;
+                    for (int j = 0; j < ggml_nelements(x); j++) {
+                        // Corresponding to (35) in Zheng et
+                        // al. (2024), substituting x =
+                        // pred_noised_sample
+                        vec_x[j] =
+                            std::sqrt(alpha_prod_t_prev /
+                                      alpha_prod_s) *
+                            vec_x[j] +
+                            std::sqrt(1 - alpha_prod_t_prev /
+                                      alpha_prod_s) *
+                            vec_noise[j];
+                    }
+                }
+            }
+        } break;

        default:
            LOG_ERROR("Attempting to sample with nonexisting sample method %i", method);
@ -1012,4 +1381,4 @@ static void sample_k_diffusion(sample_method_t method,
    }
 }

-#endif  // __DENOISER_HPP__
+#endif  // __DENOISER_HPP__
--- a/otherarch/sdcpp/diffusion_model.hpp
+++ b/otherarch/sdcpp/diffusion_model.hpp
@ -133,8 +133,9 @@ struct FluxModel : public DiffusionModel {

    FluxModel(ggml_backend_t backend,
              std::map<std::string, enum ggml_type>& tensor_types,
-              bool flash_attn = false)
-        : flux(backend, tensor_types, "model.diffusion_model", flash_attn) {
+              SDVersion version = VERSION_FLUX,
+              bool flash_attn   = false)
+        : flux(backend, tensor_types, "model.diffusion_model", version, flash_attn) {
    }

    void alloc_params_buffer() {
@ -174,8 +175,8 @@ struct FluxModel : public DiffusionModel {
                 struct ggml_tensor** output               = NULL,
                 struct ggml_context* output_ctx           = NULL,
                 std::vector<int> skip_layers              = std::vector<int>()) {
-        return flux.compute(n_threads, x, timesteps, context, y, guidance, output, output_ctx, skip_layers);
+        return flux.compute(n_threads, x, timesteps, context, c_concat, y, guidance, output, output_ctx, skip_layers);
    }
 };

-#endif
+#endif
--- a/otherarch/sdcpp/flux.hpp
+++ b/otherarch/sdcpp/flux.hpp
@ -490,6 +490,7 @@ namespace Flux {

    struct FluxParams {
        int64_t in_channels         = 64;
+        int64_t out_channels        = 64;
        int64_t vec_in_dim          = 768;
        int64_t context_in_dim      = 4096;
        int64_t hidden_size         = 3072;
@ -642,8 +643,7 @@ namespace Flux {
        Flux() {}
        Flux(FluxParams params)
            : params(params) {
-            int64_t out_channels = params.in_channels;
-            int64_t pe_dim       = params.hidden_size / params.num_heads;
+            int64_t pe_dim = params.hidden_size / params.num_heads;

            blocks["img_in"]    = std::shared_ptr<GGMLBlock>(new Linear(params.in_channels, params.hidden_size, true));
            blocks["time_in"]   = std::shared_ptr<GGMLBlock>(new MLPEmbedder(256, params.hidden_size));
@ -669,7 +669,7 @@ namespace Flux {
                                                                                                                params.flash_attn));
            }

-            blocks["final_layer"] = std::shared_ptr<GGMLBlock>(new LastLayer(params.hidden_size, 1, out_channels));
+            blocks["final_layer"] = std::shared_ptr<GGMLBlock>(new LastLayer(params.hidden_size, 1, params.out_channels));
        }

        struct ggml_tensor* patchify(struct ggml_context* ctx,
@ -789,6 +789,7 @@ namespace Flux {
                                    struct ggml_tensor* x,
                                    struct ggml_tensor* timestep,
                                    struct ggml_tensor* context,
+                                    struct ggml_tensor* c_concat,
                                    struct ggml_tensor* y,
                                    struct ggml_tensor* guidance,
                                    struct ggml_tensor* pe,
@ -797,6 +798,7 @@ namespace Flux {
            // x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
            // timestep: (N,) tensor of diffusion timesteps
            // context: (N, L, D)
+            // c_concat: NULL, or for (N,C+M, H, W) for Fill
            // y: (N, adm_in_channels) tensor of class labels
            // guidance: (N,)
            // pe: (L, d_head/2, 2, 2)
@ -806,6 +808,7 @@ namespace Flux {

            int64_t W          = x->ne[0];
            int64_t H          = x->ne[1];
+            int64_t C          = x->ne[2];
            int64_t patch_size = 2;
            int pad_h          = (patch_size - H % patch_size) % patch_size;
            int pad_w          = (patch_size - W % patch_size) % patch_size;
@ -814,6 +817,19 @@ namespace Flux {
            // img = rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=patch_size, pw=patch_size)
            auto img = patchify(ctx, x, patch_size);  // [N, h*w, C * patch_size * patch_size]

+            if (c_concat != NULL) {
+                ggml_tensor* masked = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], C, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], 0);
+                ggml_tensor* mask   = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], 8 * 8, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * C);
+
+                masked = ggml_pad(ctx, masked, pad_w, pad_h, 0, 0);
+                mask   = ggml_pad(ctx, mask, pad_w, pad_h, 0, 0);
+
+                masked = patchify(ctx, masked, patch_size);
+                mask   = patchify(ctx, mask, patch_size);
+
+                img = ggml_concat(ctx, img, ggml_concat(ctx, masked, mask, 0), 0);
+            }
+
            auto out = forward_orig(ctx, img, context, timestep, y, guidance, pe, skip_layers);  // [N, h*w, C * patch_size * patch_size]

            // rearrange(out, "b (h w) (c ph pw) -> b c (h ph) (w pw)", h=h_len, w=w_len, ph=2, pw=2)
@ -834,12 +850,16 @@ namespace Flux {
        FluxRunner(ggml_backend_t backend,
                   std::map<std::string, enum ggml_type>& tensor_types = empty_tensor_types,
                   const std::string prefix                            = "",
+                   SDVersion version                                   = VERSION_FLUX,
                   bool flash_attn                                     = false)
            : GGMLRunner(backend) {
            flux_params.flash_attn          = flash_attn;
            flux_params.guidance_embed      = false;
            flux_params.depth               = 0;
            flux_params.depth_single_blocks = 0;
+            if (version == VERSION_FLUX_FILL) {
+                flux_params.in_channels = 384;
+            }
            for (auto pair : tensor_types) {
                std::string tensor_name = pair.first;
                if (tensor_name.find("model.diffusion_model.") == std::string::npos)
@ -886,14 +906,18 @@ namespace Flux {
        struct ggml_cgraph* build_graph(struct ggml_tensor* x,
                                        struct ggml_tensor* timesteps,
                                        struct ggml_tensor* context,
+                                        struct ggml_tensor* c_concat,
                                        struct ggml_tensor* y,
                                        struct ggml_tensor* guidance,
                                        std::vector<int> skip_layers = std::vector<int>()) {
            GGML_ASSERT(x->ne[3] == 1);
            struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, FLUX_GRAPH_SIZE, false);

-            x         = to_backend(x);
-            context   = to_backend(context);
+            x       = to_backend(x);
+            context = to_backend(context);
+            if (c_concat != NULL) {
+                c_concat = to_backend(c_concat);
+            }
            y         = to_backend(y);
            timesteps = to_backend(timesteps);
            if (flux_params.guidance_embed) {
@ -913,6 +937,7 @@ namespace Flux {
                                                   x,
                                                   timesteps,
                                                   context,
+                                                   c_concat,
                                                   y,
                                                   guidance,
                                                   pe,
@ -927,6 +952,7 @@ namespace Flux {
                     struct ggml_tensor* x,
                     struct ggml_tensor* timesteps,
                     struct ggml_tensor* context,
+                     struct ggml_tensor* c_concat,
                     struct ggml_tensor* y,
                     struct ggml_tensor* guidance,
                     struct ggml_tensor** output     = NULL,
@ -938,7 +964,7 @@ namespace Flux {
            // y: [N, adm_in_channels] or [1, adm_in_channels]
            // guidance: [N, ]
            auto get_graph = [&]() -> struct ggml_cgraph* {
-                return build_graph(x, timesteps, context, y, guidance, skip_layers);
+                return build_graph(x, timesteps, context, c_concat, y, guidance, skip_layers);
            };

            GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
@ -978,7 +1004,7 @@ namespace Flux {
                struct ggml_tensor* out = NULL;

                int t0 = ggml_time_ms();
-                compute(8, x, timesteps, context, y, guidance, &out, work_ctx);
+                compute(8, x, timesteps, context, NULL, y, guidance, &out, work_ctx);
                int t1 = ggml_time_ms();

                print_ggml_tensor(out);
@ -1019,4 +1045,4 @@ namespace Flux {

 }  // namespace Flux

-#endif  // __FLUX_HPP__
+#endif  // __FLUX_HPP__
--- a/otherarch/sdcpp/ggml_extend.hpp
+++ b/otherarch/sdcpp/ggml_extend.hpp
@ -52,6 +52,71 @@
 #define __STATIC_INLINE__ static inline
 #endif

+// n-mode trensor-matrix product
+// example: 2-mode product
+// A: [ne03, k, ne01, ne00]
+// B: k rows, m columns => [k, m]
+// result is [ne03, m, ne01, ne00]
+__STATIC_INLINE__ struct ggml_tensor* ggml_mul_n_mode(struct ggml_context* ctx, struct ggml_tensor* a, struct ggml_tensor* b, int mode = 0) {
+    // reshape A
+    // swap 0th and nth axis
+    a       = ggml_cont(ctx, ggml_permute(ctx, a, mode, mode != 1 ? 1 : 0, mode != 2 ? 2 : 0, mode != 3 ? 3 : 0));
+    int ne1 = a->ne[1];
+    int ne2 = a->ne[2];
+    int ne3 = a->ne[3];
+    // make 2D
+    a = ggml_cont(ctx, ggml_reshape_2d(ctx, a, a->ne[0], (ne3 * ne2 * ne1)));
+
+    struct ggml_tensor* result = ggml_cont(ctx, ggml_transpose(ctx, ggml_mul_mat(ctx, a, b)));
+
+    // reshape output (same shape as a after permutation except first dim)
+    result = ggml_reshape_4d(ctx, result, result->ne[0], ne1, ne2, ne3);
+    // swap back 0th and nth axis
+    result = ggml_permute(ctx, result, mode, mode != 1 ? 1 : 0, mode != 2 ? 2 : 0, mode != 3 ? 3 : 0);
+    return result;
+}
+
+__STATIC_INLINE__ struct ggml_tensor* ggml_merge_lora(ggml_context* ctx, struct ggml_tensor* lora_down, struct ggml_tensor* lora_up, struct ggml_tensor* lora_mid = NULL) {
+    struct ggml_tensor* updown;
+    // flat lora tensors to multiply it
+    int64_t lora_up_rows  = lora_up->ne[ggml_n_dims(lora_up) - 1];
+    lora_up               = ggml_reshape_2d(ctx, lora_up, ggml_nelements(lora_up) / lora_up_rows, lora_up_rows);
+    auto lora_down_n_dims = ggml_n_dims(lora_down);
+    // assume n_dims should always be a multiple of 2 (otherwise rank 1 doesn't work)
+    lora_down_n_dims       = (lora_down_n_dims + lora_down_n_dims % 2);
+    int64_t lora_down_rows = lora_down->ne[lora_down_n_dims - 1];
+    lora_down              = ggml_reshape_2d(ctx, lora_down, ggml_nelements(lora_down) / lora_down_rows, lora_down_rows);
+
+    // ggml_mul_mat requires tensor b transposed
+    lora_down = ggml_cont(ctx, ggml_transpose(ctx, lora_down));
+    if (lora_mid == NULL) {
+        updown = ggml_mul_mat(ctx, lora_up, lora_down);
+        updown = ggml_cont(ctx, ggml_transpose(ctx, updown));
+    } else {
+        // undoing tucker decomposition for conv layers.
+        // lora_mid  has shape (3,    3,   Rank, Rank)
+        // lora_down has shape (Rank, In,  1,    1)
+        // lora_up   has shape (Rank, Out, 1,    1)
+        // conv layer shape is (3,    3,   Out,  In)
+        updown = ggml_mul_n_mode(ctx, ggml_mul_n_mode(ctx, lora_mid, lora_down, 3), lora_up, 2);
+        updown = ggml_cont(ctx, updown);
+    }
+    return updown;
+}
+
+// Kronecker product
+// [ne03,ne02,ne01,ne00] x [ne13,ne12,ne11,ne10] => [ne03*ne13,ne02*ne12,ne01*ne11,ne00*ne10]
+__STATIC_INLINE__ struct ggml_tensor* ggml_kronecker(ggml_context* ctx, struct ggml_tensor* a, struct ggml_tensor* b) {
+    return ggml_mul(ctx,
+                    ggml_upscale_ext(ctx,
+                                     a,
+                                     a->ne[0] * b->ne[0],
+                                     a->ne[1] * b->ne[1],
+                                     a->ne[2] * b->ne[2],
+                                     a->ne[3] * b->ne[3]),
+                    b);
+}
+
 __STATIC_INLINE__ void ggml_log_callback_default(ggml_log_level level, const char* text, void* user_data) {
    (void)level;
    (void)user_data;
@ -290,6 +355,44 @@ __STATIC_INLINE__ void sd_image_to_tensor(const uint8_t* image_data,
    }
 }

+__STATIC_INLINE__ void sd_mask_to_tensor(const uint8_t* image_data,
+                                         struct ggml_tensor* output,
+                                         bool scale = true) {
+    int64_t width    = output->ne[0];
+    int64_t height   = output->ne[1];
+    int64_t channels = output->ne[2];
+    GGML_ASSERT(channels == 1 && output->type == GGML_TYPE_F32);
+    for (int iy = 0; iy < height; iy++) {
+        for (int ix = 0; ix < width; ix++) {
+            float value = *(image_data + iy * width * channels + ix);
+            if (scale) {
+                value /= 255.f;
+            }
+            ggml_tensor_set_f32(output, value, ix, iy);
+        }
+    }
+}
+
+__STATIC_INLINE__ void sd_apply_mask(struct ggml_tensor* image_data,
+                                     struct ggml_tensor* mask,
+                                     struct ggml_tensor* output) {
+    int64_t width    = output->ne[0];
+    int64_t height   = output->ne[1];
+    int64_t channels = output->ne[2];
+    GGML_ASSERT(output->type == GGML_TYPE_F32);
+    for (int ix = 0; ix < width; ix++) {
+        for (int iy = 0; iy < height; iy++) {
+            float m = ggml_tensor_get_f32(mask, ix, iy);
+            m       = round(m);  // inpaint models need binary masks
+            ggml_tensor_set_f32(mask, m, ix, iy);
+            for (int k = 0; k < channels; k++) {
+                float value = (1 - m) * (ggml_tensor_get_f32(image_data, ix, iy, k) - .5) + .5;
+                ggml_tensor_set_f32(output, value, ix, iy, k);
+            }
+        }
+    }
+}
+
 __STATIC_INLINE__ void sd_mul_images_to_tensor(const uint8_t* image_data,
                                               struct ggml_tensor* output,
                                               int idx,
@ -951,8 +1054,8 @@ __STATIC_INLINE__ size_t ggml_tensor_num(ggml_context* ctx) {
 }

 /* SDXL with LoRA requires more space */
-#define MAX_PARAMS_TENSOR_NUM 15360
-#define MAX_GRAPH_SIZE 15360
+#define MAX_PARAMS_TENSOR_NUM 32768
+#define MAX_GRAPH_SIZE 32768

 struct GGMLRunner {
 protected:
@ -1533,4 +1636,4 @@ public:
    }
 };

-#endif  // __GGML_EXTEND__HPP__
+#endif  // __GGML_EXTEND__HPP__
--- a/otherarch/sdcpp/gits_noise.inl
+++ b/otherarch/sdcpp/gits_noise.inl
@ -329,21 +329,21 @@ const std::vector<std::vector<float>> GITS_NOISE_1_50 = {
 };

 const std::vector<const std::vector<std::vector<float>>*> GITS_NOISE = {
-    { &GITS_NOISE_0_80 },
-    { &GITS_NOISE_0_85 },
-    { &GITS_NOISE_0_90 },
-    { &GITS_NOISE_0_95 },
-    { &GITS_NOISE_1_00 },
-    { &GITS_NOISE_1_05 },
-    { &GITS_NOISE_1_10 },
-    { &GITS_NOISE_1_15 },
-    { &GITS_NOISE_1_20 },
-    { &GITS_NOISE_1_25 },
-    { &GITS_NOISE_1_30 },
-    { &GITS_NOISE_1_35 },
-    { &GITS_NOISE_1_40 },
-    { &GITS_NOISE_1_45 },
-    { &GITS_NOISE_1_50 }
+    &GITS_NOISE_0_80,
+    &GITS_NOISE_0_85,
+    &GITS_NOISE_0_90,
+    &GITS_NOISE_0_95,
+    &GITS_NOISE_1_00,
+    &GITS_NOISE_1_05,
+    &GITS_NOISE_1_10,
+    &GITS_NOISE_1_15,
+    &GITS_NOISE_1_20,
+    &GITS_NOISE_1_25,
+    &GITS_NOISE_1_30,
+    &GITS_NOISE_1_35,
+    &GITS_NOISE_1_40,
+    &GITS_NOISE_1_45,
+    &GITS_NOISE_1_50
 };

-#endif // GITS_NOISE_INL
+#endif // GITS_NOISE_INL
--- a/otherarch/sdcpp/lora.hpp
+++ b/otherarch/sdcpp/lora.hpp
@ -6,6 +6,90 @@
 #define LORA_GRAPH_SIZE 10240

 struct LoraModel : public GGMLRunner {
+    enum lora_t {
+        REGULAR      = 0,
+        DIFFUSERS    = 1,
+        DIFFUSERS_2  = 2,
+        DIFFUSERS_3  = 3,
+        TRANSFORMERS = 4,
+        LORA_TYPE_COUNT
+    };
+
+    const std::string lora_ups[LORA_TYPE_COUNT] = {
+        ".lora_up",
+        "_lora.up",
+        ".lora_B",
+        ".lora.up",
+        ".lora_linear_layer.up",
+    };
+
+    const std::string lora_downs[LORA_TYPE_COUNT] = {
+        ".lora_down",
+        "_lora.down",
+        ".lora_A",
+        ".lora.down",
+        ".lora_linear_layer.down",
+    };
+
+    const std::string lora_pre[LORA_TYPE_COUNT] = {
+        "lora.",
+        "",
+        "",
+        "",
+        "",
+    };
+
+    const std::map<std::string, std::string> alt_names = {
+        // mmdit
+        {"final_layer.adaLN_modulation.1", "norm_out.linear"},
+        {"pos_embed", "pos_embed.proj"},
+        {"final_layer.linear", "proj_out"},
+        {"y_embedder.mlp.0", "time_text_embed.text_embedder.linear_1"},
+        {"y_embedder.mlp.2", "time_text_embed.text_embedder.linear_2"},
+        {"t_embedder.mlp.0", "time_text_embed.timestep_embedder.linear_1"},
+        {"t_embedder.mlp.2", "time_text_embed.timestep_embedder.linear_2"},
+        {"x_block.mlp.fc1", "ff.net.0.proj"},
+        {"x_block.mlp.fc2", "ff.net.2"},
+        {"context_block.mlp.fc1", "ff_context.net.0.proj"},
+        {"context_block.mlp.fc2", "ff_context.net.2"},
+        {"x_block.adaLN_modulation.1", "norm1.linear"},
+        {"context_block.adaLN_modulation.1", "norm1_context.linear"},
+        {"context_block.attn.proj", "attn.to_add_out"},
+        {"x_block.attn.proj", "attn.to_out.0"},
+        {"x_block.attn2.proj", "attn2.to_out.0"},
+        // flux
+        // singlestream
+        {"linear2", "proj_out"},
+        {"modulation.lin", "norm.linear"},
+        // doublestream
+        {"txt_attn.proj", "attn.to_add_out"},
+        {"img_attn.proj", "attn.to_out.0"},
+        {"txt_mlp.0", "ff_context.net.0.proj"},
+        {"txt_mlp.2", "ff_context.net.2"},
+        {"img_mlp.0", "ff.net.0.proj"},
+        {"img_mlp.2", "ff.net.2"},
+        {"txt_mod.lin", "norm1_context.linear"},
+        {"img_mod.lin", "norm1.linear"},
+    };
+
+    const std::map<std::string, std::string> qkv_prefixes = {
+        // mmdit
+        {"context_block.attn.qkv", "attn.add_"},  // suffix "_proj"
+        {"x_block.attn.qkv", "attn.to_"},
+        {"x_block.attn2.qkv", "attn2.to_"},
+        // flux
+        // doublestream
+        {"txt_attn.qkv", "attn.add_"},  // suffix "_proj"
+        {"img_attn.qkv", "attn.to_"},
+    };
+    const std::map<std::string, std::string> qkvm_prefixes = {
+        // flux
+        // singlestream
+        {"linear1", ""},
+    };
+
+    const std::string* type_fingerprints = lora_ups;
+
    float multiplier = 1.0f;
    std::map<std::string, struct ggml_tensor*> lora_tensors;
    std::string file_path;
@ -14,6 +98,7 @@ struct LoraModel : public GGMLRunner {
    bool applied                    = false;
    std::vector<int> zero_index_vec = {0};
    ggml_tensor* zero_index         = NULL;
+    enum lora_t type                = REGULAR;

    LoraModel(ggml_backend_t backend,
              const std::string& file_path = "",
@ -44,6 +129,13 @@ struct LoraModel : public GGMLRunner {
                // LOG_INFO("skipping LoRA tesnor '%s'", name.c_str());
                return true;
            }
+            // LOG_INFO("%s", name.c_str());
+            for (int i = 0; i < LORA_TYPE_COUNT; i++) {
+                if (name.find(type_fingerprints[i]) != std::string::npos) {
+                    type = (lora_t)i;
+                    break;
+                }
+            }

            if (dry_run) {
                struct ggml_tensor* real = ggml_new_tensor(params_ctx,
@ -61,10 +153,12 @@ struct LoraModel : public GGMLRunner {

        model_loader.load_tensors(on_new_tensor_cb, backend);
        alloc_params_buffer();
-
+        // exit(0);
        dry_run = false;
        model_loader.load_tensors(on_new_tensor_cb, backend);

+        LOG_DEBUG("lora type: \"%s\"/\"%s\"", lora_downs[type].c_str(), lora_ups[type].c_str());
+
        LOG_DEBUG("finished loaded lora");
        return true;
    }
@ -76,7 +170,74 @@ struct LoraModel : public GGMLRunner {
        return out;
    }

-    struct ggml_cgraph* build_lora_graph(std::map<std::string, struct ggml_tensor*> model_tensors) {
+    std::vector<std::string> to_lora_keys(std::string blk_name, SDVersion version) {
+        std::vector<std::string> keys;
+        // if (!sd_version_is_sd3(version) || blk_name != "model.diffusion_model.pos_embed") {
+        size_t k_pos = blk_name.find(".weight");
+        if (k_pos == std::string::npos) {
+            return keys;
+        }
+        blk_name = blk_name.substr(0, k_pos);
+        // }
+        keys.push_back(blk_name);
+        keys.push_back("lora." + blk_name);
+        if (sd_version_is_dit(version)) {
+            if (blk_name.find("model.diffusion_model") != std::string::npos) {
+                blk_name.replace(blk_name.find("model.diffusion_model"), sizeof("model.diffusion_model") - 1, "transformer");
+            }
+
+            if (blk_name.find(".single_blocks") != std::string::npos) {
+                blk_name.replace(blk_name.find(".single_blocks"), sizeof(".single_blocks") - 1, ".single_transformer_blocks");
+            }
+            if (blk_name.find(".double_blocks") != std::string::npos) {
+                blk_name.replace(blk_name.find(".double_blocks"), sizeof(".double_blocks") - 1, ".transformer_blocks");
+            }
+
+            if (blk_name.find(".joint_blocks") != std::string::npos) {
+                blk_name.replace(blk_name.find(".joint_blocks"), sizeof(".joint_blocks") - 1, ".transformer_blocks");
+            }
+
+            if (blk_name.find("text_encoders.clip_l") != std::string::npos) {
+                blk_name.replace(blk_name.find("text_encoders.clip_l"), sizeof("text_encoders.clip_l") - 1, "cond_stage_model");
+            }
+
+            for (const auto& item : alt_names) {
+                size_t match = blk_name.find(item.first);
+                if (match != std::string::npos) {
+                    blk_name = blk_name.substr(0, match) + item.second;
+                }
+            }
+            for (const auto& prefix : qkv_prefixes) {
+                size_t match = blk_name.find(prefix.first);
+                if (match != std::string::npos) {
+                    std::string split_blk = "SPLIT|" + blk_name.substr(0, match) + prefix.second;
+                    keys.push_back(split_blk);
+                }
+            }
+            for (const auto& prefix : qkvm_prefixes) {
+                size_t match = blk_name.find(prefix.first);
+                if (match != std::string::npos) {
+                    std::string split_blk = "SPLIT_L|" + blk_name.substr(0, match) + prefix.second;
+                    keys.push_back(split_blk);
+                }
+            }
+            keys.push_back(blk_name);
+        }
+
+        std::vector<std::string> ret;
+        for (std::string& key : keys) {
+            ret.push_back(key);
+            replace_all_chars(key, '.', '_');
+            // fix for some sdxl lora, like lcm-lora-xl
+            if (key == "model_diffusion_model_output_blocks_2_2_conv") {
+                ret.push_back("model_diffusion_model_output_blocks_2_1_conv");
+            }
+            ret.push_back(key);
+        }
+        return ret;
+    }
+
+    struct ggml_cgraph* build_lora_graph(std::map<std::string, struct ggml_tensor*> model_tensors, SDVersion version) {
        struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, LORA_GRAPH_SIZE, false);

        zero_index = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_I32, 1);
@ -88,93 +249,574 @@ struct LoraModel : public GGMLRunner {
            std::string k_tensor       = it.first;
            struct ggml_tensor* weight = model_tensors[it.first];

-            size_t k_pos = k_tensor.find(".weight");
-            if (k_pos == std::string::npos) {
+            std::vector<std::string> keys = to_lora_keys(k_tensor, version);
+            if (keys.size() == 0)
                continue;
-            }
-            k_tensor = k_tensor.substr(0, k_pos);
-            replace_all_chars(k_tensor, '.', '_');
-            // LOG_DEBUG("k_tensor %s", k_tensor.c_str());
-            std::string lora_up_name = "lora." + k_tensor + ".lora_up.weight";
-            if (lora_tensors.find(lora_up_name) == lora_tensors.end()) {
-                if (k_tensor == "model_diffusion_model_output_blocks_2_2_conv") {
-                    // fix for some sdxl lora, like lcm-lora-xl
-                    k_tensor     = "model_diffusion_model_output_blocks_2_1_conv";
-                    lora_up_name = "lora." + k_tensor + ".lora_up.weight";
+
+            for (auto& key : keys) {
+                bool is_qkv_split = starts_with(key, "SPLIT|");
+                if (is_qkv_split) {
+                    key = key.substr(sizeof("SPLIT|") - 1);
                }
+                bool is_qkvm_split = starts_with(key, "SPLIT_L|");
+                if (is_qkvm_split) {
+                    key = key.substr(sizeof("SPLIT_L|") - 1);
+                }
+                struct ggml_tensor* updown = NULL;
+                float scale_value          = 1.0f;
+                std::string fk             = lora_pre[type] + key;
+                if (lora_tensors.find(fk + ".hada_w1_a") != lora_tensors.end()) {
+                    // LoHa mode
+
+                    // TODO: split qkv convention for LoHas (is it ever used?)
+                    if (is_qkv_split || is_qkvm_split) {
+                        LOG_ERROR("Split qkv isn't supported for LoHa models.");
+                        break;
+                    }
+                    std::string alpha_name = "";
+
+                    ggml_tensor* hada_1_mid  = NULL;  // tau for tucker decomposition
+                    ggml_tensor* hada_1_up   = NULL;
+                    ggml_tensor* hada_1_down = NULL;
+
+                    ggml_tensor* hada_2_mid  = NULL;  // tau for tucker decomposition
+                    ggml_tensor* hada_2_up   = NULL;
+                    ggml_tensor* hada_2_down = NULL;
+
+                    std::string hada_1_mid_name  = "";
+                    std::string hada_1_down_name = "";
+                    std::string hada_1_up_name   = "";
+
+                    std::string hada_2_mid_name  = "";
+                    std::string hada_2_down_name = "";
+                    std::string hada_2_up_name   = "";
+
+
+                    hada_1_down_name = fk + ".hada_w1_b";
+                    hada_1_up_name   = fk + ".hada_w1_a";
+                    hada_1_mid_name  = fk + ".hada_t1";
+                    if (lora_tensors.find(hada_1_down_name) != lora_tensors.end()) {
+                        hada_1_down = to_f32(compute_ctx, lora_tensors[hada_1_down_name]);
+                    }
+                    if (lora_tensors.find(hada_1_up_name) != lora_tensors.end()) {
+                        hada_1_up = to_f32(compute_ctx, lora_tensors[hada_1_up_name]);
+                    }
+                    if (lora_tensors.find(hada_1_mid_name) != lora_tensors.end()) {
+                        hada_1_mid = to_f32(compute_ctx, lora_tensors[hada_1_mid_name]);
+                        applied_lora_tensors.insert(hada_1_mid_name);
+                        hada_1_up = ggml_cont(compute_ctx, ggml_transpose(compute_ctx, hada_1_up));
+                    }
+
+                    hada_2_down_name = fk + ".hada_w2_b";
+                    hada_2_up_name   = fk + ".hada_w2_a";
+                    hada_2_mid_name  = fk + ".hada_t2";
+                    if (lora_tensors.find(hada_2_down_name) != lora_tensors.end()) {
+                        hada_2_down = to_f32(compute_ctx, lora_tensors[hada_2_down_name]);
+                    }
+                    if (lora_tensors.find(hada_2_up_name) != lora_tensors.end()) {
+                        hada_2_up = to_f32(compute_ctx, lora_tensors[hada_2_up_name]);
+                    }
+                    if (lora_tensors.find(hada_2_mid_name) != lora_tensors.end()) {
+                        hada_2_mid = to_f32(compute_ctx, lora_tensors[hada_2_mid_name]);
+                        applied_lora_tensors.insert(hada_2_mid_name);
+                        hada_2_up = ggml_cont(compute_ctx, ggml_transpose(compute_ctx, hada_2_up));
+                    }
+
+                    alpha_name = fk + ".alpha";
+
+                    applied_lora_tensors.insert(hada_1_down_name);
+                    applied_lora_tensors.insert(hada_1_up_name);
+                    applied_lora_tensors.insert(hada_2_down_name);
+                    applied_lora_tensors.insert(hada_2_up_name);
+
+                    applied_lora_tensors.insert(alpha_name);
+                    if (hada_1_up == NULL || hada_1_down == NULL || hada_2_up == NULL || hada_2_down == NULL) {
+                        continue;
+                    }
+
+                    struct ggml_tensor* updown_1 = ggml_merge_lora(compute_ctx, hada_1_down, hada_1_up, hada_1_mid);
+                    struct ggml_tensor* updown_2 = ggml_merge_lora(compute_ctx, hada_2_down, hada_2_up, hada_2_mid);
+                    updown                       = ggml_mul_inplace(compute_ctx, updown_1, updown_2);
+
+                    // calc_scale
+                    // TODO: .dora_scale?
+                    int64_t rank = hada_1_down->ne[ggml_n_dims(hada_1_down) - 1];
+                    if (lora_tensors.find(alpha_name) != lora_tensors.end()) {
+                        float alpha = ggml_backend_tensor_get_f32(lora_tensors[alpha_name]);
+                        scale_value = alpha / rank;
+                    }
+                } else if (lora_tensors.find(fk + ".lokr_w1") != lora_tensors.end() || lora_tensors.find(fk + ".lokr_w1_a") != lora_tensors.end()) {
+                    // LoKr mode
+
+                    // TODO: split qkv convention for LoKrs (is it ever used?)
+                    if (is_qkv_split || is_qkvm_split) {
+                        LOG_ERROR("Split qkv isn't supported for LoKr models.");
+                        break;
+                    }
+
+                    std::string alpha_name = fk + ".alpha";
+
+                    ggml_tensor* lokr_w1 = NULL;
+                    ggml_tensor* lokr_w2 = NULL;
+
+                    std::string lokr_w1_name = "";
+                    std::string lokr_w2_name = "";
+
+                    lokr_w1_name = fk + ".lokr_w1";
+                    lokr_w2_name = fk + ".lokr_w2";
+
+                    if (lora_tensors.find(lokr_w1_name) != lora_tensors.end()) {
+                        lokr_w1 = to_f32(compute_ctx, lora_tensors[lokr_w1_name]);
+                        applied_lora_tensors.insert(lokr_w1_name);
+                    } else {
+                        ggml_tensor* down     = NULL;
+                        ggml_tensor* up       = NULL;
+                        std::string down_name = lokr_w1_name + "_b";
+                        std::string up_name   = lokr_w1_name + "_a";
+                        if (lora_tensors.find(down_name) != lora_tensors.end()) {
+                            // w1 should not be low rank normally, sometimes w1 and w2 are swapped
+                            down = to_f32(compute_ctx, lora_tensors[down_name]);
+                            applied_lora_tensors.insert(down_name);
+
+                            int64_t rank = down->ne[ggml_n_dims(down) - 1];
+                            if (lora_tensors.find(alpha_name) != lora_tensors.end()) {
+                                float alpha = ggml_backend_tensor_get_f32(lora_tensors[alpha_name]);
+                                scale_value = alpha / rank;
+                            }
+                        }
+                        if (lora_tensors.find(up_name) != lora_tensors.end()) {
+                            up = to_f32(compute_ctx, lora_tensors[up_name]);
+                            applied_lora_tensors.insert(up_name);
+                        }
+                        lokr_w1 = ggml_merge_lora(compute_ctx, down, up);
+                    }
+                    if (lora_tensors.find(lokr_w2_name) != lora_tensors.end()) {
+                        lokr_w2 = to_f32(compute_ctx, lora_tensors[lokr_w2_name]);
+                        applied_lora_tensors.insert(lokr_w2_name);
+                    } else {
+                        ggml_tensor* down     = NULL;
+                        ggml_tensor* up       = NULL;
+                        std::string down_name = lokr_w2_name + "_b";
+                        std::string up_name   = lokr_w2_name + "_a";
+                        if (lora_tensors.find(down_name) != lora_tensors.end()) {
+                            down = to_f32(compute_ctx, lora_tensors[down_name]);
+                            applied_lora_tensors.insert(down_name);
+
+                            int64_t rank = down->ne[ggml_n_dims(down) - 1];
+                            if (lora_tensors.find(alpha_name) != lora_tensors.end()) {
+                                float alpha = ggml_backend_tensor_get_f32(lora_tensors[alpha_name]);
+                                scale_value = alpha / rank;
+                            }
+                        }
+                        if (lora_tensors.find(up_name) != lora_tensors.end()) {
+                            up = to_f32(compute_ctx, lora_tensors[up_name]);
+                            applied_lora_tensors.insert(up_name);
+                        }
+                        lokr_w2 = ggml_merge_lora(compute_ctx, down, up);
+                    }
+
+                    // Technically it might be unused, but I believe it's the expected behavior
+                    applied_lora_tensors.insert(alpha_name);
+
+                    updown = ggml_kronecker(compute_ctx, lokr_w1, lokr_w2);
+
+                } else {
+                    // LoRA mode
+                    ggml_tensor* lora_mid  = NULL;  // tau for tucker decomposition
+                    ggml_tensor* lora_up   = NULL;
+                    ggml_tensor* lora_down = NULL;
+
+                    std::string alpha_name         = "";
+                    std::string scale_name         = "";
+                    std::string split_q_scale_name = "";
+                    std::string lora_mid_name      = "";
+                    std::string lora_down_name     = "";
+                    std::string lora_up_name       = "";
+
+                    if (is_qkv_split) {
+                        std::string suffix  = "";
+                        auto split_q_d_name = fk + "q" + suffix + lora_downs[type] + ".weight";
+
+                        if (lora_tensors.find(split_q_d_name) == lora_tensors.end()) {
+                            suffix         = "_proj";
+                            split_q_d_name = fk + "q" + suffix + lora_downs[type] + ".weight";
+                        }
+                        if (lora_tensors.find(split_q_d_name) != lora_tensors.end()) {
+                            // print_ggml_tensor(it.second, true);  //[3072, 21504, 1, 1]
+                            // find qkv and mlp up parts in LoRA model
+                            auto split_k_d_name = fk + "k" + suffix + lora_downs[type] + ".weight";
+                            auto split_v_d_name = fk + "v" + suffix + lora_downs[type] + ".weight";
+
+                            auto split_q_u_name = fk + "q" + suffix + lora_ups[type] + ".weight";
+                            auto split_k_u_name = fk + "k" + suffix + lora_ups[type] + ".weight";
+                            auto split_v_u_name = fk + "v" + suffix + lora_ups[type] + ".weight";
+
+                            auto split_q_scale_name = fk + "q" + suffix + ".scale";
+                            auto split_k_scale_name = fk + "k" + suffix + ".scale";
+                            auto split_v_scale_name = fk + "v" + suffix + ".scale";
+
+                            auto split_q_alpha_name = fk + "q" + suffix + ".alpha";
+                            auto split_k_alpha_name = fk + "k" + suffix + ".alpha";
+                            auto split_v_alpha_name = fk + "v" + suffix + ".alpha";
+
+                            ggml_tensor* lora_q_down = NULL;
+                            ggml_tensor* lora_q_up   = NULL;
+                            ggml_tensor* lora_k_down = NULL;
+                            ggml_tensor* lora_k_up   = NULL;
+                            ggml_tensor* lora_v_down = NULL;
+                            ggml_tensor* lora_v_up   = NULL;
+
+                            lora_q_down = to_f32(compute_ctx, lora_tensors[split_q_d_name]);
+
+                            if (lora_tensors.find(split_q_u_name) != lora_tensors.end()) {
+                                lora_q_up = to_f32(compute_ctx, lora_tensors[split_q_u_name]);
+                            }
+
+                            if (lora_tensors.find(split_k_d_name) != lora_tensors.end()) {
+                                lora_k_down = to_f32(compute_ctx, lora_tensors[split_k_d_name]);
+                            }
+
+                            if (lora_tensors.find(split_k_u_name) != lora_tensors.end()) {
+                                lora_k_up = to_f32(compute_ctx, lora_tensors[split_k_u_name]);
+                            }
+
+                            if (lora_tensors.find(split_v_d_name) != lora_tensors.end()) {
+                                lora_v_down = to_f32(compute_ctx, lora_tensors[split_v_d_name]);
+                            }
+
+                            if (lora_tensors.find(split_v_u_name) != lora_tensors.end()) {
+                                lora_v_up = to_f32(compute_ctx, lora_tensors[split_v_u_name]);
+                            }
+
+                            float q_rank = lora_q_up->ne[0];
+                            float k_rank = lora_k_up->ne[0];
+                            float v_rank = lora_v_up->ne[0];
+
+                            float lora_q_scale = 1;
+                            float lora_k_scale = 1;
+                            float lora_v_scale = 1;
+
+                            if (lora_tensors.find(split_q_scale_name) != lora_tensors.end()) {
+                                lora_q_scale = ggml_backend_tensor_get_f32(lora_tensors[split_q_scale_name]);
+                                applied_lora_tensors.insert(split_q_scale_name);
+                            }
+                            if (lora_tensors.find(split_k_scale_name) != lora_tensors.end()) {
+                                lora_k_scale = ggml_backend_tensor_get_f32(lora_tensors[split_k_scale_name]);
+                                applied_lora_tensors.insert(split_k_scale_name);
+                            }
+                            if (lora_tensors.find(split_v_scale_name) != lora_tensors.end()) {
+                                lora_v_scale = ggml_backend_tensor_get_f32(lora_tensors[split_v_scale_name]);
+                                applied_lora_tensors.insert(split_v_scale_name);
+                            }
+
+                            if (lora_tensors.find(split_q_alpha_name) != lora_tensors.end()) {
+                                float lora_q_alpha = ggml_backend_tensor_get_f32(lora_tensors[split_q_alpha_name]);
+                                applied_lora_tensors.insert(split_q_alpha_name);
+                                lora_q_scale = lora_q_alpha / q_rank;
+                            }
+                            if (lora_tensors.find(split_k_alpha_name) != lora_tensors.end()) {
+                                float lora_k_alpha = ggml_backend_tensor_get_f32(lora_tensors[split_k_alpha_name]);
+                                applied_lora_tensors.insert(split_k_alpha_name);
+                                lora_k_scale = lora_k_alpha / k_rank;
+                            }
+                            if (lora_tensors.find(split_v_alpha_name) != lora_tensors.end()) {
+                                float lora_v_alpha = ggml_backend_tensor_get_f32(lora_tensors[split_v_alpha_name]);
+                                applied_lora_tensors.insert(split_v_alpha_name);
+                                lora_v_scale = lora_v_alpha / v_rank;
+                            }
+
+                            ggml_scale_inplace(compute_ctx, lora_q_down, lora_q_scale);
+                            ggml_scale_inplace(compute_ctx, lora_k_down, lora_k_scale);
+                            ggml_scale_inplace(compute_ctx, lora_v_down, lora_v_scale);
+
+                            // print_ggml_tensor(lora_q_down, true);  //[3072, R, 1, 1]
+                            // print_ggml_tensor(lora_k_down, true);  //[3072, R, 1, 1]
+                            // print_ggml_tensor(lora_v_down, true);  //[3072, R, 1, 1]
+                            // print_ggml_tensor(lora_q_up, true);    //[R, 3072, 1, 1]
+                            // print_ggml_tensor(lora_k_up, true);    //[R, 3072, 1, 1]
+                            // print_ggml_tensor(lora_v_up, true);    //[R, 3072, 1, 1]
+
+                            // these need to be stitched together this way:
+                            //                          |q_up,0   ,0   |
+                            //                          |0   ,k_up,0   |
+                            //                          |0   ,0   ,v_up|
+                            // (q_down,k_down,v_down) . (q   ,k   ,v)
+
+                            // up_concat will be [9216, R*3, 1, 1]
+                            // down_concat will be [R*3, 3072, 1, 1]
+                            ggml_tensor* lora_down_concat = ggml_concat(compute_ctx, ggml_concat(compute_ctx, lora_q_down, lora_k_down, 1), lora_v_down, 1);
+
+                            ggml_tensor* z = ggml_dup_tensor(compute_ctx, lora_q_up);
+                            ggml_scale(compute_ctx, z, 0);
+                            ggml_tensor* zz = ggml_concat(compute_ctx, z, z, 1);
+
+                            ggml_tensor* q_up = ggml_concat(compute_ctx, lora_q_up, zz, 1);
+                            ggml_tensor* k_up = ggml_concat(compute_ctx, ggml_concat(compute_ctx, z, lora_k_up, 1), z, 1);
+                            ggml_tensor* v_up = ggml_concat(compute_ctx, zz, lora_v_up, 1);
+                            // print_ggml_tensor(q_up, true);  //[R, 9216, 1, 1]
+                            // print_ggml_tensor(k_up, true);  //[R, 9216, 1, 1]
+                            // print_ggml_tensor(v_up, true);  //[R, 9216, 1, 1]
+                            ggml_tensor* lora_up_concat = ggml_concat(compute_ctx, ggml_concat(compute_ctx, q_up, k_up, 0), v_up, 0);
+                            // print_ggml_tensor(lora_up_concat, true);  //[R*3, 9216, 1, 1]
+
+                            lora_down = ggml_cont(compute_ctx, lora_down_concat);
+                            lora_up   = ggml_cont(compute_ctx, lora_up_concat);
+
+                            applied_lora_tensors.insert(split_q_u_name);
+                            applied_lora_tensors.insert(split_k_u_name);
+                            applied_lora_tensors.insert(split_v_u_name);
+
+                            applied_lora_tensors.insert(split_q_d_name);
+                            applied_lora_tensors.insert(split_k_d_name);
+                            applied_lora_tensors.insert(split_v_d_name);
+                        }
+                    } else if (is_qkvm_split) {
+                        auto split_q_d_name = fk + "attn.to_q" + lora_downs[type] + ".weight";
+                        if (lora_tensors.find(split_q_d_name) != lora_tensors.end()) {
+                            // print_ggml_tensor(it.second, true);  //[3072, 21504, 1, 1]
+                            // find qkv and mlp up parts in LoRA model
+                            auto split_k_d_name = fk + "attn.to_k" + lora_downs[type] + ".weight";
+                            auto split_v_d_name = fk + "attn.to_v" + lora_downs[type] + ".weight";
+
+                            auto split_q_u_name = fk + "attn.to_q" + lora_ups[type] + ".weight";
+                            auto split_k_u_name = fk + "attn.to_k" + lora_ups[type] + ".weight";
+                            auto split_v_u_name = fk + "attn.to_v" + lora_ups[type] + ".weight";
+
+                            auto split_m_d_name = fk + "proj_mlp" + lora_downs[type] + ".weight";
+                            auto split_m_u_name = fk + "proj_mlp" + lora_ups[type] + ".weight";
+
+                            auto split_q_scale_name = fk + "attn.to_q" + ".scale";
+                            auto split_k_scale_name = fk + "attn.to_k" + ".scale";
+                            auto split_v_scale_name = fk + "attn.to_v" + ".scale";
+                            auto split_m_scale_name = fk + "proj_mlp" + ".scale";
+
+                            auto split_q_alpha_name = fk + "attn.to_q" + ".alpha";
+                            auto split_k_alpha_name = fk + "attn.to_k" + ".alpha";
+                            auto split_v_alpha_name = fk + "attn.to_v" + ".alpha";
+                            auto split_m_alpha_name = fk + "proj_mlp" + ".alpha";
+
+                            ggml_tensor* lora_q_down = NULL;
+                            ggml_tensor* lora_q_up   = NULL;
+                            ggml_tensor* lora_k_down = NULL;
+                            ggml_tensor* lora_k_up   = NULL;
+                            ggml_tensor* lora_v_down = NULL;
+                            ggml_tensor* lora_v_up   = NULL;
+
+                            ggml_tensor* lora_m_down = NULL;
+                            ggml_tensor* lora_m_up   = NULL;
+
+                            lora_q_up = to_f32(compute_ctx, lora_tensors[split_q_u_name]);
+
+                            if (lora_tensors.find(split_q_d_name) != lora_tensors.end()) {
+                                lora_q_down = to_f32(compute_ctx, lora_tensors[split_q_d_name]);
+                            }
+
+                            if (lora_tensors.find(split_q_u_name) != lora_tensors.end()) {
+                                lora_q_up = to_f32(compute_ctx, lora_tensors[split_q_u_name]);
+                            }
+
+                            if (lora_tensors.find(split_k_d_name) != lora_tensors.end()) {
+                                lora_k_down = to_f32(compute_ctx, lora_tensors[split_k_d_name]);
+                            }
+
+                            if (lora_tensors.find(split_k_u_name) != lora_tensors.end()) {
+                                lora_k_up = to_f32(compute_ctx, lora_tensors[split_k_u_name]);
+                            }
+
+                            if (lora_tensors.find(split_v_d_name) != lora_tensors.end()) {
+                                lora_v_down = to_f32(compute_ctx, lora_tensors[split_v_d_name]);
+                            }
+
+                            if (lora_tensors.find(split_v_u_name) != lora_tensors.end()) {
+                                lora_v_up = to_f32(compute_ctx, lora_tensors[split_v_u_name]);
+                            }
+
+                            if (lora_tensors.find(split_m_d_name) != lora_tensors.end()) {
+                                lora_m_down = to_f32(compute_ctx, lora_tensors[split_m_d_name]);
+                            }
+
+                            if (lora_tensors.find(split_m_u_name) != lora_tensors.end()) {
+                                lora_m_up = to_f32(compute_ctx, lora_tensors[split_m_u_name]);
+                            }
+
+                            float q_rank = lora_q_up->ne[0];
+                            float k_rank = lora_k_up->ne[0];
+                            float v_rank = lora_v_up->ne[0];
+                            float m_rank = lora_v_up->ne[0];
+
+                            float lora_q_scale = 1;
+                            float lora_k_scale = 1;
+                            float lora_v_scale = 1;
+                            float lora_m_scale = 1;
+
+                            if (lora_tensors.find(split_q_scale_name) != lora_tensors.end()) {
+                                lora_q_scale = ggml_backend_tensor_get_f32(lora_tensors[split_q_scale_name]);
+                                applied_lora_tensors.insert(split_q_scale_name);
+                            }
+                            if (lora_tensors.find(split_k_scale_name) != lora_tensors.end()) {
+                                lora_k_scale = ggml_backend_tensor_get_f32(lora_tensors[split_k_scale_name]);
+                                applied_lora_tensors.insert(split_k_scale_name);
+                            }
+                            if (lora_tensors.find(split_v_scale_name) != lora_tensors.end()) {
+                                lora_v_scale = ggml_backend_tensor_get_f32(lora_tensors[split_v_scale_name]);
+                                applied_lora_tensors.insert(split_v_scale_name);
+                            }
+                            if (lora_tensors.find(split_m_scale_name) != lora_tensors.end()) {
+                                lora_m_scale = ggml_backend_tensor_get_f32(lora_tensors[split_m_scale_name]);
+                                applied_lora_tensors.insert(split_m_scale_name);
+                            }
+
+                            if (lora_tensors.find(split_q_alpha_name) != lora_tensors.end()) {
+                                float lora_q_alpha = ggml_backend_tensor_get_f32(lora_tensors[split_q_alpha_name]);
+                                applied_lora_tensors.insert(split_q_alpha_name);
+                                lora_q_scale = lora_q_alpha / q_rank;
+                            }
+                            if (lora_tensors.find(split_k_alpha_name) != lora_tensors.end()) {
+                                float lora_k_alpha = ggml_backend_tensor_get_f32(lora_tensors[split_k_alpha_name]);
+                                applied_lora_tensors.insert(split_k_alpha_name);
+                                lora_k_scale = lora_k_alpha / k_rank;
+                            }
+                            if (lora_tensors.find(split_v_alpha_name) != lora_tensors.end()) {
+                                float lora_v_alpha = ggml_backend_tensor_get_f32(lora_tensors[split_v_alpha_name]);
+                                applied_lora_tensors.insert(split_v_alpha_name);
+                                lora_v_scale = lora_v_alpha / v_rank;
+                            }
+                            if (lora_tensors.find(split_m_alpha_name) != lora_tensors.end()) {
+                                float lora_m_alpha = ggml_backend_tensor_get_f32(lora_tensors[split_m_alpha_name]);
+                                applied_lora_tensors.insert(split_m_alpha_name);
+                                lora_m_scale = lora_m_alpha / m_rank;
+                            }
+
+                            ggml_scale_inplace(compute_ctx, lora_q_down, lora_q_scale);
+                            ggml_scale_inplace(compute_ctx, lora_k_down, lora_k_scale);
+                            ggml_scale_inplace(compute_ctx, lora_v_down, lora_v_scale);
+                            ggml_scale_inplace(compute_ctx, lora_m_down, lora_m_scale);
+
+                            // print_ggml_tensor(lora_q_down, true);  //[3072, R, 1, 1]
+                            // print_ggml_tensor(lora_k_down, true);  //[3072, R, 1, 1]
+                            // print_ggml_tensor(lora_v_down, true);  //[3072, R, 1, 1]
+                            // print_ggml_tensor(lora_m_down, true);  //[3072, R, 1, 1]
+                            // print_ggml_tensor(lora_q_up, true);  //[R, 3072, 1, 1]
+                            // print_ggml_tensor(lora_k_up, true);  //[R, 3072, 1, 1]
+                            // print_ggml_tensor(lora_v_up, true);  //[R, 3072, 1, 1]
+                            // print_ggml_tensor(lora_m_up, true);  //[R, 12288, 1, 1]
+
+                            // these need to be stitched together this way:
+                            //                                 |q_up,0   ,0   ,0   |
+                            //                                 |0   ,k_up,0   ,0   |
+                            //                                 |0   ,0   ,v_up,0   |
+                            //                                 |0   ,0   ,0   ,m_up|
+                            // (q_down,k_down,v_down,m_down) . (q   ,k   ,v   ,m)
+
+                            // up_concat will be [21504, R*4, 1, 1]
+                            // down_concat will be [R*4, 3072, 1, 1]
+
+                            ggml_tensor* lora_down_concat = ggml_concat(compute_ctx, ggml_concat(compute_ctx, lora_q_down, lora_k_down, 1), ggml_concat(compute_ctx, lora_v_down, lora_m_down, 1), 1);
+                            // print_ggml_tensor(lora_down_concat, true);  //[3072, R*4, 1, 1]
+
+                            // this also means that if rank is bigger than 672, it is less memory efficient to do it this way (should be fine)
+                            // print_ggml_tensor(lora_q_up, true);  //[3072, R, 1, 1]
+                            ggml_tensor* z     = ggml_dup_tensor(compute_ctx, lora_q_up);
+                            ggml_tensor* mlp_z = ggml_dup_tensor(compute_ctx, lora_m_up);
+                            ggml_scale(compute_ctx, z, 0);
+                            ggml_scale(compute_ctx, mlp_z, 0);
+                            ggml_tensor* zz = ggml_concat(compute_ctx, z, z, 1);
+
+                            ggml_tensor* q_up = ggml_concat(compute_ctx, ggml_concat(compute_ctx, lora_q_up, zz, 1), mlp_z, 1);
+                            ggml_tensor* k_up = ggml_concat(compute_ctx, ggml_concat(compute_ctx, z, lora_k_up, 1), ggml_concat(compute_ctx, z, mlp_z, 1), 1);
+                            ggml_tensor* v_up = ggml_concat(compute_ctx, ggml_concat(compute_ctx, zz, lora_v_up, 1), mlp_z, 1);
+                            ggml_tensor* m_up = ggml_concat(compute_ctx, ggml_concat(compute_ctx, zz, z, 1), lora_m_up, 1);
+                            // print_ggml_tensor(q_up, true);  //[R, 21504, 1, 1]
+                            // print_ggml_tensor(k_up, true);  //[R, 21504, 1, 1]
+                            // print_ggml_tensor(v_up, true);  //[R, 21504, 1, 1]
+                            // print_ggml_tensor(m_up, true);  //[R, 21504, 1, 1]
+
+                            ggml_tensor* lora_up_concat = ggml_concat(compute_ctx, ggml_concat(compute_ctx, q_up, k_up, 0), ggml_concat(compute_ctx, v_up, m_up, 0), 0);
+                            // print_ggml_tensor(lora_up_concat, true);  //[R*4, 21504, 1, 1]
+
+                            lora_down = ggml_cont(compute_ctx, lora_down_concat);
+                            lora_up   = ggml_cont(compute_ctx, lora_up_concat);
+
+                            applied_lora_tensors.insert(split_q_u_name);
+                            applied_lora_tensors.insert(split_k_u_name);
+                            applied_lora_tensors.insert(split_v_u_name);
+                            applied_lora_tensors.insert(split_m_u_name);
+
+                            applied_lora_tensors.insert(split_q_d_name);
+                            applied_lora_tensors.insert(split_k_d_name);
+                            applied_lora_tensors.insert(split_v_d_name);
+                            applied_lora_tensors.insert(split_m_d_name);
+                        }
+                    } else {
+                        lora_up_name   = fk + lora_ups[type] + ".weight";
+                        lora_down_name = fk + lora_downs[type] + ".weight";
+                        lora_mid_name  = fk + ".lora_mid.weight";
+
+                        alpha_name = fk + ".alpha";
+                        scale_name = fk + ".scale";
+
+                        if (lora_tensors.find(lora_up_name) != lora_tensors.end()) {
+                            lora_up = to_f32(compute_ctx, lora_tensors[lora_up_name]);
+                        }
+
+                        if (lora_tensors.find(lora_down_name) != lora_tensors.end()) {
+                            lora_down = to_f32(compute_ctx, lora_tensors[lora_down_name]);
+                        }
+
+                        if (lora_tensors.find(lora_mid_name) != lora_tensors.end()) {
+                            lora_mid = to_f32(compute_ctx, lora_tensors[lora_mid_name]);
+                            applied_lora_tensors.insert(lora_mid_name);
+                        }
+
+                        applied_lora_tensors.insert(lora_up_name);
+                        applied_lora_tensors.insert(lora_down_name);
+                        applied_lora_tensors.insert(alpha_name);
+                        applied_lora_tensors.insert(scale_name);
+                    }
+
+                    if (lora_up == NULL || lora_down == NULL) {
+                        continue;
+                    }
+                    // calc_scale
+                    // TODO: .dora_scale?
+                    int64_t rank = lora_down->ne[ggml_n_dims(lora_down) - 1];
+                    if (lora_tensors.find(scale_name) != lora_tensors.end()) {
+                        scale_value = ggml_backend_tensor_get_f32(lora_tensors[scale_name]);
+                    } else if (lora_tensors.find(alpha_name) != lora_tensors.end()) {
+                        float alpha = ggml_backend_tensor_get_f32(lora_tensors[alpha_name]);
+                        scale_value = alpha / rank;
+                    }
+
+                    updown = ggml_merge_lora(compute_ctx, lora_down, lora_up, lora_mid);
+                }
+                scale_value *= multiplier;
+                updown = ggml_reshape(compute_ctx, updown, weight);
+                GGML_ASSERT(ggml_nelements(updown) == ggml_nelements(weight));
+                updown = ggml_scale_inplace(compute_ctx, updown, scale_value);
+                ggml_tensor* final_weight;
+                if (weight->type != GGML_TYPE_F32 && weight->type != GGML_TYPE_F16) {
+                    // final_weight = ggml_new_tensor(compute_ctx, GGML_TYPE_F32, ggml_n_dims(weight), weight->ne);
+                    // final_weight = ggml_cpy(compute_ctx, weight, final_weight);
+                    final_weight = to_f32(compute_ctx, weight);
+                    final_weight = ggml_add_inplace(compute_ctx, final_weight, updown);
+                    final_weight = ggml_cpy(compute_ctx, final_weight, weight);
+                } else {
+                    final_weight = ggml_add_inplace(compute_ctx, weight, updown);
+                }
+                // final_weight = ggml_add_inplace(compute_ctx, weight, updown);  // apply directly
+                ggml_build_forward_expand(gf, final_weight);
+                break;
            }
-
-            std::string lora_down_name = "lora." + k_tensor + ".lora_down.weight";
-            std::string alpha_name     = "lora." + k_tensor + ".alpha";
-            std::string scale_name     = "lora." + k_tensor + ".scale";
-
-            ggml_tensor* lora_up   = NULL;
-            ggml_tensor* lora_down = NULL;
-
-            if (lora_tensors.find(lora_up_name) != lora_tensors.end()) {
-                lora_up = lora_tensors[lora_up_name];
-            }
-
-            if (lora_tensors.find(lora_down_name) != lora_tensors.end()) {
-                lora_down = lora_tensors[lora_down_name];
-            }
-
-            if (lora_up == NULL || lora_down == NULL) {
-                continue;
-            }
-
-            applied_lora_tensors.insert(lora_up_name);
-            applied_lora_tensors.insert(lora_down_name);
-            applied_lora_tensors.insert(alpha_name);
-            applied_lora_tensors.insert(scale_name);
-
-            // calc_cale
-            int64_t dim       = lora_down->ne[ggml_n_dims(lora_down) - 1];
-            float scale_value = 1.0f;
-            if (lora_tensors.find(scale_name) != lora_tensors.end()) {
-                scale_value = ggml_backend_tensor_get_f32(lora_tensors[scale_name]);
-            } else if (lora_tensors.find(alpha_name) != lora_tensors.end()) {
-                float alpha = ggml_backend_tensor_get_f32(lora_tensors[alpha_name]);
-                scale_value = alpha / dim;
-            }
-            scale_value *= multiplier;
-
-            // flat lora tensors to multiply it
-            int64_t lora_up_rows  = lora_up->ne[ggml_n_dims(lora_up) - 1];
-            lora_up               = ggml_reshape_2d(compute_ctx, lora_up, ggml_nelements(lora_up) / lora_up_rows, lora_up_rows);
-            auto lora_down_n_dims = ggml_n_dims(lora_down);
-            // assume n_dims should always be a multiple of 2 (otherwise rank 1 doesn't work)
-            lora_down_n_dims       = (lora_down_n_dims + lora_down_n_dims % 2);
-            int64_t lora_down_rows = lora_down->ne[lora_down_n_dims - 1];
-
-            // ggml_mul_mat requires tensor b transposed
-            lora_down                  = ggml_cont(compute_ctx, ggml_transpose(compute_ctx, lora_down));
-            struct ggml_tensor* updown = ggml_mul_mat(compute_ctx, lora_up, lora_down);
-            updown                     = ggml_cont(compute_ctx, ggml_transpose(compute_ctx, updown));
-            updown                     = ggml_reshape(compute_ctx, updown, weight);
-            GGML_ASSERT(ggml_nelements(updown) == ggml_nelements(weight));
-            updown = ggml_scale_inplace(compute_ctx, updown, scale_value);
-            ggml_tensor* final_weight;
-            if (weight->type != GGML_TYPE_F32 && weight->type != GGML_TYPE_F16) {
-                // final_weight = ggml_new_tensor(compute_ctx, GGML_TYPE_F32, ggml_n_dims(weight), weight->ne);
-                // final_weight = ggml_cpy(compute_ctx, weight, final_weight);
-                final_weight = to_f32(compute_ctx, weight);
-                final_weight = ggml_add_inplace(compute_ctx, final_weight, updown);
-                final_weight = ggml_cpy(compute_ctx, final_weight, weight);
-            } else {
-                final_weight = ggml_add_inplace(compute_ctx, weight, updown);
-            }
-            // final_weight = ggml_add_inplace(compute_ctx, weight, updown);  // apply directly
-            ggml_build_forward_expand(gf, final_weight);
        }
-
        size_t total_lora_tensors_count   = 0;
        size_t applied_lora_tensors_count = 0;

        for (auto& kv : lora_tensors) {
            total_lora_tensors_count++;
            if (applied_lora_tensors.find(kv.first) == applied_lora_tensors.end()) {
-                LOG_WARN("unused lora tensor %s", kv.first.c_str());
+                LOG_WARN("unused lora tensor |%s|", kv.first.c_str());
+                print_ggml_tensor(kv.second, true);
+                // exit(0);
            } else {
                applied_lora_tensors_count++;
            }
@ -193,12 +835,12 @@ struct LoraModel : public GGMLRunner {
        return gf;
    }

-    void apply(std::map<std::string, struct ggml_tensor*> model_tensors, int n_threads) {
+    void apply(std::map<std::string, struct ggml_tensor*> model_tensors, SDVersion version, int n_threads) {
        auto get_graph = [&]() -> struct ggml_cgraph* {
-            return build_lora_graph(model_tensors);
+            return build_lora_graph(model_tensors, version);
        };
        GGMLRunner::compute(get_graph, n_threads, true);
    }
 };

-#endif  // __LORA_HPP__
+#endif  // __LORA_HPP__
--- a/otherarch/sdcpp/main.cpp
+++ b/otherarch/sdcpp/main.cpp
@ -11,6 +11,7 @@
 #include "stable-diffusion.h"

 #define STB_IMAGE_IMPLEMENTATION
+#define STB_IMAGE_STATIC
 #include "stb_image.h"

 #define STB_IMAGE_WRITE_IMPLEMENTATION
@ -18,6 +19,7 @@
 #include "stb_image_write.h"

 #define STB_IMAGE_RESIZE_IMPLEMENTATION
+#define STB_IMAGE_RESIZE_STATIC
 #include "stb_image_resize.h"

 const char* rng_type_to_str[] = {
@ -37,6 +39,8 @@ const char* sample_method_str[] = {
    "ipndm",
    "ipndm_v",
    "lcm",
+    "ddim_trailing",
+    "tcd",
 };

 // Names of the sigma schedule overrides, same order as sample_schedule in stable-diffusion.h
@ -83,6 +87,7 @@ struct SDParams {
    std::string lora_model_dir;
    std::string output_path = "output.png";
    std::string input_path;
+    std::string mask_path;
    std::string control_image_path;

    std::string prompt;
@ -90,6 +95,7 @@ struct SDParams {
    float min_cfg     = 1.0f;
    float cfg_scale   = 7.0f;
    float guidance    = 3.5f;
+    float eta         = 0.f;
    float style_ratio = 20.f;
    int clip_skip     = -1;  // <= 0 represents unspecified
    int width         = 512;
@ -120,9 +126,9 @@ struct SDParams {
    int upscale_repeats           = 1;

    std::vector<int> skip_layers = {7, 8, 9};
-    float slg_scale              = 0.;
-    float skip_layer_start       = 0.01;
-    float skip_layer_end         = 0.2;
+    float slg_scale              = 0.f;
+    float skip_layer_start       = 0.01f;
+    float skip_layer_end         = 0.2f;
 };

 void print_params(SDParams params) {
@ -146,6 +152,7 @@ void print_params(SDParams params) {
    printf("    normalize input image :  %s\n", params.normalize_input ? "true" : "false");
    printf("    output_path:       %s\n", params.output_path.c_str());
    printf("    init_img:          %s\n", params.input_path.c_str());
+    printf("    mask_img:          %s\n", params.mask_path.c_str());
    printf("    control_image:     %s\n", params.control_image_path.c_str());
    printf("    clip on cpu:       %s\n", params.clip_on_cpu ? "true" : "false");
    printf("    controlnet cpu:    %s\n", params.control_net_cpu ? "true" : "false");
@ -158,6 +165,7 @@ void print_params(SDParams params) {
    printf("    cfg_scale:         %.2f\n", params.cfg_scale);
    printf("    slg_scale:         %.2f\n", params.slg_scale);
    printf("    guidance:          %.2f\n", params.guidance);
+    printf("    eta:               %.2f\n", params.eta);
    printf("    clip_skip:         %d\n", params.clip_skip);
    printf("    width:             %d\n", params.width);
    printf("    height:            %d\n", params.height);
@ -198,16 +206,19 @@ void print_usage(int argc, const char* argv[]) {
    printf("                                     If not specified, the default is the type of the weight file\n");
    printf("  --lora-model-dir [DIR]             lora model directory\n");
    printf("  -i, --init-img [IMAGE]             path to the input image, required by img2img\n");
+    printf("  --mask [MASK]                      path to the mask image, required by img2img with mask\n");
    printf("  --control-image [IMAGE]            path to image condition, control net\n");
    printf("  -o, --output OUTPUT                path to write result image to (default: ./output.png)\n");
    printf("  -p, --prompt [PROMPT]              the prompt to render\n");
    printf("  -n, --negative-prompt PROMPT       the negative prompt (default: \"\")\n");
    printf("  --cfg-scale SCALE                  unconditional guidance scale: (default: 7.0)\n");
+    printf("  --guidance SCALE                   guidance scale for img2img (default: 3.5)\n");
    printf("  --slg-scale SCALE                  skip layer guidance (SLG) scale, only for DiT models: (default: 0)\n");
    printf("                                     0 means disabled, a value of 2.5 is nice for sd3.5 medium\n");
-    printf("  --skip_layers LAYERS               Layers to skip for SLG steps: (default: [7,8,9])\n");
-    printf("  --skip_layer_start START           SLG enabling point: (default: 0.01)\n");
-    printf("  --skip_layer_end END               SLG disabling point: (default: 0.2)\n");
+    printf("  --eta SCALE                        eta in DDIM, only for DDIM and TCD: (default: 0)\n");
+    printf("  --skip-layers LAYERS               Layers to skip for SLG steps: (default: [7,8,9])\n");
+    printf("  --skip-layer-start START           SLG enabling point: (default: 0.01)\n");
+    printf("  --skip-layer-end END               SLG disabling point: (default: 0.2)\n");
    printf("                                     SLG will be enabled at step int([STEPS]*[START]) and disabled at int([STEPS]*[END])\n");
    printf("  --strength STRENGTH                strength for noising/unnoising (default: 0.75)\n");
    printf("  --style-ratio STYLE-RATIO          strength for keeping input identity (default: 20%%)\n");
@ -215,7 +226,7 @@ void print_usage(int argc, const char* argv[]) {
    printf("                                     1.0 corresponds to full destruction of information in init image\n");
    printf("  -H, --height H                     image height, in pixel space (default: 512)\n");
    printf("  -W, --width W                      image width, in pixel space (default: 512)\n");
-    printf("  --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm}\n");
+    printf("  --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}\n");
    printf("                                     sampling method (default: \"euler_a\")\n");
    printf("  --steps  STEPS                     number of sample steps (default: 20)\n");
    printf("  --rng {std_default, cuda}          RNG (default: cuda)\n");
@ -382,6 +393,12 @@ void parse_args(int argc, const char** argv, SDParams& params) {
                break;
            }
            params.input_path = argv[i];
+        } else if (arg == "--mask") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.mask_path = argv[i];
        } else if (arg == "--control-image") {
            if (++i >= argc) {
                invalid_arg = true;
@ -428,6 +445,12 @@ void parse_args(int argc, const char** argv, SDParams& params) {
                break;
            }
            params.guidance = std::stof(argv[i]);
+        } else if (arg == "--eta") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.eta = std::stof(argv[i]);
        } else if (arg == "--strength") {
            if (++i >= argc) {
                invalid_arg = true;
@ -707,6 +730,7 @@ std::string get_image_params(SDParams params, int64_t seed) {
        parameter_string += "Skip layer end: " + std::to_string(params.skip_layer_end) + ", ";
    }
    parameter_string += "Guidance: " + std::to_string(params.guidance) + ", ";
+    parameter_string += "Eta: " + std::to_string(params.eta) + ", ";
    parameter_string += "Seed: " + std::to_string(seed) + ", ";
    parameter_string += "Size: " + std::to_string(params.width) + "x" + std::to_string(params.height) + ", ";
    parameter_string += "Model: " + sd_basename(params.model_path) + ", ";
@ -801,6 +825,8 @@ int main(int argc, const char* argv[]) {
    bool vae_decode_only          = true;
    uint8_t* input_image_buffer   = NULL;
    uint8_t* control_image_buffer = NULL;
+    uint8_t* mask_image_buffer    = NULL;
+
    if (params.mode == IMG2IMG || params.mode == IMG2VID) {
        vae_decode_only = false;

@ -905,6 +931,18 @@ int main(int argc, const char* argv[]) {
        }
    }

+    std::vector<uint8_t> default_mask_image_vec(params.width * params.height, 255);
+    if (params.mask_path != "") {
+        int c             = 0;
+        mask_image_buffer = stbi_load(params.mask_path.c_str(), &params.width, &params.height, &c, 1);
+    } else {
+        mask_image_buffer = default_mask_image_vec.data();
+    }
+    sd_image_t mask_image = {(uint32_t)params.width,
+                             (uint32_t)params.height,
+                             1,
+                             mask_image_buffer};
+
    sd_image_t* results;
    if (params.mode == TXT2IMG) {
        results = txt2img(sd_ctx,
@ -913,6 +951,7 @@ int main(int argc, const char* argv[]) {
                          params.clip_skip,
                          params.cfg_scale,
                          params.guidance,
+                          params.eta,
                          params.width,
                          params.height,
                          params.sample_method,
@ -974,11 +1013,13 @@ int main(int argc, const char* argv[]) {
        } else {
            results = img2img(sd_ctx,
                              input_image,
+                              mask_image,
                              params.prompt.c_str(),
                              params.negative_prompt.c_str(),
                              params.clip_skip,
                              params.cfg_scale,
                              params.guidance,
+                              params.eta,
                              params.width,
                              params.height,
                              params.sample_method,
@ -1032,16 +1073,41 @@ int main(int argc, const char* argv[]) {
        }
    }

-    size_t last            = params.output_path.find_last_of(".");
-    std::string dummy_name = last != std::string::npos ? params.output_path.substr(0, last) : params.output_path;
+    std::string dummy_name, ext, lc_ext;
+    bool is_jpg;
+    size_t last = params.output_path.find_last_of(".");
+    size_t last_path = std::min(params.output_path.find_last_of("/"),
+                                params.output_path.find_last_of("\\"));
+    if (last != std::string::npos // filename has extension
+    && (last_path == std::string::npos || last > last_path)) {
+        dummy_name = params.output_path.substr(0, last);
+        ext = lc_ext = params.output_path.substr(last);
+        std::transform(ext.begin(), ext.end(), lc_ext.begin(), ::tolower);
+        is_jpg = lc_ext == ".jpg" || lc_ext == ".jpeg" || lc_ext == ".jpe";
+    } else {
+        dummy_name = params.output_path;
+        ext = lc_ext = "";
+        is_jpg = false;
+    }
+    // appending ".png" to absent or unknown extension
+    if (!is_jpg && lc_ext != ".png") {
+        dummy_name += ext;
+        ext = ".png";
+    }
    for (int i = 0; i < params.batch_count; i++) {
        if (results[i].data == NULL) {
            continue;
        }
-        std::string final_image_path = i > 0 ? dummy_name + "_" + std::to_string(i + 1) + ".png" : dummy_name + ".png";
-        stbi_write_png(final_image_path.c_str(), results[i].width, results[i].height, results[i].channel,
-                       results[i].data, 0, get_image_params(params, params.seed + i).c_str());
-        printf("save result image to '%s'\n", final_image_path.c_str());
+        std::string final_image_path = i > 0 ? dummy_name + "_" + std::to_string(i + 1) + ext : dummy_name + ext;
+        if(is_jpg) {
+            stbi_write_jpg(final_image_path.c_str(), results[i].width, results[i].height, results[i].channel,
+                           results[i].data, 90, get_image_params(params, params.seed + i).c_str());
+            printf("save result JPEG image to '%s'\n", final_image_path.c_str());
+        } else {
+            stbi_write_png(final_image_path.c_str(), results[i].width, results[i].height, results[i].channel,
+                           results[i].data, 0, get_image_params(params, params.seed + i).c_str());
+            printf("save result PNG image to '%s'\n", final_image_path.c_str());
+        }
        free(results[i].data);
        results[i].data = NULL;
    }
--- a/otherarch/sdcpp/model.cpp
+++ b/otherarch/sdcpp/model.cpp
@ -572,6 +572,26 @@ std::string convert_tensor_name(std::string name) {
    return new_name;
 }

+void add_preprocess_tensor_storage_types(std::map<std::string, enum ggml_type>& tensor_storages_types, std::string name, enum ggml_type type) {
+    std::string new_name = convert_tensor_name(name);
+
+    if (new_name.find("cond_stage_model") != std::string::npos && ends_with(new_name, "attn.in_proj_weight")) {
+        size_t prefix_size                                        = new_name.find("attn.in_proj_weight");
+        std::string prefix                                        = new_name.substr(0, prefix_size);
+        tensor_storages_types[prefix + "self_attn.q_proj.weight"] = type;
+        tensor_storages_types[prefix + "self_attn.k_proj.weight"] = type;
+        tensor_storages_types[prefix + "self_attn.v_proj.weight"] = type;
+    } else if (new_name.find("cond_stage_model") != std::string::npos && ends_with(new_name, "attn.in_proj_bias")) {
+        size_t prefix_size                                      = new_name.find("attn.in_proj_bias");
+        std::string prefix                                      = new_name.substr(0, prefix_size);
+        tensor_storages_types[prefix + "self_attn.q_proj.bias"] = type;
+        tensor_storages_types[prefix + "self_attn.k_proj.bias"] = type;
+        tensor_storages_types[prefix + "self_attn.v_proj.bias"] = type;
+    } else {
+        tensor_storages_types[new_name] = type;
+    }
+}
+
 void preprocess_tensor(TensorStorage tensor_storage,
                       std::vector<TensorStorage>& processed_tensor_storages) {
    std::vector<TensorStorage> result;
@ -892,7 +912,7 @@ bool is_safetensors_file(const std::string& file_path) {
 }

 bool ModelLoader::init_from_file(const std::string& file_path, const std::string& prefix) {
-     if (is_directory(file_path)) {
+    if (is_directory(file_path)) {
        LOG_INFO("load %s using diffusers format", file_path.c_str());
        return init_from_diffusers_file(file_path, prefix);
    } else if (is_gguf_file(file_path)) {
@ -942,7 +962,7 @@ bool ModelLoader::init_from_gguf_file(const std::string& file_path, const std::s
        GGML_ASSERT(ggml_nbytes(dummy) == tensor_storage.nbytes());

        tensor_storages.push_back(tensor_storage);
-        tensor_storages_types[tensor_storage.name] = tensor_storage.type;
+        add_preprocess_tensor_storage_types(tensor_storages_types, tensor_storage.name, tensor_storage.type);
    }

    gguf_free(ctx_gguf_);
@ -1087,7 +1107,7 @@ bool ModelLoader::init_from_safetensors_file(const std::string& file_path, const
        }

        tensor_storages.push_back(tensor_storage);
-        tensor_storages_types[tensor_storage.name] = tensor_storage.type;
+        add_preprocess_tensor_storage_types(tensor_storages_types, tensor_storage.name, tensor_storage.type);

        // LOG_DEBUG("%s %s", tensor_storage.to_string().c_str(), dtype.c_str());
    }
@ -1418,7 +1438,7 @@ bool ModelLoader::parse_data_pkl(uint8_t* buffer,
                        // printf(" ZIP got tensor %s \n ", reader.tensor_storage.name.c_str());
                        reader.tensor_storage.name = prefix + reader.tensor_storage.name;
                        tensor_storages.push_back(reader.tensor_storage);
-                        tensor_storages_types[reader.tensor_storage.name] = reader.tensor_storage.type;
+                        add_preprocess_tensor_storage_types(tensor_storages_types, reader.tensor_storage.name, reader.tensor_storage.type);

                        // LOG_DEBUG("%s", reader.tensor_storage.name.c_str());
                        // reset
@ -1483,24 +1503,49 @@ bool ModelLoader::has_diffusion_model_tensors()
 }

 SDVersion ModelLoader::get_sd_version() {
-    TensorStorage token_embedding_weight;
-    for (auto& tensor_storage : tensor_storages) {
-        if (tensor_storage.name.find("model.diffusion_model.double_blocks.") != std::string::npos) {
-            return VERSION_FLUX;
-        }
-        if (tensor_storage.name.find("model.diffusion_model.joint_blocks.") != std::string::npos) {
-            return VERSION_SD3;
-        }
-        if (tensor_storage.name.find("conditioner.embedders.1") != std::string::npos) {
-            return VERSION_SDXL;
-        }
-        if (tensor_storage.name.find("cond_stage_model.1") != std::string::npos) {
-            return VERSION_SDXL;
-        }
-        if (tensor_storage.name.find("model.diffusion_model.input_blocks.8.0.time_mixer.mix_factor") != std::string::npos) {
-            return VERSION_SVD;
-        }
+    TensorStorage token_embedding_weight, input_block_weight;
+    bool input_block_checked = false;

+    bool has_multiple_encoders = false;
+    bool is_unet               = false;
+
+    bool is_xl   = false;
+    bool is_flux = false;
+
+#define found_family (is_xl || is_flux)
+    for (auto& tensor_storage : tensor_storages) {
+        if (!found_family) {
+            if (tensor_storage.name.find("model.diffusion_model.double_blocks.") != std::string::npos) {
+                is_flux = true;
+                if (input_block_checked) {
+                    break;
+                }
+            }
+            if (tensor_storage.name.find("model.diffusion_model.joint_blocks.") != std::string::npos) {
+                return VERSION_SD3;
+            }
+            if (tensor_storage.name.find("model.diffusion_model.input_blocks.") != std::string::npos) {
+                is_unet = true;
+                if (has_multiple_encoders) {
+                    is_xl = true;
+                    if (input_block_checked) {
+                        break;
+                    }
+                }
+            }
+            if (tensor_storage.name.find("conditioner.embedders.1") != std::string::npos || tensor_storage.name.find("cond_stage_model.1") != std::string::npos) {
+                has_multiple_encoders = true;
+                if (is_unet) {
+                    is_xl = true;
+                    if (input_block_checked) {
+                        break;
+                    }
+                }
+            }
+            if (tensor_storage.name.find("model.diffusion_model.input_blocks.8.0.time_mixer.mix_factor") != std::string::npos) {
+                return VERSION_SVD;
+            }
+        }
        if (tensor_storage.name == "cond_stage_model.transformer.text_model.embeddings.token_embedding.weight" ||
            tensor_storage.name == "cond_stage_model.model.token_embedding.weight" ||
            tensor_storage.name == "text_model.embeddings.token_embedding.weight" ||
@ -1510,11 +1555,39 @@ SDVersion ModelLoader::get_sd_version() {
            token_embedding_weight = tensor_storage;
            // break;
        }
+        if (tensor_storage.name == "model.diffusion_model.input_blocks.0.0.weight" || tensor_storage.name == "model.diffusion_model.img_in.weight") {
+            input_block_weight  = tensor_storage;
+            input_block_checked = true;
+            if (found_family) {
+                break;
+            }
+        }
+    }
+    bool is_inpaint = input_block_weight.ne[2] == 9;
+    if (is_xl) {
+        if (is_inpaint) {
+            return VERSION_SDXL_INPAINT;
+        }
+        return VERSION_SDXL;
+    }
+
+    if (is_flux) {
+        is_inpaint = input_block_weight.ne[0] == 384;
+        if (is_inpaint) {
+            return VERSION_FLUX_FILL;
+        }
+        return VERSION_FLUX;
    }

    if (token_embedding_weight.ne[0] == 768) {
+        if (is_inpaint) {
+            return VERSION_SD1_INPAINT;
+        }
        return VERSION_SD1;
    } else if (token_embedding_weight.ne[0] == 1024) {
+        if (is_inpaint) {
+            return VERSION_SD2_INPAINT;
+        }
        return VERSION_SD2;
    }
    return VERSION_COUNT;
@ -1607,11 +1680,20 @@ ggml_type ModelLoader::get_vae_wtype() {
 void ModelLoader::set_wtype_override(ggml_type wtype, std::string prefix) {
    for (auto& pair : tensor_storages_types) {
        if (prefix.size() < 1 || pair.first.substr(0, prefix.size()) == prefix) {
+            bool found = false;
            for (auto& tensor_storage : tensor_storages) {
-                if (tensor_storage.name == pair.first) {
-                    if (tensor_should_be_converted(tensor_storage, wtype)) {
-                        pair.second = wtype;
+                std::map<std::string, ggml_type> temp;
+                add_preprocess_tensor_storage_types(temp, tensor_storage.name, tensor_storage.type);
+                for (auto& preprocessed_name : temp) {
+                    if (preprocessed_name.first == pair.first) {
+                        if (tensor_should_be_converted(tensor_storage, wtype)) {
+                            pair.second = wtype;
+                        }
+                        found = true;
+                        break;
                    }
+                }
+                if (found) {
                    break;
                }
            }
@ -1720,9 +1802,11 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
            }
            return true;
        };
-
+        int tensor_count = 0;
+        int64_t t1       = ggml_time_ms();
        for (auto& tensor_storage : processed_tensor_storages) {
            if (tensor_storage.file_index != file_index) {
+                ++tensor_count;
                continue;
            }
            ggml_tensor* dst_tensor = NULL;
@ -1734,6 +1818,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
            }

            if (dst_tensor == NULL) {
+                ++tensor_count;
                continue;
            }

@ -1800,6 +1885,9 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
                    ggml_backend_tensor_set(dst_tensor, convert_buffer.data(), 0, ggml_nbytes(dst_tensor));
                }
            }
+            int64_t t2 = ggml_time_ms();
+            pretty_progress(++tensor_count, processed_tensor_storages.size(), (t2 - t1) / 1000.0f);
+            t1 = t2;
        }

        if (zip != NULL) {
@ -1866,9 +1954,6 @@ bool ModelLoader::load_tensors(std::map<std::string, struct ggml_tensor*>& tenso
        if (pair.first.find("cond_stage_model.transformer.text_model.encoder.layers.23") != std::string::npos) {
            continue;
        }
-        if (pair.first.find("alphas_cumprod") != std::string::npos) {
-            continue;
-        }

        if (pair.first.find("alphas_cumprod") != std::string::npos) {
            continue;
@ -2007,4 +2092,4 @@ bool convert(const char* input_path, const char* vae_path, const char* output_pa
    }
    bool success = model_loader.save_to_gguf_file(output_path, (ggml_type)output_type);
    return success;
-}
+}
--- a/otherarch/sdcpp/model.h
+++ b/otherarch/sdcpp/model.h
@ -14,21 +14,26 @@
 #include "ggml.h"
 #include "json.hpp"
 #include "zip.h"
+#include "gguf.h"

 #define SD_MAX_DIMS 5

 enum SDVersion {
    VERSION_SD1,
+    VERSION_SD1_INPAINT,
    VERSION_SD2,
+    VERSION_SD2_INPAINT,
    VERSION_SDXL,
+    VERSION_SDXL_INPAINT,
    VERSION_SVD,
    VERSION_SD3,
    VERSION_FLUX,
+    VERSION_FLUX_FILL,
    VERSION_COUNT,
 };

 static inline bool sd_version_is_flux(SDVersion version) {
-    if (version == VERSION_FLUX) {
+    if (version == VERSION_FLUX || version == VERSION_FLUX_FILL) {
        return true;
    }
    return false;
@ -41,6 +46,34 @@ static inline bool sd_version_is_sd3(SDVersion version) {
    return false;
 }

+static inline bool sd_version_is_sd1(SDVersion version) {
+    if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT) {
+        return true;
+    }
+    return false;
+}
+
+static inline bool sd_version_is_sd2(SDVersion version) {
+    if (version == VERSION_SD2 || version == VERSION_SD2_INPAINT) {
+        return true;
+    }
+    return false;
+}
+
+static inline bool sd_version_is_sdxl(SDVersion version) {
+    if (version == VERSION_SDXL || version == VERSION_SDXL_INPAINT) {
+        return true;
+    }
+    return false;
+}
+
+static inline bool sd_version_is_inpaint(SDVersion version) {
+    if (version == VERSION_SD1_INPAINT || version == VERSION_SD2_INPAINT || version == VERSION_SDXL_INPAINT || version == VERSION_FLUX_FILL) {
+        return true;
+    }
+    return false;
+}
+
 static inline bool sd_version_is_dit(SDVersion version) {
    if (sd_version_is_flux(version) || sd_version_is_sd3(version)) {
        return true;
@ -198,4 +231,4 @@ public:
    static std::string load_t5_tokenizer_json();
 };

-#endif  // __MODEL_H__
+#endif  // __MODEL_H__
--- a/otherarch/sdcpp/sdtype_adapter.cpp
+++ b/otherarch/sdcpp/sdtype_adapter.cpp
@ -62,6 +62,7 @@ struct SDParams {
    std::string lora_model_dir;
    std::string output_path = "output.png";
    std::string input_path;
+    std::string mask_path;
    std::string control_image_path;

    std::string prompt;
@ -69,6 +70,7 @@ struct SDParams {
    float min_cfg     = 1.0f;
    float cfg_scale   = 7.0f;
    float guidance    = 3.5f;
+    float eta         = 0.f;
    float style_ratio = 20.f;
    int clip_skip     = -1;  // <= 0 represents unspecified
    int width         = 512;
@ -99,9 +101,9 @@ struct SDParams {
    int upscale_repeats           = 1;

    std::vector<int> skip_layers = {7, 8, 9};
-    float slg_scale              = 0.;
-    float skip_layer_start       = 0.01;
-    float skip_layer_end         = 0.2;
+    float slg_scale              = 0.f;
+    float skip_layer_start       = 0.01f;
+    float skip_layer_end         = 0.2f;
 };

 //shared
@ -113,6 +115,7 @@ static sd_ctx_t * sd_ctx = nullptr;
 static int sddebugmode = 0;
 static std::string recent_data = "";
 static uint8_t * input_image_buffer = NULL;
+static uint8_t * input_mask_buffer = NULL;

 static std::string sdplatformenv, sddeviceenv, sdvulkandeviceenv;
 static bool notiling = false;
@ -317,6 +320,7 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
    std::string cleanprompt = clean_input_prompt(inputs.prompt);
    std::string cleannegprompt = clean_input_prompt(inputs.negative_prompt);
    std::string img2img_data = std::string(inputs.init_images);
+    std::string img2img_mask = "";
    std::string sampler = inputs.sample_method;

    sd_params->prompt = cleanprompt;
@ -351,6 +355,10 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
        newheight = newheight - (newheight%64);
        sd_params->width = newwidth;
        sd_params->height = newheight;
+        if(!sd_is_quiet && sddebugmode==1)
+        {
+            printf("\nDownscale to %dx%d as %d > %d\n",newwidth,newheight,biggestdim,reslimit);
+        }
    }
    bool dotile = (sd_params->width>768 || sd_params->height>768) && !notiling;
    set_sd_vae_tiling(sd_ctx,dotile); //changes vae tiling, prevents memory related crash/oom
@ -358,11 +366,14 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
    //for img2img
    sd_image_t input_image = {0,0,0,nullptr};
    std::vector<uint8_t> image_buffer;
+    std::vector<uint8_t> image_mask_buffer;
    int nx, ny, nc;
+    int nx2, ny2, nc2;
    int img2imgW = sd_params->width; //for img2img input
    int img2imgH = sd_params->height;
    int img2imgC = 3; // Assuming RGB image
    std::vector<uint8_t> resized_image_buf(img2imgW * img2imgH * img2imgC);
+    std::vector<uint8_t> resized_mask_buf(img2imgW * img2imgH * img2imgC);

    std::string ts = get_timestamp_str();
    if(!sd_is_quiet)
@ -429,6 +440,7 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
                          sd_params->clip_skip,
                          sd_params->cfg_scale,
                          sd_params->guidance,
+                          sd_params->eta,
                          sd_params->width,
                          sd_params->height,
                          sd_params->sample_method,
@ -461,6 +473,11 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
             stbi_image_free(input_image_buffer);
             input_image_buffer = nullptr;
        }
+         if(input_mask_buffer!=nullptr) //just in time free old buffer
+        {
+             stbi_image_free(input_mask_buffer);
+             input_mask_buffer = nullptr;
+        }

        input_image_buffer = stbi_load_from_memory(image_buffer.data(), image_buffer.size(), &nx, &ny, &nc, 3);

@ -486,11 +503,34 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
            return output;
        }

+        if(img2img_mask!="")
+        {
+            image_mask_buffer = kcpp_base64_decode(img2img_mask);
+            input_mask_buffer = stbi_load_from_memory(image_mask_buffer.data(), image_mask_buffer.size(), &nx2, &ny2, &nc2, 3);
+            // Resize the image
+            int resok = stbir_resize_uint8(input_mask_buffer, nx, ny, 0, resized_mask_buf.data(), img2imgW, img2imgH, 0, img2imgC);
+            if (!resok) {
+                printf("\nKCPP SD: resize image failed!\n");
+                output.data = "";
+                output.status = 0;
+                return output;
+            }
+        }
+
        input_image.width = img2imgW;
        input_image.height = img2imgH;
        input_image.channel = img2imgC;
        input_image.data = resized_image_buf.data();

+        uint8_t* mask_image_buffer    = NULL;
+        std::vector<uint8_t> default_mask_image_vec(img2imgW * img2imgH * img2imgC, 255);
+        if (img2img_mask != "") {
+            mask_image_buffer = resized_mask_buf.data();
+        } else {
+            mask_image_buffer = default_mask_image_vec.data();
+        }
+        sd_image_t mask_image = { (uint32_t) img2imgW, (uint32_t) img2imgH, 1, mask_image_buffer };
+
        if(!sd_is_quiet && sddebugmode==1)
        {
            printf("\nIMG2IMG PROMPT:%s\nNPROMPT:%s\nCLPSKP:%d\nCFGSCLE:%f\nW:%d\nH:%d\nSM:%d\nSTEP:%d\nSEED:%d\nBATCH:%d\nCIMG:%p\nSTR:%f\n\n",
@ -510,11 +550,13 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)

        results = img2img(sd_ctx,
                            input_image,
+                            mask_image,
                            sd_params->prompt.c_str(),
                            sd_params->negative_prompt.c_str(),
                            sd_params->clip_skip,
                            sd_params->cfg_scale,
                            sd_params->guidance,
+                            sd_params->eta,
                            sd_params->width,
                            sd_params->height,
                            sd_params->sample_method,
--- a/otherarch/sdcpp/stable-diffusion.cpp
+++ b/otherarch/sdcpp/stable-diffusion.cpp
@ -25,11 +25,15 @@ static float pending_apply_lora_power = 1.0f;

 const char* model_version_to_str[] = {
    "SD 1.x",
+    "SD 1.x Inpaint",
    "SD 2.x",
+    "SD 2.x Inpaint",
    "SDXL",
+    "SDXL Inpaint",
    "SVD",
    "SD3.x",
-    "Flux"};
+    "Flux",
+    "Flux Fill"};

 const char* sampling_methods_str[] = {
    "Euler A",
@ -42,6 +46,8 @@ const char* sampling_methods_str[] = {
    "iPNDM",
    "iPNDM_v",
    "LCM",
+    "DDIM \"trailing\"",
+    "TCD"
 };

 /*================================================== Helper Functions ================================================*/
@ -302,7 +308,7 @@ public:
            model_loader.set_wtype_override(wtype);
        }

-        if (version == VERSION_SDXL) {
+        if (sd_version_is_sdxl(version)) {
            vae_wtype = GGML_TYPE_F32;
            model_loader.set_wtype_override(GGML_TYPE_F32, "vae.");
        }
@ -314,7 +320,7 @@ public:

        LOG_DEBUG("ggml tensor size = %d bytes", (int)sizeof(ggml_tensor));

-        if (version == VERSION_SDXL) {
+        if (sd_version_is_sdxl(version)) {
            scale_factor = 0.13025f;
            if (vae_path.size() == 0 && taesd_path_fixed.size() == 0) {
                LOG_WARN(
@ -368,7 +374,7 @@ public:
                diffusion_model  = std::make_shared<MMDiTModel>(backend, model_loader.tensor_storages_types);
            } else if (sd_version_is_flux(version)) {
                cond_stage_model = std::make_shared<FluxCLIPEmbedder>(clip_backend, model_loader.tensor_storages_types);
-                diffusion_model  = std::make_shared<FluxModel>(backend, model_loader.tensor_storages_types, diffusion_flash_attn);
+                diffusion_model  = std::make_shared<FluxModel>(backend, model_loader.tensor_storages_types, version, diffusion_flash_attn);
            } else {
                if (id_embeddings_path.find("v2") != std::string::npos) {
                    cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend, model_loader.tensor_storages_types, embeddings_path, version, PM_VERSION_2);
@ -556,8 +562,12 @@ public:

        // check is_using_v_parameterization_for_sd2
        bool is_using_v_parameterization = false;
-        if (version == VERSION_SD2) {
-            if (is_using_v_parameterization_for_sd2(ctx)) {
+        if (sd_version_is_sd2(version)) {
+            if (is_using_v_parameterization_for_sd2(ctx, sd_version_is_inpaint(version))) {
+                is_using_v_parameterization = true;
+            }
+        } else if (sd_version_is_sdxl(version)) {
+            if (model_loader.tensor_storages_types.find("v_pred") != model_loader.tensor_storages_types.end()) {
                is_using_v_parameterization = true;
            }
        } else if (version == VERSION_SVD) {
@ -631,7 +641,7 @@ public:
        return true;
    }

-    bool is_using_v_parameterization_for_sd2(ggml_context* work_ctx) {
+    bool is_using_v_parameterization_for_sd2(ggml_context* work_ctx, bool is_inpaint = false) {
        struct ggml_tensor* x_t = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 4, 1);
        ggml_set_f32(x_t, 0.5);
        struct ggml_tensor* c = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 1024, 2, 1, 1);
@ -639,9 +649,15 @@ public:

        struct ggml_tensor* timesteps = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 1);
        ggml_set_f32(timesteps, 999);
+
+        struct ggml_tensor* concat = is_inpaint ? ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 5, 1) : NULL;
+        if (concat != NULL) {
+            ggml_set_f32(concat, 0);
+        }
+
        int64_t t0              = ggml_time_ms();
        struct ggml_tensor* out = ggml_dup_tensor(work_ctx, x_t);
-        diffusion_model->compute(n_threads, x_t, timesteps, c, NULL, NULL, NULL, -1, {}, 0.f, &out);
+        diffusion_model->compute(n_threads, x_t, timesteps, c, concat, NULL, NULL, -1, {}, 0.f, &out);
        diffusion_model->free_compute_buffer();

        double result = 0.f;
@ -683,7 +699,7 @@ public:
        }

        lora.multiplier = multiplier;
-        lora.apply(tensors, n_threads);
+        lora.apply(tensors, version, n_threads);
        lora.free_params_buffer();

        int64_t t1 = ggml_time_ms();
@ -713,7 +729,8 @@ public:
        }

        lora.multiplier = multiplier;
-        lora.apply(tensors, n_threads);
+        // TODO: send version?
+        lora.apply(tensors, version, n_threads);
        lora.free_params_buffer();

        int64_t t1 = ggml_time_ms();
@ -729,19 +746,20 @@ public:
        for (auto& kv : lora_state) {
            const std::string& lora_name = kv.first;
            float multiplier             = kv.second;
-
-            if (curr_lora_state.find(lora_name) != curr_lora_state.end()) {
-                float curr_multiplier = curr_lora_state[lora_name];
-                float multiplier_diff = multiplier - curr_multiplier;
-                if (multiplier_diff != 0.f) {
-                    lora_state_diff[lora_name] = multiplier_diff;
-                }
-            } else {
-                lora_state_diff[lora_name] = multiplier;
-            }
+            lora_state_diff[lora_name] += multiplier;
+        }
+        for (auto& kv : curr_lora_state) {
+            const std::string& lora_name = kv.first;
+            float curr_multiplier        = kv.second;
+            lora_state_diff[lora_name] -= curr_multiplier;
        }

-        LOG_INFO("Attempting to apply %lu LoRAs", lora_state.size());
+        size_t rm = lora_state_diff.size() - lora_state.size();
+        if (rm != 0) {
+            LOG_INFO("Attempting to apply %lu LoRAs (removing %lu applied LoRAs)", lora_state.size(), rm);
+        } else {
+            LOG_INFO("Attempting to apply %lu LoRAs", lora_state.size());
+        }

        for (auto& kv : lora_state_diff) {
            apply_lora(kv.first, kv.second);
@ -848,6 +866,7 @@ public:
                        float min_cfg,
                        float cfg_scale,
                        float guidance,
+                        float eta,
                        sample_method_t method,
                        const std::vector<float>& sigmas,
                        int start_merge_step,
@ -855,7 +874,20 @@ public:
                        std::vector<int> skip_layers = {},
                        float slg_scale              = 0,
                        float skip_layer_start       = 0.01,
-                        float skip_layer_end         = 0.2) {
+                        float skip_layer_end         = 0.2,
+                        ggml_tensor* noise_mask      = nullptr) {
+        LOG_DEBUG("Sample");
+        struct ggml_init_params params;
+        size_t data_size = ggml_row_size(init_latent->type, init_latent->ne[0]);
+        for (int i = 1; i < 4; i++) {
+            data_size *= init_latent->ne[i];
+        }
+        data_size += 1024;
+        params.mem_size       = data_size * 3;
+        params.mem_buffer     = NULL;
+        params.no_alloc       = false;
+        ggml_context* tmp_ctx = ggml_init(params);
+
        size_t steps = sigmas.size() - 1;
        // noise = load_tensor_from_file(work_ctx, "./rand0.bin");
        // print_ggml_tensor(noise);
@ -1014,10 +1046,23 @@ public:
                pretty_progress(step, (int)steps, (t1 - t0) / 1000000.f);
                // LOG_INFO("step %d sampling completed taking %.2fs", step, (t1 - t0) * 1.0f / 1000000);
            }
+            if (noise_mask != nullptr) {
+                for (int64_t x = 0; x < denoised->ne[0]; x++) {
+                    for (int64_t y = 0; y < denoised->ne[1]; y++) {
+                        float mask = ggml_tensor_get_f32(noise_mask, x, y);
+                        for (int64_t k = 0; k < denoised->ne[2]; k++) {
+                            float init = ggml_tensor_get_f32(init_latent, x, y, k);
+                            float den  = ggml_tensor_get_f32(denoised, x, y, k);
+                            ggml_tensor_set_f32(denoised, init + mask * (den - init), x, y, k);
+                        }
+                    }
+                }
+            }
+
            return denoised;
        };

-        sample_k_diffusion(method, denoise, work_ctx, x, sigmas, rng);
+        sample_k_diffusion(method, denoise, work_ctx, x, sigmas, rng, eta);

        x = denoiser->inverse_noise_scaling(sigmas[sigmas.size() - 1], x);

@ -1234,6 +1279,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
                           int clip_skip,
                           float cfg_scale,
                           float guidance,
+                           float eta,
                           int width,
                           int height,
                           enum sample_method_t sample_method,
@ -1248,7 +1294,8 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
                           std::vector<int> skip_layers = {},
                           float slg_scale              = 0,
                           float skip_layer_start       = 0.01,
-                           float skip_layer_end         = 0.2) {
+                           float skip_layer_end         = 0.2,
+                           ggml_tensor* masked_image    = NULL) {
    if (seed < 0) {
        // Generally, when using the provided command line, the seed is always >0.
        // However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library
@ -1294,7 +1341,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
    if (sd_ctx->sd->stacked_id) {
        if (!sd_ctx->sd->pmid_lora->applied) {
            t0 = ggml_time_ms();
-            sd_ctx->sd->pmid_lora->apply(sd_ctx->sd->tensors, sd_ctx->sd->n_threads);
+            sd_ctx->sd->pmid_lora->apply(sd_ctx->sd->tensors, sd_ctx->sd->version, sd_ctx->sd->n_threads);
            t1                             = ggml_time_ms();
            sd_ctx->sd->pmid_lora->applied = true;
            LOG_INFO("pmid_lora apply completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
@ -1404,7 +1451,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
    SDCondition uncond;
    if (cfg_scale != 1.0) {
        bool force_zero_embeddings = false;
-        if (sd_ctx->sd->version == VERSION_SDXL && negative_prompt.size() == 0) {
+        if (sd_version_is_sdxl(sd_ctx->sd->version) && negative_prompt.size() == 0) {
            force_zero_embeddings = true;
        }
        uncond = sd_ctx->sd->cond_stage_model->get_learned_condition(work_ctx,
@ -1441,6 +1488,39 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
    int W = width / 8;
    int H = height / 8;
    LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]);
+    ggml_tensor* noise_mask = nullptr;
+    if (sd_version_is_inpaint(sd_ctx->sd->version)) {
+        if (masked_image == NULL) {
+            int64_t mask_channels = 1;
+            if (sd_ctx->sd->version == VERSION_FLUX_FILL) {
+                mask_channels = 8 * 8;  // flatten the whole mask
+            }
+            // no mask, set the whole image as masked
+            masked_image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], mask_channels + init_latent->ne[2], 1);
+            for (int64_t x = 0; x < masked_image->ne[0]; x++) {
+                for (int64_t y = 0; y < masked_image->ne[1]; y++) {
+                    if (sd_ctx->sd->version == VERSION_FLUX_FILL) {
+                        // TODO: this might be wrong
+                        for (int64_t c = 0; c < init_latent->ne[2]; c++) {
+                            ggml_tensor_set_f32(masked_image, 0, x, y, c);
+                        }
+                        for (int64_t c = init_latent->ne[2]; c < masked_image->ne[2]; c++) {
+                            ggml_tensor_set_f32(masked_image, 1, x, y, c);
+                        }
+                    } else {
+                        ggml_tensor_set_f32(masked_image, 1, x, y, 0);
+                        for (int64_t c = 1; c < masked_image->ne[2]; c++) {
+                            ggml_tensor_set_f32(masked_image, 0, x, y, c);
+                        }
+                    }
+                }
+            }
+        }
+        cond.c_concat   = masked_image;
+        uncond.c_concat = masked_image;
+    } else {
+        noise_mask = masked_image;
+    }
    for (int b = 0; b < batch_count; b++) {
        int64_t sampling_start = ggml_time_ms();
        int64_t cur_seed       = seed + b;
@ -1469,6 +1549,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
                                                     cfg_scale,
                                                     cfg_scale,
                                                     guidance,
+                                                     eta,
                                                     sample_method,
                                                     sigmas,
                                                     start_merge_step,
@ -1476,7 +1557,9 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
                                                     skip_layers,
                                                     slg_scale,
                                                     skip_layer_start,
-                                                     skip_layer_end);
+                                                     skip_layer_end,
+                                                     noise_mask);
+
        // struct ggml_tensor* x_0 = load_tensor_from_file(ctx, "samples_ddim.bin");
        // print_ggml_tensor(x_0);
        int64_t sampling_end = ggml_time_ms();
@ -1532,6 +1615,7 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
                    int clip_skip,
                    float cfg_scale,
                    float guidance,
+                    float eta,
                    int width,
                    int height,
                    enum sample_method_t sample_method,
@ -1598,6 +1682,10 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
        ggml_set_f32(init_latent, 0.f);
    }

+    if (sd_version_is_inpaint(sd_ctx->sd->version)) {
+        LOG_WARN("This is an inpainting model, this should only be used in img2img mode with a mask");
+    }
+
    sd_image_t* result_images = generate_image(sd_ctx,
                                               work_ctx,
                                               init_latent,
@ -1606,6 +1694,7 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
                                               clip_skip,
                                               cfg_scale,
                                               guidance,
+                                               eta,
                                               width,
                                               height,
                                               sample_method,
@ -1631,11 +1720,13 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,

 sd_image_t* img2img(sd_ctx_t* sd_ctx,
                    sd_image_t init_image,
+                    sd_image_t mask,
                    const char* prompt_c_str,
                    const char* negative_prompt_c_str,
                    int clip_skip,
                    float cfg_scale,
                    float guidance,
+                    float eta,
                    int width,
                    int height,
                    sample_method_t sample_method,
@ -1670,7 +1761,7 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
    if (sd_ctx->sd->stacked_id) {
        params.mem_size += static_cast<size_t>(10 * 1024 * 1024);  // 10 MB
    }
-    params.mem_size += width * height * 3 * sizeof(float) * 2;
+    params.mem_size += width * height * 3 * sizeof(float) * 3;
    params.mem_size *= batch_count;
    params.mem_buffer = NULL;
    params.no_alloc   = false;
@ -1691,7 +1782,70 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
    sd_ctx->sd->rng->manual_seed(seed);

    ggml_tensor* init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
+    ggml_tensor* mask_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 1, 1);
+
+    sd_mask_to_tensor(mask.data, mask_img);
+
    sd_image_to_tensor(init_image.data, init_img);
+
+    ggml_tensor* masked_image;
+
+    if (sd_version_is_inpaint(sd_ctx->sd->version)) {
+        int64_t mask_channels = 1;
+        if (sd_ctx->sd->version == VERSION_FLUX_FILL) {
+            mask_channels = 8 * 8;  // flatten the whole mask
+        }
+        ggml_tensor* masked_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
+        sd_apply_mask(init_img, mask_img, masked_img);
+        ggml_tensor* masked_image_0 = NULL;
+        if (!sd_ctx->sd->use_tiny_autoencoder) {
+            ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, masked_img);
+            masked_image_0       = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments);
+        } else {
+            masked_image_0 = sd_ctx->sd->encode_first_stage(work_ctx, masked_img);
+        }
+        masked_image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, masked_image_0->ne[0], masked_image_0->ne[1], mask_channels + masked_image_0->ne[2], 1);
+        for (int ix = 0; ix < masked_image_0->ne[0]; ix++) {
+            for (int iy = 0; iy < masked_image_0->ne[1]; iy++) {
+                int mx = ix * 8;
+                int my = iy * 8;
+                if (sd_ctx->sd->version == VERSION_FLUX_FILL) {
+                    for (int k = 0; k < masked_image_0->ne[2]; k++) {
+                        float v = ggml_tensor_get_f32(masked_image_0, ix, iy, k);
+                        ggml_tensor_set_f32(masked_image, v, ix, iy, k);
+                    }
+                    // "Encode" 8x8 mask chunks into a flattened 1x64 vector, and concatenate to masked image
+                    for (int x = 0; x < 8; x++) {
+                        for (int y = 0; y < 8; y++) {
+                            float m = ggml_tensor_get_f32(mask_img, mx + x, my + y);
+                            // TODO: check if the way the mask is flattened is correct (is it supposed to be x*8+y or x+8*y?)
+                            // python code was using "b (h 8) (w 8) -> b (8 8) h w"
+                            ggml_tensor_set_f32(masked_image, m, ix, iy, masked_image_0->ne[2] + x * 8 + y);
+                        }
+                    }
+                } else {
+                    float m = ggml_tensor_get_f32(mask_img, mx, my);
+                    ggml_tensor_set_f32(masked_image, m, ix, iy, 0);
+                    for (int k = 0; k < masked_image_0->ne[2]; k++) {
+                        float v = ggml_tensor_get_f32(masked_image_0, ix, iy, k);
+                        ggml_tensor_set_f32(masked_image, v, ix, iy, k + mask_channels);
+                    }
+                }
+            }
+        }
+    } else {
+        // LOG_WARN("Inpainting with a base model is not great");
+        masked_image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width / 8, height / 8, 1, 1);
+        for (int ix = 0; ix < masked_image->ne[0]; ix++) {
+            for (int iy = 0; iy < masked_image->ne[1]; iy++) {
+                int mx  = ix * 8;
+                int my  = iy * 8;
+                float m = ggml_tensor_get_f32(mask_img, mx, my);
+                ggml_tensor_set_f32(masked_image, m, ix, iy);
+            }
+        }
+    }
+
    ggml_tensor* init_latent = NULL;
    if (!sd_ctx->sd->use_tiny_autoencoder) {
        ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, init_img);
@ -1705,6 +1859,8 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,

    std::vector<float> sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps);
    size_t t_enc              = static_cast<size_t>(sample_steps * strength);
+    if (t_enc == sample_steps)
+        t_enc--;
    LOG_INFO("target t_enc is %zu steps", t_enc);
    std::vector<float> sigma_sched;
    sigma_sched.assign(sigmas.begin() + sample_steps - t_enc - 1, sigmas.end());
@ -1717,6 +1873,7 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
                                               clip_skip,
                                               cfg_scale,
                                               guidance,
+                                               eta,
                                               width,
                                               height,
                                               sample_method,
@ -1731,11 +1888,12 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
                                               skip_layers_vec,
                                               slg_scale,
                                               skip_layer_start,
-                                               skip_layer_end);
+                                               skip_layer_end,
+                                               masked_image);

    size_t t2 = ggml_time_ms();

-    LOG_INFO("img2img completed in %.2fs", (t1 - t0) * 1.0f / 1000);
+    LOG_INFO("img2img completed in %.2fs", (t2 - t0) * 1.0f / 1000);

    return result_images;
 }
@ -1829,6 +1987,7 @@ SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx,
                                                 min_cfg,
                                                 cfg_scale,
                                                 0.f,
+                                                 0.f,
                                                 sample_method,
                                                 sigmas,
                                                 -1,
@ -1870,4 +2029,4 @@ SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx,
    LOG_INFO("img2vid completed in %.2fs", (t3 - t0) * 1.0f / 1000);

    return result_images;
-}
+}
--- a/otherarch/sdcpp/stable-diffusion.h
+++ b/otherarch/sdcpp/stable-diffusion.h
@ -44,6 +44,8 @@ enum sample_method_t {
    IPNDM,
    IPNDM_V,
    LCM,
+    DDIM_TRAILING,
+    TCD,
    N_SAMPLE_METHODS
 };

@ -59,37 +61,37 @@ enum schedule_t {

 // same as enum ggml_type
 enum sd_type_t {
-    SD_TYPE_F32  = 0,
-    SD_TYPE_F16  = 1,
-    SD_TYPE_Q4_0 = 2,
-    SD_TYPE_Q4_1 = 3,
+    SD_TYPE_F32     = 0,
+    SD_TYPE_F16     = 1,
+    SD_TYPE_Q4_0    = 2,
+    SD_TYPE_Q4_1    = 3,
    // SD_TYPE_Q4_2 = 4, support has been removed
    // SD_TYPE_Q4_3 = 5, support has been removed
-    SD_TYPE_Q5_0     = 6,
-    SD_TYPE_Q5_1     = 7,
-    SD_TYPE_Q8_0     = 8,
-    SD_TYPE_Q8_1     = 9,
-    SD_TYPE_Q2_K     = 10,
-    SD_TYPE_Q3_K     = 11,
-    SD_TYPE_Q4_K     = 12,
-    SD_TYPE_Q5_K     = 13,
-    SD_TYPE_Q6_K     = 14,
-    SD_TYPE_Q8_K     = 15,
-    SD_TYPE_IQ2_XXS  = 16,
-    SD_TYPE_IQ2_XS   = 17,
-    SD_TYPE_IQ3_XXS  = 18,
-    SD_TYPE_IQ1_S    = 19,
-    SD_TYPE_IQ4_NL   = 20,
-    SD_TYPE_IQ3_S    = 21,
-    SD_TYPE_IQ2_S    = 22,
-    SD_TYPE_IQ4_XS   = 23,
-    SD_TYPE_I8       = 24,
-    SD_TYPE_I16      = 25,
-    SD_TYPE_I32      = 26,
-    SD_TYPE_I64      = 27,
-    SD_TYPE_F64      = 28,
-    SD_TYPE_IQ1_M    = 29,
-    SD_TYPE_BF16     = 30,
+    SD_TYPE_Q5_0    = 6,
+    SD_TYPE_Q5_1    = 7,
+    SD_TYPE_Q8_0    = 8,
+    SD_TYPE_Q8_1    = 9,
+    SD_TYPE_Q2_K    = 10,
+    SD_TYPE_Q3_K    = 11,
+    SD_TYPE_Q4_K    = 12,
+    SD_TYPE_Q5_K    = 13,
+    SD_TYPE_Q6_K    = 14,
+    SD_TYPE_Q8_K    = 15,
+    SD_TYPE_IQ2_XXS = 16,
+    SD_TYPE_IQ2_XS  = 17,
+    SD_TYPE_IQ3_XXS = 18,
+    SD_TYPE_IQ1_S   = 19,
+    SD_TYPE_IQ4_NL  = 20,
+    SD_TYPE_IQ3_S   = 21,
+    SD_TYPE_IQ2_S   = 22,
+    SD_TYPE_IQ4_XS  = 23,
+    SD_TYPE_I8      = 24,
+    SD_TYPE_I16     = 25,
+    SD_TYPE_I32     = 26,
+    SD_TYPE_I64     = 27,
+    SD_TYPE_F64     = 28,
+    SD_TYPE_IQ1_M   = 29,
+    SD_TYPE_BF16    = 30,
    SD_TYPE_Q4_0_4_4 = 31,
    SD_TYPE_Q4_0_4_8 = 32,
    SD_TYPE_Q4_0_8_8 = 33,
@ -98,7 +100,7 @@ enum sd_type_t {
    SD_TYPE_IQ4_NL_4_4 = 36,
    // SD_TYPE_IQ4_NL_4_8 = 37,
    // SD_TYPE_IQ4_NL_8_8 = 38,
-    SD_TYPE_COUNT = 39,
+    SD_TYPE_COUNT   = 39,
 };

 SD_API const char* sd_type_name(enum sd_type_t type);
@ -161,6 +163,7 @@ SD_API sd_image_t* txt2img(sd_ctx_t* sd_ctx,
                           int clip_skip,
                           float cfg_scale,
                           float guidance,
+                           float eta,
                           int width,
                           int height,
                           enum sample_method_t sample_method,
@ -180,11 +183,13 @@ SD_API sd_image_t* txt2img(sd_ctx_t* sd_ctx,

 SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx,
                           sd_image_t init_image,
+                           sd_image_t mask_image,
                           const char* prompt,
                           const char* negative_prompt,
                           int clip_skip,
                           float cfg_scale,
                           float guidance,
+                           float eta,
                           int width,
                           int height,
                           enum sample_method_t sample_method,
@ -241,4 +246,4 @@ SD_API uint8_t* preprocess_canny(uint8_t* img,
 }
 #endif

-#endif  // __STABLE_DIFFUSION_H__
+#endif  // __STABLE_DIFFUSION_H__
--- a/otherarch/sdcpp/tae.hpp
+++ b/otherarch/sdcpp/tae.hpp
@ -201,7 +201,7 @@ struct TinyAutoEncoder : public GGMLRunner {
                    bool decoder_only = true,
                    SDVersion version = VERSION_SD1)
        : decode_only(decoder_only),
-          taesd(decode_only, version),
+          taesd(decoder_only, version),
          GGMLRunner(backend) {
        taesd.init(params_ctx, tensor_types, prefix);
    }
--- a/otherarch/sdcpp/thirdparty/stb_image_write.h
+++ b/otherarch/sdcpp/thirdparty/stb_image_write.h
@ -177,7 +177,7 @@ STBIWDEF int stbi_write_png(char const *filename, int w, int h, int comp, const
 STBIWDEF int stbi_write_bmp(char const *filename, int w, int h, int comp, const void  *data);
 STBIWDEF int stbi_write_tga(char const *filename, int w, int h, int comp, const void  *data);
 STBIWDEF int stbi_write_hdr(char const *filename, int w, int h, int comp, const float *data);
-STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const void  *data, int quality);
+STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const void  *data, int quality, const char* parameters = NULL);

 #ifdef STBIW_WINDOWS_UTF8
 STBIWDEF int stbiw_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input);
@ -1412,7 +1412,7 @@ static int stbiw__jpg_processDU(stbi__write_context *s, int *bitBuf, int *bitCnt
   return DU[0];
 }

-static int stbi_write_jpg_core(stbi__write_context *s, int width, int height, int comp, const void* data, int quality) {
+static int stbi_write_jpg_core(stbi__write_context *s, int width, int height, int comp, const void* data, int quality, const char* parameters) {
   // Constants that don't pollute global namespace
   static const unsigned char std_dc_luminance_nrcodes[] = {0,0,1,5,1,1,1,1,1,1,0,0,0,0,0,0,0};
   static const unsigned char std_dc_luminance_values[] = {0,1,2,3,4,5,6,7,8,9,10,11};
@ -1521,6 +1521,20 @@ static int stbi_write_jpg_core(stbi__write_context *s, int width, int height, in
      s->func(s->context, (void*)YTable, sizeof(YTable));
      stbiw__putc(s, 1);
      s->func(s->context, UVTable, sizeof(UVTable));
+
+      // comment block with parameters of generation
+      if(parameters != NULL) {
+         stbiw__putc(s, 0xFF /* comnent */ );
+         stbiw__putc(s, 0xFE /* marker  */ );
+         size_t param_length = std::min(2 + strlen("parameters") + 1 + strlen(parameters) + 1, (size_t) 0xFFFF);
+         stbiw__putc(s, param_length >> 8); // no need to mask, length < 65536
+         stbiw__putc(s, param_length & 0xFF);
+         s->func(s->context, (void*)"parameters", strlen("parameters") + 1); // std::string is zero-terminated
+         s->func(s->context, (void*)parameters, std::min(param_length, (size_t) 65534) - 2 - strlen("parameters") - 1);
+         if(param_length > 65534) stbiw__putc(s, 0); // always zero-terminate for safety
+         if(param_length & 1) stbiw__putc(s, 0xFF); // pad to even length
+      }
+
      s->func(s->context, (void*)head1, sizeof(head1));
      s->func(s->context, (void*)(std_dc_luminance_nrcodes+1), sizeof(std_dc_luminance_nrcodes)-1);
      s->func(s->context, (void*)std_dc_luminance_values, sizeof(std_dc_luminance_values));
@ -1625,16 +1639,16 @@ STBIWDEF int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x,
 {
   stbi__write_context s = { 0 };
   stbi__start_write_callbacks(&s, func, context);
-   return stbi_write_jpg_core(&s, x, y, comp, (void *) data, quality);
+   return stbi_write_jpg_core(&s, x, y, comp, (void *) data, quality, NULL);
 }


 #ifndef STBI_WRITE_NO_STDIO
-STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const void *data, int quality)
+STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const void *data, int quality, const char* parameters)
 {
   stbi__write_context s = { 0 };
   if (stbi__start_write_file(&s,filename)) {
-      int r = stbi_write_jpg_core(&s, x, y, comp, data, quality);
+      int r = stbi_write_jpg_core(&s, x, y, comp, data, quality, parameters);
      stbi__end_write_file(&s);
      return r;
   } else
@ -1738,4 +1752,4 @@ AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 ------------------------------------------------------------------------------
-*/
+*/
--- a/otherarch/sdcpp/unet.hpp
+++ b/otherarch/sdcpp/unet.hpp
@ -166,6 +166,7 @@ public:
 // ldm.modules.diffusionmodules.openaimodel.UNetModel
 class UnetModelBlock : public GGMLBlock {
 protected:
+    static std::map<std::string, enum ggml_type> empty_tensor_types;
    SDVersion version = VERSION_SD1;
    // network hparams
    int in_channels                        = 4;
@ -183,13 +184,13 @@ public:
    int model_channels  = 320;
    int adm_in_channels = 2816;  // only for VERSION_SDXL/SVD

-    UnetModelBlock(SDVersion version = VERSION_SD1, bool flash_attn = false)
+    UnetModelBlock(SDVersion version = VERSION_SD1, std::map<std::string, enum ggml_type>& tensor_types = empty_tensor_types, bool flash_attn = false)
        : version(version) {
-        if (version == VERSION_SD2) {
+        if (sd_version_is_sd2(version)) {
            context_dim       = 1024;
            num_head_channels = 64;
            num_heads         = -1;
-        } else if (version == VERSION_SDXL) {
+        } else if (sd_version_is_sdxl(version)) {
            context_dim           = 2048;
            attention_resolutions = {4, 2};
            channel_mult          = {1, 2, 4};
@ -204,6 +205,10 @@ public:
            num_head_channels = 64;
            num_heads         = -1;
        }
+        if (sd_version_is_inpaint(version)) {
+            in_channels = 9;
+        }
+
        // dims is always 2
        // use_temporal_attention is always True for SVD

@ -211,7 +216,7 @@ public:
        // time_embed_1 is nn.SiLU()
        blocks["time_embed.2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, time_embed_dim));

-        if (version == VERSION_SDXL || version == VERSION_SVD) {
+        if (sd_version_is_sdxl(version) || version == VERSION_SVD) {
            blocks["label_emb.0.0"] = std::shared_ptr<GGMLBlock>(new Linear(adm_in_channels, time_embed_dim));
            // label_emb_1 is nn.SiLU()
            blocks["label_emb.0.2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, time_embed_dim));
@ -536,7 +541,7 @@ struct UNetModelRunner : public GGMLRunner {
                    const std::string prefix,
                    SDVersion version = VERSION_SD1,
                    bool flash_attn   = false)
-        : GGMLRunner(backend), unet(version, flash_attn) {
+        : GGMLRunner(backend), unet(version, tensor_types, flash_attn) {
        unet.init(params_ctx, tensor_types, prefix);
    }

@ -566,6 +571,7 @@ struct UNetModelRunner : public GGMLRunner {
        context   = to_backend(context);
        y         = to_backend(y);
        timesteps = to_backend(timesteps);
+        c_concat  = to_backend(c_concat);

        for (int i = 0; i < controls.size(); i++) {
            controls[i] = to_backend(controls[i]);
@ -651,4 +657,4 @@ struct UNetModelRunner : public GGMLRunner {
    }
 };

-#endif  // __UNET_HPP__
+#endif  // __UNET_HPP__