diff --git a/otherarch/sdcpp/conditioner.hpp b/otherarch/sdcpp/conditioner.hpp
index 59538f683..4005fadf7 100644
--- a/otherarch/sdcpp/conditioner.hpp
+++ b/otherarch/sdcpp/conditioner.hpp
@@ -597,7 +597,6 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
         GGML_ASSERT(it != tokens.end());  // prompt must have trigger word
         tokens.erase(it);
         return decode(tokens);
-        //return prompt; //kcpp we don't care about photomaker trigger words
     }
 
     SDCondition get_learned_condition(ggml_context* work_ctx,
@@ -903,6 +902,7 @@ struct SD3CLIPEmbedder : public Conditioner {
 
                 t5->compute(n_threads,
                             input_ids,
+                            NULL,
                             &chunk_hidden_states_t5,
                             work_ctx);
                 {
@@ -1148,6 +1148,7 @@ struct FluxCLIPEmbedder : public Conditioner {
 
                 t5->compute(n_threads,
                             input_ids,
+                            NULL,
                             &chunk_hidden_states,
                             work_ctx);
                 {
@@ -1223,10 +1224,15 @@ struct PixArtCLIPEmbedder : public Conditioner {
     T5UniGramTokenizer t5_tokenizer;
     std::shared_ptr<T5Runner> t5;
     size_t chunk_len = 512;
+    bool use_mask    = false;
+    int mask_pad     = 1;
 
     PixArtCLIPEmbedder(ggml_backend_t backend,
                        std::map<std::string, enum ggml_type>& tensor_types,
-                       int clip_skip = -1) {
+                       int clip_skip = -1,
+                       bool use_mask = false,
+                       int mask_pad  = 1)
+        : use_mask(use_mask), mask_pad(mask_pad) {
         t5 = std::make_shared<T5Runner>(backend, tensor_types, "text_encoders.t5xxl.transformer");
     }
 
@@ -1323,16 +1329,6 @@ struct PixArtCLIPEmbedder : public Conditioner {
 
         size_t chunk_count = t5_tokens.size() / chunk_len;
 
-        bool use_mask                     = false;
-        const char* SD_CHROMA_USE_T5_MASK = getenv("SD_CHROMA_USE_T5_MASK");
-        if (SD_CHROMA_USE_T5_MASK != nullptr) {
-            std::string sd_chroma_use_t5_mask_str = SD_CHROMA_USE_T5_MASK;
-            if (sd_chroma_use_t5_mask_str == "ON" || sd_chroma_use_t5_mask_str == "TRUE") {
-                use_mask = true;
-            } else if (sd_chroma_use_t5_mask_str != "OFF" && sd_chroma_use_t5_mask_str != "FALSE") {
-                LOG_WARN("SD_CHROMA_USE_T5_MASK environment variable has unexpected value. Assuming default (\"OFF\"). (Expected \"OFF\"/\"FALSE\" or\"ON\"/\"TRUE\", got \"%s\")", SD_CHROMA_USE_T5_MASK);
-            }
-        }
         for (int chunk_idx = 0; chunk_idx < chunk_count; chunk_idx++) {
             // t5
             std::vector<int> chunk_tokens(t5_tokens.begin() + chunk_idx * chunk_len,
@@ -1347,9 +1343,9 @@ struct PixArtCLIPEmbedder : public Conditioner {
 
             t5->compute(n_threads,
                         input_ids,
+                        t5_attn_mask_chunk,
                         &chunk_hidden_states,
-                        work_ctx,
-                        t5_attn_mask_chunk);
+                        work_ctx);
             {
                 auto tensor         = chunk_hidden_states;
                 float original_mean = ggml_tensor_mean(tensor);
@@ -1391,18 +1387,6 @@ struct PixArtCLIPEmbedder : public Conditioner {
             ggml_set_f32(hidden_states, 0.f);
         }
 
-        int mask_pad                            = 1;
-        const char* SD_CHROMA_MASK_PAD_OVERRIDE = getenv("SD_CHROMA_MASK_PAD_OVERRIDE");
-        if (SD_CHROMA_MASK_PAD_OVERRIDE != nullptr) {
-            std::string mask_pad_str = SD_CHROMA_MASK_PAD_OVERRIDE;
-            try {
-                mask_pad = std::stoi(mask_pad_str);
-            } catch (const std::invalid_argument&) {
-                LOG_WARN("SD_CHROMA_MASK_PAD_OVERRIDE environment variable is not a valid integer (%s). Falling back to default (%d)", SD_CHROMA_MASK_PAD_OVERRIDE, mask_pad);
-            } catch (const std::out_of_range&) {
-                LOG_WARN("SD_CHROMA_MASK_PAD_OVERRIDE environment variable value is out of range for `int` type (%s). Falling back to default (%d)", SD_CHROMA_MASK_PAD_OVERRIDE, mask_pad);
-            }
-        }
         modify_mask_to_attend_padding(t5_attn_mask, ggml_nelements(t5_attn_mask), mask_pad);
 
         return SDCondition(hidden_states, t5_attn_mask, NULL);
diff --git a/otherarch/sdcpp/diffusion_model.hpp b/otherarch/sdcpp/diffusion_model.hpp
index 48522b25f..65680b8d9 100644
--- a/otherarch/sdcpp/diffusion_model.hpp
+++ b/otherarch/sdcpp/diffusion_model.hpp
@@ -137,8 +137,9 @@ struct FluxModel : public DiffusionModel {
     FluxModel(ggml_backend_t backend,
               std::map<std::string, enum ggml_type>& tensor_types,
               SDVersion version = VERSION_FLUX,
-              bool flash_attn   = false)
-        : flux(backend, tensor_types, "model.diffusion_model", version, flash_attn) {
+              bool flash_attn   = false,
+              bool use_mask     = false)
+        : flux(backend, tensor_types, "model.diffusion_model", version, flash_attn, use_mask) {
     }
 
     void alloc_params_buffer() {
diff --git a/otherarch/sdcpp/flux.hpp b/otherarch/sdcpp/flux.hpp
index e6d941af9..a16125102 100644
--- a/otherarch/sdcpp/flux.hpp
+++ b/otherarch/sdcpp/flux.hpp
@@ -744,10 +744,10 @@ namespace Flux {
             return ids;
         }
 
+
         // Generate positional embeddings
         std::vector<float> gen_pe(int h, int w, int patch_size, int bs, int context_len, std::vector<ggml_tensor*> ref_latents, int theta, const std::vector<int>& axes_dim) {
             std::vector<std::vector<float>> ids       = gen_ids(h, w, patch_size, bs, context_len, ref_latents);
-
             std::vector<std::vector<float>> trans_ids = transpose(ids);
             size_t pos_len                            = ids.size();
             int num_axes                              = axes_dim.size();
@@ -872,7 +872,7 @@ namespace Flux {
                                          struct ggml_tensor* y,
                                          struct ggml_tensor* guidance,
                                          struct ggml_tensor* pe,
-                                         struct ggml_tensor* arange   = NULL,
+                                         struct ggml_tensor* mod_index_arange   = NULL,
                                          std::vector<int> skip_layers = {}) {
             auto img_in      = std::dynamic_pointer_cast<Linear>(blocks["img_in"]);
             auto txt_in      = std::dynamic_pointer_cast<Linear>(blocks["txt_in"]);
@@ -887,9 +887,10 @@ namespace Flux {
                 auto distill_timestep    = ggml_nn_timestep_embedding(ctx, timesteps, 16, 10000, 1000.f);
                 auto distill_guidance    = ggml_nn_timestep_embedding(ctx, guidance, 16, 10000, 1000.f);
 
-                // auto arange          = ggml_arange(ctx, 0, (float)mod_index_length, 1); // Not working on a lot of backends, precomputing it on CPU instead
+                // auto mod_index_arange  = ggml_arange(ctx, 0, (float)mod_index_length, 1);
+                // ggml_arange tot working on a lot of backends, precomputing it on CPU instead
                 GGML_ASSERT(arange != NULL);
-                auto modulation_index = ggml_nn_timestep_embedding(ctx, arange, 32, 10000, 1000.f);  // [1, 344, 32]
+                auto modulation_index = ggml_nn_timestep_embedding(ctx, mod_index_arange, 32, 10000, 1000.f);  // [1, 344, 32]
 
                 // Batch broadcast (will it ever be useful)
                 modulation_index = ggml_repeat(ctx, modulation_index, ggml_new_tensor_3d(ctx, GGML_TYPE_F32, modulation_index->ne[0], modulation_index->ne[1], img->ne[2]));  // [N, 344, 32]
@@ -982,7 +983,7 @@ namespace Flux {
                                     struct ggml_tensor* y,
                                     struct ggml_tensor* guidance,
                                     struct ggml_tensor* pe,
-                                    struct ggml_tensor* arange   = NULL,
+                                    struct ggml_tensor* mod_index_arange   = NULL,
                                     std::vector<ggml_tensor*> ref_latents = {},
                                     std::vector<int> skip_layers = {}) {
             // Forward pass of DiT.
@@ -1024,7 +1025,7 @@ namespace Flux {
                 }
             }
 
-            auto out = forward_orig(ctx, img, context, timestep, y, guidance, pe, arange, skip_layers);  // [N, num_tokens, C * patch_size * patch_size]
+            auto out = forward_orig(ctx, img, context, timestep, y, guidance, pe, mod_index_arange, skip_layers);  // [N, num_tokens, C * patch_size * patch_size]
             if (out->ne[1] > img_tokens) {
                 out = ggml_cont(ctx, ggml_permute(ctx, out, 0, 2, 1, 3)); // [num_tokens, N, C * patch_size * patch_size]
                 out = ggml_view_3d(ctx, out, out->ne[0], out->ne[1], img_tokens, out->nb[1], out->nb[2], 0);
@@ -1044,15 +1045,18 @@ namespace Flux {
     public:
         FluxParams flux_params;
         Flux flux;
-        std::vector<float> pe_vec, range;  // for cache
+        std::vector<float> pe_vec;
+        std::vector<float> mod_index_arange_vec;  // for cache
         SDVersion version;
+        bool use_mask = false;
 
         FluxRunner(ggml_backend_t backend,
                    std::map<std::string, enum ggml_type>& tensor_types = empty_tensor_types,
                    const std::string prefix                            = "",
                    SDVersion version                                   = VERSION_FLUX,
-                   bool flash_attn                                     = false)
-            : GGMLRunner(backend) {
+                   bool flash_attn                                     = false,
+                   bool use_mask                                       = false)
+            : GGMLRunner(backend), use_mask(use_mask) {
             flux_params.flash_attn          = flash_attn;
             flux_params.guidance_embed      = false;
             flux_params.depth               = 0;
@@ -1116,51 +1120,28 @@ namespace Flux {
                                         struct ggml_tensor* y,
                                         struct ggml_tensor* guidance,
                                         std::vector<ggml_tensor*> ref_latents = {},
-                                        std::vector<int> skip_layers = std::vector<int>()) {
+                                        std::vector<int> skip_layers = {}) {
             GGML_ASSERT(x->ne[3] == 1);
             struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, FLUX_GRAPH_SIZE, false);
 
-            struct ggml_tensor* precompute_arange = NULL;
+            struct ggml_tensor* mod_index_arange = NULL;
 
             x       = to_backend(x);
             context = to_backend(context);
             if (c_concat != NULL) {
                 c_concat = to_backend(c_concat);
             }
-
             if (flux_params.is_chroma) {
-                const char* SD_CHROMA_ENABLE_GUIDANCE = getenv("SD_CHROMA_ENABLE_GUIDANCE");
-                bool disable_guidance                 = true;
-                if (SD_CHROMA_ENABLE_GUIDANCE != NULL) {
-                    std::string enable_guidance_str = SD_CHROMA_ENABLE_GUIDANCE;
-                    if (enable_guidance_str == "ON" || enable_guidance_str == "TRUE") {
-                        LOG_WARN("Chroma guidance has been enabled. Image might be broken. (SD_CHROMA_ENABLE_GUIDANCE env variable to \"OFF\" to disable)", SD_CHROMA_ENABLE_GUIDANCE);
-                        disable_guidance = false;
-                    } else if (enable_guidance_str != "OFF" && enable_guidance_str != "FALSE") {
-                        LOG_WARN("SD_CHROMA_ENABLE_GUIDANCE environment variable has unexpected value. Assuming default (\"OFF\"). (Expected \"ON\"/\"TRUE\" or\"OFF\"/\"FALSE\", got \"%s\")", SD_CHROMA_ENABLE_GUIDANCE);
-                    }
-                }
-                if (disable_guidance) {
-                   // LOG_DEBUG("Forcing guidance to 0 for chroma model (SD_CHROMA_ENABLE_GUIDANCE env variable to \"ON\" to enable)");
-                    guidance = ggml_set_f32(guidance, 0);
+                guidance = ggml_set_f32(guidance, 0);
+
+                if (!use_mask) {
+                    y = NULL;
                 }
 
-
-                const char* SD_CHROMA_USE_DIT_MASK = getenv("SD_CHROMA_USE_DIT_MASK");
-                if (SD_CHROMA_USE_DIT_MASK != nullptr) {
-                    std::string sd_chroma_use_DiT_mask_str = SD_CHROMA_USE_DIT_MASK;
-                    if (sd_chroma_use_DiT_mask_str == "OFF" || sd_chroma_use_DiT_mask_str == "FALSE") {
-                        y = NULL;
-                    } else if (sd_chroma_use_DiT_mask_str != "ON" && sd_chroma_use_DiT_mask_str != "TRUE") {
-                        LOG_WARN("SD_CHROMA_USE_DIT_MASK environment variable has unexpected value. Assuming default (\"ON\"). (Expected \"ON\"/\"TRUE\" or\"OFF\"/\"FALSE\", got \"%s\")", SD_CHROMA_USE_DIT_MASK);
-                    }
-                }
-
-                // ggml_arrange is not working on some backends, and y isn't used, so let's reuse y to precompute it
-                range             = arange(0, 344);
-                precompute_arange = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_F32, range.size());
-                set_backend_tensor_data(precompute_arange, range.data());
-                // y = NULL;
+                // ggml_arange is not working on some backends, precompute it
+                mod_index_arange_vec  = arange(0, 344);
+                mod_index_arange = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_F32, mod_index_arange_vec.size());
+                set_backend_tensor_data(mod_index_arange, mod_index_arange_vec.data());
             }
             y = to_backend(y);
 
@@ -1189,7 +1170,7 @@ namespace Flux {
                                                    y,
                                                    guidance,
                                                    pe,
-                                                   precompute_arange,
+                                                   mod_index_arange,
                                                    ref_latents,
                                                    skip_layers);
 
diff --git a/otherarch/sdcpp/main.cpp b/otherarch/sdcpp/main.cpp
index 9499e0c34..55e9591d3 100644
--- a/otherarch/sdcpp/main.cpp
+++ b/otherarch/sdcpp/main.cpp
@@ -128,6 +128,10 @@ struct SDParams {
     float slg_scale              = 0.f;
     float skip_layer_start       = 0.01f;
     float skip_layer_end         = 0.2f;
+
+    bool chroma_use_dit_mask     = true;
+    bool chroma_use_t5_mask      = false;
+    int  chroma_t5_mask_pad      = 1;
 };
 
 void print_params(SDParams params) {
@@ -177,6 +181,9 @@ void print_params(SDParams params) {
     printf("    batch_count:       %d\n", params.batch_count);
     printf("    vae_tiling:        %s\n", params.vae_tiling ? "true" : "false");
     printf("    upscale_repeats:   %d\n", params.upscale_repeats);
+    printf("    chroma_use_dit_mask:   %s\n", params.chroma_use_dit_mask ? "true" : "false");
+    printf("    chroma_use_t5_mask:    %s\n", params.chroma_use_t5_mask ? "true" : "false");
+    printf("    chroma_t5_mask_pad:    %d\n", params.chroma_t5_mask_pad);
 }
 
 void print_usage(int argc, const char* argv[]) {
@@ -243,6 +250,9 @@ void print_usage(int argc, const char* argv[]) {
     printf("  --control-net-cpu                  keep controlnet in cpu (for low vram)\n");
     printf("  --canny                            apply canny preprocessor (edge detection)\n");
     printf("  --color                            Colors the logging tags according to level\n");
+    printf("  --chroma-disable-dit-mask          disable dit mask for chroma\n");
+    printf("  --chroma-enable-t5-mask            enable t5 mask for chroma\n");
+    printf("  --chroma-t5-mask-pad  PAD_SIZE     t5 mask pad size of chroma\n");
     printf("  -v, --verbose                      print extra info\n");
     printf("  -ki, --kontext_img [PATH]        Reference image for Flux Kontext models (can be used multiple times) \n");
 }
@@ -938,7 +948,10 @@ int main(int argc, const char* argv[]) {
                                   params.clip_on_cpu,
                                   params.control_net_cpu,
                                   params.vae_on_cpu,
-                                  params.diffusion_flash_attn);
+                                  params.diffusion_flash_attn,
+                                  params.chroma_use_dit_mask,
+                                  params.chroma_use_t5_mask,
+                                  params.chroma_t5_mask_pad);
 
     if (sd_ctx == NULL) {
         printf("new_sd_ctx_t failed\n");
diff --git a/otherarch/sdcpp/sdtype_adapter.cpp b/otherarch/sdcpp/sdtype_adapter.cpp
index 7711b4765..6e111e74c 100644
--- a/otherarch/sdcpp/sdtype_adapter.cpp
+++ b/otherarch/sdcpp/sdtype_adapter.cpp
@@ -104,6 +104,10 @@ struct SDParams {
     float slg_scale              = 0.f;
     float skip_layer_start       = 0.01f;
     float skip_layer_end         = 0.2f;
+
+    bool chroma_use_dit_mask     = true;
+    bool chroma_use_t5_mask      = false;
+    int  chroma_t5_mask_pad      = 1;
 };
 
 //shared
@@ -272,7 +276,10 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) {
                         sd_params->clip_on_cpu,
                         sd_params->control_net_cpu,
                         sd_params->vae_on_cpu,
-                        sd_params->diffusion_flash_attn);
+                        sd_params->diffusion_flash_attn,
+                        sd_params->chroma_use_dit_mask,
+                        sd_params->chroma_use_t5_mask,
+                        sd_params->chroma_t5_mask_pad);
 
     if (sd_ctx == NULL) {
         printf("\nError: KCPP SD Failed to create context!\nIf using Flux/SD3.5, make sure you have ALL files required (e.g. VAE, T5, Clip...) or baked in!\n");
diff --git a/otherarch/sdcpp/stable-diffusion.cpp b/otherarch/sdcpp/stable-diffusion.cpp
index 5e0a79d01..322e888bd 100644
--- a/otherarch/sdcpp/stable-diffusion.cpp
+++ b/otherarch/sdcpp/stable-diffusion.cpp
@@ -159,7 +159,10 @@ public:
                         bool clip_on_cpu,
                         bool control_net_cpu,
                         bool vae_on_cpu,
-                        bool diffusion_flash_attn) {
+                        bool diffusion_flash_attn,
+                        bool chroma_use_dit_mask,
+                        bool chroma_use_t5_mask,
+                        int chroma_t5_mask_pad) {
         use_tiny_autoencoder = taesd_path.size() > 0;
         std::string taesd_path_fixed = taesd_path;
         is_loaded_chroma = false;
@@ -391,11 +394,11 @@ public:
                     }
                 }
                 if (is_chroma) {
-                    cond_stage_model = std::make_shared<PixArtCLIPEmbedder>(clip_backend, model_loader.tensor_storages_types);
+                    cond_stage_model = std::make_shared<PixArtCLIPEmbedder>(clip_backend, model_loader.tensor_storages_types, -1, chroma_use_t5_mask, chroma_t5_mask_pad);
                 } else {
                     cond_stage_model = std::make_shared<FluxCLIPEmbedder>(clip_backend, model_loader.tensor_storages_types);
                 }
-                diffusion_model = std::make_shared<FluxModel>(backend, model_loader.tensor_storages_types, version, diffusion_flash_attn);
+                diffusion_model = std::make_shared<FluxModel>(backend, model_loader.tensor_storages_types, version, diffusion_flash_attn, chroma_use_dit_mask);
             } else {
                 if (id_embeddings_path.find("v2") != std::string::npos) {
                     cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend, model_loader.tensor_storages_types, embeddings_path, version, PM_VERSION_2);
@@ -1337,7 +1340,10 @@ sd_ctx_t* new_sd_ctx(const char* model_path_c_str,
                      bool keep_clip_on_cpu,
                      bool keep_control_net_cpu,
                      bool keep_vae_on_cpu,
-                     bool diffusion_flash_attn) {
+                     bool diffusion_flash_attn,
+                     bool chroma_use_dit_mask,
+                     bool chroma_use_t5_mask,
+                     int chroma_t5_mask_pad) {
     sd_ctx_t* sd_ctx = (sd_ctx_t*)malloc(sizeof(sd_ctx_t));
     if (sd_ctx == NULL) {
         return NULL;
@@ -1379,7 +1385,10 @@ sd_ctx_t* new_sd_ctx(const char* model_path_c_str,
                                     keep_clip_on_cpu,
                                     keep_control_net_cpu,
                                     keep_vae_on_cpu,
-                                    diffusion_flash_attn)) {
+                                    diffusion_flash_attn,
+                                    chroma_use_dit_mask,
+                                    chroma_use_t5_mask,
+                                    chroma_t5_mask_pad)) {
         delete sd_ctx->sd;
         sd_ctx->sd = NULL;
         free(sd_ctx);
@@ -2231,5 +2240,133 @@ SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx,
 
     LOG_INFO("img2vid completed in %.2fs", (t3 - t0) * 1.0f / 1000);
 
+    return result_images;
+}
+
+sd_image_t* edit(sd_ctx_t* sd_ctx,
+                 sd_image_t* ref_images,
+                 int ref_images_count,
+                 const char* prompt_c_str,
+                 const char* negative_prompt_c_str,
+                 int clip_skip,
+                 float cfg_scale,
+                 float guidance,
+                 float eta,
+                 int width,
+                 int height,
+                 sample_method_t sample_method,
+                 int sample_steps,
+                 float strength,
+                 int64_t seed,
+                 int batch_count,
+                 const sd_image_t* control_cond,
+                 float control_strength,
+                 float style_ratio,
+                 bool normalize_input,
+                 int* skip_layers         = NULL,
+                 size_t skip_layers_count = 0,
+                 float slg_scale          = 0,
+                 float skip_layer_start   = 0.01,
+                 float skip_layer_end     = 0.2) {
+    std::vector<int> skip_layers_vec(skip_layers, skip_layers + skip_layers_count);
+    LOG_DEBUG("edit %dx%d", width, height);
+    if (sd_ctx == NULL) {
+        return NULL;
+    }
+    if (ref_images_count <= 0) {
+        LOG_ERROR("ref images count should > 0");
+        return NULL;
+    }
+
+    struct ggml_init_params params;
+    params.mem_size = static_cast<size_t>(30 * 1024 * 1024);  // 10 MB
+    params.mem_size += width * height * 3 * sizeof(float) * 3 * ref_images_count;
+    params.mem_size *= batch_count;
+    params.mem_buffer = NULL;
+    params.no_alloc   = false;
+    // LOG_DEBUG("mem_size %u ", params.mem_size);
+
+    struct ggml_context* work_ctx = ggml_init(params);
+    if (!work_ctx) {
+        LOG_ERROR("ggml_init() failed");
+        return NULL;
+    }
+
+    if (seed < 0) {
+        srand((int)time(NULL));
+        seed = rand();
+    }
+    sd_ctx->sd->rng->manual_seed(seed);
+
+    int C = 4;
+    if (sd_version_is_sd3(sd_ctx->sd->version)) {
+        C = 16;
+    } else if (sd_version_is_flux(sd_ctx->sd->version)) {
+        C = 16;
+    }
+    int W                    = width / 8;
+    int H                    = height / 8;
+    ggml_tensor* init_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1);
+    if (sd_version_is_sd3(sd_ctx->sd->version)) {
+        ggml_set_f32(init_latent, 0.0609f);
+    } else if (sd_version_is_flux(sd_ctx->sd->version)) {
+        ggml_set_f32(init_latent, 0.1159f);
+    } else {
+        ggml_set_f32(init_latent, 0.f);
+    }
+
+    size_t t0 = ggml_time_ms();
+
+    std::vector<struct ggml_tensor*> ref_latents;
+    for (int i = 0; i < ref_images_count; i++) {
+        ggml_tensor* img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, ref_images[i].width, ref_images[i].height, 3, 1);
+        sd_image_to_tensor(ref_images[i].data, img);
+
+        ggml_tensor* latent = NULL;
+        if (!sd_ctx->sd->use_tiny_autoencoder) {
+            ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, img);
+            latent               = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments);
+        } else {
+            latent = sd_ctx->sd->encode_first_stage(work_ctx, img);
+        }
+        ref_latents.push_back(latent);
+    }
+
+    size_t t1 = ggml_time_ms();
+    LOG_INFO("encode_first_stage completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
+
+    std::vector<float> sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps);
+
+    sd_image_t* result_images = generate_image(sd_ctx,
+                                               work_ctx,
+                                               init_latent,
+                                               prompt_c_str,
+                                               negative_prompt_c_str,
+                                               clip_skip,
+                                               cfg_scale,
+                                               guidance,
+                                               eta,
+                                               width,
+                                               height,
+                                               sample_method,
+                                               sigmas,
+                                               seed,
+                                               batch_count,
+                                               control_cond,
+                                               control_strength,
+                                               style_ratio,
+                                               normalize_input,
+                                               "",
+                                               ref_latents,
+                                               skip_layers_vec,
+                                               slg_scale,
+                                               skip_layer_start,
+                                               skip_layer_end,
+                                               NULL);
+
+    size_t t2 = ggml_time_ms();
+
+    LOG_INFO("edit completed in %.2fs", (t2 - t0) * 1.0f / 1000);
+
     return result_images;
 }
\ No newline at end of file
diff --git a/otherarch/sdcpp/stable-diffusion.h b/otherarch/sdcpp/stable-diffusion.h
index 0b0729717..fc0cd75e9 100644
--- a/otherarch/sdcpp/stable-diffusion.h
+++ b/otherarch/sdcpp/stable-diffusion.h
@@ -154,7 +154,10 @@ SD_API sd_ctx_t* new_sd_ctx(const char* model_path,
                             bool keep_clip_on_cpu,
                             bool keep_control_net_cpu,
                             bool keep_vae_on_cpu,
-                            bool diffusion_flash_attn);
+                            bool diffusion_flash_attn,
+                            bool chroma_use_dit_mask,
+                            bool chroma_use_t5_mask,
+                            int chroma_t5_mask_pad);
 
 SD_API void free_sd_ctx(sd_ctx_t* sd_ctx);
 
@@ -230,6 +233,32 @@ SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx,
                            float strength,
                            int64_t seed);
 
+SD_API sd_image_t* edit(sd_ctx_t* sd_ctx,
+                        sd_image_t* ref_images,
+                        int ref_images_count,
+                        const char* prompt,
+                        const char* negative_prompt,
+                        int clip_skip,
+                        float cfg_scale,
+                        float guidance,
+                        float eta,
+                        int width,
+                        int height,
+                        enum sample_method_t sample_method,
+                        int sample_steps,
+                        float strength,
+                        int64_t seed,
+                        int batch_count,
+                        const sd_image_t* control_cond,
+                        float control_strength,
+                        float style_strength,
+                        bool normalize_input,
+                        int* skip_layers,
+                        size_t skip_layers_count,
+                        float slg_scale,
+                        float skip_layer_start,
+                        float skip_layer_end);
+
 typedef struct upscaler_ctx_t upscaler_ctx_t;
 
 SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path,
diff --git a/otherarch/sdcpp/t5.hpp b/otherarch/sdcpp/t5.hpp
index 4cbde3eda..1861ad478 100644
--- a/otherarch/sdcpp/t5.hpp
+++ b/otherarch/sdcpp/t5.hpp
@@ -795,9 +795,9 @@ struct T5Runner : public GGMLRunner {
 
     void compute(const int n_threads,
                  struct ggml_tensor* input_ids,
+                 struct ggml_tensor* attention_mask,
                  ggml_tensor** output,
-                 ggml_context* output_ctx           = NULL,
-                 struct ggml_tensor* attention_mask = NULL) {
+                 ggml_context* output_ctx = NULL) {
         auto get_graph = [&]() -> struct ggml_cgraph* {
             return build_graph(input_ids, attention_mask);
         };
@@ -966,7 +966,7 @@ struct T5Embedder {
             struct ggml_tensor* out = NULL;
 
             int t0 = ggml_time_ms();
-            model.compute(8, input_ids, &out, work_ctx);
+            model.compute(8, input_ids, NULL, &out, work_ctx);
             int t1 = ggml_time_ms();
 
             print_ggml_tensor(out);