koboldcpp/otherarch/sdcpp/stable-diffusion.cpp

#include "ggml_extend.hpp"

#include "model.h"
#include "rng.hpp"
#include "rng_mt19937.hpp"
#include "rng_philox.hpp"
#include "stable-diffusion.h"
#include "util.h"

#include "cache_dit.hpp"
#include "conditioner.hpp"
#include "control.hpp"
#include "denoiser.hpp"
#include "diffusion_model.hpp"
#include "easycache.hpp"
#include "esrgan.hpp"
#include "lora.hpp"
#include "pmid.hpp"
#include "tae.hpp"
#include "ucache.hpp"
#include "vae.hpp"

#include "latent-preview.h"
#include "name_conversion.h"

#include <filesystem>

const char* model_version_to_str[] = {
    "SD 1.x",
    "SD 1.x Inpaint",
    "Instruct-Pix2Pix",
    "SD 1.x Tiny UNet",
    "SD 2.x",
    "SD 2.x Inpaint",
    "SD 2.x Tiny UNet",
    "SDXS",
    "SDXL",
    "SDXL Inpaint",
    "SDXL Instruct-Pix2Pix",
    "SDXL (Vega)",
    "SDXL (SSD1B)",
    "SVD",
    "SD3.x",
    "Flux",
    "Flux Fill",
    "Flux Control",
    "Flex.2",
    "Chroma Radiance",
    "Wan 2.x",
    "Wan 2.2 I2V",
    "Wan 2.2 TI2V",
    "Qwen Image",
    "Anima",
    "Flux.2",
    "Flux.2 klein",
    "Z-Image",
    "Ovis Image",
};

const char* sampling_methods_str[] = {
    "Euler",
    "Euler A",
    "Heun",
    "DPM2",
    "DPM++ (2s)",
    "DPM++ (2M)",
    "modified DPM++ (2M)",
    "iPNDM",
    "iPNDM_v",
    "LCM",
    "DDIM \"trailing\"",
    "TCD",
    "Res Multistep",
    "Res 2s",
};

/*================================================== Helper Functions ================================================*/

void calculate_alphas_cumprod(float* alphas_cumprod,
                              float linear_start = 0.00085f,
                              float linear_end   = 0.0120,
                              int timesteps      = TIMESTEPS) {
    float ls_sqrt = sqrtf(linear_start);
    float le_sqrt = sqrtf(linear_end);
    float amount  = le_sqrt - ls_sqrt;
    float product = 1.0f;
    for (int i = 0; i < timesteps; i++) {
        float beta = ls_sqrt + amount * ((float)i / (timesteps - 1));
        product *= 1.0f - powf(beta, 2.0f);
        alphas_cumprod[i] = product;
    }
}

void suppress_pp(int step, int steps, float time, void* data) {
    (void)step;
    (void)steps;
    (void)time;
    (void)data;
    return;
}

/*=============================================== StableDiffusionGGML ================================================*/

class StableDiffusionGGML {
public:
    ggml_backend_t backend             = nullptr;  // general backend
    ggml_backend_t clip_backend        = nullptr;
    ggml_backend_t control_net_backend = nullptr;
    ggml_backend_t vae_backend         = nullptr;

    SDVersion version;
    bool vae_decode_only         = false;
    bool external_vae_is_invalid = false;
    bool free_params_immediately = false;

    std::shared_ptr<RNG> rng         = std::make_shared<PhiloxRNG>();
    std::shared_ptr<RNG> sampler_rng = nullptr;
    int n_threads                    = -1;
    float scale_factor               = 0.18215f;
    float shift_factor               = 0.f;
    float default_flow_shift         = INFINITY;

    std::shared_ptr<Conditioner> cond_stage_model;
    std::shared_ptr<FrozenCLIPVisionEmbedder> clip_vision;  // for svd or wan2.1 i2v
    std::shared_ptr<DiffusionModel> diffusion_model;
    std::shared_ptr<DiffusionModel> high_noise_diffusion_model;
    std::shared_ptr<VAE> first_stage_model;
    std::shared_ptr<TinyAutoEncoder> tae_first_stage;
    std::shared_ptr<ControlNet> control_net;
    std::shared_ptr<PhotoMakerIDEncoder> pmid_model;
    std::shared_ptr<LoraModel> pmid_lora;
    std::shared_ptr<PhotoMakerIDEmbed> pmid_id_embeds;
    std::vector<std::shared_ptr<LoraModel>> cond_stage_lora_models;
    std::vector<std::shared_ptr<LoraModel>> diffusion_lora_models;
    std::vector<std::shared_ptr<LoraModel>> first_stage_lora_models;
    bool apply_lora_immediately = false;
    std::map<std::string, std::shared_ptr<LoraModel>> kcpp_lora_cache;

    std::string taesd_path;
    bool use_tiny_autoencoder            = false;
    sd_tiling_params_t vae_tiling_params = {false, 0, 0, 0.5f, 0, 0};
    bool offload_params_to_cpu           = false;
    bool use_pmid                        = false;

    bool is_using_v_parameterization     = false;
    bool is_using_edm_v_parameterization = false;

    std::map<std::string, struct ggml_tensor*> tensors;

    // lora_name => multiplier
    std::unordered_map<std::string, float> curr_lora_state;

    std::shared_ptr<Denoiser> denoiser = std::make_shared<CompVisDenoiser>();

    StableDiffusionGGML() = default;

    ~StableDiffusionGGML() {
        if (clip_backend != backend) {
            ggml_backend_free(clip_backend);
        }
        if (control_net_backend != backend) {
            ggml_backend_free(control_net_backend);
        }
        if (vae_backend != backend) {
            ggml_backend_free(vae_backend);
        }
        ggml_backend_free(backend);
    }

    std::string toLowerCase(const std::string& str) {
        std::string result;
        std::locale loc;

        for (char ch : str) {
            result += std::tolower(ch, loc); // Use locale-aware tolower
        }

        return result;
    }

    void init_backend() {
#ifdef SD_USE_CUDA
        LOG_DEBUG("Using CUDA backend");
        backend = ggml_backend_cuda_init(0);
#endif
#ifdef SD_USE_METAL
        LOG_DEBUG("Using Metal backend");
        backend = ggml_backend_metal_init();
#endif
#ifdef SD_USE_VULKAN
        LOG_DEBUG("Using Vulkan backend");
        size_t device          = 0;
        const int device_count = ggml_backend_vk_get_device_count();
        if (device_count) {
            const char* SD_VK_DEVICE = getenv("SD_VK_DEVICE");
            if (SD_VK_DEVICE != nullptr) {
                std::string sd_vk_device_str = SD_VK_DEVICE;
                try {
                    device = std::stoull(sd_vk_device_str);
                } catch (const std::invalid_argument&) {
                    LOG_WARN("SD_VK_DEVICE environment variable is not a valid integer (%s). Falling back to device 0.", SD_VK_DEVICE);
                    device = 0;
                } catch (const std::out_of_range&) {
                    LOG_WARN("SD_VK_DEVICE environment variable value is out of range for `unsigned long long` type (%s). Falling back to device 0.", SD_VK_DEVICE);
                    device = 0;
                }
                if (device >= device_count) {
                    LOG_WARN("Cannot find targeted vulkan device (%llu). Falling back to device 0.", device);
                    device = 0;
                }
            }
            LOG_INFO("Vulkan: Using device %llu", device);
            backend = ggml_backend_vk_init(device);
        }
        if (!backend) {
            LOG_WARN("Failed to initialize Vulkan backend");
        }
#endif
#ifdef SD_USE_OPENCL
        LOG_DEBUG("Using OpenCL backend");
        // ggml_log_set(ggml_log_callback_default, nullptr); // Optional ggml logs
        backend = ggml_backend_opencl_init();
        if (!backend) {
            LOG_WARN("Failed to initialize OpenCL backend");
        }
#endif
#ifdef SD_USE_SYCL
        LOG_DEBUG("Using SYCL backend");
        backend = ggml_backend_sycl_init(0);
#endif

        if (!backend) {
            LOG_DEBUG("Using CPU backend");
            backend = ggml_backend_cpu_init();
        }
    }

    std::shared_ptr<RNG> get_rng(rng_type_t rng_type) {
        if (rng_type == STD_DEFAULT_RNG) {
            return std::make_shared<STDDefaultRNG>();
        } else if (rng_type == CPU_RNG) {
            return std::make_shared<MT19937RNG>();
        } else {  // default: CUDA_RNG
            return std::make_shared<PhiloxRNG>();
        }
    }

    bool init(const sd_ctx_params_t* sd_ctx_params) {
        n_threads               = sd_ctx_params->n_threads;
        vae_decode_only         = sd_ctx_params->vae_decode_only;
        free_params_immediately = sd_ctx_params->free_params_immediately;
        taesd_path              = SAFE_STR(sd_ctx_params->taesd_path);
        use_tiny_autoencoder    = taesd_path.size() > 0;
        offload_params_to_cpu   = sd_ctx_params->offload_params_to_cpu;

        rng = get_rng(sd_ctx_params->rng_type);
        if (sd_ctx_params->sampler_rng_type != RNG_TYPE_COUNT && sd_ctx_params->sampler_rng_type != sd_ctx_params->rng_type) {
            sampler_rng = get_rng(sd_ctx_params->sampler_rng_type);
        } else {
            sampler_rng = rng;
        }

        ggml_log_set(ggml_log_callback_default, nullptr);

        init_backend();

        std::string taesd_path_fixed = taesd_path;
        std::string t5_path_fixed = SAFE_STR(sd_ctx_params->t5xxl_path);
        std::string clipl_path_fixed = SAFE_STR(sd_ctx_params->clip_l_path);
        std::string clipg_path_fixed = SAFE_STR(sd_ctx_params->clip_g_path);

        ModelLoader model_loader;

        if (strlen(SAFE_STR(sd_ctx_params->model_path)) > 0) {
            LOG_INFO("loading model from '%s'", sd_ctx_params->model_path);
            if (!model_loader.init_from_file(sd_ctx_params->model_path)) {
                LOG_ERROR("init model loader from file failed: '%s'", sd_ctx_params->model_path);
            }
        }

        if (strlen(SAFE_STR(sd_ctx_params->diffusion_model_path)) > 0) {
            LOG_INFO("loading diffusion model from '%s'", sd_ctx_params->diffusion_model_path);
            if (!model_loader.init_from_file(sd_ctx_params->diffusion_model_path, "model.diffusion_model.")) {
                LOG_WARN("loading diffusion model from '%s' failed", sd_ctx_params->diffusion_model_path);
            }
        }

        if (strlen(SAFE_STR(sd_ctx_params->high_noise_diffusion_model_path)) > 0) {
            LOG_INFO("loading high noise diffusion model from '%s'", sd_ctx_params->high_noise_diffusion_model_path);
            if (!model_loader.init_from_file(sd_ctx_params->high_noise_diffusion_model_path, "model.high_noise_diffusion_model.")) {
                LOG_WARN("loading diffusion model from '%s' failed", sd_ctx_params->high_noise_diffusion_model_path);
            }
        }

        bool is_unet = sd_version_is_unet(model_loader.get_sd_version());
        int tempver = model_loader.get_sd_version();

        // kcpp fallback to separate diffusion model passed as model
        if (tempver == VERSION_COUNT &&
        strlen(SAFE_STR(sd_ctx_params->model_path)) > 0 &&
        strlen(SAFE_STR(sd_ctx_params->diffusion_model_path)) == 0 &&
        (t5_path_fixed!=""||clipl_path_fixed!=""))
        {
            bool endswithsafetensors = ends_with(sd_ctx_params->model_path, ".safetensors");
            if(endswithsafetensors && !model_loader.has_diffusion_model_tensors())
            {
                LOG_INFO("SD Diffusion Model tensors missing! Fallback trying alternative tensor names...\n");
                if (!model_loader.init_from_file(sd_ctx_params->model_path, "model.diffusion_model.")) {
                    LOG_WARN("loading diffusion model from '%s' failed", sd_ctx_params->model_path);
                }
                tempver = model_loader.get_sd_version();
            }
        }

        if (tempver == VERSION_ANIMA &&
            strlen(SAFE_STR(sd_ctx_params->model_path)) > 0 &&
            strlen(SAFE_STR(sd_ctx_params->diffusion_model_path)) == 0 &&
            !model_loader.has_diffusion_model_tensors()
            )
        {
            LOG_INFO("Anima: SD Diffusion Model tensors missing! Fallback trying alternative tensor names...\n");
            if (!model_loader.init_from_file(sd_ctx_params->model_path, "model.diffusion_model.")) {
                LOG_WARN("loading diffusion model from '%s' failed", sd_ctx_params->model_path);
            }
            tempver = model_loader.get_sd_version();
        }

        bool iswan = (tempver==VERSION_WAN2 || tempver==VERSION_WAN2_2_I2V || tempver==VERSION_WAN2_2_TI2V);
        bool isqwenimg = (tempver==VERSION_QWEN_IMAGE);
        bool iszimg = (tempver==VERSION_Z_IMAGE);
        bool isflux2 = (tempver==VERSION_FLUX2);
        bool isflux2k = (tempver==VERSION_FLUX2_KLEIN);
        bool is_ovis =  (tempver==VERSION_OVIS_IMAGE);
        bool is_anima = (tempver==VERSION_ANIMA);
        bool conditioner_is_llm = (isqwenimg||iszimg||isflux2||isflux2k||is_ovis||is_anima);

        //kcpp qol fallback: if qwen image, and they loaded the qwen2vl llm as t5 by mistake
        if(conditioner_is_llm && t5_path_fixed!="")
        {
            if(clipl_path_fixed=="" && clipg_path_fixed=="")
            {
                clipl_path_fixed = t5_path_fixed;
                t5_path_fixed = "";
            }
            else if(clipl_path_fixed=="" && clipg_path_fixed!="")
            {
                clipl_path_fixed = t5_path_fixed;
                t5_path_fixed = "";
            }
            else if(clipl_path_fixed!="" && clipg_path_fixed=="")
            {
                //very tricky case. see if we can tell if clipl is an mmproj, if so move to right place
                if(toLowerCase(clipl_path_fixed).find("mmproj") != std::string::npos)
                {
                    clipg_path_fixed = clipl_path_fixed;
                    clipl_path_fixed = t5_path_fixed;
                    t5_path_fixed = "";
                }
            }
        }

        if (clipl_path_fixed!="") {
            LOG_INFO("loading clip_l from '%s'", clipl_path_fixed.c_str());
            std::string prefix = is_unet ? "cond_stage_model.transformer." : "text_encoders.clip_l.transformer.";
            if(iswan)
            {
                prefix = "cond_stage_model.transformer.";
                LOG_INFO("swap clip_vision from '%s'", clipl_path_fixed.c_str());
            }
            if(conditioner_is_llm)
            {
                prefix = "text_encoders.llm.";
                LOG_INFO("swap llm from '%s'", clipl_path_fixed.c_str());
            }
            if (!model_loader.init_from_file(clipl_path_fixed.c_str(), prefix)) {
                LOG_WARN("loading clip_l from '%s' failed", clipl_path_fixed.c_str());
            }
        }

        if (clipg_path_fixed!="") {
            LOG_INFO("loading clip_g from '%s'", clipg_path_fixed.c_str());
            std::string prefix = is_unet ? "cond_stage_model.1.transformer." : "text_encoders.clip_g.transformer.";
            if(iswan)
            {
                prefix = "cond_stage_model.transformer.";
                LOG_INFO("swap clip_vision from '%s'", clipg_path_fixed.c_str());
            }
            if(isqwenimg)
            {
                prefix = "text_encoders.llm.visual.";
                LOG_INFO("swap llm mmproj from '%s'", clipg_path_fixed.c_str());
            }
            if (!model_loader.init_from_file(clipg_path_fixed.c_str(), prefix)) {
                LOG_WARN("loading clip_g from '%s' failed", clipg_path_fixed.c_str());
            }
        }

        if (strlen(SAFE_STR(sd_ctx_params->clip_vision_path)) > 0) {
            LOG_INFO("loading clip_vision from '%s'", sd_ctx_params->clip_vision_path);
            std::string prefix = "cond_stage_model.transformer.";
            if (!model_loader.init_from_file(sd_ctx_params->clip_vision_path, prefix)) {
                LOG_WARN("loading clip_vision from '%s' failed", sd_ctx_params->clip_vision_path);
            }
        }

        if (t5_path_fixed!="") {
            LOG_INFO("loading t5xxl from '%s'", t5_path_fixed.c_str());
            if (!model_loader.init_from_file(t5_path_fixed.c_str(), "text_encoders.t5xxl.transformer.")) {
                LOG_WARN("loading t5xxl from '%s' failed", t5_path_fixed.c_str());
            }
        }

        if (strlen(SAFE_STR(sd_ctx_params->llm_path)) > 0) {
            LOG_INFO("loading llm from '%s'", sd_ctx_params->llm_path);
            if (!model_loader.init_from_file(sd_ctx_params->llm_path, "text_encoders.llm.")) {
                LOG_WARN("loading llm from '%s' failed", sd_ctx_params->llm_path);
            }
        }

        if (strlen(SAFE_STR(sd_ctx_params->llm_vision_path)) > 0) {
            LOG_INFO("loading llm vision from '%s'", sd_ctx_params->llm_vision_path);
            if (!model_loader.init_from_file(sd_ctx_params->llm_vision_path, "text_encoders.llm.visual.")) {
                LOG_WARN("loading llm vision from '%s' failed", sd_ctx_params->llm_vision_path);
            }
        }

        if (strlen(SAFE_STR(sd_ctx_params->vae_path)) > 0) {
            LOG_INFO("loading vae from '%s'", sd_ctx_params->vae_path);
            if (!model_loader.init_from_file(sd_ctx_params->vae_path, "vae.")) {
                LOG_WARN("loading vae from '%s' failed", sd_ctx_params->vae_path);
                external_vae_is_invalid = true;
            }
        }

        model_loader.convert_tensors_name();

        version = model_loader.get_sd_version();

        if (version == VERSION_COUNT) {
            LOG_ERROR("get sd version from file failed: '%s'", SAFE_STR(sd_ctx_params->model_path));
            return false;
        }

        auto& tensor_storage_map = model_loader.get_tensor_storage_map();

        LOG_INFO("Version: %s ", model_version_to_str[version]);

        if(use_tiny_autoencoder) // kcpp
        {
            std::string to_search = "taesd.embd";
            std::string to_replace = "";
            if(sd_version_is_sd1(version) || sd_version_is_sd2(version))
            {
                to_replace = "taesd.embd";
            }
            else if(sd_version_is_sdxl(version))
            {
                to_replace = "taesd_xl.embd";
            }
            else if(sd_version_is_flux(version)||sd_version_is_z_image(version)||version == VERSION_OVIS_IMAGE)
            {
                to_replace = "taesd_f.embd";
            }
            else if(sd_version_is_sd3(version))
            {
                to_replace = "taesd_3.embd";
            }
            else if(sd_version_is_flux2(version))
            {
                to_replace = "taesd_f2.embd";
            }
            else if((sd_version_is_wan(version) && version != VERSION_WAN2_2_TI2V)||sd_version_is_qwen_image(version)||sd_version_is_anima(version))
            {
                to_replace = "taesd_w21.embd";
            }

            if(to_replace!="")
            {
                size_t pos = taesd_path_fixed.find(to_search);
                if (pos != std::string::npos) {
                    taesd_path_fixed.replace(pos, to_search.length(), to_replace);
                }
            }
            else
            {
                printf("\nCannot use TAESD: Unknown version %d. TAESD Disabled!\n",version);
                taesd_path_fixed = "";
                use_tiny_autoencoder = false;
            }
            if (use_tiny_autoencoder && !file_exists(taesd_path_fixed))
            {
                printf("\nCannot use TAESD: \"%s\" not found. TAESD Disabled!\n", taesd_path_fixed.c_str());
                taesd_path_fixed = "";
                use_tiny_autoencoder = false;
            }
        }

        ggml_type wtype               = (int)sd_ctx_params->wtype < std::min<int>(SD_TYPE_COUNT, GGML_TYPE_COUNT)
                                            ? (ggml_type)sd_ctx_params->wtype
                                            : GGML_TYPE_COUNT;
        std::string tensor_type_rules = SAFE_STR(sd_ctx_params->tensor_type_rules);
        if (wtype != GGML_TYPE_COUNT || tensor_type_rules.size() > 0) {
            model_loader.set_wtype_override(wtype, tensor_type_rules);
        }

        std::map<ggml_type, uint32_t> wtype_stat                 = model_loader.get_wtype_stat();
        std::map<ggml_type, uint32_t> conditioner_wtype_stat     = model_loader.get_conditioner_wtype_stat();
        std::map<ggml_type, uint32_t> diffusion_model_wtype_stat = model_loader.get_diffusion_model_wtype_stat();
        std::map<ggml_type, uint32_t> vae_wtype_stat             = model_loader.get_vae_wtype_stat();

        auto wtype_stat_to_str = [](const std::map<ggml_type, uint32_t>& m, int key_width = 8, int value_width = 5) -> std::string {
            std::ostringstream oss;
            bool first = true;
            for (const auto& [type, count] : m) {
                if (!first)
                    oss << "|";
                first = false;
                oss << std::right << std::setw(key_width) << ggml_type_name(type)
                    << ": "
                    << std::left << std::setw(value_width) << count;
            }
            return oss.str();
        };

        LOG_INFO("Weight type stat:                 %s", wtype_stat_to_str(wtype_stat).c_str());
        LOG_INFO("Conditioner weight type stat:     %s", wtype_stat_to_str(conditioner_wtype_stat).c_str());
        LOG_INFO("Diffusion model weight type stat: %s", wtype_stat_to_str(diffusion_model_wtype_stat).c_str());
        LOG_INFO("VAE weight type stat:             %s", wtype_stat_to_str(vae_wtype_stat).c_str());

        LOG_DEBUG("ggml tensor size = %d bytes", (int)sizeof(ggml_tensor));

        if (sd_ctx_params->lora_apply_mode == LORA_APPLY_AUTO) {
            bool have_quantized_weight = false;
            if (wtype != GGML_TYPE_COUNT && ggml_is_quantized(wtype)) {
                have_quantized_weight = true;
            } else {
                for (const auto& [type, _] : wtype_stat) {
                    if (ggml_is_quantized(type)) {
                        have_quantized_weight = true;
                        break;
                    }
                }
            }
            if (have_quantized_weight) {
                apply_lora_immediately = false;
            } else {
                apply_lora_immediately = true;
            }
        } else if (sd_ctx_params->lora_apply_mode == LORA_APPLY_IMMEDIATELY) {
            apply_lora_immediately = true;
        } else {
            apply_lora_immediately = false;
        }

        if (sd_version_is_sdxl(version)) {
            scale_factor = 0.13025f;
        } else if (sd_version_is_sd3(version)) {
            scale_factor = 1.5305f;
            shift_factor = 0.0609f;
        } else if (sd_version_is_flux(version) || sd_version_is_z_image(version)) {
            scale_factor = 0.3611f;
            shift_factor = 0.1159f;
        } else if (sd_version_is_wan(version) ||
                   sd_version_is_qwen_image(version) ||
                   sd_version_is_anima(version) ||
                   sd_version_is_flux2(version)) {
            scale_factor = 1.0f;
            shift_factor = 0.f;
        }

        if (sd_version_is_control(version)) {
            // Might need vae encode for control cond
            vae_decode_only = false;
        }

        bool tae_preview_only = sd_ctx_params->tae_preview_only;
        if (version == VERSION_SDXS) {
            tae_preview_only = false;
        }

        if (sd_ctx_params->circular_x || sd_ctx_params->circular_y) {
            LOG_INFO("Using circular padding for convolutions");
        }

        bool clip_on_cpu = sd_ctx_params->keep_clip_on_cpu;

        {
            clip_backend = backend;
            if (clip_on_cpu && !ggml_backend_is_cpu(backend)) {
                LOG_INFO("CLIP: Using CPU backend");
                clip_backend = ggml_backend_cpu_init();
            }
            if (sd_version_is_sd3(version)) {
                cond_stage_model = std::make_shared<SD3CLIPEmbedder>(clip_backend,
                                                                     offload_params_to_cpu,
                                                                     tensor_storage_map);
                diffusion_model  = std::make_shared<MMDiTModel>(backend,
                                                               offload_params_to_cpu,
                                                               tensor_storage_map);
            } else if (sd_version_is_flux(version)) {
                bool is_chroma = false;
                for (auto pair : tensor_storage_map) {
                    if (pair.first.find("distilled_guidance_layer.in_proj.weight") != std::string::npos) {
                        is_chroma = true;
                        break;
                    }
                }
                if (is_chroma) {
                    if ((sd_ctx_params->flash_attn || sd_ctx_params->diffusion_flash_attn) && sd_ctx_params->chroma_use_dit_mask) {
                        LOG_WARN(
                            "!!!It looks like you are using Chroma with flash attention. "
                            "This is currently unsupported. "
                            "If you find that the generated images are broken, "
                            "try either disabling flash attention or specifying "
                            "--chroma-disable-dit-mask as a workaround.");
                    }

                    cond_stage_model = std::make_shared<T5CLIPEmbedder>(clip_backend,
                                                                        offload_params_to_cpu,
                                                                        tensor_storage_map,
                                                                        sd_ctx_params->chroma_use_t5_mask,
                                                                        sd_ctx_params->chroma_t5_mask_pad);
                } else if (version == VERSION_OVIS_IMAGE) {
                    cond_stage_model = std::make_shared<LLMEmbedder>(clip_backend,
                                                                     offload_params_to_cpu,
                                                                     tensor_storage_map,
                                                                     version,
                                                                     "",
                                                                     false);
                } else {
                    cond_stage_model = std::make_shared<FluxCLIPEmbedder>(clip_backend,
                                                                          offload_params_to_cpu,
                                                                          tensor_storage_map);
                }
                diffusion_model = std::make_shared<FluxModel>(backend,
                                                              offload_params_to_cpu,
                                                              tensor_storage_map,
                                                              version,
                                                              sd_ctx_params->chroma_use_dit_mask);
            } else if (sd_version_is_flux2(version)) {
                bool is_chroma   = false;
                cond_stage_model = std::make_shared<LLMEmbedder>(clip_backend,
                                                                 offload_params_to_cpu,
                                                                 tensor_storage_map,
                                                                 version);
                diffusion_model  = std::make_shared<FluxModel>(backend,
                                                              offload_params_to_cpu,
                                                              tensor_storage_map,
                                                              version,
                                                              sd_ctx_params->chroma_use_dit_mask);
            } else if (sd_version_is_wan(version)) {
                cond_stage_model = std::make_shared<T5CLIPEmbedder>(clip_backend,
                                                                    offload_params_to_cpu,
                                                                    tensor_storage_map,
                                                                    true,
                                                                    1,
                                                                    true);
                diffusion_model  = std::make_shared<WanModel>(backend,
                                                             offload_params_to_cpu,
                                                             tensor_storage_map,
                                                             "model.diffusion_model",
                                                             version);
                if (strlen(SAFE_STR(sd_ctx_params->high_noise_diffusion_model_path)) > 0) {
                    high_noise_diffusion_model = std::make_shared<WanModel>(backend,
                                                                            offload_params_to_cpu,
                                                                            tensor_storage_map,
                                                                            "model.high_noise_diffusion_model",
                                                                            version);
                }
                if (diffusion_model->get_desc() == "Wan2.1-I2V-14B" ||
                    diffusion_model->get_desc() == "Wan2.1-FLF2V-14B" ||
                    diffusion_model->get_desc() == "Wan2.1-I2V-1.3B") {
                    clip_vision = std::make_shared<FrozenCLIPVisionEmbedder>(backend,
                                                                             offload_params_to_cpu,
                                                                             tensor_storage_map);
                    clip_vision->alloc_params_buffer();
                    clip_vision->get_param_tensors(tensors);
                }
            } else if (sd_version_is_qwen_image(version)) {
                bool enable_vision = false;
                if (!vae_decode_only) {
                    enable_vision = true;
                }
                cond_stage_model = std::make_shared<LLMEmbedder>(clip_backend,
                                                                 offload_params_to_cpu,
                                                                 tensor_storage_map,
                                                                 version,
                                                                 "",
                                                                 enable_vision);
                diffusion_model  = std::make_shared<QwenImageModel>(backend,
                                                                   offload_params_to_cpu,
                                                                   tensor_storage_map,
                                                                   "model.diffusion_model",
                                                                   version,
                                                                   sd_ctx_params->qwen_image_zero_cond_t);
            } else if (sd_version_is_anima(version)) {
                cond_stage_model = std::make_shared<AnimaConditioner>(clip_backend,
                                                                      offload_params_to_cpu,
                                                                      tensor_storage_map);
                diffusion_model  = std::make_shared<AnimaModel>(backend,
                                                               offload_params_to_cpu,
                                                               tensor_storage_map,
                                                               "model.diffusion_model");
            } else if (sd_version_is_z_image(version)) {
                cond_stage_model = std::make_shared<LLMEmbedder>(clip_backend,
                                                                 offload_params_to_cpu,
                                                                 tensor_storage_map,
                                                                 version);
                diffusion_model  = std::make_shared<ZImageModel>(backend,
                                                                offload_params_to_cpu,
                                                                tensor_storage_map,
                                                                "model.diffusion_model",
                                                                version);
            } else {  // SD1.x SD2.x SDXL
                std::map<std::string, std::string> embbeding_map;
                for (uint32_t i = 0; i < sd_ctx_params->embedding_count; i++) {
                    embbeding_map.emplace(SAFE_STR(sd_ctx_params->embeddings[i].name), SAFE_STR(sd_ctx_params->embeddings[i].path));
                }
                if (strstr(SAFE_STR(sd_ctx_params->photo_maker_path), "v2")) {
                    cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend,
                                                                                           offload_params_to_cpu,
                                                                                           tensor_storage_map,
                                                                                           embbeding_map,
                                                                                           version,
                                                                                           PM_VERSION_2);
                } else {
                    cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend,
                                                                                           offload_params_to_cpu,
                                                                                           tensor_storage_map,
                                                                                           embbeding_map,
                                                                                           version);
                }
                diffusion_model = std::make_shared<UNetModel>(backend,
                                                              offload_params_to_cpu,
                                                              tensor_storage_map,
                                                              version);
                if (sd_ctx_params->diffusion_conv_direct) {
                    LOG_INFO("Using Conv2d direct in the diffusion model");
                    std::dynamic_pointer_cast<UNetModel>(diffusion_model)->unet.set_conv2d_direct_enabled(true);
                }
            }

            cond_stage_model->alloc_params_buffer();
            cond_stage_model->get_param_tensors(tensors);

            diffusion_model->alloc_params_buffer();
            diffusion_model->get_param_tensors(tensors);

            if (sd_version_is_unet_edit(version)) {
                vae_decode_only = false;
            }

            if (high_noise_diffusion_model) {
                high_noise_diffusion_model->alloc_params_buffer();
                high_noise_diffusion_model->get_param_tensors(tensors);
            }

            if (sd_ctx_params->keep_vae_on_cpu && !ggml_backend_is_cpu(backend)) {
                LOG_INFO("VAE Autoencoder: Using CPU backend");
                vae_backend = ggml_backend_cpu_init();
            } else {
                vae_backend = backend;
            }

            if (!(use_tiny_autoencoder || version == VERSION_SDXS) || tae_preview_only) {
                if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_anima(version)) {
                    first_stage_model = std::make_shared<WAN::WanVAERunner>(vae_backend,
                                                                            offload_params_to_cpu,
                                                                            tensor_storage_map,
                                                                            "first_stage_model",
                                                                            vae_decode_only,
                                                                            version);
                    first_stage_model->alloc_params_buffer();
                    first_stage_model->get_param_tensors(tensors, "first_stage_model");
                } else if (version == VERSION_CHROMA_RADIANCE) {
                    first_stage_model = std::make_shared<FakeVAE>(vae_backend,
                                                                  offload_params_to_cpu);
                } else {
                    first_stage_model = std::make_shared<AutoEncoderKL>(vae_backend,
                                                                        offload_params_to_cpu,
                                                                        tensor_storage_map,
                                                                        "first_stage_model",
                                                                        vae_decode_only,
                                                                        false,
                                                                        version);
                    if (sd_ctx_params->vae_conv_direct) {
                        LOG_INFO("Using Conv2d direct in the vae model");
                        first_stage_model->set_conv2d_direct_enabled(true);
                    }
                    if (sd_version_is_sdxl(version) &&
                        (strlen(SAFE_STR(sd_ctx_params->vae_path)) == 0 || sd_ctx_params->force_sdxl_vae_conv_scale || external_vae_is_invalid)) {
                        float vae_conv_2d_scale = 1.f / 32.f;
                        LOG_WARN(
                            "No valid VAE specified with --vae or --force-sdxl-vae-conv-scale flag set, "
                            "using Conv2D scale %.3f",
                            vae_conv_2d_scale);
                        first_stage_model->set_conv2d_scale(vae_conv_2d_scale);
                    }
                    first_stage_model->alloc_params_buffer();
                    first_stage_model->get_param_tensors(tensors, "first_stage_model");
                }
            }
            if (use_tiny_autoencoder || version == VERSION_SDXS) {
                if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_anima(version)) {
                    tae_first_stage = std::make_shared<TinyVideoAutoEncoder>(vae_backend,
                                                                             offload_params_to_cpu,
                                                                             tensor_storage_map,
                                                                             "decoder",
                                                                             vae_decode_only,
                                                                             version);
                } else {
                    tae_first_stage = std::make_shared<TinyImageAutoEncoder>(vae_backend,
                                                                             offload_params_to_cpu,
                                                                             tensor_storage_map,
                                                                             "decoder.layers",
                                                                             vae_decode_only,
                                                                             version);
                    if (version == VERSION_SDXS) {
                        tae_first_stage->alloc_params_buffer();
                        tae_first_stage->get_param_tensors(tensors, "first_stage_model");
                    }
                }
                if (sd_ctx_params->vae_conv_direct) {
                    LOG_INFO("Using Conv2d direct in the tae model");
                    tae_first_stage->set_conv2d_direct_enabled(true);
                }
            }

            if (strlen(SAFE_STR(sd_ctx_params->control_net_path)) > 0) {
                ggml_backend_t controlnet_backend = nullptr;
                if (sd_ctx_params->keep_control_net_on_cpu && !ggml_backend_is_cpu(backend)) {
                    LOG_DEBUG("ControlNet: Using CPU backend");
                    controlnet_backend = ggml_backend_cpu_init();
                } else {
                    controlnet_backend = backend;
                }
                control_net = std::make_shared<ControlNet>(controlnet_backend,
                                                           offload_params_to_cpu,
                                                           tensor_storage_map,
                                                           version);
                if (sd_ctx_params->diffusion_conv_direct) {
                    LOG_INFO("Using Conv2d direct in the control net");
                    control_net->set_conv2d_direct_enabled(true);
                }
            }

            if (strstr(SAFE_STR(sd_ctx_params->photo_maker_path), "v2")) {
                pmid_model = std::make_shared<PhotoMakerIDEncoder>(backend,
                                                                   offload_params_to_cpu,
                                                                   tensor_storage_map,
                                                                   "pmid",
                                                                   version,
                                                                   PM_VERSION_2);
                LOG_INFO("using PhotoMaker Version 2");
            } else {
                pmid_model = std::make_shared<PhotoMakerIDEncoder>(backend,
                                                                   offload_params_to_cpu,
                                                                   tensor_storage_map,
                                                                   "pmid",
                                                                   version);
            }
            if (strlen(SAFE_STR(sd_ctx_params->photo_maker_path)) > 0) {
              if (version != VERSION_SDXL) { // kcpp
                printf("\n!!!!\nWARNING: PhotoMaker is only compatible with SDXL models. PhotoMaker will be disabled!\n!!!!\n");
              } else {
                pmid_lora               = std::make_shared<LoraModel>("pmid", backend, sd_ctx_params->photo_maker_path, "", version);
                auto lora_tensor_filter = [&](const std::string& tensor_name) {
                    if (starts_with(tensor_name, "lora.model")) {
                        return true;
                    }
                    return false;
                };
                if (!pmid_lora->load_from_file(n_threads, lora_tensor_filter)) {
                    LOG_WARN("load photomaker lora tensors from %s failed", sd_ctx_params->photo_maker_path);
                    return false;
                }
                LOG_INFO("loading stacked ID embedding (PHOTOMAKER) model file from '%s'", sd_ctx_params->photo_maker_path);
                if (!model_loader.init_from_file_and_convert_name(sd_ctx_params->photo_maker_path, "pmid.")) {
                    LOG_WARN("loading stacked ID embedding from '%s' failed", sd_ctx_params->photo_maker_path);
                } else {
                    use_pmid = true;
                }
              }
            }
            if (use_pmid) {
                if (!pmid_model->alloc_params_buffer()) {
                    LOG_ERROR(" pmid model params buffer allocation failed");
                    return false;
                }
                pmid_model->get_param_tensors(tensors, "pmid");
            }

            if (sd_ctx_params->flash_attn) {
                LOG_INFO("Using flash attention");
                if(!sd_version_is_qwen_image(version)) //kcpp: edit 14feb, breaks qwen image edit
                {
                    cond_stage_model->set_flash_attention_enabled(true);
                }
                if (clip_vision) {
                    clip_vision->set_flash_attention_enabled(true);
                }
                if (first_stage_model) {
                    first_stage_model->set_flash_attention_enabled(true);
                }
                if (tae_first_stage) {
                    tae_first_stage->set_flash_attention_enabled(true);
                }
            }

            if (sd_ctx_params->flash_attn || sd_ctx_params->diffusion_flash_attn) {
                LOG_INFO("Using flash attention in the diffusion model");
                diffusion_model->set_flash_attention_enabled(true);
                if (high_noise_diffusion_model) {
                    high_noise_diffusion_model->set_flash_attention_enabled(true);
                }
            }

            diffusion_model->set_circular_axes(sd_ctx_params->circular_x, sd_ctx_params->circular_y);
            if (high_noise_diffusion_model) {
                high_noise_diffusion_model->set_circular_axes(sd_ctx_params->circular_x, sd_ctx_params->circular_y);
            }
            if (control_net) {
                control_net->set_circular_axes(sd_ctx_params->circular_x, sd_ctx_params->circular_y);
            }
            if (first_stage_model) {
                first_stage_model->set_circular_axes(sd_ctx_params->circular_x, sd_ctx_params->circular_y);
            }
            if (tae_first_stage) {
                tae_first_stage->set_circular_axes(sd_ctx_params->circular_x, sd_ctx_params->circular_y);
            }
        }

        struct ggml_init_params params;
        params.mem_size   = static_cast<size_t>(10 * 1024) * 1024;  // 10M
        params.mem_buffer = nullptr;
        params.no_alloc   = false;
        // LOG_DEBUG("mem_size %u ", params.mem_size);
        struct ggml_context* ctx = ggml_init(params);  // for  alphas_cumprod and is_using_v_parameterization check
        GGML_ASSERT(ctx != nullptr);
        ggml_tensor* alphas_cumprod_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, TIMESTEPS);
        calculate_alphas_cumprod((float*)alphas_cumprod_tensor->data);

        // load weights
        LOG_DEBUG("loading weights");

        std::set<std::string> ignore_tensors;
        tensors["alphas_cumprod"] = alphas_cumprod_tensor;
        if (use_tiny_autoencoder) {
            ignore_tensors.insert("first_stage_model.");
        }
        if (use_pmid) {
            ignore_tensors.insert("pmid.unet.");
        }
        ignore_tensors.insert("model.diffusion_model.__x0__");
        ignore_tensors.insert("model.diffusion_model.__32x32__");
        ignore_tensors.insert("model.diffusion_model.__index_timestep_zero__");

        if (vae_decode_only) {
            ignore_tensors.insert("first_stage_model.encoder");
            ignore_tensors.insert("first_stage_model.conv1");
            ignore_tensors.insert("first_stage_model.quant");
            ignore_tensors.insert("text_encoders.llm.visual.");
        }
        if (version == VERSION_OVIS_IMAGE) {
            ignore_tensors.insert("text_encoders.llm.vision_model.");
            ignore_tensors.insert("text_encoders.llm.visual_tokenizer.");
            ignore_tensors.insert("text_encoders.llm.vte.");
        }
        if (version == VERSION_SVD) {
            ignore_tensors.insert("conditioner.embedders.3");
        }
        bool success = model_loader.load_tensors(tensors, ignore_tensors, n_threads, sd_ctx_params->enable_mmap);
        if (!success) {
            LOG_ERROR("load tensors from model loader failed");
            ggml_free(ctx);
            return false;
        }

        LOG_DEBUG("finished loaded file");

        {
            size_t clip_params_mem_size = cond_stage_model->get_params_buffer_size();
            size_t unet_params_mem_size = diffusion_model->get_params_buffer_size();
            if (high_noise_diffusion_model) {
                unet_params_mem_size += high_noise_diffusion_model->get_params_buffer_size();
            }
            size_t vae_params_mem_size = 0;
            if (!(use_tiny_autoencoder || version == VERSION_SDXS) || tae_preview_only) {
                vae_params_mem_size = first_stage_model->get_params_buffer_size();
            }
            if (use_tiny_autoencoder || version == VERSION_SDXS) {
                if (use_tiny_autoencoder && !tae_first_stage->load_from_file(taesd_path_fixed, n_threads)) {
                    return false;
                }
                use_tiny_autoencoder = true;  // now the processing is identical for VERSION_SDXS
                vae_params_mem_size  = tae_first_stage->get_params_buffer_size();
            }
            size_t control_net_params_mem_size = 0;
            if (control_net) {
                if (!control_net->load_from_file(SAFE_STR(sd_ctx_params->control_net_path), n_threads)) {
                    return false;
                }
                control_net_params_mem_size = control_net->get_params_buffer_size();
            }
            size_t pmid_params_mem_size = 0;
            if (use_pmid) {
                pmid_params_mem_size = pmid_model->get_params_buffer_size();
            }

            size_t total_params_ram_size  = 0;
            size_t total_params_vram_size = 0;
            if (ggml_backend_is_cpu(clip_backend)) {
                total_params_ram_size += clip_params_mem_size + pmid_params_mem_size;
            } else {
                total_params_vram_size += clip_params_mem_size + pmid_params_mem_size;
            }

            if (ggml_backend_is_cpu(backend)) {
                total_params_ram_size += unet_params_mem_size;
            } else {
                total_params_vram_size += unet_params_mem_size;
            }

            if (ggml_backend_is_cpu(vae_backend)) {
                total_params_ram_size += vae_params_mem_size;
            } else {
                total_params_vram_size += vae_params_mem_size;
            }

            if (ggml_backend_is_cpu(control_net_backend)) {
                total_params_ram_size += control_net_params_mem_size;
            } else {
                total_params_vram_size += control_net_params_mem_size;
            }

            size_t total_params_size = total_params_ram_size + total_params_vram_size;
            LOG_INFO(
                "total params memory size = %.2fMB (VRAM %.2fMB, RAM %.2fMB): "
                "text_encoders %.2fMB(%s), diffusion_model %.2fMB(%s), vae %.2fMB(%s), controlnet %.2fMB(%s), pmid %.2fMB(%s)",
                total_params_size / 1024.0 / 1024.0,
                total_params_vram_size / 1024.0 / 1024.0,
                total_params_ram_size / 1024.0 / 1024.0,
                clip_params_mem_size / 1024.0 / 1024.0,
                ggml_backend_is_cpu(clip_backend) ? "RAM" : "VRAM",
                unet_params_mem_size / 1024.0 / 1024.0,
                ggml_backend_is_cpu(backend) ? "RAM" : "VRAM",
                vae_params_mem_size / 1024.0 / 1024.0,
                ggml_backend_is_cpu(vae_backend) ? "RAM" : "VRAM",
                control_net_params_mem_size / 1024.0 / 1024.0,
                ggml_backend_is_cpu(control_net_backend) ? "RAM" : "VRAM",
                pmid_params_mem_size / 1024.0 / 1024.0,
                ggml_backend_is_cpu(clip_backend) ? "RAM" : "VRAM");
        }

        // init denoiser
        {
            prediction_t pred_type = sd_ctx_params->prediction;

            if (pred_type == PREDICTION_COUNT) {
                if (sd_version_is_sd2(version)) {
                    // check is_using_v_parameterization_for_sd2
                    if (is_using_v_parameterization_for_sd2(ctx, sd_version_is_inpaint(version))) {
                        pred_type = V_PRED;
                    } else {
                        pred_type = EPS_PRED;
                    }
                } else if (sd_version_is_sdxl(version)) {
                    if (tensor_storage_map.find("edm_vpred.sigma_max") != tensor_storage_map.end()) {
                        // CosXL models
                        // TODO: get sigma_min and sigma_max values from file
                        pred_type = EDM_V_PRED;
                    } else if (tensor_storage_map.find("v_pred") != tensor_storage_map.end()) {
                        pred_type = V_PRED;
                    } else {
                        pred_type = EPS_PRED;
                    }
                } else if (sd_version_is_sd3(version) ||
                           sd_version_is_wan(version) ||
                           sd_version_is_qwen_image(version) ||
                           sd_version_is_anima(version) ||
                           sd_version_is_z_image(version)) {
                    pred_type = FLOW_PRED;
                    if (sd_version_is_wan(version)) {
                        default_flow_shift = 5.f;
                    } else {
                        default_flow_shift = 3.f;
                    }
                } else if (sd_version_is_flux(version)) {
                    pred_type = FLUX_FLOW_PRED;

                    default_flow_shift = 1.0f;  // TODO: validate
                    for (const auto& [name, tensor_storage] : tensor_storage_map) {
                        if (starts_with(name, "model.diffusion_model.guidance_in.in_layer.weight")) {
                            default_flow_shift = 1.15f;
                            break;
                        }
                    }
                } else if (sd_version_is_flux2(version)) {
                    pred_type = FLUX2_FLOW_PRED;
                } else {
                    pred_type = EPS_PRED;
                }
            }

            switch (pred_type) {
                case EPS_PRED:
                    LOG_INFO("running in eps-prediction mode");
                    break;
                case V_PRED:
                    LOG_INFO("running in v-prediction mode");
                    denoiser = std::make_shared<CompVisVDenoiser>();
                    break;
                case EDM_V_PRED:
                    LOG_INFO("running in v-prediction EDM mode");
                    denoiser = std::make_shared<EDMVDenoiser>();
                    break;
                case FLOW_PRED: {
                    LOG_INFO("running in FLOW mode");
                    denoiser = std::make_shared<DiscreteFlowDenoiser>();
                    break;
                }
                case FLUX_FLOW_PRED: {
                    LOG_INFO("running in Flux FLOW mode");
                    denoiser = std::make_shared<FluxFlowDenoiser>();
                    break;
                }
                case FLUX2_FLOW_PRED: {
                    LOG_INFO("running in Flux2 FLOW mode");
                    denoiser = std::make_shared<Flux2FlowDenoiser>();
                    break;
                }
                default: {
                    LOG_ERROR("Unknown predition type %i", pred_type);
                    ggml_free(ctx);
                    return false;
                }
            }

            auto comp_vis_denoiser = std::dynamic_pointer_cast<CompVisDenoiser>(denoiser);
            if (comp_vis_denoiser) {
                for (int i = 0; i < TIMESTEPS; i++) {
                    comp_vis_denoiser->sigmas[i]     = std::sqrt((1 - ((float*)alphas_cumprod_tensor->data)[i]) / ((float*)alphas_cumprod_tensor->data)[i]);
                    comp_vis_denoiser->log_sigmas[i] = std::log(comp_vis_denoiser->sigmas[i]);
                }
            }
        }

        ggml_free(ctx);
        use_tiny_autoencoder = use_tiny_autoencoder && !tae_preview_only;
        return true;
    }

    bool is_using_v_parameterization_for_sd2(ggml_context* work_ctx, bool is_inpaint = false) {
        struct ggml_tensor* x_t = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 4, 1);
        ggml_set_f32(x_t, 0.5);
        struct ggml_tensor* c = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 1024, 2, 1, 1);
        ggml_set_f32(c, 0.5);

        struct ggml_tensor* timesteps = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 1);
        ggml_set_f32(timesteps, 999);

        struct ggml_tensor* concat = is_inpaint ? ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 5, 1) : nullptr;
        if (concat != nullptr) {
            ggml_set_f32(concat, 0);
        }

        int64_t t0              = ggml_time_ms();
        struct ggml_tensor* out = ggml_dup_tensor(work_ctx, x_t);
        DiffusionParams diffusion_params;
        diffusion_params.x         = x_t;
        diffusion_params.timesteps = timesteps;
        diffusion_params.context   = c;
        diffusion_params.c_concat  = concat;
        diffusion_model->compute(n_threads, diffusion_params, &out);
        diffusion_model->free_compute_buffer();

        double result = 0.f;
        {
            float* vec_x   = (float*)x_t->data;
            float* vec_out = (float*)out->data;

            int64_t n = ggml_nelements(out);

            for (int i = 0; i < n; i++) {
                result += ((double)vec_out[i] - (double)vec_x[i]);
            }
            result /= n;
        }
        int64_t t1 = ggml_time_ms();
        LOG_DEBUG("check is_using_v_parameterization_for_sd2, taking %.2fs", (t1 - t0) * 1.0f / 1000);
        return result < -1;
    }

    std::shared_ptr<LoraModel> load_lora_model_from_file(const std::string& lora_id,
                                                         float multiplier,
                                                         ggml_backend_t backend,
                                                         std::string stage = "",
                                                         LoraModel::filter_t lora_tensor_filter = nullptr) {
        // kcpp
        // first check the cache
        bool kcpp_at_runtime = (stage != "");
        std::string lora_key = "|" + stage + "|" + lora_id;
        if (kcpp_at_runtime) {
            auto it = kcpp_lora_cache.find(lora_key);
            if (it != kcpp_lora_cache.end()) {
                if (it->second) {
                    it->second->multiplier = multiplier;
                }
                return it->second;
            }
        }
        // by construction, kcpp will always find the preloaded LoRAs on the cache

        std::string lora_path             = lora_id;
        static std::string high_noise_tag = "|high_noise|";
        bool is_high_noise                = false;
        if (starts_with(lora_path, high_noise_tag)) {
            lora_path     = lora_path.substr(high_noise_tag.size());
            is_high_noise = true;
            LOG_DEBUG("high noise lora: %s", lora_path.c_str());
        }
        auto lora = std::make_shared<LoraModel>(lora_id, backend, lora_path, is_high_noise ? "model.high_noise_" : "", version);
        if (!lora->load_from_file(n_threads, lora_tensor_filter)) {
            LOG_WARN("load lora tensors from %s failed", lora_path.c_str());
            // also cache negatives to avoid I/O at runtime
            lora = nullptr;
            if (kcpp_at_runtime)
                kcpp_lora_cache[lora_key] = lora;
            return lora;
        }

        lora->multiplier = multiplier;
        if (kcpp_at_runtime)
            kcpp_lora_cache[lora_key] = lora;
        return lora;
    }

    void apply_loras_immediately(const std::unordered_map<std::string, float>& lora_state) {
        std::unordered_map<std::string, float> lora_state_diff;
        for (auto& kv : lora_state) {
            const std::string& lora_name = kv.first;
            float multiplier             = kv.second;
            lora_state_diff[lora_name] += multiplier;
        }
        for (auto& kv : curr_lora_state) {
            const std::string& lora_name = kv.first;
            float curr_multiplier        = kv.second;
            lora_state_diff[lora_name] -= curr_multiplier;
        }

        if (lora_state_diff.empty()) {
            return;
        }

        LOG_INFO("apply lora immediately");

        size_t rm = lora_state_diff.size() - lora_state.size();
        if (rm != 0) {
            LOG_INFO("attempting to apply %lu LoRAs (removing %lu applied LoRAs)", lora_state.size(), rm);
        } else {
            LOG_INFO("attempting to apply %lu LoRAs", lora_state.size());
        }

        for (auto& kv : lora_state_diff) {
            int64_t t0 = ggml_time_ms();

            auto lora = load_lora_model_from_file(kv.first, kv.second, backend);
            if (!lora || lora->lora_tensors.empty()) {
                continue;
            }
            lora->apply(tensors, version, n_threads);
            lora->free_params_buffer();

            int64_t t1 = ggml_time_ms();

            LOG_INFO("lora '%s' applied, taking %.2fs", kv.first.c_str(), (t1 - t0) * 1.0f / 1000);
        }

        curr_lora_state = lora_state;
    }

    void apply_loras_at_runtime(const std::unordered_map<std::string, float>& lora_state) {
        cond_stage_lora_models.clear();
        diffusion_lora_models.clear();
        first_stage_lora_models.clear();
        if (cond_stage_model) {
            cond_stage_model->set_weight_adapter(nullptr);
        }
        if (diffusion_model) {
            diffusion_model->set_weight_adapter(nullptr);
        }
        if (high_noise_diffusion_model) {
            high_noise_diffusion_model->set_weight_adapter(nullptr);
        }
        if (first_stage_model) {
            first_stage_model->set_weight_adapter(nullptr);
        }
        if (lora_state.empty()) {
            return;
        }
        LOG_INFO("apply lora at runtime");
        if (cond_stage_model) {
            std::vector<std::shared_ptr<LoraModel>> lora_models;
            auto lora_state_diff = lora_state;
            for (auto& lora_model : cond_stage_lora_models) {
                auto iter = lora_state_diff.find(lora_model->lora_id);

                if (iter != lora_state_diff.end()) {
                    lora_model->multiplier = iter->second;
                    lora_models.push_back(lora_model);
                    lora_state_diff.erase(iter);
                }
            }
            cond_stage_lora_models  = lora_models;
            auto lora_tensor_filter = [&](const std::string& tensor_name) {
                if (is_cond_stage_model_name(tensor_name)) {
                    return true;
                }
                return false;
            };
            for (auto& kv : lora_state_diff) {
                const std::string& lora_id = kv.first;
                float multiplier           = kv.second;

                auto lora = load_lora_model_from_file(lora_id, multiplier, clip_backend, "cond_stage", lora_tensor_filter);
                if (lora && !lora->lora_tensors.empty()) {
                    lora->preprocess_lora_tensors(tensors);
                    cond_stage_lora_models.push_back(lora);
                }
            }
            auto multi_lora_adapter = std::make_shared<MultiLoraAdapter>(cond_stage_lora_models);
            cond_stage_model->set_weight_adapter(multi_lora_adapter);
        }
        if (diffusion_model) {
            std::vector<std::shared_ptr<LoraModel>> lora_models;
            auto lora_state_diff = lora_state;
            for (auto& lora_model : diffusion_lora_models) {
                auto iter = lora_state_diff.find(lora_model->lora_id);

                if (iter != lora_state_diff.end()) {
                    lora_model->multiplier = iter->second;
                    lora_models.push_back(lora_model);
                    lora_state_diff.erase(iter);
                }
            }
            diffusion_lora_models   = lora_models;
            auto lora_tensor_filter = [&](const std::string& tensor_name) {
                if (is_diffusion_model_name(tensor_name)) {
                    return true;
                }
                return false;
            };
            for (auto& kv : lora_state_diff) {
                const std::string& lora_name = kv.first;
                float multiplier             = kv.second;

                auto lora = load_lora_model_from_file(lora_name, multiplier, backend, "diffusion", lora_tensor_filter);
                if (lora && !lora->lora_tensors.empty()) {
                    lora->preprocess_lora_tensors(tensors);
                    diffusion_lora_models.push_back(lora);
                }
            }
            auto multi_lora_adapter = std::make_shared<MultiLoraAdapter>(diffusion_lora_models);
            diffusion_model->set_weight_adapter(multi_lora_adapter);
            if (high_noise_diffusion_model) {
                high_noise_diffusion_model->set_weight_adapter(multi_lora_adapter);
            }
        }

        if (first_stage_model) {
            std::vector<std::shared_ptr<LoraModel>> lora_models;
            auto lora_state_diff = lora_state;
            for (auto& lora_model : first_stage_lora_models) {
                auto iter = lora_state_diff.find(lora_model->lora_id);

                if (iter != lora_state_diff.end()) {
                    lora_model->multiplier = iter->second;
                    lora_models.push_back(lora_model);
                    lora_state_diff.erase(iter);
                }
            }
            first_stage_lora_models = lora_models;
            auto lora_tensor_filter = [&](const std::string& tensor_name) {
                if (is_first_stage_model_name(tensor_name)) {
                    return true;
                }
                return false;
            };
            for (auto& kv : lora_state_diff) {
                const std::string& lora_name = kv.first;
                float multiplier             = kv.second;

                auto lora = load_lora_model_from_file(lora_name, multiplier, vae_backend, "first_stage", lora_tensor_filter);
                if (lora && !lora->lora_tensors.empty()) {
                    lora->preprocess_lora_tensors(tensors);
                    first_stage_lora_models.push_back(lora);
                }
            }
            auto multi_lora_adapter = std::make_shared<MultiLoraAdapter>(first_stage_lora_models);
            first_stage_model->set_weight_adapter(multi_lora_adapter);
        }
    }

    void lora_stat() {
        if (!cond_stage_lora_models.empty()) {
            LOG_INFO("cond_stage_lora_models:");
            for (auto& lora_model : cond_stage_lora_models) {
                lora_model->stat();
            }
        }

        if (!diffusion_lora_models.empty()) {
            LOG_INFO("diffusion_lora_models:");
            for (auto& lora_model : diffusion_lora_models) {
                lora_model->stat();
            }
        }

        if (!first_stage_lora_models.empty()) {
            LOG_INFO("first_stage_lora_models:");
            for (auto& lora_model : first_stage_lora_models) {
                lora_model->stat();
            }
        }
    }

    void apply_loras(const sd_lora_t* loras, uint32_t lora_count) {
        std::unordered_map<std::string, float> lora_f2m;
        for (uint32_t i = 0; i < lora_count; i++) {
            std::string lora_id = SAFE_STR(loras[i].path);
            if (loras[i].is_high_noise) {
                lora_id = "|high_noise|" + lora_id;
            }
            lora_f2m[lora_id] = loras[i].multiplier;
            LOG_DEBUG("lora %s:%.2f", lora_id.c_str(), loras[i].multiplier);
        }
        int64_t t0 = ggml_time_ms();
        if (apply_lora_immediately) {
            apply_loras_immediately(lora_f2m);
        } else {
            apply_loras_at_runtime(lora_f2m);
        }
        int64_t t1 = ggml_time_ms();
        if (!lora_f2m.empty()) {
            LOG_INFO("apply_loras completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
        }
    }

    SDCondition get_pmid_conditon(ggml_context* work_ctx,
                                  sd_pm_params_t pm_params,
                                  ConditionerParams& condition_params) {
        SDCondition id_cond;
        if (use_pmid) {
            if (!pmid_lora->applied) {
                int64_t t0 = ggml_time_ms();
                pmid_lora->apply(tensors, version, n_threads);
                int64_t t1         = ggml_time_ms();
                pmid_lora->applied = true;
                LOG_INFO("pmid_lora apply completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
                if (free_params_immediately) {
                    pmid_lora->free_params_buffer();
                }
            }
            // preprocess input id images
            bool pmv2 = pmid_model->get_version() == PM_VERSION_2;
            if (pm_params.id_images_count > 0) {
                int clip_image_size        = 224;
                pmid_model->style_strength = pm_params.style_strength;

                auto id_image_tensor = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, clip_image_size, clip_image_size, 3, pm_params.id_images_count);

                std::vector<sd_image_f32_t> processed_id_images;
                for (int i = 0; i < pm_params.id_images_count; i++) {
                    sd_image_f32_t id_image           = sd_image_t_to_sd_image_f32_t(pm_params.id_images[i]);
                    sd_image_f32_t processed_id_image = clip_preprocess(id_image, clip_image_size, clip_image_size);
                    free(id_image.data);
                    id_image.data = nullptr;
                    processed_id_images.push_back(processed_id_image);
                }

                ggml_ext_tensor_iter(id_image_tensor, [&](ggml_tensor* id_image_tensor, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
                    float value = sd_image_get_f32(processed_id_images[i3], i0, i1, i2, false);
                    ggml_ext_tensor_set_f32(id_image_tensor, value, i0, i1, i2, i3);
                });

                for (auto& image : processed_id_images) {
                    free(image.data);
                    image.data = nullptr;
                }
                processed_id_images.clear();

                int64_t t0                      = ggml_time_ms();
                condition_params.num_input_imgs = pm_params.id_images_count;
                auto cond_tup                   = cond_stage_model->get_learned_condition_with_trigger(work_ctx,
                                                                                                       n_threads,
                                                                                                       condition_params);
                id_cond                         = std::get<0>(cond_tup);
                auto class_tokens_mask          = std::get<1>(cond_tup);
                struct ggml_tensor* id_embeds   = nullptr;
                if (pmv2 && pm_params.id_embed_path != nullptr) {
                    id_embeds = load_tensor_from_file(work_ctx, pm_params.id_embed_path);
                }
                if (pmv2 && id_embeds == nullptr) {
                    LOG_WARN("Provided PhotoMaker images, but NO valid ID embeds file for PM v2");
                    LOG_WARN("Turn off PhotoMaker");
                    use_pmid = false;
                } else {
                    if (pmv2 && pm_params.id_images_count != id_embeds->ne[1]) {
                        LOG_WARN("PhotoMaker image count (%d) does NOT match ID embeds (%d). You should run face_detect.py again.", pm_params.id_images_count, id_embeds->ne[1]);
                        LOG_WARN("Turn off PhotoMaker");
                        use_pmid = false;
                    } else {
                        ggml_tensor* res = nullptr;
                        pmid_model->compute(n_threads, id_image_tensor, id_cond.c_crossattn, id_embeds, class_tokens_mask, &res, work_ctx);
                        id_cond.c_crossattn = res;
                        int64_t t1          = ggml_time_ms();
                        LOG_INFO("Photomaker ID Stacking, taking %" PRId64 " ms", t1 - t0);
                        if (free_params_immediately) {
                            pmid_model->free_params_buffer();
                        }
                        // Encode input prompt without the trigger word for delayed conditioning
                        condition_params.text = cond_stage_model->remove_trigger_from_prompt(work_ctx, condition_params.text);
                    }
                }
            } else {
                LOG_WARN("Provided PhotoMaker model file, but NO input ID images");
                LOG_WARN("Turn off PhotoMaker");
                use_pmid = false;
            }
        }
        return id_cond;
    }

    ggml_tensor* get_clip_vision_output(ggml_context* work_ctx,
                                        sd_image_t init_image,
                                        bool return_pooled   = true,
                                        int clip_skip        = -1,
                                        bool zero_out_masked = false) {
        ggml_tensor* output = nullptr;
        if (zero_out_masked) {
            if (return_pooled) {
                output = ggml_new_tensor_1d(work_ctx,
                                            GGML_TYPE_F32,
                                            clip_vision->vision_model.projection_dim);
            } else {
                output = ggml_new_tensor_2d(work_ctx,
                                            GGML_TYPE_F32,
                                            clip_vision->vision_model.hidden_size,
                                            257);
            }

            ggml_set_f32(output, 0.f);
        } else {
            sd_image_f32_t image         = sd_image_t_to_sd_image_f32_t(init_image);
            sd_image_f32_t resized_image = clip_preprocess(image, clip_vision->vision_model.image_size, clip_vision->vision_model.image_size);
            free(image.data);
            image.data = nullptr;

            ggml_tensor* pixel_values = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, resized_image.width, resized_image.height, 3, 1);
            sd_image_f32_to_ggml_tensor(resized_image, pixel_values, false);
            free(resized_image.data);
            resized_image.data = nullptr;

            // print_ggml_tensor(pixel_values);
            clip_vision->compute(n_threads, pixel_values, return_pooled, clip_skip, &output, work_ctx);
            // print_ggml_tensor(c_crossattn);
        }
        return output;
    }

    SDCondition get_svd_condition(ggml_context* work_ctx,
                                  sd_image_t init_image,
                                  int width,
                                  int height,
                                  int fps                  = 6,
                                  int motion_bucket_id     = 127,
                                  float augmentation_level = 0.f,
                                  bool zero_out_masked     = false) {
        // c_crossattn
        int64_t t0                      = ggml_time_ms();
        struct ggml_tensor* c_crossattn = get_clip_vision_output(work_ctx, init_image, true, -1, zero_out_masked);

        // c_concat
        struct ggml_tensor* c_concat = nullptr;
        {
            if (zero_out_masked) {
                c_concat = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width / get_vae_scale_factor(), height / get_vae_scale_factor(), 4, 1);
                ggml_set_f32(c_concat, 0.f);
            } else {
                ggml_tensor* init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);

                if (width != init_image.width || height != init_image.height) {
                    sd_image_f32_t image         = sd_image_t_to_sd_image_f32_t(init_image);
                    sd_image_f32_t resized_image = resize_sd_image_f32_t(image, width, height);
                    free(image.data);
                    image.data = nullptr;
                    sd_image_f32_to_ggml_tensor(resized_image, init_img, false);
                    free(resized_image.data);
                    resized_image.data = nullptr;
                } else {
                    sd_image_to_ggml_tensor(init_image, init_img);
                }
                if (augmentation_level > 0.f) {
                    struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, init_img);
                    ggml_ext_im_set_randn_f32(noise, rng);
                    // encode_pixels += torch.randn_like(pixels) * augmentation_level
                    ggml_ext_tensor_scale_inplace(noise, augmentation_level);
                    ggml_ext_tensor_add_inplace(init_img, noise);
                }
                ggml_tensor* moments = vae_encode(work_ctx, init_img);
                c_concat             = get_first_stage_encoding(work_ctx, moments);
            }
        }

        // y
        struct ggml_tensor* y = nullptr;
        {
            y                            = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, diffusion_model->get_adm_in_channels());
            int out_dim                  = 256;
            int fps_id                   = fps - 1;
            std::vector<float> timesteps = {(float)fps_id, (float)motion_bucket_id, augmentation_level};
            set_timestep_embedding(timesteps, y, out_dim);
        }
        int64_t t1 = ggml_time_ms();
        LOG_DEBUG("computing svd condition graph completed, taking %" PRId64 " ms", t1 - t0);
        return {c_crossattn, y, c_concat};
    }

    std::vector<float> process_timesteps(const std::vector<float>& timesteps,
                                         ggml_tensor* init_latent,
                                         ggml_tensor* denoise_mask) {
        if (diffusion_model->get_desc() == "Wan2.2-TI2V-5B") {
            auto new_timesteps = std::vector<float>(init_latent->ne[2], timesteps[0]);

            if (denoise_mask != nullptr) {
                float value = ggml_ext_tensor_get_f32(denoise_mask, 0, 0, 0, 0);
                if (value == 0.f) {
                    new_timesteps[0] = 0.f;
                }
            }
            return new_timesteps;
        } else {
            return timesteps;
        }
    }

    // a = a * mask + b * (1 - mask)
    void apply_mask(ggml_tensor* a, ggml_tensor* b, ggml_tensor* mask) {
        for (int64_t i0 = 0; i0 < a->ne[0]; i0++) {
            for (int64_t i1 = 0; i1 < a->ne[1]; i1++) {
                for (int64_t i2 = 0; i2 < a->ne[2]; i2++) {
                    for (int64_t i3 = 0; i3 < a->ne[3]; i3++) {
                        float a_value    = ggml_ext_tensor_get_f32(a, i0, i1, i2, i3);
                        float b_value    = ggml_ext_tensor_get_f32(b, i0, i1, i2, i3);
                        float mask_value = ggml_ext_tensor_get_f32(mask, i0 % mask->ne[0], i1 % mask->ne[1], i2 % mask->ne[2], i3 % mask->ne[3]);
                        ggml_ext_tensor_set_f32(a, a_value * mask_value + b_value * (1 - mask_value), i0, i1, i2, i3);
                    }
                }
            }
        }
    }

    void silent_tiling(ggml_tensor* input, ggml_tensor* output, const int scale, const int tile_size, const float tile_overlap_factor, on_tile_process on_processing) {
        sd_progress_cb_t cb = sd_get_progress_callback();
        void* cbd           = sd_get_progress_callback_data();
        sd_set_progress_callback((sd_progress_cb_t)suppress_pp, nullptr);
        sd_tiling(input, output, scale, tile_size, tile_overlap_factor, on_processing);
        sd_set_progress_callback(cb, cbd);
    }

    void preview_image(ggml_context* work_ctx,
                       int step,
                       struct ggml_tensor* latents,
                       enum SDVersion version,
                       preview_t preview_mode,
                       ggml_tensor* result,
                       std::function<void(int, int, sd_image_t*, bool, void*)> step_callback,
                       void* step_callback_data,
                       bool is_noisy) {
        const uint32_t channel = 3;
        uint32_t width         = static_cast<uint32_t>(latents->ne[0]);
        uint32_t height        = static_cast<uint32_t>(latents->ne[1]);
        uint32_t dim           = static_cast<uint32_t>(latents->ne[ggml_n_dims(latents) - 1]);

        if (preview_mode == PREVIEW_PROJ) {
            int patch_sz                           = 1;
            const float(*latent_rgb_proj)[channel] = nullptr;
            float* latent_rgb_bias                 = nullptr;

            if (dim == 128) {
                if (sd_version_is_flux2(version)) {
                    latent_rgb_proj = flux2_latent_rgb_proj;
                    latent_rgb_bias = flux2_latent_rgb_bias;
                    patch_sz        = 2;
                }
            } else if (dim == 48) {
                if (sd_version_is_wan(version)) {
                    latent_rgb_proj = wan_22_latent_rgb_proj;
                    latent_rgb_bias = wan_22_latent_rgb_bias;
                } else {
                    LOG_WARN("No latent to RGB projection known for this model");
                    // unknown model
                    return;
                }
            } else if (dim == 16) {
                // 16 channels VAE -> Flux or SD3

                if (sd_version_is_sd3(version)) {
                    latent_rgb_proj = sd3_latent_rgb_proj;
                    latent_rgb_bias = sd3_latent_rgb_bias;
                } else if (sd_version_is_flux(version) || sd_version_is_z_image(version)) {
                    latent_rgb_proj = flux_latent_rgb_proj;
                    latent_rgb_bias = flux_latent_rgb_bias;
                } else if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_anima(version)) {
                    latent_rgb_proj = wan_21_latent_rgb_proj;
                    latent_rgb_bias = wan_21_latent_rgb_bias;
                } else {
                    LOG_WARN("No latent to RGB projection known for this model");
                    // unknown model
                    return;
                }

            } else if (dim == 4) {
                // 4 channels VAE
                if (sd_version_is_sdxl(version)) {
                    latent_rgb_proj = sdxl_latent_rgb_proj;
                    latent_rgb_bias = sdxl_latent_rgb_bias;
                } else if (sd_version_is_sd1(version) || sd_version_is_sd2(version)) {
                    latent_rgb_proj = sd_latent_rgb_proj;
                    latent_rgb_bias = sd_latent_rgb_bias;
                } else {
                    // unknown model
                    LOG_WARN("No latent to RGB projection known for this model");
                    return;
                }
            } else if (dim == 3) {
                // Do nothing, assuming already RGB latents
            } else {
                LOG_WARN("No latent to RGB projection known for this model");
                // unknown latent space
                return;
            }

            uint32_t frames = 1;
            if (ggml_n_dims(latents) == 4) {
                frames = static_cast<uint32_t>(latents->ne[2]);
            }

            uint32_t img_width  = width * patch_sz;
            uint32_t img_height = height * patch_sz;

            uint8_t* data = (uint8_t*)malloc(frames * img_width * img_height * channel * sizeof(uint8_t));

            preview_latent_video(data, latents, latent_rgb_proj, latent_rgb_bias, patch_sz);
            sd_image_t* images = (sd_image_t*)malloc(frames * sizeof(sd_image_t));
            for (uint32_t i = 0; i < frames; i++) {
                images[i] = {img_width, img_height, channel, data + i * img_width * img_height * channel};
            }
            step_callback(step, frames, images, is_noisy, step_callback_data);
            free(data);
            free(images);
        } else {
            if (preview_mode == PREVIEW_VAE) {
                process_latent_out(latents);
                if (vae_tiling_params.enabled) {
                    // split latent in 32x32 tiles and compute in several steps
                    auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
                        return first_stage_model->compute(n_threads, in, true, &out, nullptr);
                    };
                    silent_tiling(latents, result, get_vae_scale_factor(), 32, 0.5f, on_tiling);

                } else {
                    first_stage_model->compute(n_threads, latents, true, &result, work_ctx);
                }

                first_stage_model->free_compute_buffer();
                process_vae_output_tensor(result);
                process_latent_in(latents);
            } else if (preview_mode == PREVIEW_TAE) {
                if (tae_first_stage == nullptr) {
                    LOG_WARN("TAE not found for preview");
                    return;
                }
                if (vae_tiling_params.enabled) {
                    // split latent in 64x64 tiles and compute in several steps
                    auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
                        return tae_first_stage->compute(n_threads, in, true, &out, nullptr);
                    };
                    silent_tiling(latents, result, get_vae_scale_factor(), 64, 0.5f, on_tiling);
                } else {
                    tae_first_stage->compute(n_threads, latents, true, &result, work_ctx);
                }
                tae_first_stage->free_compute_buffer();
            } else {
                return;
            }

            ggml_ext_tensor_clamp_inplace(result, 0.0f, 1.0f);
            uint32_t frames = 1;
            if (ggml_n_dims(latents) == 4) {
                frames = static_cast<uint32_t>(result->ne[2]);
            }

            sd_image_t* images = (sd_image_t*)malloc(frames * sizeof(sd_image_t));
            // print_ggml_tensor(result,true);
            for (size_t i = 0; i < frames; i++) {
                images[i].width   = static_cast<uint32_t>(result->ne[0]);
                images[i].height  = static_cast<uint32_t>(result->ne[1]);
                images[i].channel = 3;
                images[i].data    = ggml_tensor_to_sd_image(result, static_cast<int>(i), ggml_n_dims(latents) == 4);
            }

            step_callback(step, frames, images, is_noisy, step_callback_data);

            ggml_ext_tensor_scale_inplace(result, 0);
            for (uint32_t i = 0; i < frames; i++) {
                free(images[i].data);
            }

            free(images);
        }
    }

    ggml_tensor* sample(ggml_context* work_ctx,
                        std::shared_ptr<DiffusionModel> work_diffusion_model,
                        bool inverse_noise_scaling,
                        ggml_tensor* init_latent,
                        ggml_tensor* noise,
                        SDCondition cond,
                        SDCondition uncond,
                        SDCondition img_cond,
                        ggml_tensor* control_hint,
                        float control_strength,
                        sd_guidance_params_t guidance,
                        float eta,
                        int shifted_timestep,
                        sample_method_t method,
                        const std::vector<float>& sigmas,
                        int start_merge_step,
                        SDCondition id_cond,
                        std::vector<ggml_tensor*> ref_latents = {},
                        bool increase_ref_index               = false,
                        ggml_tensor* denoise_mask             = nullptr,
                        ggml_tensor* vace_context             = nullptr,
                        float vace_strength                   = 1.f,
                        const sd_cache_params_t* cache_params = nullptr) {
        if (shifted_timestep > 0 && !sd_version_is_sdxl(version)) {
            LOG_WARN("timestep shifting is only supported for SDXL models!");
            shifted_timestep = 0;
        }
        std::vector<int> skip_layers(guidance.slg.layers, guidance.slg.layers + guidance.slg.layer_count);

        float cfg_scale = guidance.txt_cfg;
        if (cfg_scale < 1.f) {
            if (cfg_scale == 0.f) {
                // Diffusers follow the convention from the original paper
                // (https://arxiv.org/abs/2207.12598v1), so many distilled model docs
                // recommend 0 as guidance; warn the user that it'll disable prompt folowing
                LOG_WARN("unconditioned mode, images won't follow the prompt (use cfg-scale=1 for distilled models)");
            } else {
                LOG_WARN("cfg value out of expected range may produce unexpected results");
            }
        }

        float img_cfg_scale = std::isfinite(guidance.img_cfg) ? guidance.img_cfg : guidance.txt_cfg;
        float slg_scale     = guidance.slg.scale;

        if (img_cfg_scale != cfg_scale && !sd_version_is_inpaint_or_unet_edit(version)) {
            LOG_WARN("2-conditioning CFG is not supported with this model, disabling it for better performance...");
            img_cfg_scale = cfg_scale;
        }

        EasyCacheState easycache_state;
        UCacheState ucache_state;
        CacheDitConditionState cachedit_state;
        bool easycache_enabled = false;
        bool ucache_enabled    = false;
        bool cachedit_enabled  = false;

        if (cache_params != nullptr && cache_params->mode != SD_CACHE_DISABLED) {
            bool percent_valid = true;
            if (cache_params->mode == SD_CACHE_EASYCACHE || cache_params->mode == SD_CACHE_UCACHE) {
                percent_valid = cache_params->start_percent >= 0.0f &&
                                cache_params->start_percent < 1.0f &&
                                cache_params->end_percent > 0.0f &&
                                cache_params->end_percent <= 1.0f &&
                                cache_params->start_percent < cache_params->end_percent;
            }

            if (!percent_valid) {
                LOG_WARN("Cache disabled due to invalid percent range (start=%.3f, end=%.3f)",
                         cache_params->start_percent,
                         cache_params->end_percent);
            } else if (cache_params->mode == SD_CACHE_EASYCACHE) {
                bool easycache_supported = sd_version_is_dit(version);
                if (!easycache_supported) {
                    LOG_WARN("EasyCache requested but not supported for this model type");
                } else {
                    EasyCacheConfig easycache_config;
                    easycache_config.enabled         = true;
                    easycache_config.reuse_threshold = std::max(0.0f, cache_params->reuse_threshold);
                    easycache_config.start_percent   = cache_params->start_percent;
                    easycache_config.end_percent     = cache_params->end_percent;
                    easycache_state.init(easycache_config, denoiser.get());
                    if (easycache_state.enabled()) {
                        easycache_enabled = true;
                        LOG_INFO("EasyCache enabled - threshold: %.3f, start: %.2f, end: %.2f",
                                 easycache_config.reuse_threshold,
                                 easycache_config.start_percent,
                                 easycache_config.end_percent);
                    } else {
                        LOG_WARN("EasyCache requested but could not be initialized for this run");
                    }
                }
            } else if (cache_params->mode == SD_CACHE_UCACHE) {
                bool ucache_supported = sd_version_is_unet(version);
                if (!ucache_supported) {
                    LOG_WARN("UCache requested but not supported for this model type (only UNET models)");
                } else {
                    UCacheConfig ucache_config;
                    ucache_config.enabled                = true;
                    ucache_config.reuse_threshold        = std::max(0.0f, cache_params->reuse_threshold);
                    ucache_config.start_percent          = cache_params->start_percent;
                    ucache_config.end_percent            = cache_params->end_percent;
                    ucache_config.error_decay_rate       = std::max(0.0f, std::min(1.0f, cache_params->error_decay_rate));
                    ucache_config.use_relative_threshold = cache_params->use_relative_threshold;
                    ucache_config.reset_error_on_compute = cache_params->reset_error_on_compute;
                    ucache_state.init(ucache_config, denoiser.get());
                    if (ucache_state.enabled()) {
                        ucache_enabled = true;
                        LOG_INFO("UCache enabled - threshold: %.3f, start: %.2f, end: %.2f, decay: %.2f, relative: %s, reset: %s",
                                 ucache_config.reuse_threshold,
                                 ucache_config.start_percent,
                                 ucache_config.end_percent,
                                 ucache_config.error_decay_rate,
                                 ucache_config.use_relative_threshold ? "true" : "false",
                                 ucache_config.reset_error_on_compute ? "true" : "false");
                    } else {
                        LOG_WARN("UCache requested but could not be initialized for this run");
                    }
                }
            } else if (cache_params->mode == SD_CACHE_DBCACHE ||
                       cache_params->mode == SD_CACHE_TAYLORSEER ||
                       cache_params->mode == SD_CACHE_CACHE_DIT) {
                bool cachedit_supported = sd_version_is_dit(version);
                if (!cachedit_supported) {
                    LOG_WARN("CacheDIT requested but not supported for this model type (only DiT models)");
                } else {
                    DBCacheConfig dbcfg;
                    dbcfg.enabled                     = (cache_params->mode == SD_CACHE_DBCACHE ||
                                     cache_params->mode == SD_CACHE_CACHE_DIT);
                    dbcfg.Fn_compute_blocks           = cache_params->Fn_compute_blocks;
                    dbcfg.Bn_compute_blocks           = cache_params->Bn_compute_blocks;
                    dbcfg.residual_diff_threshold     = cache_params->residual_diff_threshold;
                    dbcfg.max_warmup_steps            = cache_params->max_warmup_steps;
                    dbcfg.max_cached_steps            = cache_params->max_cached_steps;
                    dbcfg.max_continuous_cached_steps = cache_params->max_continuous_cached_steps;
                    if (cache_params->scm_mask != nullptr && strlen(cache_params->scm_mask) > 0) {
                        dbcfg.steps_computation_mask = parse_scm_mask(cache_params->scm_mask);
                    }
                    dbcfg.scm_policy_dynamic = cache_params->scm_policy_dynamic;

                    TaylorSeerConfig tcfg;
                    tcfg.enabled             = (cache_params->mode == SD_CACHE_TAYLORSEER ||
                                    cache_params->mode == SD_CACHE_CACHE_DIT);
                    tcfg.n_derivatives       = cache_params->taylorseer_n_derivatives;
                    tcfg.skip_interval_steps = cache_params->taylorseer_skip_interval;

                    cachedit_state.init(dbcfg, tcfg);
                    if (cachedit_state.enabled()) {
                        cachedit_enabled = true;
                        LOG_INFO("CacheDIT enabled - mode: %s, Fn: %d, Bn: %d, threshold: %.3f, warmup: %d",
                                 cache_params->mode == SD_CACHE_CACHE_DIT ? "DBCache+TaylorSeer" : (cache_params->mode == SD_CACHE_DBCACHE ? "DBCache" : "TaylorSeer"),
                                 dbcfg.Fn_compute_blocks,
                                 dbcfg.Bn_compute_blocks,
                                 dbcfg.residual_diff_threshold,
                                 dbcfg.max_warmup_steps);
                    } else {
                        LOG_WARN("CacheDIT requested but could not be initialized for this run");
                    }
                }
            }
        }

        if (ucache_enabled) {
            ucache_state.set_sigmas(sigmas);
        }

        if (cachedit_enabled) {
            cachedit_state.set_sigmas(sigmas);
        }

        size_t steps          = sigmas.size() - 1;
        struct ggml_tensor* x = ggml_dup_tensor(work_ctx, init_latent);
        copy_ggml_tensor(x, init_latent);

        if (noise) {
            x = denoiser->noise_scaling(sigmas[0], noise, x);
        }

        struct ggml_tensor* noised_input = ggml_dup_tensor(work_ctx, x);

        bool has_unconditioned = img_cfg_scale != 1.0 && uncond.c_crossattn != nullptr;
        bool has_img_cond      = cfg_scale != img_cfg_scale && img_cond.c_crossattn != nullptr;
        bool has_skiplayer     = slg_scale != 0.0 && skip_layers.size() > 0;

        // denoise wrapper
        struct ggml_tensor* out_cond     = ggml_dup_tensor(work_ctx, x);
        struct ggml_tensor* out_uncond   = nullptr;
        struct ggml_tensor* out_skip     = nullptr;
        struct ggml_tensor* out_img_cond = nullptr;

        if (has_unconditioned) {
            out_uncond = ggml_dup_tensor(work_ctx, x);
        }
        if (has_skiplayer) {
            if (sd_version_is_dit(version)) {
                out_skip = ggml_dup_tensor(work_ctx, x);
            } else {
                has_skiplayer = false;
                LOG_WARN("SLG is incompatible with %s models", model_version_to_str[version]);
            }
        }
        if (has_img_cond) {
            out_img_cond = ggml_dup_tensor(work_ctx, x);
        }
        struct ggml_tensor* denoised = ggml_dup_tensor(work_ctx, x);

        int64_t t0 = ggml_time_us();

        struct ggml_tensor* preview_tensor = nullptr;
        auto sd_preview_mode               = sd_get_preview_mode();
        if (sd_preview_mode != PREVIEW_NONE && sd_preview_mode != PREVIEW_PROJ) {
            int64_t W = x->ne[0] * get_vae_scale_factor();
            int64_t H = x->ne[1] * get_vae_scale_factor();
            if (ggml_n_dims(x) == 4) {
                // assuming video mode (if batch processing gets implemented this will break)
                int64_t T = x->ne[2];
                if (sd_version_is_wan(version)) {
                    T = ((T - 1) * 4) + 1;
                }
                preview_tensor = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32,
                                                    W,
                                                    H,
                                                    T,
                                                    3);
            } else {
                preview_tensor = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32,
                                                    W,
                                                    H,
                                                    3,
                                                    x->ne[3]);
            }
        }

        auto denoise = [&](ggml_tensor* input, float sigma, int step) -> ggml_tensor* {
            auto sd_preview_cb      = sd_get_preview_callback();
            auto sd_preview_cb_data = sd_get_preview_callback_data();
            auto sd_preview_mode    = sd_get_preview_mode();
            if (step == 1 || step == -1) {
                pretty_progress(0, (int)steps, 0);
            }

            DiffusionParams diffusion_params;

            const bool easycache_step_active = easycache_enabled && step > 0;
            int easycache_step_index         = easycache_step_active ? (step - 1) : -1;
            if (easycache_step_active) {
                easycache_state.begin_step(easycache_step_index, sigma);
            }

            auto easycache_before_condition = [&](const SDCondition* condition, struct ggml_tensor* output_tensor) -> bool {
                if (!easycache_step_active || condition == nullptr || output_tensor == nullptr) {
                    return false;
                }
                return easycache_state.before_condition(condition,
                                                        diffusion_params.x,
                                                        output_tensor,
                                                        sigma,
                                                        easycache_step_index);
            };

            auto easycache_after_condition = [&](const SDCondition* condition, struct ggml_tensor* output_tensor) {
                if (!easycache_step_active || condition == nullptr || output_tensor == nullptr) {
                    return;
                }
                easycache_state.after_condition(condition,
                                                diffusion_params.x,
                                                output_tensor);
            };

            auto easycache_step_is_skipped = [&]() {
                return easycache_step_active && easycache_state.is_step_skipped();
            };

            const bool ucache_step_active = ucache_enabled && step > 0;
            int ucache_step_index         = ucache_step_active ? (step - 1) : -1;
            if (ucache_step_active) {
                ucache_state.begin_step(ucache_step_index, sigma);
            }

            auto ucache_before_condition = [&](const SDCondition* condition, struct ggml_tensor* output_tensor) -> bool {
                if (!ucache_step_active || condition == nullptr || output_tensor == nullptr) {
                    return false;
                }
                return ucache_state.before_condition(condition,
                                                     diffusion_params.x,
                                                     output_tensor,
                                                     sigma,
                                                     ucache_step_index);
            };

            auto ucache_after_condition = [&](const SDCondition* condition, struct ggml_tensor* output_tensor) {
                if (!ucache_step_active || condition == nullptr || output_tensor == nullptr) {
                    return;
                }
                ucache_state.after_condition(condition,
                                             diffusion_params.x,
                                             output_tensor);
            };

            auto ucache_step_is_skipped = [&]() {
                return ucache_step_active && ucache_state.is_step_skipped();
            };

            const bool cachedit_step_active = cachedit_enabled && step > 0;
            int cachedit_step_index         = cachedit_step_active ? (step - 1) : -1;
            if (cachedit_step_active) {
                cachedit_state.begin_step(cachedit_step_index, sigma);
            }

            auto cachedit_before_condition = [&](const SDCondition* condition, struct ggml_tensor* output_tensor) -> bool {
                if (!cachedit_step_active || condition == nullptr || output_tensor == nullptr) {
                    return false;
                }
                return cachedit_state.before_condition(condition,
                                                       diffusion_params.x,
                                                       output_tensor,
                                                       sigma,
                                                       cachedit_step_index);
            };

            auto cachedit_after_condition = [&](const SDCondition* condition, struct ggml_tensor* output_tensor) {
                if (!cachedit_step_active || condition == nullptr || output_tensor == nullptr) {
                    return;
                }
                cachedit_state.after_condition(condition,
                                               diffusion_params.x,
                                               output_tensor);
            };

            auto cachedit_step_is_skipped = [&]() {
                return cachedit_step_active && cachedit_state.is_step_skipped();
            };

            auto cache_before_condition = [&](const SDCondition* condition, struct ggml_tensor* output_tensor) -> bool {
                if (easycache_step_active) {
                    return easycache_before_condition(condition, output_tensor);
                } else if (ucache_step_active) {
                    return ucache_before_condition(condition, output_tensor);
                } else if (cachedit_step_active) {
                    return cachedit_before_condition(condition, output_tensor);
                }
                return false;
            };

            auto cache_after_condition = [&](const SDCondition* condition, struct ggml_tensor* output_tensor) {
                if (easycache_step_active) {
                    easycache_after_condition(condition, output_tensor);
                } else if (ucache_step_active) {
                    ucache_after_condition(condition, output_tensor);
                } else if (cachedit_step_active) {
                    cachedit_after_condition(condition, output_tensor);
                }
            };

            auto cache_step_is_skipped = [&]() {
                return easycache_step_is_skipped() || ucache_step_is_skipped() || cachedit_step_is_skipped();
            };

            std::vector<float> scaling = denoiser->get_scalings(sigma);
            GGML_ASSERT(scaling.size() == 3);
            float c_skip = scaling[0];
            float c_out  = scaling[1];
            float c_in   = scaling[2];

            float t = denoiser->sigma_to_t(sigma);
            std::vector<float> timesteps_vec;
            if (shifted_timestep > 0 && sd_version_is_sdxl(version)) {
                float shifted_t_float = t * (float(shifted_timestep) / float(TIMESTEPS));
                int64_t shifted_t     = static_cast<int64_t>(roundf(shifted_t_float));
                shifted_t             = std::max((int64_t)0, std::min((int64_t)(TIMESTEPS - 1), shifted_t));
                LOG_DEBUG("shifting timestep from %.2f to %" PRId64 " (sigma: %.4f)", t, shifted_t, sigma);
                timesteps_vec.assign(1, (float)shifted_t);
            } else if (sd_version_is_anima(version)) {
                // Anima uses normalized flow timesteps.
                timesteps_vec.assign(1, t / static_cast<float>(TIMESTEPS));
            } else if (sd_version_is_z_image(version)) {
                timesteps_vec.assign(1, 1000.f - t);
            } else {
                timesteps_vec.assign(1, t);
            }

            timesteps_vec  = process_timesteps(timesteps_vec, init_latent, denoise_mask);
            auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec);
            std::vector<float> guidance_vec(1, guidance.distilled_guidance);
            auto guidance_tensor = vector_to_ggml_tensor(work_ctx, guidance_vec);

            copy_ggml_tensor(noised_input, input);
            // noised_input = noised_input * c_in
            ggml_ext_tensor_scale_inplace(noised_input, c_in);

            if (denoise_mask != nullptr && version == VERSION_WAN2_2_TI2V) {
                apply_mask(noised_input, init_latent, denoise_mask);
            }
            if (sd_preview_cb != nullptr && sd_should_preview_noisy()) {
                if (step % sd_get_preview_interval() == 0) {
                    preview_image(work_ctx, step, noised_input, version, sd_preview_mode, preview_tensor, sd_preview_cb, sd_preview_cb_data, true);
                }
            }

            std::vector<struct ggml_tensor*> controls;

            if (control_hint != nullptr && control_net != nullptr) {
                if (control_net->compute(n_threads, noised_input, control_hint, timesteps, cond.c_crossattn, cond.c_vector)) {
                    controls = control_net->controls;
                } else {
                    LOG_ERROR("controlnet compute failed");
                }
                // print_ggml_tensor(controls[12]);
                // GGML_ASSERT(0);
            }

            diffusion_params.x                  = noised_input;
            diffusion_params.timesteps          = timesteps;
            diffusion_params.guidance           = guidance_tensor;
            diffusion_params.ref_latents        = ref_latents;
            diffusion_params.increase_ref_index = increase_ref_index;
            diffusion_params.controls           = controls;
            diffusion_params.control_strength   = control_strength;
            diffusion_params.vace_context       = vace_context;
            diffusion_params.vace_strength      = vace_strength;

            const SDCondition* active_condition = nullptr;
            struct ggml_tensor** active_output  = &out_cond;
            if (start_merge_step == -1 || step <= start_merge_step) {
                // cond
                diffusion_params.context  = cond.c_crossattn;
                diffusion_params.c_concat = cond.c_concat;
                diffusion_params.y        = cond.c_vector;
                active_condition          = &cond;
            } else {
                diffusion_params.context  = id_cond.c_crossattn;
                diffusion_params.c_concat = cond.c_concat;
                diffusion_params.y        = id_cond.c_vector;
                active_condition          = &id_cond;
            }

            bool skip_model = cache_before_condition(active_condition, *active_output);
            if (!skip_model) {
                if (!work_diffusion_model->compute(n_threads,
                                                   diffusion_params,
                                                   active_output)) {
                    LOG_ERROR("diffusion model compute failed");
                    return nullptr;
                }
                cache_after_condition(active_condition, *active_output);
            }

            bool current_step_skipped = cache_step_is_skipped();

            float* negative_data = nullptr;
            if (has_unconditioned) {
                // uncond
                if (!current_step_skipped && control_hint != nullptr && control_net != nullptr) {
                    if (control_net->compute(n_threads, noised_input, control_hint, timesteps, uncond.c_crossattn, uncond.c_vector)) {
                        controls = control_net->controls;
                    } else {
                        LOG_ERROR("controlnet compute failed");
                    }
                }
                current_step_skipped      = cache_step_is_skipped();
                diffusion_params.controls = controls;
                diffusion_params.context  = uncond.c_crossattn;
                diffusion_params.c_concat = uncond.c_concat;
                diffusion_params.y        = uncond.c_vector;
                bool skip_uncond          = cache_before_condition(&uncond, out_uncond);
                if (!skip_uncond) {
                    if (!work_diffusion_model->compute(n_threads,
                                                       diffusion_params,
                                                       &out_uncond)) {
                        LOG_ERROR("diffusion model compute failed");
                        return nullptr;
                    }
                    cache_after_condition(&uncond, out_uncond);
                }
                negative_data = (float*)out_uncond->data;
            }

            float* img_cond_data = nullptr;
            if (has_img_cond) {
                diffusion_params.context  = img_cond.c_crossattn;
                diffusion_params.c_concat = img_cond.c_concat;
                diffusion_params.y        = img_cond.c_vector;
                bool skip_img_cond        = cache_before_condition(&img_cond, out_img_cond);
                if (!skip_img_cond) {
                    if (!work_diffusion_model->compute(n_threads,
                                                       diffusion_params,
                                                       &out_img_cond)) {
                        LOG_ERROR("diffusion model compute failed");
                        return nullptr;
                    }
                    cache_after_condition(&img_cond, out_img_cond);
                }
                img_cond_data = (float*)out_img_cond->data;
            }

            int step_count         = static_cast<int>(sigmas.size());
            bool is_skiplayer_step = has_skiplayer && step > (int)(guidance.slg.layer_start * step_count) && step < (int)(guidance.slg.layer_end * step_count);
            float* skip_layer_data = has_skiplayer ? (float*)out_skip->data : nullptr;
            if (is_skiplayer_step) {
                LOG_DEBUG("Skipping layers at step %d\n", step);
                if (!cache_step_is_skipped()) {
                    // skip layer (same as conditioned)
                    diffusion_params.context     = cond.c_crossattn;
                    diffusion_params.c_concat    = cond.c_concat;
                    diffusion_params.y           = cond.c_vector;
                    diffusion_params.skip_layers = skip_layers;
                    if (!work_diffusion_model->compute(n_threads,
                                                       diffusion_params,
                                                       &out_skip)) {
                        LOG_ERROR("diffusion model compute failed");
                        return nullptr;
                    }
                }
                skip_layer_data = (float*)out_skip->data;
            }
            float* vec_denoised  = (float*)denoised->data;
            float* vec_input     = (float*)input->data;
            float* positive_data = (float*)out_cond->data;
            int ne_elements      = (int)ggml_nelements(denoised);

            if (shifted_timestep > 0 && sd_version_is_sdxl(version)) {
                int64_t shifted_t_idx              = static_cast<int64_t>(roundf(timesteps_vec[0]));
                float shifted_sigma                = denoiser->t_to_sigma((float)shifted_t_idx);
                std::vector<float> shifted_scaling = denoiser->get_scalings(shifted_sigma);
                float shifted_c_skip               = shifted_scaling[0];
                float shifted_c_out                = shifted_scaling[1];
                float shifted_c_in                 = shifted_scaling[2];

                c_skip = shifted_c_skip * c_in / shifted_c_in;
                c_out  = shifted_c_out;
            }

            for (int i = 0; i < ne_elements; i++) {
                float latent_result = positive_data[i];
                if (has_unconditioned) {
                    // out_uncond + cfg_scale * (out_cond - out_uncond)
                    if (has_img_cond) {
                        // out_uncond + text_cfg_scale * (out_cond - out_img_cond) + image_cfg_scale * (out_img_cond - out_uncond)
                        latent_result = negative_data[i] + img_cfg_scale * (img_cond_data[i] - negative_data[i]) + cfg_scale * (positive_data[i] - img_cond_data[i]);
                    } else {
                        // img_cfg_scale == cfg_scale
                        latent_result = negative_data[i] + cfg_scale * (positive_data[i] - negative_data[i]);
                    }
                } else if (has_img_cond) {
                    // img_cfg_scale == 1
                    latent_result = img_cond_data[i] + cfg_scale * (positive_data[i] - img_cond_data[i]);
                }
                if (is_skiplayer_step) {
                    latent_result = latent_result + (positive_data[i] - skip_layer_data[i]) * slg_scale;
                }
                // v = latent_result, eps = latent_result
                // denoised = (v * c_out + input * c_skip) or (input + eps * c_out)
                vec_denoised[i] = latent_result * c_out + vec_input[i] * c_skip;
            }

            if (denoise_mask != nullptr) {
                apply_mask(denoised, init_latent, denoise_mask);
            }

            if (sd_preview_cb != nullptr && sd_should_preview_denoised()) {
                if (step % sd_get_preview_interval() == 0) {
                    preview_image(work_ctx, step, denoised, version, sd_preview_mode, preview_tensor, sd_preview_cb, sd_preview_cb_data, false);
                }
            }

            int64_t t1 = ggml_time_us();
            if (step > 0 || step == -(int)steps) {
                int showstep = std::abs(step);
                pretty_progress(showstep, (int)steps, (t1 - t0) / 1000000.f / showstep);
                // LOG_INFO("step %d sampling completed taking %.2fs", step, (t1 - t0) * 1.0f / 1000000);
            }
            return denoised;
        };

        if (!sample_k_diffusion(method, denoise, work_ctx, x, sigmas, sampler_rng, eta)) {
            LOG_ERROR("Diffusion model sampling failed");
            if (control_net) {
                control_net->free_control_ctx();
                control_net->free_compute_buffer();
            }
            diffusion_model->free_compute_buffer();
            return NULL;
        }

        if (easycache_enabled) {
            size_t total_steps = sigmas.size() > 0 ? sigmas.size() - 1 : 0;
            if (easycache_state.total_steps_skipped > 0 && total_steps > 0) {
                if (easycache_state.total_steps_skipped < static_cast<int>(total_steps)) {
                    double speedup = static_cast<double>(total_steps) /
                                     static_cast<double>(total_steps - easycache_state.total_steps_skipped);
                    LOG_INFO("EasyCache skipped %d/%zu steps (%.2fx estimated speedup)",
                             easycache_state.total_steps_skipped,
                             total_steps,
                             speedup);
                } else {
                    LOG_INFO("EasyCache skipped %d/%zu steps",
                             easycache_state.total_steps_skipped,
                             total_steps);
                }
            } else if (total_steps > 0) {
                LOG_INFO("EasyCache completed without skipping steps");
            }
        }

        if (ucache_enabled) {
            size_t total_steps = sigmas.size() > 0 ? sigmas.size() - 1 : 0;
            if (ucache_state.total_steps_skipped > 0 && total_steps > 0) {
                if (ucache_state.total_steps_skipped < static_cast<int>(total_steps)) {
                    double speedup = static_cast<double>(total_steps) /
                                     static_cast<double>(total_steps - ucache_state.total_steps_skipped);
                    LOG_INFO("UCache skipped %d/%zu steps (%.2fx estimated speedup)",
                             ucache_state.total_steps_skipped,
                             total_steps,
                             speedup);
                } else {
                    LOG_INFO("UCache skipped %d/%zu steps",
                             ucache_state.total_steps_skipped,
                             total_steps);
                }
            } else if (total_steps > 0) {
                LOG_INFO("UCache completed without skipping steps");
            }
        }

        if (cachedit_enabled) {
            size_t total_steps = sigmas.size() > 0 ? sigmas.size() - 1 : 0;
            if (cachedit_state.total_steps_skipped > 0 && total_steps > 0) {
                if (cachedit_state.total_steps_skipped < static_cast<int>(total_steps)) {
                    double speedup = static_cast<double>(total_steps) /
                                     static_cast<double>(total_steps - cachedit_state.total_steps_skipped);
                    LOG_INFO("CacheDIT skipped %d/%zu steps (%.2fx estimated speedup), accum_diff: %.4f",
                             cachedit_state.total_steps_skipped,
                             total_steps,
                             speedup,
                             cachedit_state.accumulated_residual_diff);
                } else {
                    LOG_INFO("CacheDIT skipped %d/%zu steps, accum_diff: %.4f",
                             cachedit_state.total_steps_skipped,
                             total_steps,
                             cachedit_state.accumulated_residual_diff);
                }
            } else if (total_steps > 0) {
                LOG_INFO("CacheDIT completed without skipping steps");
            }
        }

        if (inverse_noise_scaling) {
            x = denoiser->inverse_noise_scaling(sigmas[sigmas.size() - 1], x);
        }

        if (control_net) {
            control_net->free_control_ctx();
            control_net->free_compute_buffer();
        }
        work_diffusion_model->free_compute_buffer();
        return x;
    }

    int get_vae_scale_factor() {
        int vae_scale_factor = 8;
        if (version == VERSION_WAN2_2_TI2V) {
            vae_scale_factor = 16;
        } else if (sd_version_is_flux2(version)) {
            vae_scale_factor = 16;
        } else if (version == VERSION_CHROMA_RADIANCE) {
            vae_scale_factor = 1;
        }
        return vae_scale_factor;
    }

    int get_diffusion_model_down_factor() {
        int down_factor = 8;  // unet
        if (sd_version_is_dit(version)) {
            if (sd_version_is_wan(version)) {
                down_factor = 2;
            } else {
                down_factor = 1;
            }
        }
        return down_factor;
    }

    int get_latent_channel() {
        int latent_channel = 4;
        if (sd_version_is_dit(version)) {
            if (version == VERSION_WAN2_2_TI2V) {
                latent_channel = 48;
            } else if (version == VERSION_CHROMA_RADIANCE) {
                latent_channel = 3;
            } else if (sd_version_is_flux2(version)) {
                latent_channel = 128;
            } else {
                latent_channel = 16;
            }
        }
        return latent_channel;
    }

    int get_image_seq_len(int h, int w) {
        int vae_scale_factor = get_vae_scale_factor();
        return (h / vae_scale_factor) * (w / vae_scale_factor);
    }

    ggml_tensor* generate_init_latent(ggml_context* work_ctx,
                                      int width,
                                      int height,
                                      int frames = 1,
                                      bool video = false) {
        int vae_scale_factor = get_vae_scale_factor();
        int W                = width / vae_scale_factor;
        int H                = height / vae_scale_factor;
        int T                = frames;
        if (sd_version_is_wan(version)) {
            T = ((T - 1) / 4) + 1;
        }
        int C = get_latent_channel();
        ggml_tensor* init_latent;
        if (video) {
            init_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, T, C);
        } else {
            init_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1);
        }
        ggml_set_f32(init_latent, shift_factor);
        return init_latent;
    }

    void get_latents_mean_std_vec(ggml_tensor* latent, int channel_dim, std::vector<float>& latents_mean_vec, std::vector<float>& latents_std_vec) {
        GGML_ASSERT(latent->ne[channel_dim] == 16 || latent->ne[channel_dim] == 48 || latent->ne[channel_dim] == 128);
        if (latent->ne[channel_dim] == 16) {
            latents_mean_vec = {-0.7571f, -0.7089f, -0.9113f, 0.1075f, -0.1745f, 0.9653f, -0.1517f, 1.5508f,
                                0.4134f, -0.0715f, 0.5517f, -0.3632f, -0.1922f, -0.9497f, 0.2503f, -0.2921f};
            latents_std_vec  = {2.8184f, 1.4541f, 2.3275f, 2.6558f, 1.2196f, 1.7708f, 2.6052f, 2.0743f,
                                3.2687f, 2.1526f, 2.8652f, 1.5579f, 1.6382f, 1.1253f, 2.8251f, 1.9160f};
        } else if (latent->ne[channel_dim] == 48) {
            latents_mean_vec = {-0.2289f, -0.0052f, -0.1323f, -0.2339f, -0.2799f, 0.0174f, 0.1838f, 0.1557f,
                                -0.1382f, 0.0542f, 0.2813f, 0.0891f, 0.1570f, -0.0098f, 0.0375f, -0.1825f,
                                -0.2246f, -0.1207f, -0.0698f, 0.5109f, 0.2665f, -0.2108f, -0.2158f, 0.2502f,
                                -0.2055f, -0.0322f, 0.1109f, 0.1567f, -0.0729f, 0.0899f, -0.2799f, -0.1230f,
                                -0.0313f, -0.1649f, 0.0117f, 0.0723f, -0.2839f, -0.2083f, -0.0520f, 0.3748f,
                                0.0152f, 0.1957f, 0.1433f, -0.2944f, 0.3573f, -0.0548f, -0.1681f, -0.0667f};
            latents_std_vec  = {
                 0.4765f, 1.0364f, 0.4514f, 1.1677f, 0.5313f, 0.4990f, 0.4818f, 0.5013f,
                 0.8158f, 1.0344f, 0.5894f, 1.0901f, 0.6885f, 0.6165f, 0.8454f, 0.4978f,
                 0.5759f, 0.3523f, 0.7135f, 0.6804f, 0.5833f, 1.4146f, 0.8986f, 0.5659f,
                 0.7069f, 0.5338f, 0.4889f, 0.4917f, 0.4069f, 0.4999f, 0.6866f, 0.4093f,
                 0.5709f, 0.6065f, 0.6415f, 0.4944f, 0.5726f, 1.2042f, 0.5458f, 1.6887f,
                 0.3971f, 1.0600f, 0.3943f, 0.5537f, 0.5444f, 0.4089f, 0.7468f, 0.7744f};
        } else if (latent->ne[channel_dim] == 128) {
            // flux2
            latents_mean_vec = {-0.0676f, -0.0715f, -0.0753f, -0.0745f, 0.0223f, 0.0180f, 0.0142f, 0.0184f,
                                -0.0001f, -0.0063f, -0.0002f, -0.0031f, -0.0272f, -0.0281f, -0.0276f, -0.0290f,
                                -0.0769f, -0.0672f, -0.0902f, -0.0892f, 0.0168f, 0.0152f, 0.0079f, 0.0086f,
                                0.0083f, 0.0015f, 0.0003f, -0.0043f, -0.0439f, -0.0419f, -0.0438f, -0.0431f,
                                -0.0102f, -0.0132f, -0.0066f, -0.0048f, -0.0311f, -0.0306f, -0.0279f, -0.0180f,
                                0.0030f, 0.0015f, 0.0126f, 0.0145f, 0.0347f, 0.0338f, 0.0337f, 0.0283f,
                                0.0020f, 0.0047f, 0.0047f, 0.0050f, 0.0123f, 0.0081f, 0.0081f, 0.0146f,
                                0.0681f, 0.0679f, 0.0767f, 0.0732f, -0.0462f, -0.0474f, -0.0392f, -0.0511f,
                                -0.0528f, -0.0477f, -0.0470f, -0.0517f, -0.0317f, -0.0316f, -0.0345f, -0.0283f,
                                0.0510f, 0.0445f, 0.0578f, 0.0458f, -0.0412f, -0.0458f, -0.0487f, -0.0467f,
                                -0.0088f, -0.0106f, -0.0088f, -0.0046f, -0.0376f, -0.0432f, -0.0436f, -0.0499f,
                                0.0118f, 0.0166f, 0.0203f, 0.0279f, 0.0113f, 0.0129f, 0.0016f, 0.0072f,
                                -0.0118f, -0.0018f, -0.0141f, -0.0054f, -0.0091f, -0.0138f, -0.0145f, -0.0187f,
                                0.0323f, 0.0305f, 0.0259f, 0.0300f, 0.0540f, 0.0614f, 0.0495f, 0.0590f,
                                -0.0511f, -0.0603f, -0.0478f, -0.0524f, -0.0227f, -0.0274f, -0.0154f, -0.0255f,
                                -0.0572f, -0.0565f, -0.0518f, -0.0496f, 0.0116f, 0.0054f, 0.0163f, 0.0104f};
            latents_std_vec  = {
                 1.8029f, 1.7786f, 1.7868f, 1.7837f, 1.7717f, 1.7590f, 1.7610f, 1.7479f,
                 1.7336f, 1.7373f, 1.7340f, 1.7343f, 1.8626f, 1.8527f, 1.8629f, 1.8589f,
                 1.7593f, 1.7526f, 1.7556f, 1.7583f, 1.7363f, 1.7400f, 1.7355f, 1.7394f,
                 1.7342f, 1.7246f, 1.7392f, 1.7304f, 1.7551f, 1.7513f, 1.7559f, 1.7488f,
                 1.8449f, 1.8454f, 1.8550f, 1.8535f, 1.8240f, 1.7813f, 1.7854f, 1.7945f,
                 1.8047f, 1.7876f, 1.7695f, 1.7676f, 1.7782f, 1.7667f, 1.7925f, 1.7848f,
                 1.7579f, 1.7407f, 1.7483f, 1.7368f, 1.7961f, 1.7998f, 1.7920f, 1.7925f,
                 1.7780f, 1.7747f, 1.7727f, 1.7749f, 1.7526f, 1.7447f, 1.7657f, 1.7495f,
                 1.7775f, 1.7720f, 1.7813f, 1.7813f, 1.8162f, 1.8013f, 1.8023f, 1.8033f,
                 1.7527f, 1.7331f, 1.7563f, 1.7482f, 1.7610f, 1.7507f, 1.7681f, 1.7613f,
                 1.7665f, 1.7545f, 1.7828f, 1.7726f, 1.7896f, 1.7999f, 1.7864f, 1.7760f,
                 1.7613f, 1.7625f, 1.7560f, 1.7577f, 1.7783f, 1.7671f, 1.7810f, 1.7799f,
                 1.7201f, 1.7068f, 1.7265f, 1.7091f, 1.7793f, 1.7578f, 1.7502f, 1.7455f,
                 1.7587f, 1.7500f, 1.7525f, 1.7362f, 1.7616f, 1.7572f, 1.7444f, 1.7430f,
                 1.7509f, 1.7610f, 1.7634f, 1.7612f, 1.7254f, 1.7135f, 1.7321f, 1.7226f,
                 1.7664f, 1.7624f, 1.7718f, 1.7664f, 1.7457f, 1.7441f, 1.7569f, 1.7530f};
        }
    }

    void process_latent_in(ggml_tensor* latent) {
        if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_anima(version) || sd_version_is_flux2(version)) {
            int channel_dim = sd_version_is_flux2(version) ? 2 : 3;
            std::vector<float> latents_mean_vec;
            std::vector<float> latents_std_vec;
            get_latents_mean_std_vec(latent, channel_dim, latents_mean_vec, latents_std_vec);

            float mean;
            float std_;
            for (int i = 0; i < latent->ne[3]; i++) {
                if (channel_dim == 3) {
                    mean = latents_mean_vec[i];
                    std_ = latents_std_vec[i];
                }
                for (int j = 0; j < latent->ne[2]; j++) {
                    if (channel_dim == 2) {
                        mean = latents_mean_vec[i];
                        std_ = latents_std_vec[i];
                    }
                    for (int k = 0; k < latent->ne[1]; k++) {
                        for (int l = 0; l < latent->ne[0]; l++) {
                            float value = ggml_ext_tensor_get_f32(latent, l, k, j, i);
                            value       = (value - mean) * scale_factor / std_;
                            ggml_ext_tensor_set_f32(latent, value, l, k, j, i);
                        }
                    }
                }
            }
        } else if (version == VERSION_CHROMA_RADIANCE) {
            // pass
        } else {
            ggml_ext_tensor_iter(latent, [&](ggml_tensor* latent, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
                float value = ggml_ext_tensor_get_f32(latent, i0, i1, i2, i3);
                value       = (value - shift_factor) * scale_factor;
                ggml_ext_tensor_set_f32(latent, value, i0, i1, i2, i3);
            });
        }
    }

    void process_latent_out(ggml_tensor* latent) {
        if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_anima(version) || sd_version_is_flux2(version)) {
            int channel_dim = sd_version_is_flux2(version) ? 2 : 3;
            std::vector<float> latents_mean_vec;
            std::vector<float> latents_std_vec;
            get_latents_mean_std_vec(latent, channel_dim, latents_mean_vec, latents_std_vec);

            float mean;
            float std_;
            for (int i = 0; i < latent->ne[3]; i++) {
                if (channel_dim == 3) {
                    mean = latents_mean_vec[i];
                    std_ = latents_std_vec[i];
                }
                for (int j = 0; j < latent->ne[2]; j++) {
                    if (channel_dim == 2) {
                        mean = latents_mean_vec[i];
                        std_ = latents_std_vec[i];
                    }
                    for (int k = 0; k < latent->ne[1]; k++) {
                        for (int l = 0; l < latent->ne[0]; l++) {
                            float value = ggml_ext_tensor_get_f32(latent, l, k, j, i);
                            value       = value * std_ / scale_factor + mean;
                            ggml_ext_tensor_set_f32(latent, value, l, k, j, i);
                        }
                    }
                }
            }
        } else if (version == VERSION_CHROMA_RADIANCE) {
            // pass
        } else {
            ggml_ext_tensor_iter(latent, [&](ggml_tensor* latent, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
                float value = ggml_ext_tensor_get_f32(latent, i0, i1, i2, i3);
                value       = (value / scale_factor) + shift_factor;
                ggml_ext_tensor_set_f32(latent, value, i0, i1, i2, i3);
            });
        }
    }

    void get_tile_sizes(int& tile_size_x,
                        int& tile_size_y,
                        float& tile_overlap,
                        const sd_tiling_params_t& params,
                        int64_t latent_x,
                        int64_t latent_y,
                        float encoding_factor = 1.0f) {
        tile_overlap       = std::max(std::min(params.target_overlap, 0.5f), 0.0f);
        auto get_tile_size = [&](int requested_size, float factor, int64_t latent_size) {
            const int default_tile_size  = 32;
            const int min_tile_dimension = 4;
            int tile_size                = default_tile_size;
            // factor <= 1 means simple fraction of the latent dimension
            // factor > 1 means number of tiles across that dimension
            if (factor > 0.f) {
                if (factor > 1.0)
                    factor = 1 / (factor - factor * tile_overlap + tile_overlap);
                tile_size = static_cast<int>(std::round(latent_size * factor));
            } else if (requested_size >= min_tile_dimension) {
                tile_size = requested_size;
            }
            tile_size = static_cast<int>(tile_size * encoding_factor);
            return std::max(std::min(tile_size, static_cast<int>(latent_size)), min_tile_dimension);
        };

        tile_size_x = get_tile_size(params.tile_size_x, params.rel_size_x, latent_x);
        tile_size_y = get_tile_size(params.tile_size_y, params.rel_size_y, latent_y);
    }

    ggml_tensor* vae_encode(ggml_context* work_ctx, ggml_tensor* x, bool encode_video = false) {
        int64_t t0                 = ggml_time_ms();
        ggml_tensor* result        = nullptr;
        const int vae_scale_factor = get_vae_scale_factor();
        int64_t W                  = x->ne[0] / vae_scale_factor;
        int64_t H                  = x->ne[1] / vae_scale_factor;
        int64_t C                  = get_latent_channel();
        if (vae_tiling_params.enabled && !encode_video) {
            // TODO wan2.2 vae support?
            int64_t ne2;
            int64_t ne3;
            if (sd_version_is_qwen_image(version) || sd_version_is_anima(version)) {
                ne2 = 1;
                ne3 = C * x->ne[3];
            } else {
                int64_t out_channels   = C;
                bool encode_outputs_mu = use_tiny_autoencoder ||
                                         sd_version_is_wan(version) ||
                                         sd_version_is_flux2(version) ||
                                         version == VERSION_CHROMA_RADIANCE;
                if (!encode_outputs_mu) {
                    out_channels *= 2;
                }
                ne2 = out_channels;
                ne3 = x->ne[3];
            }
            result = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, ne2, ne3);
        }

        if (sd_version_is_qwen_image(version) || sd_version_is_anima(version)) {
            x = ggml_reshape_4d(work_ctx, x, x->ne[0], x->ne[1], 1, x->ne[2] * x->ne[3]);
        }

        if (!use_tiny_autoencoder) {
            process_vae_input_tensor(x);
            if (vae_tiling_params.enabled && !encode_video) {
                float tile_overlap;
                int tile_size_x, tile_size_y;
                // multiply tile size for encode to keep the compute buffer size consistent
                get_tile_sizes(tile_size_x, tile_size_y, tile_overlap, vae_tiling_params, W, H, 1.30539f);

                LOG_DEBUG("VAE Tile size: %dx%d", tile_size_x, tile_size_y);

                auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
                    return first_stage_model->compute(n_threads, in, false, &out, work_ctx);
                };
                sd_tiling_non_square(x, result, vae_scale_factor, tile_size_x, tile_size_y, tile_overlap, on_tiling);
            } else {
                first_stage_model->compute(n_threads, x, false, &result, work_ctx);
            }
            first_stage_model->free_compute_buffer();
        } else {
            if (vae_tiling_params.enabled && !encode_video) {
                // split latent in 32x32 tiles and compute in several steps
                auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
                    return tae_first_stage->compute(n_threads, in, false, &out, nullptr);
                };
                sd_tiling(x, result, vae_scale_factor, 64, 0.5f, on_tiling);
            } else {
                tae_first_stage->compute(n_threads, x, false, &result, work_ctx);
            }
            tae_first_stage->free_compute_buffer();
        }

        int64_t t1 = ggml_time_ms();
        LOG_DEBUG("computing vae encode graph completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
        return result;
    }

    ggml_tensor* gaussian_latent_sample(ggml_context* work_ctx, ggml_tensor* moments) {
        // ldm.modules.distributions.distributions.DiagonalGaussianDistribution.sample
        ggml_tensor* latent       = ggml_new_tensor_4d(work_ctx, moments->type, moments->ne[0], moments->ne[1], moments->ne[2] / 2, moments->ne[3]);
        struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, latent);
        ggml_ext_im_set_randn_f32(noise, rng);
        {
            float mean   = 0;
            float logvar = 0;
            float value  = 0;
            float std_   = 0;
            for (int i = 0; i < latent->ne[3]; i++) {
                for (int j = 0; j < latent->ne[2]; j++) {
                    for (int k = 0; k < latent->ne[1]; k++) {
                        for (int l = 0; l < latent->ne[0]; l++) {
                            mean   = ggml_ext_tensor_get_f32(moments, l, k, j, i);
                            logvar = ggml_ext_tensor_get_f32(moments, l, k, j + (int)latent->ne[2], i);
                            logvar = std::max(-30.0f, std::min(logvar, 20.0f));
                            std_   = std::exp(0.5f * logvar);
                            value  = mean + std_ * ggml_ext_tensor_get_f32(noise, l, k, j, i);
                            // printf("%d %d %d %d -> %f\n", i, j, k, l, value);
                            ggml_ext_tensor_set_f32(latent, value, l, k, j, i);
                        }
                    }
                }
            }
        }
        return latent;
    }

    ggml_tensor* get_first_stage_encoding(ggml_context* work_ctx, ggml_tensor* vae_output) {
        ggml_tensor* latent;
        if (use_tiny_autoencoder ||
            sd_version_is_qwen_image(version) ||
            sd_version_is_anima(version) ||
            sd_version_is_wan(version) ||
            sd_version_is_flux2(version) ||
            version == VERSION_CHROMA_RADIANCE) {
            latent = vae_output;
        } else if (version == VERSION_SD1_PIX2PIX) {
            latent = ggml_view_3d(work_ctx,
                                  vae_output,
                                  vae_output->ne[0],
                                  vae_output->ne[1],
                                  vae_output->ne[2] / 2,
                                  vae_output->nb[1],
                                  vae_output->nb[2],
                                  0);
        } else {
            latent = gaussian_latent_sample(work_ctx, vae_output);
        }
        if (!use_tiny_autoencoder) {
            process_latent_in(latent);
        }
        if (sd_version_is_qwen_image(version) || sd_version_is_anima(version)) {
            latent = ggml_reshape_4d(work_ctx, latent, latent->ne[0], latent->ne[1], latent->ne[3], 1);
        }
        return latent;
    }

    ggml_tensor* encode_first_stage(ggml_context* work_ctx, ggml_tensor* x, bool encode_video = false) {
        ggml_tensor* vae_output = vae_encode(work_ctx, x, encode_video);
        return get_first_stage_encoding(work_ctx, vae_output);
    }

    ggml_tensor* decode_first_stage(ggml_context* work_ctx, ggml_tensor* x, bool decode_video = false) {
        const int vae_scale_factor = get_vae_scale_factor();
        int64_t W                  = x->ne[0] * vae_scale_factor;
        int64_t H                  = x->ne[1] * vae_scale_factor;
        int64_t C                  = 3;
        ggml_tensor* result        = nullptr;
        if (decode_video) {
            int64_t T = x->ne[2];
            if (sd_version_is_wan(version)) {
                T = ((T - 1) * 4) + 1;
            }
            result = ggml_new_tensor_4d(work_ctx,
                                        GGML_TYPE_F32,
                                        W,
                                        H,
                                        T,
                                        3);
        } else {
            result = ggml_new_tensor_4d(work_ctx,
                                        GGML_TYPE_F32,
                                        W,
                                        H,
                                        C,
                                        x->ne[3]);
        }
        int64_t t0 = ggml_time_ms();
        if (!use_tiny_autoencoder) {
            if (sd_version_is_qwen_image(version) || sd_version_is_anima(version)) {
                x = ggml_reshape_4d(work_ctx, x, x->ne[0], x->ne[1], 1, x->ne[2] * x->ne[3]);
            }
            process_latent_out(x);
            // x = load_tensor_from_file(work_ctx, "wan_vae_z.bin");
            if (vae_tiling_params.enabled) {
                float tile_overlap;
                int tile_size_x, tile_size_y;
                get_tile_sizes(tile_size_x, tile_size_y, tile_overlap, vae_tiling_params, x->ne[0], x->ne[1]);

                LOG_DEBUG("VAE Tile size: %dx%d", tile_size_x, tile_size_y);

                // split latent in 32x32 tiles and compute in several steps
                auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
                    return first_stage_model->compute(n_threads, in, true, &out, nullptr);
                };
                sd_tiling_non_square(x, result, vae_scale_factor, tile_size_x, tile_size_y, tile_overlap, on_tiling);
            } else {
                if (!first_stage_model->compute(n_threads, x, true, &result, work_ctx)) {
                    LOG_ERROR("Failed to decode latetnts");
                    first_stage_model->free_compute_buffer();
                    return nullptr;
                }
            }
            first_stage_model->free_compute_buffer();
            process_vae_output_tensor(result);
        } else {
            if (vae_tiling_params.enabled) {
                // split latent in 64x64 tiles and compute in several steps
                auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
                    return tae_first_stage->compute(n_threads, in, true, &out);
                };
                sd_tiling(x, result, vae_scale_factor, 64, 0.5f, on_tiling);
            } else {
                if (!tae_first_stage->compute(n_threads, x, true, &result)) {
                    LOG_ERROR("Failed to decode latetnts");
                    tae_first_stage->free_compute_buffer();
                    return nullptr;
                }
            }
            tae_first_stage->free_compute_buffer();
        }

        int64_t t1 = ggml_time_ms();
        LOG_DEBUG("computing vae decode graph completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
        ggml_ext_tensor_clamp_inplace(result, 0.0f, 1.0f);
        return result;
    }

    void set_flow_shift(float flow_shift = INFINITY) {
        auto flow_denoiser = std::dynamic_pointer_cast<DiscreteFlowDenoiser>(denoiser);
        if (flow_denoiser) {
            if (flow_shift == INFINITY) {
                flow_shift = default_flow_shift;
            }
            flow_denoiser->set_shift(flow_shift);
        }
    }

    //added for kcpp
    void SetCircularAxesAll(bool circular_x, bool circular_y)
    {
        diffusion_model->set_circular_axes(circular_x, circular_y);
        if (high_noise_diffusion_model) {
            high_noise_diffusion_model->set_circular_axes(circular_x, circular_y);
        }
        if (control_net) {
            control_net->set_circular_axes(circular_x, circular_y);
        }
        if (first_stage_model) {
            first_stage_model->set_circular_axes(circular_x, circular_y);
        }
        if (tae_first_stage) {
            tae_first_stage->set_circular_axes(circular_x, circular_y);
        }
    }
    //end added for kcpp

};

/*================================================= SD API ==================================================*/

#define NONE_STR "NONE"

const char* sd_type_name(enum sd_type_t type) {
    if ((int)type < std::min<int>(SD_TYPE_COUNT, GGML_TYPE_COUNT)) {
        return ggml_type_name((ggml_type)type);
    }
    return NONE_STR;
}

enum sd_type_t str_to_sd_type(const char* str) {
    for (int i = 0; i < std::min<int>(SD_TYPE_COUNT, GGML_TYPE_COUNT); i++) {
        auto trait = ggml_get_type_traits((ggml_type)i);
        if (!strcmp(str, trait->type_name)) {
            return (enum sd_type_t)i;
        }
    }
    return SD_TYPE_COUNT;
}

const char* rng_type_to_str[] = {
    "std_default",
    "cuda",
    "cpu",
};

const char* sd_rng_type_name(enum rng_type_t rng_type) {
    if (rng_type < RNG_TYPE_COUNT) {
        return rng_type_to_str[rng_type];
    }
    return NONE_STR;
}

enum rng_type_t str_to_rng_type(const char* str) {
    for (int i = 0; i < RNG_TYPE_COUNT; i++) {
        if (!strcmp(str, rng_type_to_str[i])) {
            return (enum rng_type_t)i;
        }
    }
    return RNG_TYPE_COUNT;
}

const char* sample_method_to_str[] = {
    "euler",
    "euler_a",
    "heun",
    "dpm2",
    "dpm++2s_a",
    "dpm++2m",
    "dpm++2mv2",
    "ipndm",
    "ipndm_v",
    "lcm",
    "ddim_trailing",
    "tcd",
    "res_multistep",
    "res_2s",
};

const char* sd_sample_method_name(enum sample_method_t sample_method) {
    if (sample_method < SAMPLE_METHOD_COUNT) {
        return sample_method_to_str[sample_method];
    }
    return NONE_STR;
}

enum sample_method_t str_to_sample_method(const char* str) {
    for (int i = 0; i < SAMPLE_METHOD_COUNT; i++) {
        if (!strcmp(str, sample_method_to_str[i])) {
            return (enum sample_method_t)i;
        }
    }
    return SAMPLE_METHOD_COUNT;
}

const char* scheduler_to_str[] = {
    "discrete",
    "karras",
    "exponential",
    "ays",
    "gits",
    "sgm_uniform",
    "simple",
    "smoothstep",
    "kl_optimal",
    "lcm",
    "bong_tangent",
};

const char* sd_scheduler_name(enum scheduler_t scheduler) {
    if (scheduler < SCHEDULER_COUNT) {
        return scheduler_to_str[scheduler];
    }
    return NONE_STR;
}

enum scheduler_t str_to_scheduler(const char* str) {
    for (int i = 0; i < SCHEDULER_COUNT; i++) {
        if (!strcmp(str, scheduler_to_str[i])) {
            return (enum scheduler_t)i;
        }
    }
    return SCHEDULER_COUNT;
}

const char* prediction_to_str[] = {
    "eps",
    "v",
    "edm_v",
    "sd3_flow",
    "flux_flow",
    "flux2_flow",
};

const char* sd_prediction_name(enum prediction_t prediction) {
    if (prediction < PREDICTION_COUNT) {
        return prediction_to_str[prediction];
    }
    return NONE_STR;
}

enum prediction_t str_to_prediction(const char* str) {
    for (int i = 0; i < PREDICTION_COUNT; i++) {
        if (!strcmp(str, prediction_to_str[i])) {
            return (enum prediction_t)i;
        }
    }
    return PREDICTION_COUNT;
}

const char* preview_to_str[] = {
    "none",
    "proj",
    "tae",
    "vae",
};

const char* sd_preview_name(enum preview_t preview) {
    if (preview < PREVIEW_COUNT) {
        return preview_to_str[preview];
    }
    return NONE_STR;
}

enum preview_t str_to_preview(const char* str) {
    for (int i = 0; i < PREVIEW_COUNT; i++) {
        if (!strcmp(str, preview_to_str[i])) {
            return (enum preview_t)i;
        }
    }
    return PREVIEW_COUNT;
}

const char* lora_apply_mode_to_str[] = {
    "auto",
    "immediately",
    "at_runtime",
};

const char* sd_lora_apply_mode_name(enum lora_apply_mode_t mode) {
    if (mode < LORA_APPLY_MODE_COUNT) {
        return lora_apply_mode_to_str[mode];
    }
    return NONE_STR;
}

enum lora_apply_mode_t str_to_lora_apply_mode(const char* str) {
    for (int i = 0; i < LORA_APPLY_MODE_COUNT; i++) {
        if (!strcmp(str, lora_apply_mode_to_str[i])) {
            return (enum lora_apply_mode_t)i;
        }
    }
    return LORA_APPLY_MODE_COUNT;
}

void sd_cache_params_init(sd_cache_params_t* cache_params) {
    *cache_params                             = {};
    cache_params->mode                        = SD_CACHE_DISABLED;
    cache_params->reuse_threshold             = 1.0f;
    cache_params->start_percent               = 0.15f;
    cache_params->end_percent                 = 0.95f;
    cache_params->error_decay_rate            = 1.0f;
    cache_params->use_relative_threshold      = true;
    cache_params->reset_error_on_compute      = true;
    cache_params->Fn_compute_blocks           = 8;
    cache_params->Bn_compute_blocks           = 0;
    cache_params->residual_diff_threshold     = 0.08f;
    cache_params->max_warmup_steps            = 8;
    cache_params->max_cached_steps            = -1;
    cache_params->max_continuous_cached_steps = -1;
    cache_params->taylorseer_n_derivatives    = 1;
    cache_params->taylorseer_skip_interval    = 1;
    cache_params->scm_mask                    = nullptr;
    cache_params->scm_policy_dynamic          = true;
}

void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
    *sd_ctx_params                         = {};
    sd_ctx_params->vae_decode_only         = true;
    sd_ctx_params->free_params_immediately = true;
    sd_ctx_params->n_threads               = sd_get_num_physical_cores();
    sd_ctx_params->wtype                   = SD_TYPE_COUNT;
    sd_ctx_params->rng_type                = CUDA_RNG;
    sd_ctx_params->sampler_rng_type        = RNG_TYPE_COUNT;
    sd_ctx_params->prediction              = PREDICTION_COUNT;
    sd_ctx_params->lora_apply_mode         = LORA_APPLY_AUTO;
    sd_ctx_params->offload_params_to_cpu   = false;
    sd_ctx_params->enable_mmap             = false;
    sd_ctx_params->keep_clip_on_cpu        = false;
    sd_ctx_params->keep_control_net_on_cpu = false;
    sd_ctx_params->keep_vae_on_cpu         = false;
    sd_ctx_params->diffusion_flash_attn    = false;
    sd_ctx_params->circular_x              = false;
    sd_ctx_params->circular_y              = false;
    sd_ctx_params->chroma_use_dit_mask     = true;
    sd_ctx_params->chroma_use_t5_mask      = false;
    sd_ctx_params->chroma_t5_mask_pad      = 1;
}

char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
    char* buf = (char*)malloc(4096);
    if (!buf)
        return nullptr;
    buf[0] = '\0';

    snprintf(buf + strlen(buf), 4096 - strlen(buf),
             "model_path: %s\n"
             "clip_l_path: %s\n"
             "clip_g_path: %s\n"
             "clip_vision_path: %s\n"
             "t5xxl_path: %s\n"
             "llm_path: %s\n"
             "llm_vision_path: %s\n"
             "diffusion_model_path: %s\n"
             "high_noise_diffusion_model_path: %s\n"
             "vae_path: %s\n"
             "taesd_path: %s\n"
             "control_net_path: %s\n"
             "photo_maker_path: %s\n"
             "tensor_type_rules: %s\n"
             "vae_decode_only: %s\n"
             "free_params_immediately: %s\n"
             "n_threads: %d\n"
             "wtype: %s\n"
             "rng_type: %s\n"
             "sampler_rng_type: %s\n"
             "prediction: %s\n"
             "offload_params_to_cpu: %s\n"
             "keep_clip_on_cpu: %s\n"
             "keep_control_net_on_cpu: %s\n"
             "keep_vae_on_cpu: %s\n"
             "flash_attn: %s\n"
             "diffusion_flash_attn: %s\n"
             "circular_x: %s\n"
             "circular_y: %s\n"
             "chroma_use_dit_mask: %s\n"
             "chroma_use_t5_mask: %s\n"
             "chroma_t5_mask_pad: %d\n",
             SAFE_STR(sd_ctx_params->model_path),
             SAFE_STR(sd_ctx_params->clip_l_path),
             SAFE_STR(sd_ctx_params->clip_g_path),
             SAFE_STR(sd_ctx_params->clip_vision_path),
             SAFE_STR(sd_ctx_params->t5xxl_path),
             SAFE_STR(sd_ctx_params->llm_path),
             SAFE_STR(sd_ctx_params->llm_vision_path),
             SAFE_STR(sd_ctx_params->diffusion_model_path),
             SAFE_STR(sd_ctx_params->high_noise_diffusion_model_path),
             SAFE_STR(sd_ctx_params->vae_path),
             SAFE_STR(sd_ctx_params->taesd_path),
             SAFE_STR(sd_ctx_params->control_net_path),
             SAFE_STR(sd_ctx_params->photo_maker_path),
             SAFE_STR(sd_ctx_params->tensor_type_rules),
             BOOL_STR(sd_ctx_params->vae_decode_only),
             BOOL_STR(sd_ctx_params->free_params_immediately),
             sd_ctx_params->n_threads,
             sd_type_name(sd_ctx_params->wtype),
             sd_rng_type_name(sd_ctx_params->rng_type),
             sd_rng_type_name(sd_ctx_params->sampler_rng_type),
             sd_prediction_name(sd_ctx_params->prediction),
             BOOL_STR(sd_ctx_params->offload_params_to_cpu),
             BOOL_STR(sd_ctx_params->keep_clip_on_cpu),
             BOOL_STR(sd_ctx_params->keep_control_net_on_cpu),
             BOOL_STR(sd_ctx_params->keep_vae_on_cpu),
             BOOL_STR(sd_ctx_params->flash_attn),
             BOOL_STR(sd_ctx_params->diffusion_flash_attn),
             BOOL_STR(sd_ctx_params->circular_x),
             BOOL_STR(sd_ctx_params->circular_y),
             BOOL_STR(sd_ctx_params->chroma_use_dit_mask),
             BOOL_STR(sd_ctx_params->chroma_use_t5_mask),
             sd_ctx_params->chroma_t5_mask_pad);

    return buf;
}

void sd_sample_params_init(sd_sample_params_t* sample_params) {
    *sample_params                             = {};
    sample_params->guidance.txt_cfg            = 7.0f;
    sample_params->guidance.img_cfg            = INFINITY;
    sample_params->guidance.distilled_guidance = 3.5f;
    sample_params->guidance.slg.layer_count    = 0;
    sample_params->guidance.slg.layer_start    = 0.01f;
    sample_params->guidance.slg.layer_end      = 0.2f;
    sample_params->guidance.slg.scale          = 0.f;
    sample_params->scheduler                   = SCHEDULER_COUNT;
    sample_params->sample_method               = SAMPLE_METHOD_COUNT;
    sample_params->sample_steps                = 20;
    sample_params->custom_sigmas               = nullptr;
    sample_params->custom_sigmas_count         = 0;
    sample_params->flow_shift                  = INFINITY;
}

char* sd_sample_params_to_str(const sd_sample_params_t* sample_params) {
    char* buf = (char*)malloc(4096);
    if (!buf)
        return nullptr;
    buf[0] = '\0';

    snprintf(buf + strlen(buf), 4096 - strlen(buf),
             "(txt_cfg: %.2f, "
             "img_cfg: %.2f, "
             "distilled_guidance: %.2f, "
             "slg.layer_count: %zu, "
             "slg.layer_start: %.2f, "
             "slg.layer_end: %.2f, "
             "slg.scale: %.2f, "
             "scheduler: %s, "
             "sample_method: %s, "
             "sample_steps: %d, "
             "eta: %.2f, "
             "shifted_timestep: %d, "
             "flow_shift: %.2f)",
             sample_params->guidance.txt_cfg,
             std::isfinite(sample_params->guidance.img_cfg)
                 ? sample_params->guidance.img_cfg
                 : sample_params->guidance.txt_cfg,
             sample_params->guidance.distilled_guidance,
             sample_params->guidance.slg.layer_count,
             sample_params->guidance.slg.layer_start,
             sample_params->guidance.slg.layer_end,
             sample_params->guidance.slg.scale,
             sd_scheduler_name(sample_params->scheduler),
             sd_sample_method_name(sample_params->sample_method),
             sample_params->sample_steps,
             sample_params->eta,
             sample_params->shifted_timestep,
             sample_params->flow_shift);

    return buf;
}

void sd_img_gen_params_init(sd_img_gen_params_t* sd_img_gen_params) {
    *sd_img_gen_params = {};
    sd_sample_params_init(&sd_img_gen_params->sample_params);
    sd_img_gen_params->clip_skip         = -1;
    sd_img_gen_params->ref_images_count  = 0;
    sd_img_gen_params->width             = 512;
    sd_img_gen_params->height            = 512;
    sd_img_gen_params->strength          = 0.75f;
    sd_img_gen_params->seed              = -1;
    sd_img_gen_params->batch_count       = 1;
    sd_img_gen_params->control_strength  = 0.9f;
    sd_img_gen_params->pm_params         = {nullptr, 0, nullptr, 20.f};
    sd_img_gen_params->vae_tiling_params = {false, 0, 0, 0.5f, 0.0f, 0.0f};
    sd_cache_params_init(&sd_img_gen_params->cache);
}

char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params) {
    char* buf = (char*)malloc(4096);
    if (!buf)
        return nullptr;
    buf[0] = '\0';

    char* sample_params_str = sd_sample_params_to_str(&sd_img_gen_params->sample_params);

    snprintf(buf + strlen(buf), 4096 - strlen(buf),
             "prompt: %s\n"
             "negative_prompt: %s\n"
             "clip_skip: %d\n"
             "width: %d\n"
             "height: %d\n"
             "sample_params: %s\n"
             "strength: %.2f\n"
             "seed: %" PRId64
             "\n"
             "batch_count: %d\n"
             "ref_images_count: %d\n"
             "auto_resize_ref_image: %s\n"
             "increase_ref_index: %s\n"
             "control_strength: %.2f\n"
             "photo maker: {style_strength = %.2f, id_images_count = %d, id_embed_path = %s}\n"
             "VAE tiling: %s\n",
             SAFE_STR(sd_img_gen_params->prompt),
             SAFE_STR(sd_img_gen_params->negative_prompt),
             sd_img_gen_params->clip_skip,
             sd_img_gen_params->width,
             sd_img_gen_params->height,
             SAFE_STR(sample_params_str),
             sd_img_gen_params->strength,
             sd_img_gen_params->seed,
             sd_img_gen_params->batch_count,
             sd_img_gen_params->ref_images_count,
             BOOL_STR(sd_img_gen_params->auto_resize_ref_image),
             BOOL_STR(sd_img_gen_params->increase_ref_index),
             sd_img_gen_params->control_strength,
             sd_img_gen_params->pm_params.style_strength,
             sd_img_gen_params->pm_params.id_images_count,
             SAFE_STR(sd_img_gen_params->pm_params.id_embed_path),
             BOOL_STR(sd_img_gen_params->vae_tiling_params.enabled));
    const char* cache_mode_str = "disabled";
    if (sd_img_gen_params->cache.mode == SD_CACHE_EASYCACHE) {
        cache_mode_str = "easycache";
    } else if (sd_img_gen_params->cache.mode == SD_CACHE_UCACHE) {
        cache_mode_str = "ucache";
    }
    snprintf(buf + strlen(buf), 4096 - strlen(buf),
             "cache: %s (threshold=%.3f, start=%.2f, end=%.2f)\n",
             cache_mode_str,
             sd_img_gen_params->cache.reuse_threshold,
             sd_img_gen_params->cache.start_percent,
             sd_img_gen_params->cache.end_percent);
    free(sample_params_str);
    return buf;
}

void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params) {
    *sd_vid_gen_params = {};
    sd_sample_params_init(&sd_vid_gen_params->sample_params);
    sd_sample_params_init(&sd_vid_gen_params->high_noise_sample_params);
    sd_vid_gen_params->high_noise_sample_params.sample_steps = -1;
    sd_vid_gen_params->width                                 = 512;
    sd_vid_gen_params->height                                = 512;
    sd_vid_gen_params->strength                              = 0.75f;
    sd_vid_gen_params->seed                                  = -1;
    sd_vid_gen_params->video_frames                          = 6;
    sd_vid_gen_params->moe_boundary                          = 0.875f;
    sd_vid_gen_params->vace_strength                         = 1.f;
    sd_vid_gen_params->vae_tiling_params                     = {false, 0, 0, 0.5f, 0.0f, 0.0f};
    sd_cache_params_init(&sd_vid_gen_params->cache);
}

struct sd_ctx_t {
    StableDiffusionGGML* sd = nullptr;
};

sd_ctx_t* new_sd_ctx(const sd_ctx_params_t* sd_ctx_params) {
    sd_ctx_t* sd_ctx = (sd_ctx_t*)malloc(sizeof(sd_ctx_t));
    if (sd_ctx == nullptr) {
        return nullptr;
    }

    sd_ctx->sd = new StableDiffusionGGML();
    if (sd_ctx->sd == nullptr) {
        free(sd_ctx);
        return nullptr;
    }

    if (!sd_ctx->sd->init(sd_ctx_params)) {
        delete sd_ctx->sd;
        sd_ctx->sd = nullptr;
        free(sd_ctx);
        return nullptr;
    }
    return sd_ctx;
}

void free_sd_ctx(sd_ctx_t* sd_ctx) {
    if (sd_ctx->sd != nullptr) {
        delete sd_ctx->sd;
        sd_ctx->sd = nullptr;
    }
    free(sd_ctx);
}

enum sample_method_t sd_get_default_sample_method(const sd_ctx_t* sd_ctx) {
    if (sd_ctx != nullptr && sd_ctx->sd != nullptr) {
        if (sd_version_is_dit(sd_ctx->sd->version)) {
            return EULER_SAMPLE_METHOD;
        }
    }
    return EULER_A_SAMPLE_METHOD;
}

enum scheduler_t sd_get_default_scheduler(const sd_ctx_t* sd_ctx, enum sample_method_t sample_method) {
    if (sd_ctx != nullptr && sd_ctx->sd != nullptr) {
        auto edm_v_denoiser = std::dynamic_pointer_cast<EDMVDenoiser>(sd_ctx->sd->denoiser);
        if (edm_v_denoiser) {
            return EXPONENTIAL_SCHEDULER;
        }
    }
    if (sample_method == LCM_SAMPLE_METHOD) {
        return LCM_SCHEDULER;
    }
    return DISCRETE_SCHEDULER;
}

sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
                                    struct ggml_context* work_ctx,
                                    ggml_tensor* init_latent,
                                    std::string prompt,
                                    std::string negative_prompt,
                                    int clip_skip,
                                    sd_guidance_params_t guidance,
                                    float eta,
                                    int shifted_timestep,
                                    int width,
                                    int height,
                                    enum sample_method_t sample_method,
                                    const std::vector<float>& sigmas,
                                    int64_t seed,
                                    int batch_count,
                                    sd_image_t control_image,
                                    float control_strength,
                                    sd_pm_params_t pm_params,
                                    std::vector<sd_image_t*> ref_images,
                                    std::vector<ggml_tensor*> ref_latents,
                                    bool increase_ref_index,
                                    ggml_tensor* concat_latent            = nullptr,
                                    ggml_tensor* denoise_mask             = nullptr,
                                    const sd_cache_params_t* cache_params = nullptr) {
    if (seed < 0) {
        // Generally, when using the provided command line, the seed is always >0.
        // However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library
        // by a third party with a seed <0, let's incorporate randomization here.
        srand((int)time(nullptr));
        seed = rand();
    }

    if (!std::isfinite(guidance.img_cfg)) {
        guidance.img_cfg = guidance.txt_cfg;
    }

    int sample_steps = static_cast<int>(sigmas.size() - 1);

    int64_t t0 = ggml_time_ms();

    ConditionerParams condition_params;
    condition_params.text            = prompt;
    condition_params.clip_skip       = clip_skip;
    condition_params.width           = width;
    condition_params.height          = height;
    condition_params.ref_images      = ref_images;
    condition_params.adm_in_channels = static_cast<int>(sd_ctx->sd->diffusion_model->get_adm_in_channels());

    // Photo Maker
    SDCondition id_cond = sd_ctx->sd->get_pmid_conditon(work_ctx, pm_params, condition_params);

    // Get learned condition
    condition_params.zero_out_masked = false;
    SDCondition cond                 = sd_ctx->sd->cond_stage_model->get_learned_condition(work_ctx,
                                                                                           sd_ctx->sd->n_threads,
                                                                                           condition_params);

    SDCondition uncond;
    if (guidance.txt_cfg != 1.0 ||
        (sd_version_is_inpaint_or_unet_edit(sd_ctx->sd->version) && guidance.txt_cfg != guidance.img_cfg)) {
        bool zero_out_masked = false;
        if (sd_version_is_sdxl(sd_ctx->sd->version) && negative_prompt.size() == 0 && !sd_ctx->sd->is_using_edm_v_parameterization) {
            zero_out_masked = true;
        }
        condition_params.text            = negative_prompt;
        condition_params.zero_out_masked = zero_out_masked;
        uncond                           = sd_ctx->sd->cond_stage_model->get_learned_condition(work_ctx,
                                                                                               sd_ctx->sd->n_threads,
                                                                                               condition_params);
    }
    int64_t t1 = ggml_time_ms();
    LOG_INFO("get_learned_condition completed, taking %" PRId64 " ms", t1 - t0);

    if (sd_ctx->sd->free_params_immediately) {
        sd_ctx->sd->cond_stage_model->free_params_buffer();
    }

    // Control net hint
    struct ggml_tensor* image_hint = nullptr;
    if (control_image.data != nullptr) {
        image_hint = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
        sd_image_to_ggml_tensor(control_image, image_hint);
    }

    // Sample
    std::vector<struct ggml_tensor*> final_latents;  // collect latents to decode
    int C = sd_ctx->sd->get_latent_channel();
    int W = width / sd_ctx->sd->get_vae_scale_factor();
    int H = height / sd_ctx->sd->get_vae_scale_factor();

    struct ggml_tensor* control_latent = nullptr;
    if (sd_version_is_control(sd_ctx->sd->version) && image_hint != nullptr) {
        control_latent = sd_ctx->sd->encode_first_stage(work_ctx, image_hint);
        ggml_ext_tensor_scale_inplace(control_latent, control_strength);
    }

    if (sd_version_is_inpaint(sd_ctx->sd->version)) {
        int64_t mask_channels = 1;
        if (sd_ctx->sd->version == VERSION_FLUX_FILL) {
            mask_channels = 8 * 8;  // flatten the whole mask
        } else if (sd_ctx->sd->version == VERSION_FLEX_2) {
            mask_channels = 1 + init_latent->ne[2];
        }
        auto empty_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], mask_channels + init_latent->ne[2], 1);
        // no mask, set the whole image as masked
        for (int64_t x = 0; x < empty_latent->ne[0]; x++) {
            for (int64_t y = 0; y < empty_latent->ne[1]; y++) {
                if (sd_ctx->sd->version == VERSION_FLUX_FILL) {
                    // TODO: this might be wrong
                    for (int64_t c = 0; c < init_latent->ne[2]; c++) {
                        ggml_ext_tensor_set_f32(empty_latent, 0, x, y, c);
                    }
                    for (int64_t c = init_latent->ne[2]; c < empty_latent->ne[2]; c++) {
                        ggml_ext_tensor_set_f32(empty_latent, 1, x, y, c);
                    }
                } else if (sd_ctx->sd->version == VERSION_FLEX_2) {
                    for (int64_t c = 0; c < empty_latent->ne[2]; c++) {
                        // 0x16,1x1,0x16
                        ggml_ext_tensor_set_f32(empty_latent, c == init_latent->ne[2], x, y, c);
                    }
                } else {
                    ggml_ext_tensor_set_f32(empty_latent, 1, x, y, 0);
                    for (int64_t c = 1; c < empty_latent->ne[2]; c++) {
                        ggml_ext_tensor_set_f32(empty_latent, 0, x, y, c);
                    }
                }
            }
        }

        if (sd_ctx->sd->version == VERSION_FLEX_2 && control_latent != nullptr && sd_ctx->sd->control_net == nullptr) {
            bool no_inpaint = concat_latent == nullptr;
            if (no_inpaint) {
                concat_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], mask_channels + init_latent->ne[2], 1);
            }
            // fill in the control image here
            for (int64_t x = 0; x < control_latent->ne[0]; x++) {
                for (int64_t y = 0; y < control_latent->ne[1]; y++) {
                    if (no_inpaint) {
                        for (int64_t c = 0; c < concat_latent->ne[2] - control_latent->ne[2]; c++) {
                            // 0x16,1x1,0x16
                            ggml_ext_tensor_set_f32(concat_latent, c == init_latent->ne[2], x, y, c);
                        }
                    }
                    for (int64_t c = 0; c < control_latent->ne[2]; c++) {
                        float v = ggml_ext_tensor_get_f32(control_latent, x, y, c);
                        ggml_ext_tensor_set_f32(concat_latent, v, x, y, concat_latent->ne[2] - control_latent->ne[2] + c);
                    }
                }
            }
        } else if (concat_latent == nullptr) {
            concat_latent = empty_latent;
        }
        cond.c_concat   = concat_latent;
        uncond.c_concat = empty_latent;
        denoise_mask    = nullptr;
    } else if (sd_version_is_unet_edit(sd_ctx->sd->version)) {
        auto empty_latent = ggml_dup_tensor(work_ctx, init_latent);
        ggml_set_f32(empty_latent, 0);
        uncond.c_concat = empty_latent;
        cond.c_concat   = ref_latents[0];
        if (cond.c_concat == nullptr) {
            cond.c_concat = empty_latent;
        }
    } else if (sd_version_is_control(sd_ctx->sd->version)) {
        auto empty_latent = ggml_dup_tensor(work_ctx, init_latent);
        ggml_set_f32(empty_latent, 0);
        uncond.c_concat = empty_latent;
        if (sd_ctx->sd->control_net == nullptr) {
            cond.c_concat = control_latent;
        }
        if (cond.c_concat == nullptr) {
            cond.c_concat = empty_latent;
        }
    }
    SDCondition img_cond;
    if (uncond.c_crossattn != nullptr &&
        (sd_version_is_inpaint_or_unet_edit(sd_ctx->sd->version) && guidance.txt_cfg != guidance.img_cfg)) {
        img_cond = SDCondition(uncond.c_crossattn, uncond.c_vector, cond.c_concat);
    }
    for (int b = 0; b < batch_count; b++) {
        int64_t sampling_start = ggml_time_ms();
        int64_t cur_seed       = seed + b;
        LOG_INFO("generating image: %i/%i - seed %" PRId64, b + 1, batch_count, cur_seed);

        sd_ctx->sd->rng->manual_seed(cur_seed);
        sd_ctx->sd->sampler_rng->manual_seed(cur_seed);
        struct ggml_tensor* x_t   = init_latent;
        struct ggml_tensor* noise = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1);
        ggml_ext_im_set_randn_f32(noise, sd_ctx->sd->rng);

        int start_merge_step = -1;
        if (sd_ctx->sd->use_pmid) {
            start_merge_step = int(sd_ctx->sd->pmid_model->style_strength / 100.f * sample_steps);
            // if (start_merge_step > 30)
            //     start_merge_step = 30;
            LOG_INFO("PHOTOMAKER: start_merge_step: %d", start_merge_step);
        }

        struct ggml_tensor* x_0 = sd_ctx->sd->sample(work_ctx,
                                                     sd_ctx->sd->diffusion_model,
                                                     true,
                                                     x_t,
                                                     noise,
                                                     cond,
                                                     uncond,
                                                     img_cond,
                                                     image_hint,
                                                     control_strength,
                                                     guidance,
                                                     eta,
                                                     shifted_timestep,
                                                     sample_method,
                                                     sigmas,
                                                     start_merge_step,
                                                     id_cond,
                                                     ref_latents,
                                                     increase_ref_index,
                                                     denoise_mask,
                                                     nullptr,
                                                     1.0f,
                                                     cache_params);
        int64_t sampling_end    = ggml_time_ms();
        if (x_0 != nullptr) {
            // print_ggml_tensor(x_0);
            LOG_INFO("sampling completed, taking %.2fs", (sampling_end - sampling_start) * 1.0f / 1000);
            final_latents.push_back(x_0);
        } else {
            LOG_ERROR("sampling for image %d/%d failed after %.2fs", b + 1, batch_count, (sampling_end - sampling_start) * 1.0f / 1000);
        }
    }

    if (sd_ctx->sd->free_params_immediately) {
        sd_ctx->sd->diffusion_model->free_params_buffer();
    }
    int64_t t3 = ggml_time_ms();
    LOG_INFO("generating %" PRId64 " latent images completed, taking %.2fs", final_latents.size(), (t3 - t1) * 1.0f / 1000);

    // Decode to image
    LOG_INFO("decoding %zu latents", final_latents.size());
    std::vector<struct ggml_tensor*> decoded_images;  // collect decoded images
    for (size_t i = 0; i < final_latents.size(); i++) {
        t1                      = ggml_time_ms();
        struct ggml_tensor* img = sd_ctx->sd->decode_first_stage(work_ctx, final_latents[i] /* x_0 */);
        // print_ggml_tensor(img);
        if (img != nullptr) {
            decoded_images.push_back(img);
        }
        int64_t t2 = ggml_time_ms();
        LOG_INFO("latent %" PRId64 " decoded, taking %.2fs", i + 1, (t2 - t1) * 1.0f / 1000);
    }

    int64_t t4 = ggml_time_ms();
    LOG_INFO("decode_first_stage completed, taking %.2fs", (t4 - t3) * 1.0f / 1000);
    if (sd_ctx->sd->free_params_immediately && !sd_ctx->sd->use_tiny_autoencoder) {
        sd_ctx->sd->first_stage_model->free_params_buffer();
    }

    sd_ctx->sd->lora_stat();

    sd_image_t* result_images = (sd_image_t*)calloc(batch_count, sizeof(sd_image_t));
    if (result_images == nullptr) {
        ggml_free(work_ctx);
        return nullptr;
    }
    memset(result_images, 0, batch_count * sizeof(sd_image_t));

    for (size_t i = 0; i < decoded_images.size(); i++) {
        result_images[i].width   = width;
        result_images[i].height  = height;
        result_images[i].channel = 3;
        result_images[i].data    = ggml_tensor_to_sd_image(decoded_images[i]);
    }
    ggml_free(work_ctx);

    return result_images;
}

sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params) {
    sd_ctx->sd->vae_tiling_params = sd_img_gen_params->vae_tiling_params;
    int width                     = sd_img_gen_params->width;
    int height                    = sd_img_gen_params->height;

    int vae_scale_factor            = sd_ctx->sd->get_vae_scale_factor();
    int diffusion_model_down_factor = sd_ctx->sd->get_diffusion_model_down_factor();
    int spatial_multiple            = vae_scale_factor * diffusion_model_down_factor;

    int width_offset  = align_up_offset(width, spatial_multiple);
    int height_offset = align_up_offset(height, spatial_multiple);
    if (width_offset > 0 || height_offset > 0) {
        width += width_offset;
        height += height_offset;
        LOG_WARN("align up %dx%d to %dx%d (multiple=%d)", sd_img_gen_params->width, sd_img_gen_params->height, width, height, spatial_multiple);
    }

    LOG_DEBUG("generate_image %dx%d", width, height);
    if (sd_ctx == nullptr || sd_img_gen_params == nullptr) {
        return nullptr;
    }

    struct ggml_init_params params;
    params.mem_size   = static_cast<size_t>(1024 * 1024) * 1024;  // 1G
    params.mem_buffer = nullptr;
    params.no_alloc   = false;
    // LOG_DEBUG("mem_size %u ", params.mem_size);

    struct ggml_context* work_ctx = ggml_init(params);
    if (!work_ctx) {
        LOG_ERROR("ggml_init() failed");
        return nullptr;
    }

    int64_t seed = sd_img_gen_params->seed;
    if (seed < 0) {
        srand((int)time(nullptr));
        seed = rand();
    }
    sd_ctx->sd->rng->manual_seed(seed);
    sd_ctx->sd->sampler_rng->manual_seed(seed);

    size_t t0 = ggml_time_ms();

    sd_ctx->sd->set_flow_shift(sd_img_gen_params->sample_params.flow_shift);

    // Apply lora
    sd_ctx->sd->apply_loras(sd_img_gen_params->loras, sd_img_gen_params->lora_count);

    enum sample_method_t sample_method = sd_img_gen_params->sample_params.sample_method;
    if (sample_method == SAMPLE_METHOD_COUNT) {
        sample_method = sd_get_default_sample_method(sd_ctx);
    }
    LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]);

    int sample_steps = sd_img_gen_params->sample_params.sample_steps;
    std::vector<float> sigmas;
    if (sd_img_gen_params->sample_params.custom_sigmas_count > 0) {
        sigmas = std::vector<float>(sd_img_gen_params->sample_params.custom_sigmas,
                                    sd_img_gen_params->sample_params.custom_sigmas + sd_img_gen_params->sample_params.custom_sigmas_count);
        if (sample_steps != sigmas.size() - 1) {
            sample_steps = static_cast<int>(sigmas.size()) - 1;
            LOG_WARN("sample_steps != custom_sigmas_count - 1, set sample_steps to %d", sample_steps);
        }
    } else {
        scheduler_t scheduler = sd_img_gen_params->sample_params.scheduler;
        if (scheduler == SCHEDULER_COUNT) {
            scheduler = sd_get_default_scheduler(sd_ctx, sample_method);
        }
        sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps,
                                                  sd_ctx->sd->get_image_seq_len(height, width),
                                                  scheduler,
                                                  sd_ctx->sd->version);
    }

    ggml_tensor* init_latent   = nullptr;
    ggml_tensor* concat_latent = nullptr;
    ggml_tensor* denoise_mask  = nullptr;
    if (sd_img_gen_params->init_image.data) {
        LOG_INFO("IMG2IMG");

        size_t t_enc = static_cast<size_t>(sample_steps * sd_img_gen_params->strength);
        if (t_enc == sample_steps)
            t_enc--;
        LOG_INFO("target t_enc is %zu steps", t_enc);
        std::vector<float> sigma_sched;
        sigma_sched.assign(sigmas.begin() + sample_steps - t_enc - 1, sigmas.end());
        sigmas = sigma_sched;

        ggml_tensor* init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
        ggml_tensor* mask_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 1, 1);

        sd_image_to_ggml_tensor(sd_img_gen_params->mask_image, mask_img);
        sd_image_to_ggml_tensor(sd_img_gen_params->init_image, init_img);

        if (sd_version_is_inpaint(sd_ctx->sd->version)) {
            int64_t mask_channels = 1;
            if (sd_ctx->sd->version == VERSION_FLUX_FILL) {
                mask_channels = vae_scale_factor * vae_scale_factor;  // flatten the whole mask
            } else if (sd_ctx->sd->version == VERSION_FLEX_2) {
                mask_channels = 1 + sd_ctx->sd->get_latent_channel();
            }
            ggml_tensor* masked_latent = nullptr;

            if (sd_ctx->sd->version != VERSION_FLEX_2) {
                // most inpaint models mask before vae
                ggml_tensor* masked_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
                ggml_ext_tensor_apply_mask(init_img, mask_img, masked_img);
                masked_latent = sd_ctx->sd->encode_first_stage(work_ctx, masked_img);
                init_latent   = sd_ctx->sd->encode_first_stage(work_ctx, init_img);
            } else {
                // mask after vae
                init_latent   = sd_ctx->sd->encode_first_stage(work_ctx, init_img);
                masked_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], init_latent->ne[2], 1);
                ggml_ext_tensor_apply_mask(init_latent, mask_img, masked_latent, 0.);
            }
            concat_latent = ggml_new_tensor_4d(work_ctx,
                                               GGML_TYPE_F32,
                                               masked_latent->ne[0],
                                               masked_latent->ne[1],
                                               mask_channels + masked_latent->ne[2],
                                               1);
            for (int ix = 0; ix < masked_latent->ne[0]; ix++) {
                for (int iy = 0; iy < masked_latent->ne[1]; iy++) {
                    int mx = ix * vae_scale_factor;
                    int my = iy * vae_scale_factor;
                    if (sd_ctx->sd->version == VERSION_FLUX_FILL) {
                        for (int k = 0; k < masked_latent->ne[2]; k++) {
                            float v = ggml_ext_tensor_get_f32(masked_latent, ix, iy, k);
                            ggml_ext_tensor_set_f32(concat_latent, v, ix, iy, k);
                        }
                        // "Encode" 8x8 mask chunks into a flattened 1x64 vector, and concatenate to masked image
                        for (int x = 0; x < vae_scale_factor; x++) {
                            for (int y = 0; y < vae_scale_factor; y++) {
                                float m = ggml_ext_tensor_get_f32(mask_img, mx + x, my + y);
                                // TODO: check if the way the mask is flattened is correct (is it supposed to be x*vae_scale_factor+y or x+vae_scale_factor*y?)
                                // python code was using "b (h vae_scale_factor) (w vae_scale_factor) -> b (vae_scale_factor vae_scale_factor) h w"
                                ggml_ext_tensor_set_f32(concat_latent, m, ix, iy, masked_latent->ne[2] + x * vae_scale_factor + y);
                            }
                        }
                    } else if (sd_ctx->sd->version == VERSION_FLEX_2) {
                        float m = ggml_ext_tensor_get_f32(mask_img, mx, my);
                        // masked image
                        for (int k = 0; k < masked_latent->ne[2]; k++) {
                            float v = ggml_ext_tensor_get_f32(masked_latent, ix, iy, k);
                            ggml_ext_tensor_set_f32(concat_latent, v, ix, iy, k);
                        }
                        // downsampled mask
                        ggml_ext_tensor_set_f32(concat_latent, m, ix, iy, masked_latent->ne[2]);
                        // control (todo: support this)
                        for (int k = 0; k < masked_latent->ne[2]; k++) {
                            ggml_ext_tensor_set_f32(concat_latent, 0, ix, iy, masked_latent->ne[2] + 1 + k);
                        }
                    } else {
                        float m = ggml_ext_tensor_get_f32(mask_img, mx, my);
                        ggml_ext_tensor_set_f32(concat_latent, m, ix, iy, 0);
                        for (int k = 0; k < masked_latent->ne[2]; k++) {
                            float v = ggml_ext_tensor_get_f32(masked_latent, ix, iy, k);
                            ggml_ext_tensor_set_f32(concat_latent, v, ix, iy, k + mask_channels);
                        }
                    }
                }
            }
        } else {
            init_latent = sd_ctx->sd->encode_first_stage(work_ctx, init_img);
        }

        {
            // LOG_WARN("Inpainting with a base model is not great");
            denoise_mask = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width / vae_scale_factor, height / vae_scale_factor, 1, 1);
            for (int ix = 0; ix < denoise_mask->ne[0]; ix++) {
                for (int iy = 0; iy < denoise_mask->ne[1]; iy++) {
                    int mx  = ix * vae_scale_factor;
                    int my  = iy * vae_scale_factor;
                    float m = ggml_ext_tensor_get_f32(mask_img, mx, my);
                    ggml_ext_tensor_set_f32(denoise_mask, m, ix, iy);
                }
            }
        }
    } else {
        LOG_INFO("TXT2IMG");
        if (sd_version_is_inpaint(sd_ctx->sd->version)) {
            LOG_WARN("This is an inpainting model, this should only be used in img2img mode with a mask");
        }
        init_latent = sd_ctx->sd->generate_init_latent(work_ctx, width, height);
    }

    sd_guidance_params_t guidance = sd_img_gen_params->sample_params.guidance;
    std::vector<sd_image_t*> ref_images;
    for (int i = 0; i < sd_img_gen_params->ref_images_count; i++) {
        ref_images.push_back(&sd_img_gen_params->ref_images[i]);
    }

    std::vector<uint8_t> empty_image_data;
    sd_image_t empty_image = {(uint32_t)width, (uint32_t)height, 3, nullptr};
    if (ref_images.empty() && sd_version_is_unet_edit(sd_ctx->sd->version)) {
        LOG_WARN("This model needs at least one reference image; using an empty reference");
        empty_image_data.resize(width * height * 3);
        ref_images.push_back(&empty_image);
        empty_image.data = empty_image_data.data();
        guidance.img_cfg = 0.f;
    }

    if (ref_images.size() > 0) {
        LOG_INFO("EDIT mode");
    }

    std::vector<ggml_tensor*> ref_latents;
    for (int i = 0; i < ref_images.size(); i++) {
        ggml_tensor* img;
        if (sd_img_gen_params->auto_resize_ref_image) {
            LOG_DEBUG("auto resize ref images");
            sd_image_f32_t ref_image = sd_image_t_to_sd_image_f32_t(*ref_images[i]);
            int VAE_IMAGE_SIZE       = std::min(1024 * 1024, width * height);
            double vae_width         = sqrt(VAE_IMAGE_SIZE * ref_image.width / ref_image.height);
            double vae_height        = vae_width * ref_image.height / ref_image.width;

            int factor = 16;
            if (sd_version_is_qwen_image(sd_ctx->sd->version)) {
                factor = 32;
            }

            vae_height = round(vae_height / factor) * factor;
            vae_width  = round(vae_width / factor) * factor;

            sd_image_f32_t resized_image = resize_sd_image_f32_t(ref_image, static_cast<int>(vae_width), static_cast<int>(vae_height));
            free(ref_image.data);
            ref_image.data = nullptr;

            LOG_DEBUG("resize vae ref image %d from %dx%d to %dx%d", i, ref_image.height, ref_image.width, resized_image.height, resized_image.width);

            img = ggml_new_tensor_4d(work_ctx,
                                     GGML_TYPE_F32,
                                     resized_image.width,
                                     resized_image.height,
                                     3,
                                     1);
            sd_image_f32_to_ggml_tensor(resized_image, img);
            free(resized_image.data);
            resized_image.data = nullptr;
        } else {
            img = ggml_new_tensor_4d(work_ctx,
                                     GGML_TYPE_F32,
                                     ref_images[i]->width,
                                     ref_images[i]->height,
                                     3,
                                     1);
            sd_image_to_ggml_tensor(*ref_images[i], img);
        }

        // print_ggml_tensor(img, false, "img");

        ggml_tensor* latent = sd_ctx->sd->encode_first_stage(work_ctx, img);
        ref_latents.push_back(latent);
    }

    if (sd_img_gen_params->init_image.data != nullptr || sd_img_gen_params->ref_images_count > 0) {
        size_t t1 = ggml_time_ms();
        LOG_INFO("encode_first_stage completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
    }

    sd_image_t* result_images = generate_image_internal(sd_ctx,
                                                        work_ctx,
                                                        init_latent,
                                                        SAFE_STR(sd_img_gen_params->prompt),
                                                        SAFE_STR(sd_img_gen_params->negative_prompt),
                                                        sd_img_gen_params->clip_skip,
                                                        guidance,
                                                        sd_img_gen_params->sample_params.eta,
                                                        sd_img_gen_params->sample_params.shifted_timestep,
                                                        width,
                                                        height,
                                                        sample_method,
                                                        sigmas,
                                                        seed,
                                                        sd_img_gen_params->batch_count,
                                                        sd_img_gen_params->control_image,
                                                        sd_img_gen_params->control_strength,
                                                        sd_img_gen_params->pm_params,
                                                        ref_images,
                                                        ref_latents,
                                                        sd_img_gen_params->increase_ref_index,
                                                        concat_latent,
                                                        denoise_mask,
                                                        &sd_img_gen_params->cache);

    size_t t2 = ggml_time_ms();

    LOG_INFO("generate_image completed in %.2fs", (t2 - t0) * 1.0f / 1000);

    return result_images;
}

SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* sd_vid_gen_params, int* num_frames_out) {
    if (sd_ctx == nullptr || sd_vid_gen_params == nullptr) {
        return nullptr;
    }
    sd_ctx->sd->vae_tiling_params = sd_vid_gen_params->vae_tiling_params;

    std::string prompt          = SAFE_STR(sd_vid_gen_params->prompt);
    std::string negative_prompt = SAFE_STR(sd_vid_gen_params->negative_prompt);

    int width        = sd_vid_gen_params->width;
    int height       = sd_vid_gen_params->height;
    int frames       = sd_vid_gen_params->video_frames;
    frames           = (frames - 1) / 4 * 4 + 1;
    int sample_steps = sd_vid_gen_params->sample_params.sample_steps;

    int vae_scale_factor            = sd_ctx->sd->get_vae_scale_factor();
    int diffusion_model_down_factor = sd_ctx->sd->get_diffusion_model_down_factor();
    int spatial_multiple            = vae_scale_factor * diffusion_model_down_factor;

    int width_offset  = align_up_offset(width, spatial_multiple);
    int height_offset = align_up_offset(height, spatial_multiple);
    if (width_offset > 0 || height_offset > 0) {
        width += width_offset;
        height += height_offset;
        LOG_WARN("align up %dx%d to %dx%d (multiple=%d)", sd_vid_gen_params->width, sd_vid_gen_params->height, width, height, spatial_multiple);
    }
    LOG_INFO("generate_video %dx%dx%d", width, height, frames);

    sd_ctx->sd->set_flow_shift(sd_vid_gen_params->sample_params.flow_shift);

    enum sample_method_t sample_method = sd_vid_gen_params->sample_params.sample_method;
    if (sample_method == SAMPLE_METHOD_COUNT) {
        sample_method = sd_get_default_sample_method(sd_ctx);
    }
    LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]);

    int high_noise_sample_steps = 0;
    if (sd_ctx->sd->high_noise_diffusion_model) {
        high_noise_sample_steps = sd_vid_gen_params->high_noise_sample_params.sample_steps;
    }

    int total_steps = sample_steps;

    if (high_noise_sample_steps > 0) {
        total_steps += high_noise_sample_steps;
    }

    std::vector<float> sigmas;
    if (sd_vid_gen_params->sample_params.custom_sigmas_count > 0) {
        sigmas = std::vector<float>(sd_vid_gen_params->sample_params.custom_sigmas,
                                    sd_vid_gen_params->sample_params.custom_sigmas + sd_vid_gen_params->sample_params.custom_sigmas_count);
        if (total_steps != sigmas.size() - 1) {
            total_steps = static_cast<int>(sigmas.size()) - 1;
            LOG_WARN("total_steps != custom_sigmas_count - 1, set total_steps to %d", total_steps);
            if (sample_steps >= total_steps) {
                sample_steps = total_steps;
                LOG_WARN("total_steps != custom_sigmas_count - 1, set sample_steps to %d", sample_steps);
            }
            if (high_noise_sample_steps > 0) {
                high_noise_sample_steps = total_steps - sample_steps;
                LOG_WARN("total_steps != custom_sigmas_count - 1, set high_noise_sample_steps to %d", high_noise_sample_steps);
            }
        }
    } else {
        scheduler_t scheduler = sd_vid_gen_params->sample_params.scheduler;
        if (scheduler == SCHEDULER_COUNT) {
            scheduler = sd_get_default_scheduler(sd_ctx, sample_method);
        }
        sigmas = sd_ctx->sd->denoiser->get_sigmas(total_steps,
                                                  0,
                                                  scheduler,
                                                  sd_ctx->sd->version);
    }

    if (high_noise_sample_steps < 0) {
        // timesteps ∝ sigmas for Flow models (like wan2.2 a14b)
        for (size_t i = 0; i < sigmas.size(); ++i) {
            if (sigmas[i] < sd_vid_gen_params->moe_boundary) {
                high_noise_sample_steps = static_cast<int>(i);
                break;
            }
        }
        LOG_DEBUG("switching from high noise model at step %d", high_noise_sample_steps);
    }

    struct ggml_init_params params;
    params.mem_size   = static_cast<size_t>(1024 * 1024) * 1024;  // 1G
    params.mem_buffer = nullptr;
    params.no_alloc   = false;
    // LOG_DEBUG("mem_size %u ", params.mem_size);

    struct ggml_context* work_ctx = ggml_init(params);
    if (!work_ctx) {
        LOG_ERROR("ggml_init() failed");
        return nullptr;
    }

    int64_t seed = sd_vid_gen_params->seed;
    if (seed < 0) {
        seed = (int)time(nullptr);
    }

    sd_ctx->sd->rng->manual_seed(seed);
    sd_ctx->sd->sampler_rng->manual_seed(seed);

    int64_t t0 = ggml_time_ms();

    // Apply lora
    sd_ctx->sd->apply_loras(sd_vid_gen_params->loras, sd_vid_gen_params->lora_count);

    ggml_tensor* init_latent        = nullptr;
    ggml_tensor* clip_vision_output = nullptr;
    ggml_tensor* concat_latent      = nullptr;
    ggml_tensor* denoise_mask       = nullptr;
    ggml_tensor* vace_context       = nullptr;
    int64_t ref_image_num           = 0;  // for vace
    if (sd_ctx->sd->diffusion_model->get_desc() == "Wan2.1-I2V-14B" ||
        sd_ctx->sd->diffusion_model->get_desc() == "Wan2.2-I2V-14B" ||
        sd_ctx->sd->diffusion_model->get_desc() == "Wan2.1-I2V-1.3B" ||
        sd_ctx->sd->diffusion_model->get_desc() == "Wan2.1-FLF2V-14B") {
        LOG_INFO("IMG2VID");

        if (sd_ctx->sd->diffusion_model->get_desc() == "Wan2.1-I2V-14B" ||
            sd_ctx->sd->diffusion_model->get_desc() == "Wan2.1-I2V-1.3B" ||
            sd_ctx->sd->diffusion_model->get_desc() == "Wan2.1-FLF2V-14B") {
            if (sd_vid_gen_params->init_image.data) {
                clip_vision_output = sd_ctx->sd->get_clip_vision_output(work_ctx, sd_vid_gen_params->init_image, false, -2);
            } else {
                clip_vision_output = sd_ctx->sd->get_clip_vision_output(work_ctx, sd_vid_gen_params->init_image, false, -2, true);
            }

            if (sd_ctx->sd->diffusion_model->get_desc() == "Wan2.1-FLF2V-14B") {
                ggml_tensor* end_image_clip_vision_output = nullptr;
                if (sd_vid_gen_params->end_image.data) {
                    end_image_clip_vision_output = sd_ctx->sd->get_clip_vision_output(work_ctx, sd_vid_gen_params->end_image, false, -2);
                } else {
                    end_image_clip_vision_output = sd_ctx->sd->get_clip_vision_output(work_ctx, sd_vid_gen_params->end_image, false, -2, true);
                }
                clip_vision_output = ggml_ext_tensor_concat(work_ctx, clip_vision_output, end_image_clip_vision_output, 1);
            }

            int64_t t1 = ggml_time_ms();
            LOG_INFO("get_clip_vision_output completed, taking %" PRId64 " ms", t1 - t0);
        }

        int64_t t1         = ggml_time_ms();
        ggml_tensor* image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, frames, 3);
        ggml_ext_tensor_iter(image, [&](ggml_tensor* image, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
            float value = 0.5f;
            if (i2 == 0 && sd_vid_gen_params->init_image.data) {  // start image
                value = *(sd_vid_gen_params->init_image.data + i1 * width * 3 + i0 * 3 + i3);
                value /= 255.f;
            } else if (i2 == frames - 1 && sd_vid_gen_params->end_image.data) {
                value = *(sd_vid_gen_params->end_image.data + i1 * width * 3 + i0 * 3 + i3);
                value /= 255.f;
            }
            ggml_ext_tensor_set_f32(image, value, i0, i1, i2, i3);
        });

        concat_latent = sd_ctx->sd->encode_first_stage(work_ctx, image);  // [b*c, t, h/vae_scale_factor, w/vae_scale_factor]

        int64_t t2 = ggml_time_ms();
        LOG_INFO("encode_first_stage completed, taking %" PRId64 " ms", t2 - t1);

        ggml_tensor* concat_mask = ggml_new_tensor_4d(work_ctx,
                                                      GGML_TYPE_F32,
                                                      concat_latent->ne[0],
                                                      concat_latent->ne[1],
                                                      concat_latent->ne[2],
                                                      4);  // [b*4, t, w/vae_scale_factor, h/vae_scale_factor]
        ggml_ext_tensor_iter(concat_mask, [&](ggml_tensor* concat_mask, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
            float value = 0.0f;
            if (i2 == 0 && sd_vid_gen_params->init_image.data) {  // start image
                value = 1.0f;
            } else if (i2 == frames - 1 && sd_vid_gen_params->end_image.data && i3 == 3) {
                value = 1.0f;
            }
            ggml_ext_tensor_set_f32(concat_mask, value, i0, i1, i2, i3);
        });

        concat_latent = ggml_ext_tensor_concat(work_ctx, concat_mask, concat_latent, 3);  // [b*(c+4), t, h/vae_scale_factor, w/vae_scale_factor]
    } else if (sd_ctx->sd->diffusion_model->get_desc() == "Wan2.2-TI2V-5B" && sd_vid_gen_params->init_image.data) {
        LOG_INFO("IMG2VID");

        int64_t t1            = ggml_time_ms();
        ggml_tensor* init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
        sd_image_to_ggml_tensor(sd_vid_gen_params->init_image, init_img);
        init_img = ggml_reshape_4d(work_ctx, init_img, width, height, 1, 3);

        auto init_image_latent = sd_ctx->sd->vae_encode(work_ctx, init_img);  // [b*c, 1, h/16, w/16]

        init_latent  = sd_ctx->sd->generate_init_latent(work_ctx, width, height, frames, true);
        denoise_mask = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], init_latent->ne[2], 1);
        ggml_set_f32(denoise_mask, 1.f);

        if (!sd_ctx->sd->use_tiny_autoencoder)
            sd_ctx->sd->process_latent_out(init_latent);

        ggml_ext_tensor_iter(init_image_latent, [&](ggml_tensor* t, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
            float value = ggml_ext_tensor_get_f32(t, i0, i1, i2, i3);
            ggml_ext_tensor_set_f32(init_latent, value, i0, i1, i2, i3);
            if (i3 == 0) {
                ggml_ext_tensor_set_f32(denoise_mask, 0.f, i0, i1, i2, i3);
            }
        });

        if (!sd_ctx->sd->use_tiny_autoencoder)
            sd_ctx->sd->process_latent_in(init_latent);

        int64_t t2 = ggml_time_ms();
        LOG_INFO("encode_first_stage completed, taking %" PRId64 " ms", t2 - t1);
    } else if (sd_ctx->sd->diffusion_model->get_desc() == "Wan2.1-VACE-1.3B" ||
               sd_ctx->sd->diffusion_model->get_desc() == "Wan2.x-VACE-14B") {
        LOG_INFO("VACE");
        int64_t t1                    = ggml_time_ms();
        ggml_tensor* ref_image_latent = nullptr;
        if (sd_vid_gen_params->init_image.data) {
            ggml_tensor* ref_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
            sd_image_to_ggml_tensor(sd_vid_gen_params->init_image, ref_img);
            ref_img = ggml_reshape_4d(work_ctx, ref_img, width, height, 1, 3);

            ref_image_latent = sd_ctx->sd->encode_first_stage(work_ctx, ref_img);  // [b*c, 1, h/16, w/16]
            auto zero_latent = ggml_dup_tensor(work_ctx, ref_image_latent);
            ggml_set_f32(zero_latent, 0.f);
            ref_image_latent = ggml_ext_tensor_concat(work_ctx, ref_image_latent, zero_latent, 3);  // [b*2*c, 1, h/16, w/16]
        }

        ggml_tensor* control_video = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, frames, 3);
        ggml_ext_tensor_iter(control_video, [&](ggml_tensor* control_video, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
            float value = 0.5f;
            if (i2 < sd_vid_gen_params->control_frames_size) {
                value = sd_image_get_f32(sd_vid_gen_params->control_frames[i2], i0, i1, i3);
            }
            ggml_ext_tensor_set_f32(control_video, value, i0, i1, i2, i3);
        });
        ggml_tensor* mask = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, frames, 1);
        ggml_set_f32(mask, 1.0f);
        ggml_tensor* inactive = ggml_dup_tensor(work_ctx, control_video);
        ggml_tensor* reactive = ggml_dup_tensor(work_ctx, control_video);

        ggml_ext_tensor_iter(control_video, [&](ggml_tensor* t, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
            float control_video_value = ggml_ext_tensor_get_f32(t, i0, i1, i2, i3) - 0.5f;
            float mask_value          = ggml_ext_tensor_get_f32(mask, i0, i1, i2, 0);
            float inactive_value      = (control_video_value * (1.f - mask_value)) + 0.5f;
            float reactive_value      = (control_video_value * mask_value) + 0.5f;

            ggml_ext_tensor_set_f32(inactive, inactive_value, i0, i1, i2, i3);
            ggml_ext_tensor_set_f32(reactive, reactive_value, i0, i1, i2, i3);
        });

        inactive = sd_ctx->sd->encode_first_stage(work_ctx, inactive);  // [b*c, t, h/vae_scale_factor, w/vae_scale_factor]
        reactive = sd_ctx->sd->encode_first_stage(work_ctx, reactive);  // [b*c, t, h/vae_scale_factor, w/vae_scale_factor]

        int64_t length = inactive->ne[2];
        if (ref_image_latent) {
            length += 1;
            frames        = static_cast<int>((length - 1) * 4 + 1);
            ref_image_num = 1;
        }
        vace_context = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, inactive->ne[0], inactive->ne[1], length, 96);  // [b*96, t, h/vae_scale_factor, w/vae_scale_factor]
        ggml_ext_tensor_iter(vace_context, [&](ggml_tensor* vace_context, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
            float value;
            if (i3 < 32) {
                if (ref_image_latent && i2 == 0) {
                    value = ggml_ext_tensor_get_f32(ref_image_latent, i0, i1, 0, i3);
                } else {
                    if (i3 < 16) {
                        value = ggml_ext_tensor_get_f32(inactive, i0, i1, i2 - ref_image_num, i3);
                    } else {
                        value = ggml_ext_tensor_get_f32(reactive, i0, i1, i2 - ref_image_num, i3 - 16);
                    }
                }
            } else {  // mask
                if (ref_image_latent && i2 == 0) {
                    value = 0.f;
                } else {
                    int64_t vae_stride        = vae_scale_factor;
                    int64_t mask_height_index = i1 * vae_stride + (i3 - 32) / vae_stride;
                    int64_t mask_width_index  = i0 * vae_stride + (i3 - 32) % vae_stride;
                    value                     = ggml_ext_tensor_get_f32(mask, mask_width_index, mask_height_index, i2 - ref_image_num, 0);
                }
            }
            ggml_ext_tensor_set_f32(vace_context, value, i0, i1, i2, i3);
        });
        int64_t t2 = ggml_time_ms();
        LOG_INFO("encode_first_stage completed, taking %" PRId64 " ms", t2 - t1);
    }

    if (init_latent == nullptr) {
        init_latent = sd_ctx->sd->generate_init_latent(work_ctx, width, height, frames, true);
    }

    // Get learned condition
    ConditionerParams condition_params;
    condition_params.clip_skip       = sd_vid_gen_params->clip_skip;
    condition_params.zero_out_masked = true;
    condition_params.text            = prompt;

    int64_t t1       = ggml_time_ms();
    SDCondition cond = sd_ctx->sd->cond_stage_model->get_learned_condition(work_ctx,
                                                                           sd_ctx->sd->n_threads,
                                                                           condition_params);
    cond.c_concat    = concat_latent;
    cond.c_vector    = clip_vision_output;
    SDCondition uncond;
    if (sd_vid_gen_params->sample_params.guidance.txt_cfg != 1.0 || sd_vid_gen_params->high_noise_sample_params.guidance.txt_cfg != 1.0) {
        condition_params.text = negative_prompt;
        uncond                = sd_ctx->sd->cond_stage_model->get_learned_condition(work_ctx,
                                                                                    sd_ctx->sd->n_threads,
                                                                                    condition_params);
        uncond.c_concat       = concat_latent;
        uncond.c_vector       = clip_vision_output;
    }
    int64_t t2 = ggml_time_ms();
    LOG_INFO("get_learned_condition completed, taking %" PRId64 " ms", t2 - t1);

    if (sd_ctx->sd->free_params_immediately) {
        sd_ctx->sd->cond_stage_model->free_params_buffer();
    }

    int W = width / vae_scale_factor;
    int H = height / vae_scale_factor;
    int T = static_cast<int>(init_latent->ne[2]);
    int C = sd_ctx->sd->get_latent_channel();

    struct ggml_tensor* final_latent;
    struct ggml_tensor* x_t   = init_latent;
    struct ggml_tensor* noise = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, T, C);
    ggml_ext_im_set_randn_f32(noise, sd_ctx->sd->rng);
    // High Noise Sample
    if (high_noise_sample_steps > 0) {
        LOG_DEBUG("sample(high noise) %dx%dx%d", W, H, T);
        enum sample_method_t high_noise_sample_method = sd_vid_gen_params->high_noise_sample_params.sample_method;
        if (high_noise_sample_method == SAMPLE_METHOD_COUNT) {
            high_noise_sample_method = sd_get_default_sample_method(sd_ctx);
        }
        LOG_INFO("sampling(high noise) using %s method", sampling_methods_str[high_noise_sample_method]);

        int64_t sampling_start = ggml_time_ms();

        std::vector<float> high_noise_sigmas = std::vector<float>(sigmas.begin(), sigmas.begin() + high_noise_sample_steps + 1);
        sigmas                               = std::vector<float>(sigmas.begin() + high_noise_sample_steps, sigmas.end());

        x_t = sd_ctx->sd->sample(work_ctx,
                                 sd_ctx->sd->high_noise_diffusion_model,
                                 false,
                                 x_t,
                                 noise,
                                 cond,
                                 uncond,
                                 {},
                                 nullptr,
                                 0,
                                 sd_vid_gen_params->high_noise_sample_params.guidance,
                                 sd_vid_gen_params->high_noise_sample_params.eta,
                                 sd_vid_gen_params->high_noise_sample_params.shifted_timestep,
                                 high_noise_sample_method,
                                 high_noise_sigmas,
                                 -1,
                                 {},
                                 {},
                                 false,
                                 denoise_mask,
                                 vace_context,
                                 sd_vid_gen_params->vace_strength,
                                 &sd_vid_gen_params->cache);

        int64_t sampling_end = ggml_time_ms();
        LOG_INFO("sampling(high noise) completed, taking %.2fs", (sampling_end - sampling_start) * 1.0f / 1000);
        if (sd_ctx->sd->free_params_immediately) {
            sd_ctx->sd->high_noise_diffusion_model->free_params_buffer();
        }
        noise = nullptr;
    }

    // Sample
    {
        LOG_DEBUG("sample %dx%dx%d", W, H, T);
        int64_t sampling_start = ggml_time_ms();

        final_latent = sd_ctx->sd->sample(work_ctx,
                                          sd_ctx->sd->diffusion_model,
                                          true,
                                          x_t,
                                          noise,
                                          cond,
                                          uncond,
                                          {},
                                          nullptr,
                                          0,
                                          sd_vid_gen_params->sample_params.guidance,
                                          sd_vid_gen_params->sample_params.eta,
                                          sd_vid_gen_params->sample_params.shifted_timestep,
                                          sample_method,
                                          sigmas,
                                          -1,
                                          {},
                                          {},
                                          false,
                                          denoise_mask,
                                          vace_context,
                                          sd_vid_gen_params->vace_strength,
                                          &sd_vid_gen_params->cache);

        int64_t sampling_end = ggml_time_ms();
        LOG_INFO("sampling completed, taking %.2fs", (sampling_end - sampling_start) * 1.0f / 1000);
        if (sd_ctx->sd->free_params_immediately) {
            sd_ctx->sd->diffusion_model->free_params_buffer();
        }
    }

    if (ref_image_num > 0) {
        ggml_tensor* trim_latent = ggml_new_tensor_4d(work_ctx,
                                                      GGML_TYPE_F32,
                                                      final_latent->ne[0],
                                                      final_latent->ne[1],
                                                      final_latent->ne[2] - ref_image_num,
                                                      final_latent->ne[3]);
        ggml_ext_tensor_iter(trim_latent, [&](ggml_tensor* trim_latent, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
            float value = ggml_ext_tensor_get_f32(final_latent, i0, i1, i2 + ref_image_num, i3);
            ggml_ext_tensor_set_f32(trim_latent, value, i0, i1, i2, i3);
        });
        final_latent = trim_latent;
    }

    int64_t t4 = ggml_time_ms();
    LOG_INFO("generating latent video completed, taking %.2fs", (t4 - t2) * 1.0f / 1000);
    struct ggml_tensor* vid = sd_ctx->sd->decode_first_stage(work_ctx, final_latent, true);
    int64_t t5              = ggml_time_ms();
    LOG_INFO("decode_first_stage completed, taking %.2fs", (t5 - t4) * 1.0f / 1000);
    if (sd_ctx->sd->free_params_immediately && !sd_ctx->sd->use_tiny_autoencoder) {
        sd_ctx->sd->first_stage_model->free_params_buffer();
    }

    sd_ctx->sd->lora_stat();

    sd_image_t* result_images = (sd_image_t*)calloc(vid->ne[2], sizeof(sd_image_t));
    if (result_images == nullptr) {
        ggml_free(work_ctx);
        return nullptr;
    }
    *num_frames_out = static_cast<int>(vid->ne[2]);

    for (int64_t i = 0; i < vid->ne[2]; i++) {
        result_images[i].width   = static_cast<uint32_t>(vid->ne[0]);
        result_images[i].height  = static_cast<uint32_t>(vid->ne[1]);
        result_images[i].channel = 3;
        result_images[i].data    = ggml_tensor_to_sd_image(vid, static_cast<int>(i), true);
    }
    ggml_free(work_ctx);

    LOG_INFO("generate_video completed in %.2fs", (t5 - t0) * 1.0f / 1000);

    return result_images;
}

//added for kcpp
void SetCircularAxesAll(sd_ctx_t* sd_ctx, bool circular_x, bool circular_y)
{
    sd_ctx->sd->SetCircularAxesAll(circular_x, circular_y);
}
//end added for kcpp