sd: sync to master-383-20eb674

2026-05-09 19:46:11 +00:00 · 2025-11-30 07:08:55 -03:00 · 2025-11-30 07:08:55 -03:00 · 3db48a1536
commit 3db48a1536
parent 438eae7105
18 changed files with 490825 additions and 278 deletions
--- a/otherarch/sdcpp/conditioner.hpp
+++ b/otherarch/sdcpp/conditioner.hpp
@ -2,7 +2,7 @@
 #define __CONDITIONER_HPP__

 #include "clip.hpp"
-#include "qwenvl.hpp"
+#include "llm.hpp"
 #include "t5.hpp"

 struct SDCondition {
@ -1623,61 +1623,72 @@ struct T5CLIPEmbedder : public Conditioner {
    }
 };

-struct Qwen2_5_VLCLIPEmbedder : public Conditioner {
-    Qwen::Qwen2Tokenizer tokenizer;
-    std::shared_ptr<Qwen::Qwen2_5_VLRunner> qwenvl;
+struct LLMEmbedder : public Conditioner {
+    SDVersion version;
+    std::shared_ptr<LLM::BPETokenizer> tokenizer;
+    std::shared_ptr<LLM::LLMRunner> llm;

-    Qwen2_5_VLCLIPEmbedder(ggml_backend_t backend,
-                           bool offload_params_to_cpu,
-                           const String2TensorStorage& tensor_storage_map = {},
-                           const std::string prefix                       = "",
-                           bool enable_vision                             = false) {
-        qwenvl = std::make_shared<Qwen::Qwen2_5_VLRunner>(backend,
-                                                          offload_params_to_cpu,
-                                                          tensor_storage_map,
-                                                          "text_encoders.qwen2vl",
-                                                          enable_vision);
+    LLMEmbedder(ggml_backend_t backend,
+                bool offload_params_to_cpu,
+                const String2TensorStorage& tensor_storage_map = {},
+                SDVersion version                              = VERSION_QWEN_IMAGE,
+                const std::string prefix                       = "",
+                bool enable_vision                             = false)
+        : version(version) {
+        LLM::LLMArch arch = LLM::LLMArch::QWEN2_5_VL;
+        if (sd_version_is_flux2(version)) {
+            arch = LLM::LLMArch::MISTRAL_SMALL_3_2;
+        }
+        if (arch == LLM::LLMArch::MISTRAL_SMALL_3_2) {
+            tokenizer = std::make_shared<LLM::MistralTokenizer>();
+        } else {
+            tokenizer = std::make_shared<LLM::Qwen2Tokenizer>();
+        }
+        llm = std::make_shared<LLM::LLMRunner>(arch,
+                                               backend,
+                                               offload_params_to_cpu,
+                                               tensor_storage_map,
+                                               "text_encoders.llm",
+                                               enable_vision);
    }

    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
-        qwenvl->get_param_tensors(tensors, "text_encoders.qwen2vl");
+        llm->get_param_tensors(tensors, "text_encoders.llm");
    }

    void alloc_params_buffer() override {
-        qwenvl->alloc_params_buffer();
+        llm->alloc_params_buffer();
    }

    void free_params_buffer() override {
-        qwenvl->free_params_buffer();
+        llm->free_params_buffer();
    }

    size_t get_params_buffer_size() override {
        size_t buffer_size = 0;
-        buffer_size += qwenvl->get_params_buffer_size();
+        buffer_size += llm->get_params_buffer_size();
        return buffer_size;
    }

    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
-        if (qwenvl) {
-            qwenvl->set_weight_adapter(adapter);
+        if (llm) {
+            llm->set_weight_adapter(adapter);
        }
    }

    std::tuple<std::vector<int>, std::vector<float>> tokenize(std::string text,
-                                                              size_t max_length           = 0,
-                                                              size_t system_prompt_length = 0,
-                                                              bool padding                = false) {
+                                                              std::pair<int, int> attn_range,
+                                                              size_t max_length = 0,
+                                                              bool padding      = false) {
        std::vector<std::pair<std::string, float>> parsed_attention;
-        if (system_prompt_length > 0) {
-            parsed_attention.emplace_back(text.substr(0, system_prompt_length), 1.f);
-            auto new_parsed_attention = parse_prompt_attention(text.substr(system_prompt_length, text.size() - system_prompt_length));
+        parsed_attention.emplace_back(text.substr(0, attn_range.first), 1.f);
+        if (attn_range.second - attn_range.first > 0) {
+            auto new_parsed_attention = parse_prompt_attention(text.substr(attn_range.first, attn_range.second - attn_range.first));
            parsed_attention.insert(parsed_attention.end(),
                                    new_parsed_attention.begin(),
                                    new_parsed_attention.end());
-        } else {
-            parsed_attention = parse_prompt_attention(text);
        }
-
+        parsed_attention.emplace_back(text.substr(attn_range.second), 1.f);
        {
            std::stringstream ss;
            ss << "[";
@ -1693,12 +1704,12 @@ struct Qwen2_5_VLCLIPEmbedder : public Conditioner {
        for (const auto& item : parsed_attention) {
            const std::string& curr_text = item.first;
            float curr_weight            = item.second;
-            std::vector<int> curr_tokens = tokenizer.tokenize(curr_text, nullptr);
+            std::vector<int> curr_tokens = tokenizer->tokenize(curr_text, nullptr);
            tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end());
            weights.insert(weights.end(), curr_tokens.size(), curr_weight);
        }

-        tokenizer.pad_tokens(tokens, weights, max_length, padding);
+        tokenizer->pad_tokens(tokens, weights, max_length, padding);

        // for (int i = 0; i < tokens.size(); i++) {
        //     std::cout << tokens[i] << ":" << weights[i] << ", " << i << std::endl;
@ -1713,9 +1724,10 @@ struct Qwen2_5_VLCLIPEmbedder : public Conditioner {
                                      const ConditionerParams& conditioner_params) override {
        std::string prompt;
        std::vector<std::pair<int, ggml_tensor*>> image_embeds;
-        size_t system_prompt_length          = 0;
+        std::pair<int, int> prompt_attn_range;
        int prompt_template_encode_start_idx = 34;
-        if (qwenvl->enable_vision && conditioner_params.ref_images.size() > 0) {
+        std::set<int> out_layers;
+        if (llm->enable_vision && conditioner_params.ref_images.size() > 0) {
            LOG_INFO("QwenImageEditPlusPipeline");
            prompt_template_encode_start_idx = 64;
            int image_embed_idx              = 64 + 6;
@ -1727,7 +1739,7 @@ struct Qwen2_5_VLCLIPEmbedder : public Conditioner {

            for (int i = 0; i < conditioner_params.ref_images.size(); i++) {
                sd_image_f32_t image = sd_image_t_to_sd_image_f32_t(*conditioner_params.ref_images[i]);
-                double factor        = qwenvl->params.vision.patch_size * qwenvl->params.vision.spatial_merge_size;
+                double factor        = llm->params.vision.patch_size * llm->params.vision.spatial_merge_size;
                int height           = image.height;
                int width            = image.width;
                int h_bar            = static_cast<int>(std::round(height / factor)) * factor;
@ -1757,7 +1769,7 @@ struct Qwen2_5_VLCLIPEmbedder : public Conditioner {
                resized_image.data = nullptr;

                ggml_tensor* image_embed = nullptr;
-                qwenvl->encode_image(n_threads, image_tensor, &image_embed, work_ctx);
+                llm->encode_image(n_threads, image_tensor, &image_embed, work_ctx);
                image_embeds.emplace_back(image_embed_idx, image_embed);
                image_embed_idx += 1 + image_embed->ne[1] + 6;

@ -1771,17 +1783,37 @@ struct Qwen2_5_VLCLIPEmbedder : public Conditioner {
            }

            prompt = "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n";
-
-            system_prompt_length = prompt.size();
-
            prompt += img_prompt;
+
+            prompt_attn_range.first = prompt.size();
            prompt += conditioner_params.text;
+            prompt_attn_range.second = prompt.size();
+
            prompt += "<|im_end|>\n<|im_start|>assistant\n";
+        } else if (sd_version_is_flux2(version)) {
+            prompt_template_encode_start_idx = 0;
+            out_layers                       = {10, 20, 30};
+
+            prompt = "[SYSTEM_PROMPT]You are an AI that reasons about image descriptions. You give structured responses focusing on object relationships, object\nattribution and actions without speculation.[/SYSTEM_PROMPT][INST]";
+
+            prompt_attn_range.first = prompt.size();
+            prompt += conditioner_params.text;
+            prompt_attn_range.second = prompt.size();
+
+            prompt += "[/INST]";
        } else {
-            prompt = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n" + conditioner_params.text + "<|im_end|>\n<|im_start|>assistant\n";
+            prompt_template_encode_start_idx = 34;
+
+            prompt = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n";
+
+            prompt_attn_range.first = prompt.size();
+            prompt += conditioner_params.text;
+            prompt_attn_range.second = prompt.size();
+
+            prompt += "<|im_end|>\n<|im_start|>assistant\n";
        }

-        auto tokens_and_weights = tokenize(prompt, 0, system_prompt_length, false);
+        auto tokens_and_weights = tokenize(prompt, prompt_attn_range, 0, false);
        auto& tokens            = std::get<0>(tokens_and_weights);
        auto& weights           = std::get<1>(tokens_and_weights);

@ -1790,11 +1822,12 @@ struct Qwen2_5_VLCLIPEmbedder : public Conditioner {

        auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens);

-        qwenvl->compute(n_threads,
-                        input_ids,
-                        image_embeds,
-                        &hidden_states,
-                        work_ctx);
+        llm->compute(n_threads,
+                     input_ids,
+                     image_embeds,
+                     out_layers,
+                     &hidden_states,
+                     work_ctx);
        {
            auto tensor         = hidden_states;
            float original_mean = ggml_ext_tensor_mean(tensor);
@ -1813,14 +1846,25 @@ struct Qwen2_5_VLCLIPEmbedder : public Conditioner {

        GGML_ASSERT(hidden_states->ne[1] > prompt_template_encode_start_idx);

+        int64_t zero_pad_len = 0;
+        if (sd_version_is_flux2(version)) {
+            int64_t min_length = 512;
+            if (hidden_states->ne[1] - prompt_template_encode_start_idx < min_length) {
+                zero_pad_len = min_length - hidden_states->ne[1] + prompt_template_encode_start_idx;
+            }
+        }
+
        ggml_tensor* new_hidden_states = ggml_new_tensor_3d(work_ctx,
                                                            GGML_TYPE_F32,
                                                            hidden_states->ne[0],
-                                                            hidden_states->ne[1] - prompt_template_encode_start_idx,
+                                                            hidden_states->ne[1] - prompt_template_encode_start_idx + zero_pad_len,
                                                            hidden_states->ne[2]);

        ggml_ext_tensor_iter(new_hidden_states, [&](ggml_tensor* new_hidden_states, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
-            float value = ggml_ext_tensor_get_f32(hidden_states, i0, i1 + prompt_template_encode_start_idx, i2, i3);
+            float value = 0.f;
+            if (i1 + prompt_template_encode_start_idx < hidden_states->ne[1]) {
+                value = ggml_ext_tensor_get_f32(hidden_states, i0, i1 + prompt_template_encode_start_idx, i2, i3);
+            }
            ggml_ext_tensor_set_f32(new_hidden_states, value, i0, i1, i2, i3);
        });

--- a/otherarch/sdcpp/denoiser.hpp
+++ b/otherarch/sdcpp/denoiser.hpp
@ -356,7 +356,7 @@ struct Denoiser {
    virtual ggml_tensor* noise_scaling(float sigma, ggml_tensor* noise, ggml_tensor* latent) = 0;
    virtual ggml_tensor* inverse_noise_scaling(float sigma, ggml_tensor* latent)             = 0;

-    virtual std::vector<float> get_sigmas(uint32_t n, scheduler_t scheduler_type, SDVersion version) {
+    virtual std::vector<float> get_sigmas(uint32_t n, int /*image_seq_len*/, scheduler_t scheduler_type, SDVersion version) {
        auto bound_t_to_sigma = std::bind(&Denoiser::t_to_sigma, this, std::placeholders::_1);
        std::shared_ptr<SigmaScheduler> scheduler;
        switch (scheduler_type) {
@ -582,10 +582,14 @@ struct FluxFlowDenoiser : public Denoiser {
        set_parameters(shift);
    }

-    void set_parameters(float shift = 1.15f) {
+    void set_shift(float shift) {
        this->shift = shift;
-        for (int i = 1; i < TIMESTEPS + 1; i++) {
-            sigmas[i - 1] = t_to_sigma(i / TIMESTEPS * TIMESTEPS);
+    }
+
+    void set_parameters(float shift) {
+        set_shift(shift);
+        for (int i = 0; i < TIMESTEPS; i++) {
+            sigmas[i] = t_to_sigma(i);
        }
    }

@ -627,6 +631,38 @@ struct FluxFlowDenoiser : public Denoiser {
    }
 };

+struct Flux2FlowDenoiser : public FluxFlowDenoiser {
+    Flux2FlowDenoiser() = default;
+
+    float compute_empirical_mu(uint32_t n, int image_seq_len) {
+        const float a1 = 8.73809524e-05f;
+        const float b1 = 1.89833333f;
+        const float a2 = 0.00016927f;
+        const float b2 = 0.45666666f;
+
+        if (image_seq_len > 4300) {
+            float mu = a2 * image_seq_len + b2;
+            return mu;
+        }
+
+        float m_200 = a2 * image_seq_len + b2;
+        float m_10  = a1 * image_seq_len + b1;
+
+        float a  = (m_200 - m_10) / 190.0f;
+        float b  = m_200 - 200.0f * a;
+        float mu = a * n + b;
+
+        return mu;
+    }
+
+    std::vector<float> get_sigmas(uint32_t n, int image_seq_len, scheduler_t scheduler_type, SDVersion version) override {
+        float mu = compute_empirical_mu(n, image_seq_len);
+        LOG_DEBUG("Flux2FlowDenoiser: set shift to %.3f", mu);
+        set_shift(mu);
+        return Denoiser::get_sigmas(n, image_seq_len, scheduler_type, version);
+    }
+};
+
 typedef std::function<ggml_tensor*(ggml_tensor*, float, int)> denoise_cb_t;

 // k diffusion reverse ODE: dx = (x - D(x;\sigma)) / \sigma dt; \sigma(t) = t
--- a/otherarch/sdcpp/flux.hpp
+++ b/otherarch/sdcpp/flux.hpp
@ -14,9 +14,9 @@ namespace Flux {

    struct MLPEmbedder : public UnaryBlock {
    public:
-        MLPEmbedder(int64_t in_dim, int64_t hidden_dim) {
-            blocks["in_layer"]  = std::shared_ptr<GGMLBlock>(new Linear(in_dim, hidden_dim, true));
-            blocks["out_layer"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_dim, hidden_dim, true));
+        MLPEmbedder(int64_t in_dim, int64_t hidden_dim, bool bias = true) {
+            blocks["in_layer"]  = std::shared_ptr<GGMLBlock>(new Linear(in_dim, hidden_dim, bias));
+            blocks["out_layer"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_dim, hidden_dim, bias));
        }

        struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
@ -89,12 +89,13 @@ namespace Flux {
    public:
        SelfAttention(int64_t dim,
                      int64_t num_heads = 8,
-                      bool qkv_bias     = false)
+                      bool qkv_bias     = false,
+                      bool proj_bias    = true)
            : num_heads(num_heads) {
            int64_t head_dim = dim / num_heads;
            blocks["qkv"]    = std::shared_ptr<GGMLBlock>(new Linear(dim, dim * 3, qkv_bias));
            blocks["norm"]   = std::shared_ptr<GGMLBlock>(new QKNorm(head_dim));
-            blocks["proj"]   = std::shared_ptr<GGMLBlock>(new Linear(dim, dim));
+            blocks["proj"]   = std::shared_ptr<GGMLBlock>(new Linear(dim, dim, proj_bias));
        }

        std::vector<struct ggml_tensor*> pre_attention(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
@ -155,10 +156,10 @@ namespace Flux {
        int multiplier;

    public:
-        Modulation(int64_t dim, bool is_double)
+        Modulation(int64_t dim, bool is_double, bool bias = true)
            : is_double(is_double) {
            multiplier    = is_double ? 6 : 3;
-            blocks["lin"] = std::shared_ptr<GGMLBlock>(new Linear(dim, dim * multiplier));
+            blocks["lin"] = std::shared_ptr<GGMLBlock>(new Linear(dim, dim * multiplier, bias));
        }

        std::vector<ModulationOut> forward(GGMLRunnerContext* ctx, struct ggml_tensor* vec) {
@ -198,37 +199,43 @@ namespace Flux {
    struct DoubleStreamBlock : public GGMLBlock {
        bool prune_mod;
        int idx = 0;
+        bool use_mlp_silu_act;

    public:
        DoubleStreamBlock(int64_t hidden_size,
                          int64_t num_heads,
                          float mlp_ratio,
-                          int idx        = 0,
-                          bool qkv_bias  = false,
-                          bool prune_mod = false)
-            : idx(idx), prune_mod(prune_mod) {
-            int64_t mlp_hidden_dim = hidden_size * mlp_ratio;
-            if (!prune_mod) {
+                          int idx               = 0,
+                          bool qkv_bias         = false,
+                          bool prune_mod        = false,
+                          bool share_modulation = false,
+                          bool mlp_proj_bias    = true,
+                          bool use_mlp_silu_act = false)
+            : idx(idx), prune_mod(prune_mod), use_mlp_silu_act(use_mlp_silu_act) {
+            int64_t mlp_hidden_dim  = hidden_size * mlp_ratio;
+            int64_t mlp_mult_factor = use_mlp_silu_act ? 2 : 1;
+
+            if (!prune_mod && !share_modulation) {
                blocks["img_mod"] = std::shared_ptr<GGMLBlock>(new Modulation(hidden_size, true));
            }
            blocks["img_norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-6f, false));
-            blocks["img_attn"]  = std::shared_ptr<GGMLBlock>(new SelfAttention(hidden_size, num_heads, qkv_bias));
+            blocks["img_attn"]  = std::shared_ptr<GGMLBlock>(new SelfAttention(hidden_size, num_heads, qkv_bias, mlp_proj_bias));

            blocks["img_norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-6f, false));
-            blocks["img_mlp.0"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, mlp_hidden_dim));
+            blocks["img_mlp.0"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, mlp_hidden_dim * mlp_mult_factor, mlp_proj_bias));
            // img_mlp.1 is nn.GELU(approximate="tanh")
-            blocks["img_mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(mlp_hidden_dim, hidden_size));
+            blocks["img_mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(mlp_hidden_dim, hidden_size, mlp_proj_bias));

-            if (!prune_mod) {
+            if (!prune_mod && !share_modulation) {
                blocks["txt_mod"] = std::shared_ptr<GGMLBlock>(new Modulation(hidden_size, true));
            }
            blocks["txt_norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-6f, false));
-            blocks["txt_attn"]  = std::shared_ptr<GGMLBlock>(new SelfAttention(hidden_size, num_heads, qkv_bias));
+            blocks["txt_attn"]  = std::shared_ptr<GGMLBlock>(new SelfAttention(hidden_size, num_heads, qkv_bias, mlp_proj_bias));

            blocks["txt_norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-6f, false));
-            blocks["txt_mlp.0"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, mlp_hidden_dim));
+            blocks["txt_mlp.0"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, mlp_hidden_dim * mlp_mult_factor, mlp_proj_bias));
            // img_mlp.1 is nn.GELU(approximate="tanh")
-            blocks["txt_mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(mlp_hidden_dim, hidden_size));
+            blocks["txt_mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(mlp_hidden_dim, hidden_size, mlp_proj_bias));
        }

        std::vector<ModulationOut> get_distil_img_mod(GGMLRunnerContext* ctx, struct ggml_tensor* vec) {
@ -254,7 +261,9 @@ namespace Flux {
                                                                    struct ggml_tensor* txt,
                                                                    struct ggml_tensor* vec,
                                                                    struct ggml_tensor* pe,
-                                                                    struct ggml_tensor* mask = nullptr) {
+                                                                    struct ggml_tensor* mask            = nullptr,
+                                                                    std::vector<ModulationOut> img_mods = {},
+                                                                    std::vector<ModulationOut> txt_mods = {}) {
            // img: [N, n_img_token, hidden_size]
            // txt: [N, n_txt_token, hidden_size]
            // pe: [n_img_token + n_txt_token, d_head/2, 2, 2]
@ -273,21 +282,23 @@ namespace Flux {
            auto txt_mlp_0 = std::dynamic_pointer_cast<Linear>(blocks["txt_mlp.0"]);
            auto txt_mlp_2 = std::dynamic_pointer_cast<Linear>(blocks["txt_mlp.2"]);

-            std::vector<ModulationOut> img_mods;
-            if (prune_mod) {
-                img_mods = get_distil_img_mod(ctx, vec);
-            } else {
-                auto img_mod = std::dynamic_pointer_cast<Modulation>(blocks["img_mod"]);
-                img_mods     = img_mod->forward(ctx, vec);
+            if (img_mods.empty()) {
+                if (prune_mod) {
+                    img_mods = get_distil_img_mod(ctx, vec);
+                } else {
+                    auto img_mod = std::dynamic_pointer_cast<Modulation>(blocks["img_mod"]);
+                    img_mods     = img_mod->forward(ctx, vec);
+                }
            }
            ModulationOut img_mod1 = img_mods[0];
            ModulationOut img_mod2 = img_mods[1];
-            std::vector<ModulationOut> txt_mods;
-            if (prune_mod) {
-                txt_mods = get_distil_txt_mod(ctx, vec);
-            } else {
-                auto txt_mod = std::dynamic_pointer_cast<Modulation>(blocks["txt_mod"]);
-                txt_mods     = txt_mod->forward(ctx, vec);
+            if (txt_mods.empty()) {
+                if (prune_mod) {
+                    txt_mods = get_distil_txt_mod(ctx, vec);
+                } else {
+                    auto txt_mod = std::dynamic_pointer_cast<Modulation>(blocks["txt_mod"]);
+                    txt_mods     = txt_mod->forward(ctx, vec);
+                }
            }
            ModulationOut txt_mod1 = txt_mods[0];
            ModulationOut txt_mod2 = txt_mods[1];
@ -338,8 +349,12 @@ namespace Flux {
            img = ggml_add(ctx->ggml_ctx, img, ggml_mul(ctx->ggml_ctx, img_attn->post_attention(ctx, img_attn_out), img_mod1.gate));

            auto img_mlp_out = img_mlp_0->forward(ctx, Flux::modulate(ctx->ggml_ctx, img_norm2->forward(ctx, img), img_mod2.shift, img_mod2.scale));
-            img_mlp_out      = ggml_gelu_inplace(ctx->ggml_ctx, img_mlp_out);
-            img_mlp_out      = img_mlp_2->forward(ctx, img_mlp_out);
+            if (use_mlp_silu_act) {
+                img_mlp_out = ggml_ext_silu_act(ctx->ggml_ctx, img_mlp_out);
+            } else {
+                img_mlp_out = ggml_gelu_inplace(ctx->ggml_ctx, img_mlp_out);
+            }
+            img_mlp_out = img_mlp_2->forward(ctx, img_mlp_out);

            img = ggml_add(ctx->ggml_ctx, img, ggml_mul(ctx->ggml_ctx, img_mlp_out, img_mod2.gate));

@ -347,10 +362,13 @@ namespace Flux {
            txt = ggml_add(ctx->ggml_ctx, txt, ggml_mul(ctx->ggml_ctx, txt_attn->post_attention(ctx, txt_attn_out), txt_mod1.gate));

            auto txt_mlp_out = txt_mlp_0->forward(ctx, Flux::modulate(ctx->ggml_ctx, txt_norm2->forward(ctx, txt), txt_mod2.shift, txt_mod2.scale));
-            txt_mlp_out      = ggml_gelu_inplace(ctx->ggml_ctx, txt_mlp_out);
-            txt_mlp_out      = txt_mlp_2->forward(ctx, txt_mlp_out);
-
-            txt = ggml_add(ctx->ggml_ctx, txt, ggml_mul(ctx->ggml_ctx, txt_mlp_out, txt_mod2.gate));
+            if (use_mlp_silu_act) {
+                txt_mlp_out = ggml_ext_silu_act(ctx->ggml_ctx, txt_mlp_out);
+            } else {
+                txt_mlp_out = ggml_gelu_inplace(ctx->ggml_ctx, txt_mlp_out);
+            }
+            txt_mlp_out = txt_mlp_2->forward(ctx, txt_mlp_out);
+            txt         = ggml_add(ctx->ggml_ctx, txt, ggml_mul(ctx->ggml_ctx, txt_mlp_out, txt_mod2.gate));

            return {img, txt};
        }
@ -363,28 +381,37 @@ namespace Flux {
        int64_t mlp_hidden_dim;
        bool prune_mod;
        int idx = 0;
+        bool use_mlp_silu_act;
+        int64_t mlp_mult_factor;

    public:
        SingleStreamBlock(int64_t hidden_size,
                          int64_t num_heads,
-                          float mlp_ratio = 4.0f,
-                          int idx         = 0,
-                          float qk_scale  = 0.f,
-                          bool prune_mod  = false)
-            : hidden_size(hidden_size), num_heads(num_heads), idx(idx), prune_mod(prune_mod) {
+                          float mlp_ratio       = 4.0f,
+                          int idx               = 0,
+                          float qk_scale        = 0.f,
+                          bool prune_mod        = false,
+                          bool share_modulation = false,
+                          bool mlp_proj_bias    = true,
+                          bool use_mlp_silu_act = false)
+            : hidden_size(hidden_size), num_heads(num_heads), idx(idx), prune_mod(prune_mod), use_mlp_silu_act(use_mlp_silu_act) {
            int64_t head_dim = hidden_size / num_heads;
            float scale      = qk_scale;
            if (scale <= 0.f) {
                scale = 1 / sqrt((float)head_dim);
            }
-            mlp_hidden_dim = hidden_size * mlp_ratio;
+            mlp_hidden_dim  = hidden_size * mlp_ratio;
+            mlp_mult_factor = 1;
+            if (use_mlp_silu_act) {
+                mlp_mult_factor = 2;
+            }

-            blocks["linear1"]  = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size * 3 + mlp_hidden_dim));
-            blocks["linear2"]  = std::shared_ptr<GGMLBlock>(new Linear(hidden_size + mlp_hidden_dim, hidden_size));
+            blocks["linear1"]  = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size * 3 + mlp_hidden_dim * mlp_mult_factor, mlp_proj_bias));
+            blocks["linear2"]  = std::shared_ptr<GGMLBlock>(new Linear(hidden_size + mlp_hidden_dim, hidden_size, mlp_proj_bias));
            blocks["norm"]     = std::shared_ptr<GGMLBlock>(new QKNorm(head_dim));
            blocks["pre_norm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-6f, false));
            // mlp_act is nn.GELU(approximate="tanh")
-            if (!prune_mod) {
+            if (!prune_mod && !share_modulation) {
                blocks["modulation"] = std::shared_ptr<GGMLBlock>(new Modulation(hidden_size, false));
            }
        }
@ -398,7 +425,8 @@ namespace Flux {
                                    struct ggml_tensor* x,
                                    struct ggml_tensor* vec,
                                    struct ggml_tensor* pe,
-                                    struct ggml_tensor* mask = nullptr) {
+                                    struct ggml_tensor* mask        = nullptr,
+                                    std::vector<ModulationOut> mods = {}) {
            // x: [N, n_token, hidden_size]
            // pe: [n_token, d_head/2, 2, 2]
            // return: [N, n_token, hidden_size]
@ -407,14 +435,20 @@ namespace Flux {
            auto linear2  = std::dynamic_pointer_cast<Linear>(blocks["linear2"]);
            auto norm     = std::dynamic_pointer_cast<QKNorm>(blocks["norm"]);
            auto pre_norm = std::dynamic_pointer_cast<LayerNorm>(blocks["pre_norm"]);
-            ModulationOut mod;
-            if (prune_mod) {
-                mod = get_distil_mod(ctx, vec);
-            } else {
-                auto modulation = std::dynamic_pointer_cast<Modulation>(blocks["modulation"]);

-                mod = modulation->forward(ctx, vec)[0];
+            ModulationOut mod;
+            if (!mods.empty()) {
+                mod = mods[0];
+            } else {
+                if (prune_mod) {
+                    mod = get_distil_mod(ctx, vec);
+                } else {
+                    auto modulation = std::dynamic_pointer_cast<Modulation>(blocks["modulation"]);
+
+                    mod = modulation->forward(ctx, vec)[0];
+                }
            }
+
            auto x_mod   = Flux::modulate(ctx->ggml_ctx, pre_norm->forward(ctx, x), mod.shift, mod.scale);
            auto qkv_mlp = linear1->forward(ctx, x_mod);                                                // [N, n_token, hidden_size * 3 + mlp_hidden_dim]
            qkv_mlp      = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, qkv_mlp, 2, 0, 1, 3));  // [hidden_size * 3 + mlp_hidden_dim, N, n_token]
@ -432,11 +466,11 @@ namespace Flux {
                                    qkv_mlp,
                                    qkv_mlp->ne[0],
                                    qkv_mlp->ne[1],
-                                    mlp_hidden_dim,
+                                    mlp_hidden_dim * mlp_mult_factor,
                                    qkv_mlp->nb[1],
                                    qkv_mlp->nb[2],
-                                    qkv_mlp->nb[2] * hidden_size * 3);                          // [mlp_hidden_dim , N, n_token]
-            mlp      = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, mlp, 1, 2, 0, 3));  // [N, n_token, mlp_hidden_dim]
+                                    qkv_mlp->nb[2] * hidden_size * 3);                          // [mlp_hidden_dim*mlp_mult_factor , N, n_token]
+            mlp      = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, mlp, 1, 2, 0, 3));  // [N, n_token, mlp_hidden_dim*mlp_mult_factor]

            auto qkv_vec     = split_qkv(ctx->ggml_ctx, qkv);  // q,k,v: [N, n_token, hidden_size]
            int64_t head_dim = hidden_size / num_heads;
@ -447,8 +481,13 @@ namespace Flux {
            k                = norm->key_norm(ctx, k);
            auto attn        = Rope::attention(ctx, q, k, v, pe, mask);  // [N, n_token, hidden_size]

-            auto attn_mlp = ggml_concat(ctx->ggml_ctx, attn, ggml_gelu_inplace(ctx->ggml_ctx, mlp), 0);  // [N, n_token, hidden_size + mlp_hidden_dim]
-            auto output   = linear2->forward(ctx, attn_mlp);                                             // [N, n_token, hidden_size]
+            if (use_mlp_silu_act) {
+                mlp = ggml_ext_silu_act(ctx->ggml_ctx, mlp);
+            } else {
+                mlp = ggml_gelu_inplace(ctx->ggml_ctx, mlp);
+            }
+            auto attn_mlp = ggml_concat(ctx->ggml_ctx, attn, mlp, 0);  // [N, n_token, hidden_size + mlp_hidden_dim]
+            auto output   = linear2->forward(ctx, attn_mlp);           // [N, n_token, hidden_size]

            output = ggml_add(ctx->ggml_ctx, x, ggml_mul(ctx->ggml_ctx, output, mod.gate));
            return output;
@ -462,12 +501,13 @@ namespace Flux {
        LastLayer(int64_t hidden_size,
                  int64_t patch_size,
                  int64_t out_channels,
-                  bool prune_mod = false)
+                  bool prune_mod = false,
+                  bool bias      = true)
            : prune_mod(prune_mod) {
            blocks["norm_final"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-06f, false));
-            blocks["linear"]     = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, patch_size * patch_size * out_channels));
+            blocks["linear"]     = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, patch_size * patch_size * out_channels, bias));
            if (!prune_mod) {
-                blocks["adaLN_modulation.1"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, 2 * hidden_size));
+                blocks["adaLN_modulation.1"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, 2 * hidden_size, bias));
            }
        }

@ -684,6 +724,10 @@ namespace Flux {
        bool qkv_bias               = true;
        bool guidance_embed         = true;
        int64_t in_dim              = 64;
+        bool disable_bias           = false;
+        bool share_modulation       = false;
+        bool use_mlp_silu_act       = false;
+        float ref_index_scale       = 1.f;
        ChromaRadianceParams chroma_radiance_params;
    };

@ -702,18 +746,20 @@ namespace Flux {
                                                                  kernel_size,
                                                                  stride);
            } else {
-                blocks["img_in"] = std::make_shared<Linear>(params.in_channels, params.hidden_size, true);
+                blocks["img_in"] = std::make_shared<Linear>(params.in_channels, params.hidden_size, !params.disable_bias);
            }
            if (params.is_chroma) {
                blocks["distilled_guidance_layer"] = std::make_shared<ChromaApproximator>(params.in_dim, params.hidden_size);
            } else {
-                blocks["time_in"]   = std::make_shared<MLPEmbedder>(256, params.hidden_size);
-                blocks["vector_in"] = std::make_shared<MLPEmbedder>(params.vec_in_dim, params.hidden_size);
+                blocks["time_in"] = std::make_shared<MLPEmbedder>(256, params.hidden_size, !params.disable_bias);
+                if (params.vec_in_dim > 0) {
+                    blocks["vector_in"] = std::make_shared<MLPEmbedder>(params.vec_in_dim, params.hidden_size, !params.disable_bias);
+                }
                if (params.guidance_embed) {
-                    blocks["guidance_in"] = std::make_shared<MLPEmbedder>(256, params.hidden_size);
+                    blocks["guidance_in"] = std::make_shared<MLPEmbedder>(256, params.hidden_size, !params.disable_bias);
                }
            }
-            blocks["txt_in"] = std::make_shared<Linear>(params.context_in_dim, params.hidden_size, true);
+            blocks["txt_in"] = std::make_shared<Linear>(params.context_in_dim, params.hidden_size, !params.disable_bias);

            for (int i = 0; i < params.depth; i++) {
                blocks["double_blocks." + std::to_string(i)] = std::make_shared<DoubleStreamBlock>(params.hidden_size,
@ -721,7 +767,10 @@ namespace Flux {
                                                                                                   params.mlp_ratio,
                                                                                                   i,
                                                                                                   params.qkv_bias,
-                                                                                                   params.is_chroma);
+                                                                                                   params.is_chroma,
+                                                                                                   params.share_modulation,
+                                                                                                   !params.disable_bias,
+                                                                                                   params.use_mlp_silu_act);
            }

            for (int i = 0; i < params.depth_single_blocks; i++) {
@ -730,7 +779,10 @@ namespace Flux {
                                                                                                   params.mlp_ratio,
                                                                                                   i,
                                                                                                   0.f,
-                                                                                                   params.is_chroma);
+                                                                                                   params.is_chroma,
+                                                                                                   params.share_modulation,
+                                                                                                   !params.disable_bias,
+                                                                                                   params.use_mlp_silu_act);
            }

            if (params.version == VERSION_CHROMA_RADIANCE) {
@ -748,7 +800,13 @@ namespace Flux {
                                                                                       params.in_channels);

            } else {
-                blocks["final_layer"] = std::make_shared<LastLayer>(params.hidden_size, 1, params.out_channels, params.is_chroma);
+                blocks["final_layer"] = std::make_shared<LastLayer>(params.hidden_size, 1, params.out_channels, params.is_chroma, !params.disable_bias);
+            }
+
+            if (params.share_modulation) {
+                blocks["double_stream_modulation_img"] = std::make_shared<Modulation>(params.hidden_size, true, !params.disable_bias);
+                blocks["double_stream_modulation_txt"] = std::make_shared<Modulation>(params.hidden_size, true, !params.disable_bias);
+                blocks["single_stream_modulation"]     = std::make_shared<Modulation>(params.hidden_size, false, !params.disable_bias);
            }
        }

@ -861,9 +919,8 @@ namespace Flux {
                    txt_img_mask = ggml_pad(ctx->ggml_ctx, y, img->ne[1], 0, 0, 0);
                }
            } else {
-                auto time_in   = std::dynamic_pointer_cast<MLPEmbedder>(blocks["time_in"]);
-                auto vector_in = std::dynamic_pointer_cast<MLPEmbedder>(blocks["vector_in"]);
-                vec            = time_in->forward(ctx, ggml_ext_timestep_embedding(ctx->ggml_ctx, timesteps, 256, 10000, 1000.f));
+                auto time_in = std::dynamic_pointer_cast<MLPEmbedder>(blocks["time_in"]);
+                vec          = time_in->forward(ctx, ggml_ext_timestep_embedding(ctx->ggml_ctx, timesteps, 256, 10000, 1000.f));
                if (params.guidance_embed) {
                    GGML_ASSERT(guidance != nullptr);
                    auto guidance_in = std::dynamic_pointer_cast<MLPEmbedder>(blocks["guidance_in"]);
@ -872,7 +929,23 @@ namespace Flux {
                    vec       = ggml_add(ctx->ggml_ctx, vec, guidance_in->forward(ctx, g_in));
                }

-                vec = ggml_add(ctx->ggml_ctx, vec, vector_in->forward(ctx, y));
+                if (params.vec_in_dim > 0) {
+                    auto vector_in = std::dynamic_pointer_cast<MLPEmbedder>(blocks["vector_in"]);
+                    vec            = ggml_add(ctx->ggml_ctx, vec, vector_in->forward(ctx, y));
+                }
+            }
+
+            std::vector<ModulationOut> ds_img_mods;
+            std::vector<ModulationOut> ds_txt_mods;
+            std::vector<ModulationOut> ss_mods;
+            if (params.share_modulation) {
+                auto double_stream_modulation_img = std::dynamic_pointer_cast<Modulation>(blocks["double_stream_modulation_img"]);
+                auto double_stream_modulation_txt = std::dynamic_pointer_cast<Modulation>(blocks["double_stream_modulation_txt"]);
+                auto single_stream_modulation     = std::dynamic_pointer_cast<Modulation>(blocks["single_stream_modulation"]);
+
+                ds_img_mods = double_stream_modulation_img->forward(ctx, vec);
+                ds_txt_mods = double_stream_modulation_txt->forward(ctx, vec);
+                ss_mods     = single_stream_modulation->forward(ctx, vec);
            }

            txt = txt_in->forward(ctx, txt);
@ -884,7 +957,7 @@ namespace Flux {

                auto block = std::dynamic_pointer_cast<DoubleStreamBlock>(blocks["double_blocks." + std::to_string(i)]);

-                auto img_txt = block->forward(ctx, img, txt, vec, pe, txt_img_mask);
+                auto img_txt = block->forward(ctx, img, txt, vec, pe, txt_img_mask, ds_img_mods, ds_txt_mods);
                img          = img_txt.first;   // [N, n_img_token, hidden_size]
                txt          = img_txt.second;  // [N, n_txt_token, hidden_size]
            }
@ -896,7 +969,7 @@ namespace Flux {
                }
                auto block = std::dynamic_pointer_cast<SingleStreamBlock>(blocks["single_blocks." + std::to_string(i)]);

-                txt_img = block->forward(ctx, txt_img, vec, pe, txt_img_mask);
+                txt_img = block->forward(ctx, txt_img, vec, pe, txt_img_mask, ss_mods);
            }

            txt_img = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, txt_img, 0, 2, 1, 3));  // [n_txt_token + n_img_token, N, hidden_size]
@ -1133,6 +1206,22 @@ namespace Flux {
            } else if (version == VERSION_CHROMA_RADIANCE) {
                flux_params.in_channels = 3;
                flux_params.patch_size  = 16;
+            } else if (sd_version_is_flux2(version)) {
+                flux_params.context_in_dim   = 15360;
+                flux_params.in_channels      = 128;
+                flux_params.hidden_size      = 6144;
+                flux_params.num_heads        = 48;
+                flux_params.patch_size       = 1;
+                flux_params.out_channels     = 128;
+                flux_params.mlp_ratio        = 3.f;
+                flux_params.theta            = 2000;
+                flux_params.axes_dim         = {32, 32, 32, 32};
+                flux_params.vec_in_dim       = 0;
+                flux_params.qkv_bias         = false;
+                flux_params.disable_bias     = true;
+                flux_params.share_modulation = true;
+                flux_params.ref_index_scale  = 10.f;
+                flux_params.use_mlp_silu_act = true;
            }
            for (auto pair : tensor_storage_map) {
                std::string tensor_name = pair.first;
@ -1281,7 +1370,8 @@ namespace Flux {
                                            x->ne[3],
                                            context->ne[1],
                                            ref_latents,
-                                            increase_ref_index,
+                                       sd_version_is_flux2(version) ? true : increase_ref_index,
+                                            flux_params.ref_index_scale,
                                            flux_params.theta,
                                            flux_params.axes_dim);
            int pos_len = pe_vec.size() / flux_params.axes_dim_sum / 2;
@ -1360,9 +1450,9 @@ namespace Flux {
                // cpu f16:
                // cuda f16: nan
                // cuda q8_0: pass
-                // auto x = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 16, 16, 16, 1);
+                auto x = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 16, 16, 128, 1);
                // ggml_set_f32(x, 0.01f);
-                auto x = load_tensor_from_file(work_ctx, "chroma_x.bin");
+                // auto x = load_tensor_from_file(work_ctx, "chroma_x.bin");
                // print_ggml_tensor(x);

                std::vector<float> timesteps_vec(1, 1.f);
@ -1371,9 +1461,9 @@ namespace Flux {
                std::vector<float> guidance_vec(1, 0.f);
                auto guidance = vector_to_ggml_tensor(work_ctx, guidance_vec);

-                // auto context = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32, 4096, 256, 1);
+                auto context = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32, 15360, 256, 1);
                // ggml_set_f32(context, 0.01f);
-                auto context = load_tensor_from_file(work_ctx, "chroma_context.bin");
+                // auto context = load_tensor_from_file(work_ctx, "chroma_context.bin");
                // print_ggml_tensor(context);

                // auto y = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 768, 1);
@ -1395,7 +1485,7 @@ namespace Flux {
        static void load_from_file_and_test(const std::string& file_path) {
            // ggml_backend_t backend = ggml_backend_cuda_init(0);
            ggml_backend_t backend    = ggml_backend_cpu_init();
-            ggml_type model_data_type = GGML_TYPE_Q8_0;
+            ggml_type model_data_type = GGML_TYPE_COUNT;

            ModelLoader model_loader;
            if (!model_loader.init_from_file_and_convert_name(file_path, "model.diffusion_model.")) {
@ -1404,9 +1494,11 @@ namespace Flux {
            }

            auto& tensor_storage_map = model_loader.get_tensor_storage_map();
-            for (auto& [name, tensor_storage] : tensor_storage_map) {
-                if (ends_with(name, "weight")) {
-                    tensor_storage.expected_type = model_data_type;
+            if (model_data_type != GGML_TYPE_COUNT) {
+                for (auto& [name, tensor_storage] : tensor_storage_map) {
+                    if (ends_with(name, "weight")) {
+                        tensor_storage.expected_type = model_data_type;
+                    }
                }
            }

@ -1414,7 +1506,7 @@ namespace Flux {
                                                                            false,
                                                                            tensor_storage_map,
                                                                            "model.diffusion_model",
-                                                                            VERSION_CHROMA_RADIANCE,
+                                                                            VERSION_FLUX2,
                                                                            false);

            flux->alloc_params_buffer();
--- a/otherarch/sdcpp/ggml_extend.hpp
+++ b/otherarch/sdcpp/ggml_extend.hpp
@ -766,6 +766,21 @@ __STATIC_INLINE__ std::vector<struct ggml_tensor*> ggml_ext_chunk(struct ggml_co
    return chunks;
 }

+__STATIC_INLINE__ ggml_tensor* ggml_ext_silu_act(ggml_context* ctx, ggml_tensor* x) {
+    // x: [ne3, ne2, ne1, ne0]
+    // return: [ne3, ne2, ne1, ne0/2]
+
+    auto x_vec = ggml_ext_chunk(ctx, x, 2, 0);
+    auto x1    = x_vec[0];  // [ne3, ne2, ne1, ne0/2]
+    auto x2    = x_vec[1];  // [ne3, ne2, ne1, ne0/2]
+
+    x1 = ggml_gelu_inplace(ctx, x1);
+
+    x = ggml_mul(ctx, x1, x2);  // [ne3, ne2, ne1, ne0/2]
+
+    return x;
+}
+
 typedef std::function<void(ggml_tensor*, ggml_tensor*, bool)> on_tile_process;

 __STATIC_INLINE__ void sd_tiling_calc_tiles(int& num_tiles_dim,
@ -1132,6 +1147,14 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_ones(struct ggml_context* ctx,
 }

 __STATIC_INLINE__ ggml_tensor* ggml_ext_cast_f32(ggml_context* ctx, ggml_tensor* a) {
+#ifdef SD_USE_VULKAN
+    auto zero_index = ggml_get_tensor(ctx, "ggml_runner_build_in_tensor:zero_int");
+    auto out        = ggml_reshape_1d(ctx, a, ggml_nelements(a));
+    out             = ggml_get_rows(ctx, out, zero_index);
+    out             = ggml_reshape(ctx, out, a);
+    // auto out = ggml_cast(ctx, a, GGML_TYPE_F32);
+    return out;
+#else
    auto out         = ggml_reshape_2d(ctx, a, 1, ggml_nelements(a));
    ggml_tensor* one = ggml_ext_ones(ctx, 1, 1, 1, 1);  // [1,]
    if (ggml_is_transposed(out)) {
@ -1139,7 +1162,8 @@ __STATIC_INLINE__ ggml_tensor* ggml_ext_cast_f32(ggml_context* ctx, ggml_tensor*
    } else {
        out = ggml_mul_mat(ctx, out, one);
    }
-    out = ggml_reshape(ctx, out, a);
+    out                    = ggml_reshape(ctx, out, a);
+#endif
    return out;
 }

@ -1547,6 +1571,9 @@ protected:
    std::vector<float> one_vec = {1.f};
    ggml_tensor* one_tensor    = nullptr;

+    std::vector<int> zero_int_vec = {0};
+    ggml_tensor* zero_int_tensor  = nullptr;
+
    std::map<struct ggml_tensor*, const void*> backend_tensor_data_map;
    std::map<std::string, struct ggml_tensor*> cache_tensor_map;  // name -> tensor
    const std::string final_result_name = "ggml_runner_final_result_tensor";
@ -1617,10 +1644,15 @@ protected:
        one_tensor = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_F32, 1);
        ggml_set_name(one_tensor, "ggml_runner_build_in_tensor:one");
        set_backend_tensor_data(one_tensor, one_vec.data());
+
+        zero_int_tensor = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_I32, 1);
+        ggml_set_name(zero_int_tensor, "ggml_runner_build_in_tensor:zero_int");
+        set_backend_tensor_data(zero_int_tensor, zero_int_vec.data());
    }

    void prepare_build_in_tensor_after(struct ggml_cgraph* gf) {
        ggml_build_forward_expand(gf, one_tensor);
+        ggml_build_forward_expand(gf, zero_int_tensor);
    }

    struct ggml_cgraph* new_graph_custom(size_t graph_size) {
--- a/otherarch/sdcpp/llm.hpp
+++ b/otherarch/sdcpp/llm.hpp
--- a/otherarch/sdcpp/main.cpp
+++ b/otherarch/sdcpp/main.cpp
@ -70,8 +70,8 @@ struct SDParams {
    std::string clip_g_path;
    std::string clip_vision_path;
    std::string t5xxl_path;
-    std::string qwen2vl_path;
-    std::string qwen2vl_vision_path;
+    std::string llm_path;
+    std::string llm_vision_path;
    std::string diffusion_model_path;
    std::string high_noise_diffusion_model_path;
    std::string vae_path;
@ -151,6 +151,7 @@ struct SDParams {
    preview_t preview_method = PREVIEW_NONE;
    int preview_interval     = 1;
    std::string preview_path = "preview.png";
+    float preview_fps        = 16;
    bool taesd_preview       = false;
    bool preview_noisy       = false;

@ -174,8 +175,8 @@ void print_params(SDParams params) {
    printf("    clip_g_path:                       %s\n", params.clip_g_path.c_str());
    printf("    clip_vision_path:                  %s\n", params.clip_vision_path.c_str());
    printf("    t5xxl_path:                        %s\n", params.t5xxl_path.c_str());
-    printf("    qwen2vl_path:                      %s\n", params.qwen2vl_path.c_str());
-    printf("    qwen2vl_vision_path:               %s\n", params.qwen2vl_vision_path.c_str());
+    printf("    llm_path:                          %s\n", params.llm_path.c_str());
+    printf("    llm_vision_path:                   %s\n", params.llm_vision_path.c_str());
    printf("    diffusion_model_path:              %s\n", params.diffusion_model_path.c_str());
    printf("    high_noise_diffusion_model_path:   %s\n", params.high_noise_diffusion_model_path.c_str());
    printf("    vae_path:                          %s\n", params.vae_path.c_str());
@ -532,14 +533,22 @@ void parse_args(int argc, const char** argv, SDParams& params) {
         "--t5xxl",
         "path to the t5xxl text encoder",
         &params.t5xxl_path},
+        {"",
+         "--llm",
+         "path to the llm text encoder. For example: (qwenvl2.5 for qwen-image, mistral-small3.2 for flux2, ...)",
+         &params.llm_path},
+        {"",
+         "--llm_vision",
+         "path to the llm vit",
+         &params.llm_vision_path},
        {"",
         "--qwen2vl",
-         "path to the qwen2vl text encoder",
-         &params.qwen2vl_path},
+         "alias of --llm. Deprecated.",
+         &params.llm_path},
        {"",
         "--qwen2vl_vision",
-         "path to the qwen2vl vit",
-         &params.qwen2vl_vision_path},
+         "alias of --llm_vision. Deprecated.",
+         &params.llm_vision_path},
        {"",
         "--diffusion-model",
         "path to the standalone diffusion model",
@ -1185,7 +1194,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
         on_sample_method_arg},
        {"",
         "--prediction",
-         "prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow]",
+         "prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow, flux2_flow]",
         on_prediction_arg},
        {"",
         "--lora-apply-mode",
@ -1230,7 +1239,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
         on_relative_tile_size_arg},
        {"",
         "--preview",
-         std::string("preview method. must be one of the following [") + previews_str[0] + ", " + previews_str[1] + ", " + previews_str[2] + ", " + previews_str[3] + "] (default is " + previews_str[PREVIEW_NONE] + ")\n",
+         std::string("preview method. must be one of the following [") + previews_str[0] + ", " + previews_str[1] + ", " + previews_str[2] + ", " + previews_str[3] + "] (default is " + previews_str[PREVIEW_NONE] + ")",
         on_preview_arg},
        {"",
         "--easycache",
@ -1428,7 +1437,7 @@ std::string get_image_params(SDParams params, int64_t seed) {
        parameter_string += " " + std::string(sd_scheduler_name(params.sample_params.scheduler));
    }
    parameter_string += ", ";
-    for (const auto& te : {params.clip_l_path, params.clip_g_path, params.t5xxl_path, params.qwen2vl_path, params.qwen2vl_vision_path}) {
+    for (const auto& te : {params.clip_l_path, params.clip_g_path, params.t5xxl_path, params.llm_path, params.llm_vision_path}) {
        if (!te.empty()) {
            parameter_string += "TE: " + sd_basename(te) + ", ";
        }
@ -1630,25 +1639,22 @@ bool load_images_from_dir(const std::string dir,
    return true;
 }

-std::string preview_path;
-float preview_fps;
-
-void step_callback(int step, int frame_count, sd_image_t* image, bool is_noisy) {
+void step_callback(int step, int frame_count, sd_image_t* image, bool is_noisy, void* data) {
    (void)step;
    (void)is_noisy;
+    SDParams* params = (SDParams*)data;
    // is_noisy is set to true if the preview corresponds to noisy latents, false if it's denoised latents
    // unused in this app, it will either be always noisy or always denoised here
    if (frame_count == 1) {
-        stbi_write_png(preview_path.c_str(), image->width, image->height, image->channel, image->data, 0);
+        stbi_write_png(params->preview_path.c_str(), image->width, image->height, image->channel, image->data, 0);
    } else {
-        create_mjpg_avi_from_sd_images(preview_path.c_str(), image, frame_count, preview_fps);
+        create_mjpg_avi_from_sd_images(params->preview_path.c_str(), image, frame_count, params->preview_fps);
    }
 }

 int main(int argc, const char* argv[]) {
    SDParams params;
    parse_args(argc, argv, params);
-    preview_path = params.preview_path;
    if (params.video_frames > 4) {
        size_t last_dot_pos   = params.preview_path.find_last_of(".");
        std::string base_path = params.preview_path;
@ -1659,12 +1665,12 @@ int main(int argc, const char* argv[]) {
            std::transform(file_ext.begin(), file_ext.end(), file_ext.begin(), ::tolower);
        }
        if (file_ext == ".png") {
-            preview_path = base_path + ".avi";
+            params.preview_path = base_path + ".avi";
        }
    }
-    preview_fps = params.fps;
+    params.preview_fps = params.fps;
    if (params.preview_method == PREVIEW_PROJ)
-        preview_fps /= 4.0f;
+        params.preview_fps /= 4.0f;

    params.sample_params.guidance.slg.layers                 = params.skip_layers.data();
    params.sample_params.guidance.slg.layer_count            = params.skip_layers.size();
@ -1672,7 +1678,7 @@ int main(int argc, const char* argv[]) {
    params.high_noise_sample_params.guidance.slg.layer_count = params.high_noise_skip_layers.size();

    sd_set_log_callback(sd_log_cb, (void*)&params);
-    sd_set_preview_callback((sd_preview_cb_t)step_callback, params.preview_method, params.preview_interval, !params.preview_noisy, params.preview_noisy);
+    sd_set_preview_callback(step_callback, params.preview_method, params.preview_interval, !params.preview_noisy, params.preview_noisy, (void*)&params);

    if (params.verbose) {
        print_params(params);
@ -1845,8 +1851,8 @@ int main(int argc, const char* argv[]) {
        params.clip_g_path.c_str(),
        params.clip_vision_path.c_str(),
        params.t5xxl_path.c_str(),
-        params.qwen2vl_path.c_str(),
-        params.qwen2vl_vision_path.c_str(),
+        params.llm_path.c_str(),
+        params.llm_vision_path.c_str(),
        params.diffusion_model_path.c_str(),
        params.high_noise_diffusion_model_path.c_str(),
        params.vae_path.c_str(),
--- a/otherarch/sdcpp/model.cpp
+++ b/otherarch/sdcpp/model.cpp
@ -19,6 +19,7 @@
 #include "util.h"
 #ifndef KCPP_NO_BAKE_SD_VOCAB
 #include "vocab.hpp"
+#include "vocab_mistral.hpp"
 #include "vocab_qwen.hpp"
 #include "vocab_umt5.hpp"
 #endif
@ -120,8 +121,9 @@ const char* unused_tensors[] = {
    "denoiser.sigmas",
    "edm_vpred.sigma_max",
    "text_encoders.t5xxl.transformer.encoder.embed_tokens.weight",  // only used during training
-    "text_encoders.qwen2vl.output.weight",
-    "text_encoders.qwen2vl.lm_head.",
+    "text_encoders.llm.output.weight",
+    "text_encoders.llm.lm_head.",
+    "first_stage_model.bn.",
 };

 bool is_unused_tensor(std::string name) {
@ -1120,6 +1122,9 @@ SDVersion ModelLoader::get_sd_version() {
            if (tensor_storage.name.find("model.diffusion_model.transformer_blocks.0.img_mod.1.weight") != std::string::npos) {
                return VERSION_QWEN_IMAGE;
            }
+            if (tensor_storage.name.find("model.diffusion_model.double_stream_modulation_img.lin.weight") != std::string::npos) {
+                return VERSION_FLUX2;
+            }
            if (tensor_storage.name.find("model.diffusion_model.blocks.0.cross_attn.norm_k.weight") != std::string::npos) {
                is_wan = true;
            }
@ -1386,6 +1391,24 @@ std::string ModelLoader::load_qwen2_merges() {
 #endif
 }

+std::string ModelLoader::load_mistral_merges() {
+#ifndef KCPP_NO_BAKE_SD_VOCAB
+    std::string merges_utf8_str(reinterpret_cast<const char*>(mistral_merges_utf8_c_str), sizeof(mistral_merges_utf8_c_str));
+    return merges_utf8_str;
+#else
+    return sd_load_mistral_merges();
+#endif
+}
+
+std::string ModelLoader::load_mistral_vocab_json() {
+#ifndef KCPP_NO_BAKE_SD_VOCAB
+    std::string json_str(reinterpret_cast<const char*>(mistral_vocab_json_utf8_c_str), sizeof(mistral_vocab_json_utf8_c_str));
+    return json_str;
+#else
+    return sd_load_mistral_vocab_json();
+#endif
+}
+
 std::string ModelLoader::load_t5_tokenizer_json() {
 #ifndef KCPP_NO_BAKE_SD_VOCAB
    std::string json_str(reinterpret_cast<const char*>(t5_tokenizer_json_str), sizeof(t5_tokenizer_json_str));
--- a/otherarch/sdcpp/model.h
+++ b/otherarch/sdcpp/model.h
@ -43,6 +43,7 @@ enum SDVersion {
    VERSION_WAN2_2_I2V,
    VERSION_WAN2_2_TI2V,
    VERSION_QWEN_IMAGE,
+    VERSION_FLUX2,
    VERSION_COUNT,
 };

@ -94,6 +95,13 @@ static inline bool sd_version_is_flux(SDVersion version) {
    return false;
 }

+static inline bool sd_version_is_flux2(SDVersion version) {
+    if (version == VERSION_FLUX2) {
+        return true;
+    }
+    return false;
+}
+
 static inline bool sd_version_is_wan(SDVersion version) {
    if (version == VERSION_WAN2 || version == VERSION_WAN2_2_I2V || version == VERSION_WAN2_2_TI2V) {
        return true;
@ -121,6 +129,7 @@ static inline bool sd_version_is_inpaint(SDVersion version) {

 static inline bool sd_version_is_dit(SDVersion version) {
    if (sd_version_is_flux(version) ||
+        sd_version_is_flux2(version) ||
        sd_version_is_sd3(version) ||
        sd_version_is_wan(version) ||
        sd_version_is_qwen_image(version)) {
@ -314,6 +323,8 @@ public:

    static std::string load_merges();
    static std::string load_qwen2_merges();
+    static std::string load_mistral_merges();
+    static std::string load_mistral_vocab_json();
    static std::string load_t5_tokenizer_json();
    static std::string load_umt5_tokenizer_json();
 };
--- a/otherarch/sdcpp/name_conversion.cpp
+++ b/otherarch/sdcpp/name_conversion.cpp
@ -127,7 +127,7 @@ std::string convert_cond_stage_model_name(std::string name, std::string prefix)
        {"token_embd.", "shared."},
    };

-    static const std::vector<std::pair<std::string, std::string>> qwenvl_name_map{
+    static const std::vector<std::pair<std::string, std::string>> llm_name_map{
        {"token_embd.", "model.embed_tokens."},
        {"blk.", "model.layers."},
        {"attn_q.", "self_attn.q_proj."},
@ -142,7 +142,7 @@ std::string convert_cond_stage_model_name(std::string name, std::string prefix)
        {"output_norm.", "model.norm."},
    };

-    static const std::vector<std::pair<std::string, std::string>> qwenvl_vision_name_map{
+    static const std::vector<std::pair<std::string, std::string>> llm_vision_name_map{
        {"mm.", "merger.mlp."},
        {"v.post_ln.", "merger.ln_q."},
        {"v.patch_embd.weight", "patch_embed.proj.0.weight"},
@ -161,11 +161,11 @@ std::string convert_cond_stage_model_name(std::string name, std::string prefix)
    };
    if (contains(name, "t5xxl")) {
        replace_with_name_map(name, t5_name_map);
-    } else if (contains(name, "qwen2vl")) {
-        if (contains(name, "qwen2vl.visual")) {
-            replace_with_name_map(name, qwenvl_vision_name_map);
+    } else if (contains(name, "llm")) {
+        if (contains(name, "llm.visual")) {
+            replace_with_name_map(name, llm_vision_name_map);
        } else {
-            replace_with_name_map(name, qwenvl_name_map);
+            replace_with_name_map(name, llm_name_map);
        }
    } else {
        name = convert_open_clip_to_hf_clip_name(name);
@ -620,7 +620,7 @@ std::string convert_diffusion_model_name(std::string name, std::string prefix, S
        name = convert_diffusers_unet_to_original_sdxl(name);
    } else if (sd_version_is_sd3(version)) {
        name = convert_diffusers_dit_to_original_sd3(name);
-    } else if (sd_version_is_flux(version)) {
+    } else if (sd_version_is_flux(version) || sd_version_is_flux2(version)) {
        name = convert_diffusers_dit_to_original_flux(name);
    }
    return name;
@ -722,6 +722,11 @@ std::string convert_diffusers_vae_to_original_sd1(std::string name) {
 }

 std::string convert_first_stage_model_name(std::string name, std::string prefix) {
+    static std::unordered_map<std::string, std::string> vae_name_map = {
+        {"decoder.post_quant_conv.", "post_quant_conv."},
+        {"encoder.quant_conv.", "quant_conv."},
+    };
+    replace_with_prefix_map(name, vae_name_map);
    name = convert_diffusers_vae_to_original_sd1(name);
    return name;
 }
--- a/otherarch/sdcpp/rope.hpp
+++ b/otherarch/sdcpp/rope.hpp
@ -72,15 +72,28 @@ namespace Rope {
    }

    // Generate IDs for image patches and text
-    __STATIC_INLINE__ std::vector<std::vector<float>> gen_txt_ids(int bs, int context_len) {
-        return std::vector<std::vector<float>>(bs * context_len, std::vector<float>(3, 0.0));
+    __STATIC_INLINE__ std::vector<std::vector<float>> gen_flux_txt_ids(int bs, int context_len, int axes_dim_num) {
+        auto txt_ids = std::vector<std::vector<float>>(bs * context_len, std::vector<float>(axes_dim_num, 0.0f));
+        if (axes_dim_num == 4) {
+            for (int i = 0; i < bs * context_len; i++) {
+                txt_ids[i][3] = (i % context_len);
+            }
+        }
+        return txt_ids;
    }

-    __STATIC_INLINE__ std::vector<std::vector<float>> gen_img_ids(int h, int w, int patch_size, int bs, int index = 0, int h_offset = 0, int w_offset = 0) {
+    __STATIC_INLINE__ std::vector<std::vector<float>> gen_flux_img_ids(int h,
+                                                                       int w,
+                                                                       int patch_size,
+                                                                       int bs,
+                                                                       int axes_dim_num,
+                                                                       int index    = 0,
+                                                                       int h_offset = 0,
+                                                                       int w_offset = 0) {
        int h_len = (h + (patch_size / 2)) / patch_size;
        int w_len = (w + (patch_size / 2)) / patch_size;

-        std::vector<std::vector<float>> img_ids(h_len * w_len, std::vector<float>(3, 0.0));
+        std::vector<std::vector<float>> img_ids(h_len * w_len, std::vector<float>(axes_dim_num, 0.0));

        std::vector<float> row_ids = linspace<float>(h_offset, h_len - 1 + h_offset, h_len);
        std::vector<float> col_ids = linspace<float>(w_offset, w_len - 1 + w_offset, w_len);
@ -153,8 +166,10 @@ namespace Rope {

    __STATIC_INLINE__ std::vector<std::vector<float>> gen_refs_ids(int patch_size,
                                                                   int bs,
+                                                                   int axes_dim_num,
                                                                   const std::vector<ggml_tensor*>& ref_latents,
-                                                                   bool increase_ref_index) {
+                                                                   bool increase_ref_index,
+                                                                   float ref_index_scale) {
        std::vector<std::vector<float>> ids;
        uint64_t curr_h_offset = 0;
        uint64_t curr_w_offset = 0;
@ -170,7 +185,14 @@ namespace Rope {
                }
            }

-            auto ref_ids = gen_img_ids(ref->ne[1], ref->ne[0], patch_size, bs, index, h_offset, w_offset);
+            auto ref_ids = gen_flux_img_ids(ref->ne[1],
+                                            ref->ne[0],
+                                            patch_size,
+                                            bs,
+                                            axes_dim_num,
+                                            static_cast<int>(index * ref_index_scale),
+                                            h_offset,
+                                            w_offset);
            ids          = concat_ids(ids, ref_ids, bs);

            if (increase_ref_index) {
@ -187,15 +209,17 @@ namespace Rope {
                                                                   int w,
                                                                   int patch_size,
                                                                   int bs,
+                                                                   int axes_dim_num,
                                                                   int context_len,
                                                                   const std::vector<ggml_tensor*>& ref_latents,
-                                                                   bool increase_ref_index) {
-        auto txt_ids = gen_txt_ids(bs, context_len);
-        auto img_ids = gen_img_ids(h, w, patch_size, bs);
+                                                                   bool increase_ref_index,
+                                                                   float ref_index_scale) {
+        auto txt_ids = gen_flux_txt_ids(bs, context_len, axes_dim_num);
+        auto img_ids = gen_flux_img_ids(h, w, patch_size, bs, axes_dim_num);

        auto ids = concat_ids(txt_ids, img_ids, bs);
        if (ref_latents.size() > 0) {
-            auto refs_ids = gen_refs_ids(patch_size, bs, ref_latents, increase_ref_index);
+            auto refs_ids = gen_refs_ids(patch_size, bs, axes_dim_num, ref_latents, increase_ref_index, ref_index_scale);
            ids           = concat_ids(ids, refs_ids, bs);
        }
        return ids;
@ -209,9 +233,18 @@ namespace Rope {
                                                     int context_len,
                                                     const std::vector<ggml_tensor*>& ref_latents,
                                                     bool increase_ref_index,
+                                                     float ref_index_scale,
                                                     int theta,
                                                     const std::vector<int>& axes_dim) {
-        std::vector<std::vector<float>> ids = gen_flux_ids(h, w, patch_size, bs, context_len, ref_latents, increase_ref_index);
+        std::vector<std::vector<float>> ids = gen_flux_ids(h,
+                                                           w,
+                                                           patch_size,
+                                                           bs,
+                                                           static_cast<int>(axes_dim.size()),
+                                                           context_len,
+                                                           ref_latents,
+                                                           increase_ref_index,
+                                                           ref_index_scale);
        return embed_nd(ids, bs, theta, axes_dim);
    }

@ -232,10 +265,11 @@ namespace Rope {
                txt_ids_repeated[i * txt_ids.size() + j] = {txt_ids[j], txt_ids[j], txt_ids[j]};
            }
        }
-        auto img_ids = gen_img_ids(h, w, patch_size, bs);
-        auto ids     = concat_ids(txt_ids_repeated, img_ids, bs);
+        int axes_dim_num = 3;
+        auto img_ids     = gen_flux_img_ids(h, w, patch_size, bs, axes_dim_num);
+        auto ids         = concat_ids(txt_ids_repeated, img_ids, bs);
        if (ref_latents.size() > 0) {
-            auto refs_ids = gen_refs_ids(patch_size, bs, ref_latents, increase_ref_index);
+            auto refs_ids = gen_refs_ids(patch_size, bs, axes_dim_num, ref_latents, increase_ref_index, 1.f);
            ids           = concat_ids(ids, refs_ids, bs);
        }
        return ids;
--- a/otherarch/sdcpp/sdtype_adapter.cpp
+++ b/otherarch/sdcpp/sdtype_adapter.cpp
@ -21,6 +21,8 @@ std::string sd_load_merges();
 std::string sd_load_t5();
 std::string sd_load_umt5();
 std::string sd_load_qwen2_merges();
+std::string sd_load_mistral_merges();
+std::string sd_load_mistral_vocab_json();

 #include "flux.hpp"
 #include "stable-diffusion.cpp"
@ -159,6 +161,26 @@ std::string sd_load_qwen2_merges()
    qwenmergesstr = read_str_from_disk(filepath);
    return qwenmergesstr;
 }
+std::string sd_load_mistral_merges()
+{
+    static std::string mistralmergesstr;  // cached string
+    if (!mistralmergesstr.empty()) {
+        return mistralmergesstr;  // already loaded
+    }
+    std::string filepath = executable_path + "embd_res/mistral2_merges_utf8_c_str.embd";
+    mistralmergesstr = read_str_from_disk(filepath);
+    return mistralmergesstr;
+}
+std::string sd_load_mistral_vocab_json()
+{
+    static std::string mistralvocabstr;  // cached string
+    if (!mistralvocabstr.empty()) {
+        return mistralvocabstr;  // already loaded
+    }
+    std::string filepath = executable_path + "embd_res/mistral2_vocab_utf8_c_str.embd";
+    mistralvocabstr = read_str_from_disk(filepath);
+    return mistralvocabstr;
+}
 std::string sd_load_t5()
 {
    static std::string t5str = "";
--- a/otherarch/sdcpp/stable-diffusion.cpp
+++ b/otherarch/sdcpp/stable-diffusion.cpp
@ -46,6 +46,7 @@ const char* model_version_to_str[] = {
    "Wan 2.2 I2V",
    "Wan 2.2 TI2V",
    "Qwen Image",
+    "Flux.2",
 };

 const char* sampling_methods_str[] = {
@ -341,17 +342,17 @@ public:
            }
        }

-        if (strlen(SAFE_STR(sd_ctx_params->qwen2vl_path)) > 0) {
-            LOG_INFO("loading qwen2vl from '%s'", sd_ctx_params->qwen2vl_path);
-            if (!model_loader.init_from_file(sd_ctx_params->qwen2vl_path, "text_encoders.qwen2vl.")) {
-                LOG_WARN("loading qwen2vl from '%s' failed", sd_ctx_params->qwen2vl_path);
+        if (strlen(SAFE_STR(sd_ctx_params->llm_path)) > 0) {
+            LOG_INFO("loading llm from '%s'", sd_ctx_params->llm_path);
+            if (!model_loader.init_from_file(sd_ctx_params->llm_path, "text_encoders.llm.")) {
+                LOG_WARN("loading llm from '%s' failed", sd_ctx_params->llm_path);
            }
        }

-        if (strlen(SAFE_STR(sd_ctx_params->qwen2vl_vision_path)) > 0) {
-            LOG_INFO("loading qwen2vl vision from '%s'", sd_ctx_params->qwen2vl_vision_path);
-            if (!model_loader.init_from_file(sd_ctx_params->qwen2vl_vision_path, "text_encoders.qwen2vl.visual.")) {
-                LOG_WARN("loading qwen2vl vision from '%s' failed", sd_ctx_params->qwen2vl_vision_path);
+        if (strlen(SAFE_STR(sd_ctx_params->llm_vision_path)) > 0) {
+            LOG_INFO("loading llm vision from '%s'", sd_ctx_params->llm_vision_path);
+            if (!model_loader.init_from_file(sd_ctx_params->llm_vision_path, "text_encoders.llm.visual.")) {
+                LOG_WARN("loading llm vision from '%s' failed", sd_ctx_params->llm_vision_path);
            }
        }

@ -390,7 +391,7 @@ public:

        auto& tensor_storage_map = model_loader.get_tensor_storage_map();
        for (auto& [name, tensor_storage] : tensor_storage_map) {
-            if (contains(name, "qwen2vl") &&
+            if (contains(name, "llm") &&
                ends_with(name, "weight") &&
                (tensor_storage.type == GGML_TYPE_F32 || tensor_storage.type == GGML_TYPE_BF16)) {
                tensor_storage.expected_type = GGML_TYPE_F16;
@ -500,8 +501,11 @@ public:
        } else if (sd_version_is_flux(version)) {
            scale_factor = 0.3611f;
            shift_factor = 0.1159f;
-        } else if (sd_version_is_wan(version) || sd_version_is_qwen_image(version)) {
+        } else if (sd_version_is_wan(version) ||
+                   sd_version_is_qwen_image(version) ||
+                   sd_version_is_flux2(version)) {
            scale_factor = 1.0f;
+            shift_factor = 0.f;
        }

        if (sd_version_is_control(version)) {
@ -557,6 +561,17 @@ public:
                                                              tensor_storage_map,
                                                              version,
                                                              sd_ctx_params->chroma_use_dit_mask);
+            } else if (sd_version_is_flux2(version)) {
+                bool is_chroma   = false;
+                cond_stage_model = std::make_shared<LLMEmbedder>(clip_backend,
+                                                                 offload_params_to_cpu,
+                                                                 tensor_storage_map,
+                                                                 version);
+                diffusion_model  = std::make_shared<FluxModel>(backend,
+                                                              offload_params_to_cpu,
+                                                              tensor_storage_map,
+                                                              version,
+                                                              sd_ctx_params->chroma_use_dit_mask);
            } else if (sd_version_is_wan(version)) {
                cond_stage_model = std::make_shared<T5CLIPEmbedder>(clip_backend,
                                                                    offload_params_to_cpu,
@ -590,11 +605,12 @@ public:
                if (!vae_decode_only) {
                    enable_vision = true;
                }
-                cond_stage_model = std::make_shared<Qwen2_5_VLCLIPEmbedder>(clip_backend,
-                                                                            offload_params_to_cpu,
-                                                                            tensor_storage_map,
-                                                                            "",
-                                                                            enable_vision);
+                cond_stage_model = std::make_shared<LLMEmbedder>(clip_backend,
+                                                                 offload_params_to_cpu,
+                                                                 tensor_storage_map,
+                                                                 version,
+                                                                 "",
+                                                                 enable_vision);
                diffusion_model  = std::make_shared<QwenImageModel>(backend,
                                                                   offload_params_to_cpu,
                                                                   tensor_storage_map,
@ -793,7 +809,7 @@ public:
            ignore_tensors.insert("first_stage_model.encoder");
            ignore_tensors.insert("first_stage_model.conv1");
            ignore_tensors.insert("first_stage_model.quant");
-            ignore_tensors.insert("text_encoders.qwen2vl.visual.");
+            ignore_tensors.insert("text_encoders.llm.visual.");
        }
        if (version == VERSION_SVD) {
            ignore_tensors.insert("conditioner.embedders.3");
@ -911,6 +927,11 @@ public:
                    denoiser = std::make_shared<FluxFlowDenoiser>(shift);
                    break;
                }
+                case FLUX2_FLOW_PRED: {
+                    LOG_INFO("running in Flux2 FLOW mode");
+                    denoiser = std::make_shared<Flux2FlowDenoiser>();
+                    break;
+                }
                default: {
                    LOG_ERROR("Unknown parametrization %i", sd_ctx_params->prediction);
                    return false;
@ -955,6 +976,9 @@ public:
                    }
                }
                denoiser = std::make_shared<FluxFlowDenoiser>(shift);
+            } else if (sd_version_is_flux2(version)) {
+                LOG_INFO("running in Flux2 FLOW mode");
+                denoiser = std::make_shared<Flux2FlowDenoiser>();
            } else if (sd_version_is_wan(version)) {
                LOG_INFO("running in FLOW mode");
                float shift = sd_ctx_params->flow_shift;
@ -1131,6 +1155,9 @@ public:
            int64_t t0 = ggml_time_ms();

            auto lora = load_lora_model_from_file(kv.first, kv.second, backend);
+            if (!lora || lora->lora_tensors.empty()) {
+                continue;
+            }
            lora->apply(tensors, version, n_threads);
            lora->free_params_buffer();

@ -1454,7 +1481,8 @@ public:
                       enum SDVersion version,
                       preview_t preview_mode,
                       ggml_tensor* result,
-                       std::function<void(int, int, sd_image_t*, bool)> step_callback,
+                       std::function<void(int, int, sd_image_t*, bool, void*)> step_callback,
+                       void* step_callback_data,
                       bool is_noisy) {
        const uint32_t channel = 3;
        uint32_t width         = latents->ne[0];
@ -1525,7 +1553,7 @@ public:
            for (int i = 0; i < frames; i++) {
                images[i] = {width, height, channel, data + i * width * height * channel};
            }
-            step_callback(step, frames, images, is_noisy);
+            step_callback(step, frames, images, is_noisy, step_callback_data);
            free(data);
            free(images);
        } else {
@ -1579,7 +1607,7 @@ public:
                images[i].data    = ggml_tensor_to_sd_image(result, i, ggml_n_dims(latents) == 4);
            }

-            step_callback(step, frames, images, is_noisy);
+            step_callback(step, frames, images, is_noisy, step_callback_data);

            ggml_ext_tensor_scale_inplace(result, 0);
            for (int i = 0; i < frames; i++) {
@ -1728,8 +1756,9 @@ public:
        }

        auto denoise = [&](ggml_tensor* input, float sigma, int step) -> ggml_tensor* {
-            auto sd_preview_cb   = sd_get_preview_callback();
-            auto sd_preview_mode = sd_get_preview_mode();
+            auto sd_preview_cb      = sd_get_preview_callback();
+            auto sd_preview_cb_data = sd_get_preview_callback_data();
+            auto sd_preview_mode    = sd_get_preview_mode();
            if (step == 1 || step == -1) {
                pretty_progress(0, (int)steps, 0);
            }
@ -1798,7 +1827,7 @@ public:
            }
            if (sd_preview_cb != nullptr && sd_should_preview_noisy()) {
                if (step % sd_get_preview_interval() == 0) {
-                    preview_image(work_ctx, step, noised_input, version, sd_preview_mode, preview_tensor, sd_preview_cb, true);
+                    preview_image(work_ctx, step, noised_input, version, sd_preview_mode, preview_tensor, sd_preview_cb, sd_preview_cb_data, true);
                }
            }

@ -1946,7 +1975,7 @@ public:

            if (sd_preview_cb != nullptr && sd_should_preview_denoised()) {
                if (step % sd_get_preview_interval() == 0) {
-                    preview_image(work_ctx, step, denoised, version, sd_preview_mode, preview_tensor, sd_preview_cb, false);
+                    preview_image(work_ctx, step, denoised, version, sd_preview_mode, preview_tensor, sd_preview_cb, sd_preview_cb_data, false);
                }
            }

@ -1997,6 +2026,8 @@ public:
        int vae_scale_factor = 8;
        if (version == VERSION_WAN2_2_TI2V) {
            vae_scale_factor = 16;
+        } else if (sd_version_is_flux2(version)) {
+            vae_scale_factor = 16;
        } else if (version == VERSION_CHROMA_RADIANCE) {
            vae_scale_factor = 1;
        }
@ -2010,6 +2041,8 @@ public:
                latent_channel = 48;
            } else if (version == VERSION_CHROMA_RADIANCE) {
                latent_channel = 3;
+            } else if (sd_version_is_flux2(version)) {
+                latent_channel = 128;
            } else {
                latent_channel = 16;
            }
@ -2017,6 +2050,11 @@ public:
        return latent_channel;
    }

+    int get_image_seq_len(int h, int w) {
+        int vae_scale_factor = get_vae_scale_factor();
+        return (h / vae_scale_factor) * (w / vae_scale_factor);
+    }
+
    ggml_tensor* generate_init_latent(ggml_context* work_ctx,
                                      int width,
                                      int height,
@ -2040,32 +2078,84 @@ public:
        return init_latent;
    }

+    void get_latents_mean_std_vec(ggml_tensor* latent, int channel_dim, std::vector<float>& latents_mean_vec, std::vector<float>& latents_std_vec) {
+        GGML_ASSERT(latent->ne[channel_dim] == 16 || latent->ne[channel_dim] == 48 || latent->ne[channel_dim] == 128);
+        if (latent->ne[channel_dim] == 16) {
+            latents_mean_vec = {-0.7571f, -0.7089f, -0.9113f, 0.1075f, -0.1745f, 0.9653f, -0.1517f, 1.5508f,
+                                0.4134f, -0.0715f, 0.5517f, -0.3632f, -0.1922f, -0.9497f, 0.2503f, -0.2921f};
+            latents_std_vec  = {2.8184f, 1.4541f, 2.3275f, 2.6558f, 1.2196f, 1.7708f, 2.6052f, 2.0743f,
+                                3.2687f, 2.1526f, 2.8652f, 1.5579f, 1.6382f, 1.1253f, 2.8251f, 1.9160f};
+        } else if (latent->ne[channel_dim] == 48) {
+            latents_mean_vec = {-0.2289f, -0.0052f, -0.1323f, -0.2339f, -0.2799f, 0.0174f, 0.1838f, 0.1557f,
+                                -0.1382f, 0.0542f, 0.2813f, 0.0891f, 0.1570f, -0.0098f, 0.0375f, -0.1825f,
+                                -0.2246f, -0.1207f, -0.0698f, 0.5109f, 0.2665f, -0.2108f, -0.2158f, 0.2502f,
+                                -0.2055f, -0.0322f, 0.1109f, 0.1567f, -0.0729f, 0.0899f, -0.2799f, -0.1230f,
+                                -0.0313f, -0.1649f, 0.0117f, 0.0723f, -0.2839f, -0.2083f, -0.0520f, 0.3748f,
+                                0.0152f, 0.1957f, 0.1433f, -0.2944f, 0.3573f, -0.0548f, -0.1681f, -0.0667f};
+            latents_std_vec  = {
+                 0.4765f, 1.0364f, 0.4514f, 1.1677f, 0.5313f, 0.4990f, 0.4818f, 0.5013f,
+                 0.8158f, 1.0344f, 0.5894f, 1.0901f, 0.6885f, 0.6165f, 0.8454f, 0.4978f,
+                 0.5759f, 0.3523f, 0.7135f, 0.6804f, 0.5833f, 1.4146f, 0.8986f, 0.5659f,
+                 0.7069f, 0.5338f, 0.4889f, 0.4917f, 0.4069f, 0.4999f, 0.6866f, 0.4093f,
+                 0.5709f, 0.6065f, 0.6415f, 0.4944f, 0.5726f, 1.2042f, 0.5458f, 1.6887f,
+                 0.3971f, 1.0600f, 0.3943f, 0.5537f, 0.5444f, 0.4089f, 0.7468f, 0.7744f};
+        } else if (latent->ne[channel_dim] == 128) {
+            // flux2
+            latents_mean_vec = {-0.0676f, -0.0715f, -0.0753f, -0.0745f, 0.0223f, 0.0180f, 0.0142f, 0.0184f,
+                                -0.0001f, -0.0063f, -0.0002f, -0.0031f, -0.0272f, -0.0281f, -0.0276f, -0.0290f,
+                                -0.0769f, -0.0672f, -0.0902f, -0.0892f, 0.0168f, 0.0152f, 0.0079f, 0.0086f,
+                                0.0083f, 0.0015f, 0.0003f, -0.0043f, -0.0439f, -0.0419f, -0.0438f, -0.0431f,
+                                -0.0102f, -0.0132f, -0.0066f, -0.0048f, -0.0311f, -0.0306f, -0.0279f, -0.0180f,
+                                0.0030f, 0.0015f, 0.0126f, 0.0145f, 0.0347f, 0.0338f, 0.0337f, 0.0283f,
+                                0.0020f, 0.0047f, 0.0047f, 0.0050f, 0.0123f, 0.0081f, 0.0081f, 0.0146f,
+                                0.0681f, 0.0679f, 0.0767f, 0.0732f, -0.0462f, -0.0474f, -0.0392f, -0.0511f,
+                                -0.0528f, -0.0477f, -0.0470f, -0.0517f, -0.0317f, -0.0316f, -0.0345f, -0.0283f,
+                                0.0510f, 0.0445f, 0.0578f, 0.0458f, -0.0412f, -0.0458f, -0.0487f, -0.0467f,
+                                -0.0088f, -0.0106f, -0.0088f, -0.0046f, -0.0376f, -0.0432f, -0.0436f, -0.0499f,
+                                0.0118f, 0.0166f, 0.0203f, 0.0279f, 0.0113f, 0.0129f, 0.0016f, 0.0072f,
+                                -0.0118f, -0.0018f, -0.0141f, -0.0054f, -0.0091f, -0.0138f, -0.0145f, -0.0187f,
+                                0.0323f, 0.0305f, 0.0259f, 0.0300f, 0.0540f, 0.0614f, 0.0495f, 0.0590f,
+                                -0.0511f, -0.0603f, -0.0478f, -0.0524f, -0.0227f, -0.0274f, -0.0154f, -0.0255f,
+                                -0.0572f, -0.0565f, -0.0518f, -0.0496f, 0.0116f, 0.0054f, 0.0163f, 0.0104f};
+            latents_std_vec  = {
+                 1.8029f, 1.7786f, 1.7868f, 1.7837f, 1.7717f, 1.7590f, 1.7610f, 1.7479f,
+                 1.7336f, 1.7373f, 1.7340f, 1.7343f, 1.8626f, 1.8527f, 1.8629f, 1.8589f,
+                 1.7593f, 1.7526f, 1.7556f, 1.7583f, 1.7363f, 1.7400f, 1.7355f, 1.7394f,
+                 1.7342f, 1.7246f, 1.7392f, 1.7304f, 1.7551f, 1.7513f, 1.7559f, 1.7488f,
+                 1.8449f, 1.8454f, 1.8550f, 1.8535f, 1.8240f, 1.7813f, 1.7854f, 1.7945f,
+                 1.8047f, 1.7876f, 1.7695f, 1.7676f, 1.7782f, 1.7667f, 1.7925f, 1.7848f,
+                 1.7579f, 1.7407f, 1.7483f, 1.7368f, 1.7961f, 1.7998f, 1.7920f, 1.7925f,
+                 1.7780f, 1.7747f, 1.7727f, 1.7749f, 1.7526f, 1.7447f, 1.7657f, 1.7495f,
+                 1.7775f, 1.7720f, 1.7813f, 1.7813f, 1.8162f, 1.8013f, 1.8023f, 1.8033f,
+                 1.7527f, 1.7331f, 1.7563f, 1.7482f, 1.7610f, 1.7507f, 1.7681f, 1.7613f,
+                 1.7665f, 1.7545f, 1.7828f, 1.7726f, 1.7896f, 1.7999f, 1.7864f, 1.7760f,
+                 1.7613f, 1.7625f, 1.7560f, 1.7577f, 1.7783f, 1.7671f, 1.7810f, 1.7799f,
+                 1.7201f, 1.7068f, 1.7265f, 1.7091f, 1.7793f, 1.7578f, 1.7502f, 1.7455f,
+                 1.7587f, 1.7500f, 1.7525f, 1.7362f, 1.7616f, 1.7572f, 1.7444f, 1.7430f,
+                 1.7509f, 1.7610f, 1.7634f, 1.7612f, 1.7254f, 1.7135f, 1.7321f, 1.7226f,
+                 1.7664f, 1.7624f, 1.7718f, 1.7664f, 1.7457f, 1.7441f, 1.7569f, 1.7530f};
+        }
+    }
+
    void process_latent_in(ggml_tensor* latent) {
-        if (sd_version_is_wan(version) || sd_version_is_qwen_image(version)) {
-            GGML_ASSERT(latent->ne[3] == 16 || latent->ne[3] == 48);
-            std::vector<float> latents_mean_vec = {-0.7571f, -0.7089f, -0.9113f, 0.1075f, -0.1745f, 0.9653f, -0.1517f, 1.5508f,
-                                                   0.4134f, -0.0715f, 0.5517f, -0.3632f, -0.1922f, -0.9497f, 0.2503f, -0.2921f};
-            std::vector<float> latents_std_vec  = {2.8184f, 1.4541f, 2.3275f, 2.6558f, 1.2196f, 1.7708f, 2.6052f, 2.0743f,
-                                                   3.2687f, 2.1526f, 2.8652f, 1.5579f, 1.6382f, 1.1253f, 2.8251f, 1.9160f};
-            if (latent->ne[3] == 48) {
-                latents_mean_vec = {-0.2289f, -0.0052f, -0.1323f, -0.2339f, -0.2799f, 0.0174f, 0.1838f, 0.1557f,
-                                    -0.1382f, 0.0542f, 0.2813f, 0.0891f, 0.1570f, -0.0098f, 0.0375f, -0.1825f,
-                                    -0.2246f, -0.1207f, -0.0698f, 0.5109f, 0.2665f, -0.2108f, -0.2158f, 0.2502f,
-                                    -0.2055f, -0.0322f, 0.1109f, 0.1567f, -0.0729f, 0.0899f, -0.2799f, -0.1230f,
-                                    -0.0313f, -0.1649f, 0.0117f, 0.0723f, -0.2839f, -0.2083f, -0.0520f, 0.3748f,
-                                    0.0152f, 0.1957f, 0.1433f, -0.2944f, 0.3573f, -0.0548f, -0.1681f, -0.0667f};
-                latents_std_vec  = {
-                     0.4765f, 1.0364f, 0.4514f, 1.1677f, 0.5313f, 0.4990f, 0.4818f, 0.5013f,
-                     0.8158f, 1.0344f, 0.5894f, 1.0901f, 0.6885f, 0.6165f, 0.8454f, 0.4978f,
-                     0.5759f, 0.3523f, 0.7135f, 0.6804f, 0.5833f, 1.4146f, 0.8986f, 0.5659f,
-                     0.7069f, 0.5338f, 0.4889f, 0.4917f, 0.4069f, 0.4999f, 0.6866f, 0.4093f,
-                     0.5709f, 0.6065f, 0.6415f, 0.4944f, 0.5726f, 1.2042f, 0.5458f, 1.6887f,
-                     0.3971f, 1.0600f, 0.3943f, 0.5537f, 0.5444f, 0.4089f, 0.7468f, 0.7744f};
-            }
+        if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_flux2(version)) {
+            int channel_dim = sd_version_is_flux2(version) ? 2 : 3;
+            std::vector<float> latents_mean_vec;
+            std::vector<float> latents_std_vec;
+            get_latents_mean_std_vec(latent, channel_dim, latents_mean_vec, latents_std_vec);
+
+            float mean;
+            float std_;
            for (int i = 0; i < latent->ne[3]; i++) {
-                float mean = latents_mean_vec[i];
-                float std_ = latents_std_vec[i];
+                if (channel_dim == 3) {
+                    mean = latents_mean_vec[i];
+                    std_ = latents_std_vec[i];
+                }
                for (int j = 0; j < latent->ne[2]; j++) {
+                    if (channel_dim == 2) {
+                        mean = latents_mean_vec[i];
+                        std_ = latents_std_vec[i];
+                    }
                    for (int k = 0; k < latent->ne[1]; k++) {
                        for (int l = 0; l < latent->ne[0]; l++) {
                            float value = ggml_ext_tensor_get_f32(latent, l, k, j, i);
@ -2087,31 +2177,24 @@ public:
    }

    void process_latent_out(ggml_tensor* latent) {
-        if (sd_version_is_wan(version) || sd_version_is_qwen_image(version)) {
-            GGML_ASSERT(latent->ne[3] == 16 || latent->ne[3] == 48);
-            std::vector<float> latents_mean_vec = {-0.7571f, -0.7089f, -0.9113f, 0.1075f, -0.1745f, 0.9653f, -0.1517f, 1.5508f,
-                                                   0.4134f, -0.0715f, 0.5517f, -0.3632f, -0.1922f, -0.9497f, 0.2503f, -0.2921f};
-            std::vector<float> latents_std_vec  = {2.8184f, 1.4541f, 2.3275f, 2.6558f, 1.2196f, 1.7708f, 2.6052f, 2.0743f,
-                                                   3.2687f, 2.1526f, 2.8652f, 1.5579f, 1.6382f, 1.1253f, 2.8251f, 1.9160f};
-            if (latent->ne[3] == 48) {
-                latents_mean_vec = {-0.2289f, -0.0052f, -0.1323f, -0.2339f, -0.2799f, 0.0174f, 0.1838f, 0.1557f,
-                                    -0.1382f, 0.0542f, 0.2813f, 0.0891f, 0.1570f, -0.0098f, 0.0375f, -0.1825f,
-                                    -0.2246f, -0.1207f, -0.0698f, 0.5109f, 0.2665f, -0.2108f, -0.2158f, 0.2502f,
-                                    -0.2055f, -0.0322f, 0.1109f, 0.1567f, -0.0729f, 0.0899f, -0.2799f, -0.1230f,
-                                    -0.0313f, -0.1649f, 0.0117f, 0.0723f, -0.2839f, -0.2083f, -0.0520f, 0.3748f,
-                                    0.0152f, 0.1957f, 0.1433f, -0.2944f, 0.3573f, -0.0548f, -0.1681f, -0.0667f};
-                latents_std_vec  = {
-                     0.4765f, 1.0364f, 0.4514f, 1.1677f, 0.5313f, 0.4990f, 0.4818f, 0.5013f,
-                     0.8158f, 1.0344f, 0.5894f, 1.0901f, 0.6885f, 0.6165f, 0.8454f, 0.4978f,
-                     0.5759f, 0.3523f, 0.7135f, 0.6804f, 0.5833f, 1.4146f, 0.8986f, 0.5659f,
-                     0.7069f, 0.5338f, 0.4889f, 0.4917f, 0.4069f, 0.4999f, 0.6866f, 0.4093f,
-                     0.5709f, 0.6065f, 0.6415f, 0.4944f, 0.5726f, 1.2042f, 0.5458f, 1.6887f,
-                     0.3971f, 1.0600f, 0.3943f, 0.5537f, 0.5444f, 0.4089f, 0.7468f, 0.7744f};
-            }
+        if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_flux2(version)) {
+            int channel_dim = sd_version_is_flux2(version) ? 2 : 3;
+            std::vector<float> latents_mean_vec;
+            std::vector<float> latents_std_vec;
+            get_latents_mean_std_vec(latent, channel_dim, latents_mean_vec, latents_std_vec);
+
+            float mean;
+            float std_;
            for (int i = 0; i < latent->ne[3]; i++) {
-                float mean = latents_mean_vec[i];
-                float std_ = latents_std_vec[i];
+                if (channel_dim == 3) {
+                    mean = latents_mean_vec[i];
+                    std_ = latents_std_vec[i];
+                }
                for (int j = 0; j < latent->ne[2]; j++) {
+                    if (channel_dim == 2) {
+                        mean = latents_mean_vec[i];
+                        std_ = latents_std_vec[i];
+                    }
                    for (int k = 0; k < latent->ne[1]; k++) {
                        for (int l = 0; l < latent->ne[0]; l++) {
                            float value = ggml_ext_tensor_get_f32(latent, l, k, j, i);
@ -2258,6 +2341,7 @@ public:
        if (use_tiny_autoencoder ||
            sd_version_is_qwen_image(version) ||
            sd_version_is_wan(version) ||
+            sd_version_is_flux2(version) ||
            version == VERSION_CHROMA_RADIANCE) {
            latent = vae_output;
        } else if (version == VERSION_SD1_PIX2PIX) {
@ -2463,6 +2547,7 @@ const char* prediction_to_str[] = {
    "edm_v",
    "sd3_flow",
    "flux_flow",
+    "flux2_flow",
 };

 const char* sd_prediction_name(enum prediction_t prediction) {
@ -2567,8 +2652,8 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
             "clip_g_path: %s\n"
             "clip_vision_path: %s\n"
             "t5xxl_path: %s\n"
-             "qwen2vl_path: %s\n"
-             "qwen2vl_vision_path: %s\n"
+             "llm_path: %s\n"
+             "llm_vision_path: %s\n"
             "diffusion_model_path: %s\n"
             "high_noise_diffusion_model_path: %s\n"
             "vae_path: %s\n"
@ -2598,8 +2683,8 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
             SAFE_STR(sd_ctx_params->clip_g_path),
             SAFE_STR(sd_ctx_params->clip_vision_path),
             SAFE_STR(sd_ctx_params->t5xxl_path),
-             SAFE_STR(sd_ctx_params->qwen2vl_path),
-             SAFE_STR(sd_ctx_params->qwen2vl_vision_path),
+             SAFE_STR(sd_ctx_params->llm_path),
+             SAFE_STR(sd_ctx_params->llm_vision_path),
             SAFE_STR(sd_ctx_params->diffusion_model_path),
             SAFE_STR(sd_ctx_params->high_noise_diffusion_model_path),
             SAFE_STR(sd_ctx_params->vae_path),
@ -3233,7 +3318,10 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
    LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]);

    int sample_steps          = sd_img_gen_params->sample_params.sample_steps;
-    std::vector<float> sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps, sd_img_gen_params->sample_params.scheduler, sd_ctx->sd->version);
+    std::vector<float> sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps,
+                                                                 sd_ctx->sd->get_image_seq_len(height, width),
+                                                                 sd_img_gen_params->sample_params.scheduler,
+                                                                 sd_ctx->sd->version);

    ggml_tensor* init_latent   = nullptr;
    ggml_tensor* concat_latent = nullptr;
@ -3486,7 +3574,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
    if (high_noise_sample_steps > 0) {
        total_steps += high_noise_sample_steps;
    }
-    std::vector<float> sigmas = sd_ctx->sd->denoiser->get_sigmas(total_steps, sd_vid_gen_params->sample_params.scheduler, sd_ctx->sd->version);
+    std::vector<float> sigmas = sd_ctx->sd->denoiser->get_sigmas(total_steps, 0, sd_vid_gen_params->sample_params.scheduler, sd_ctx->sd->version);

    if (high_noise_sample_steps < 0) {
        // timesteps ∝ sigmas for Flow models (like wan2.2 a14b)
--- a/otherarch/sdcpp/stable-diffusion.h
+++ b/otherarch/sdcpp/stable-diffusion.h
@ -71,6 +71,7 @@ enum prediction_t {
    EDM_V_PRED,
    SD3_FLOW_PRED,
    FLUX_FLOW_PRED,
+    FLUX2_FLOW_PRED,
    PREDICTION_COUNT
 };

@ -156,8 +157,8 @@ typedef struct {
    const char* clip_g_path;
    const char* clip_vision_path;
    const char* t5xxl_path;
-    const char* qwen2vl_path;
-    const char* qwen2vl_vision_path;
+    const char* llm_path;
+    const char* llm_vision_path;
    const char* diffusion_model_path;
    const char* high_noise_diffusion_model_path;
    const char* vae_path;
@ -282,11 +283,11 @@ typedef struct sd_ctx_t sd_ctx_t;

 typedef void (*sd_log_cb_t)(enum sd_log_level_t level, const char* text, void* data);
 typedef void (*sd_progress_cb_t)(int step, int steps, float time, void* data);
-typedef void (*sd_preview_cb_t)(int step, int frame_count, sd_image_t* frames, bool is_noisy);
+typedef void (*sd_preview_cb_t)(int step, int frame_count, sd_image_t* frames, bool is_noisy, void* data);

 SD_API void sd_set_log_callback(sd_log_cb_t sd_log_cb, void* data);
 SD_API void sd_set_progress_callback(sd_progress_cb_t cb, void* data);
-SD_API void sd_set_preview_callback(sd_preview_cb_t cb, enum preview_t mode, int interval, bool denoised, bool noisy);
+SD_API void sd_set_preview_callback(sd_preview_cb_t cb, enum preview_t mode, int interval, bool denoised, bool noisy, void* data);
 SD_API int32_t sd_get_num_physical_cores();
 SD_API const char* sd_get_system_info();

--- a/otherarch/sdcpp/tokenize_util.cpp
+++ b/otherarch/sdcpp/tokenize_util.cpp
@ -811,6 +811,8 @@ bool starts_with(const std::vector<char32_t>& text,
    return std::equal(prefix.begin(), prefix.end(), text.begin() + index);
 }

+// mistral: [^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+
+// qwen2: (?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+
 std::vector<std::string> token_split(const std::string& text) {
    std::vector<std::string> tokens;
    auto cps = utf8_to_codepoints(text);
--- a/otherarch/sdcpp/util.cpp
+++ b/otherarch/sdcpp/util.cpp
@ -177,6 +177,7 @@ static sd_progress_cb_t sd_progress_cb = nullptr;
 void* sd_progress_cb_data              = nullptr;

 static sd_preview_cb_t sd_preview_cb = nullptr;
+static void* sd_preview_cb_data      = nullptr;
 preview_t sd_preview_mode            = PREVIEW_NONE;
 int sd_preview_interval              = 1;
 bool sd_preview_denoised             = true;
@ -270,13 +271,16 @@ void pretty_progress(int step, int steps, float time) {
        }
    }
    progress += "|";
-    printf(time > 1.0f ? "\r%s %i/%i - %.2fs/it" : "\r%s %i/%i - %.2fit/s\033[K",
-           progress.c_str(), step, steps,
-           time > 1.0f || time == 0 ? time : (1.0f / time));
-    fflush(stdout);  // for linux
-    if (step == steps) {
-        printf("\n");
+
+    const char* lf   = (step == steps ? "\n" : "");
+    const char* unit = "s/it";
+    float speed      = time;
+    if (speed < 1.0f && speed > 0.f) {
+        speed = 1.0f / speed;
+        unit  = "it/s";
    }
+    printf("\r%s %i/%i - %.2f%s\033[K%s", progress.c_str(), step, steps, speed, unit, lf);
+    fflush(stdout);  // for linux
 }

 std::string ltrim(const std::string& s) {
@ -355,8 +359,9 @@ void sd_set_progress_callback(sd_progress_cb_t cb, void* data) {
    sd_progress_cb      = cb;
    sd_progress_cb_data = data;
 }
-void sd_set_preview_callback(sd_preview_cb_t cb, preview_t mode = PREVIEW_PROJ, int interval = 1, bool denoised = true, bool noisy = false) {
+void sd_set_preview_callback(sd_preview_cb_t cb, preview_t mode, int interval, bool denoised, bool noisy, void* data) {
    sd_preview_cb       = cb;
+    sd_preview_cb_data  = data;
    sd_preview_mode     = mode;
    sd_preview_interval = interval;
    sd_preview_denoised = denoised;
@ -366,6 +371,9 @@ void sd_set_preview_callback(sd_preview_cb_t cb, preview_t mode = PREVIEW_PROJ,
 sd_preview_cb_t sd_get_preview_callback() {
    return sd_preview_cb;
 }
+void* sd_get_preview_callback_data() {
+    return sd_preview_cb_data;
+}

 preview_t sd_get_preview_mode() {
    return sd_preview_mode;
--- a/otherarch/sdcpp/util.h
+++ b/otherarch/sdcpp/util.h
@ -56,6 +56,7 @@ sd_progress_cb_t sd_get_progress_callback();
 void* sd_get_progress_callback_data();

 sd_preview_cb_t sd_get_preview_callback();
+void* sd_get_preview_callback_data();
 preview_t sd_get_preview_mode();
 int sd_get_preview_interval();
 bool sd_should_preview_denoised();
--- a/otherarch/sdcpp/vae.hpp
+++ b/otherarch/sdcpp/vae.hpp
@ -487,6 +487,7 @@ public:
 // ldm.models.autoencoder.AutoencoderKL
 class AutoencodingEngine : public GGMLBlock {
 protected:
+    SDVersion version;
    bool decode_only       = true;
    bool use_video_decoder = false;
    bool use_quant         = true;
@ -507,10 +508,15 @@ public:
                       bool decode_only           = true,
                       bool use_linear_projection = false,
                       bool use_video_decoder     = false)
-        : decode_only(decode_only), use_video_decoder(use_video_decoder) {
+        : version(version), decode_only(decode_only), use_video_decoder(use_video_decoder) {
        if (sd_version_is_dit(version)) {
-            dd_config.z_channels = 16;
-            use_quant            = false;
+            if (sd_version_is_flux2(version)) {
+                dd_config.z_channels = 32;
+                embed_dim            = 32;
+            } else {
+                use_quant            = false;
+                dd_config.z_channels = 16;
+            }
        }
        if (use_video_decoder) {
            use_quant = false;
@ -547,6 +553,24 @@ public:

    struct ggml_tensor* decode(GGMLRunnerContext* ctx, struct ggml_tensor* z) {
        // z: [N, z_channels, h, w]
+        if (sd_version_is_flux2(version)) {
+            // [N, C*p*p, h, w] -> [N, C, h*p, w*p]
+            int64_t p = 2;
+
+            int64_t N = z->ne[3];
+            int64_t C = z->ne[2] / p / p;
+            int64_t h = z->ne[1];
+            int64_t w = z->ne[0];
+            int64_t H = h * p;
+            int64_t W = w * p;
+
+            z = ggml_reshape_4d(ctx->ggml_ctx, z, w * h, p * p, C, N);                           // [N, C, p*p, h*w]
+            z = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, z, 1, 0, 2, 3));  // [N, C, h*w, p*p]
+            z = ggml_reshape_4d(ctx->ggml_ctx, z, p, p, w, h * C * N);                           // [N*C*h, w, p, p]
+            z = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, z, 0, 2, 1, 3));  // [N*C*h, p, w, p]
+            z = ggml_reshape_4d(ctx->ggml_ctx, z, W, H, C, N);                                   // [N, C, h*p, w*p]
+        }
+
        if (use_quant) {
            auto post_quant_conv = std::dynamic_pointer_cast<Conv2d>(blocks["post_quant_conv"]);
            z                    = post_quant_conv->forward(ctx, z);  // [N, z_channels, h, w]
@ -563,12 +587,30 @@ public:
        // x: [N, in_channels, h, w]
        auto encoder = std::dynamic_pointer_cast<Encoder>(blocks["encoder"]);

-        auto h = encoder->forward(ctx, x);  // [N, 2*z_channels, h/8, w/8]
+        auto z = encoder->forward(ctx, x);  // [N, 2*z_channels, h/8, w/8]
        if (use_quant) {
            auto quant_conv = std::dynamic_pointer_cast<Conv2d>(blocks["quant_conv"]);
-            h               = quant_conv->forward(ctx, h);  // [N, 2*embed_dim, h/8, w/8]
+            z               = quant_conv->forward(ctx, z);  // [N, 2*embed_dim, h/8, w/8]
        }
-        return h;
+        if (sd_version_is_flux2(version)) {
+            z = ggml_ext_chunk(ctx->ggml_ctx, z, 2, 2)[0];
+
+            // [N, C, H, W] -> [N, C*p*p, H/p, W/p]
+            int64_t p = 2;
+            int64_t N = z->ne[3];
+            int64_t C = z->ne[2];
+            int64_t H = z->ne[1];
+            int64_t W = z->ne[0];
+            int64_t h = H / p;
+            int64_t w = W / p;
+
+            z = ggml_reshape_4d(ctx->ggml_ctx, z, p, w, p, h * C * N);                 // [N*C*h, p, w, p]
+            z = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, z, 0, 2, 1, 3));  // [N*C*h, w, p, p]
+            z = ggml_reshape_4d(ctx->ggml_ctx, z, p * p, w * h, C, N);                 // [N, C, h*w, p*p]
+            z = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, z, 1, 0, 2, 3));  // [N, C, p*p, h*w]
+            z = ggml_reshape_4d(ctx->ggml_ctx, z, w, h, p * p * C, N);                 // [N, C*p*p, h*w]
+        }
+        return z;
    }
 };

--- a/otherarch/sdcpp/vocab_mistral.hpp
+++ b/otherarch/sdcpp/vocab_mistral.hpp