resync and updated sdcpp for flux and sd3 support

2025-09-10 09:04:36 +00:00 · 2024-11-03 22:03:16 +08:00 · 2024-11-03 22:03:16 +08:00 · f32a874966
commit f32a874966
parent 33721615b5
30 changed files with 2434248 additions and 1729 deletions
--- a/otherarch/sdcpp/clip.hpp
+++ b/otherarch/sdcpp/clip.hpp
@ -31,16 +31,6 @@ std::pair<std::unordered_map<std::string, float>, std::string> extract_and_remov
    return std::make_pair(filename2multiplier, text);
 }

-const std::string UNK_TOKEN = "<|endoftext|>";
-const std::string BOS_TOKEN = "<|startoftext|>";
-const std::string EOS_TOKEN = "<|endoftext|>";
-const std::string PAD_TOEKN = "<|endoftext|>";
-
-const int UNK_TOKEN_ID = 49407;
-const int BOS_TOKEN_ID = 49406;
-const int EOS_TOKEN_ID = 49407;
-const int PAD_TOKEN_ID = 49407;
-
 std::vector<std::pair<int, std::u32string>> bytes_to_unicode() {
    std::vector<std::pair<int, std::u32string>> byte_unicode_pairs;
    std::set<int> byte_set;
@ -73,12 +63,27 @@ typedef std::function<bool(std::string&, std::vector<int32_t>&)> on_new_token_cb

 class CLIPTokenizer {
 private:
-    SDVersion version = VERSION_1_x;
    std::map<int, std::u32string> byte_encoder;
+    std::map<std::u32string, int> byte_decoder;
    std::map<std::u32string, int> encoder;
+    std::map<int, std::u32string> decoder;
    std::map<std::pair<std::u32string, std::u32string>, int> bpe_ranks;
    std::regex pat;
+    int encoder_len;
+    int bpe_len;

+public:
+    const std::string UNK_TOKEN = "<|endoftext|>";
+    const std::string BOS_TOKEN = "<|startoftext|>";
+    const std::string EOS_TOKEN = "<|endoftext|>";
+    const std::string PAD_TOKEN = "<|endoftext|>";
+
+    const int UNK_TOKEN_ID = 49407;
+    const int BOS_TOKEN_ID = 49406;
+    const int EOS_TOKEN_ID = 49407;
+    const int PAD_TOKEN_ID = 49407;
+
+private:
    static std::string strip(const std::string& str) {
        std::string::size_type start = str.find_first_not_of(" \t\n\r\v\f");
        std::string::size_type end   = str.find_last_not_of(" \t\n\r\v\f");
@ -113,12 +118,22 @@ private:
    }

 public:
-    CLIPTokenizer(SDVersion version = VERSION_1_x)
-        : version(version) {}
+    CLIPTokenizer(int pad_token_id = 49407, const std::string& merges_utf8_str = "")
+        : PAD_TOKEN_ID(pad_token_id) {
+        if (merges_utf8_str.size() > 0) {
+            load_from_merges(merges_utf8_str);
+        } else {
+            load_from_merges(ModelLoader::load_merges());
+        }
+    }

    void load_from_merges(const std::string& merges_utf8_str) {
        auto byte_unicode_pairs = bytes_to_unicode();
-        byte_encoder            = std::map<int, std::u32string>(byte_unicode_pairs.begin(), byte_unicode_pairs.end());
+        // printf("byte_unicode_pairs have %lu pairs \n", byte_unicode_pairs.size());
+        byte_encoder = std::map<int, std::u32string>(byte_unicode_pairs.begin(), byte_unicode_pairs.end());
+        for (auto& pair : byte_unicode_pairs) {
+            byte_decoder[pair.second] = pair.first;
+        }
        // for (auto & pair: byte_unicode_pairs) {
        //     std::cout << pair.first << ": " << pair.second << std::endl;
        // }
@ -138,6 +153,8 @@ public:
            size_t space_pos = merge.find(' ');
            merge_pairs.emplace_back(merge.substr(0, space_pos), merge.substr(space_pos + 1));
            // LOG_DEBUG("%s", utf32_to_utf8(merge.substr(space_pos + 1)).c_str());
+            // printf("%s :: %s | %s \n", utf32_to_utf8(merge).c_str(), utf32_to_utf8(merge.substr(0, space_pos)).c_str(),
+            //                     utf32_to_utf8(merge.substr(space_pos + 1)).c_str());
        }
        std::vector<std::u32string> vocab;
        for (const auto& pair : byte_unicode_pairs) {
@ -154,15 +171,36 @@ public:
        LOG_DEBUG("vocab size: %llu", vocab.size());
        int i = 0;
        for (const auto& token : vocab) {
-            encoder[token] = i++;
+            encoder[token] = i;
+            decoder[i]     = token;
+            i++;
+        }
+        encoder_len = i;
+
+        auto it = encoder.find(utf8_to_utf32("img</w>"));
+        if (it != encoder.end()) {
+            LOG_DEBUG(" trigger word img already in vocab");
+        } else {
+            LOG_DEBUG(" trigger word img not in vocab yet");
        }

        int rank = 0;
        for (const auto& merge : merge_pairs) {
            bpe_ranks[merge] = rank++;
        }
+        bpe_len = rank;
    };

+    void add_token(const std::string& text) {
+        std::u32string token = utf8_to_utf32(text);
+        auto it              = encoder.find(token);
+        if (it != encoder.end()) {
+            encoder[token]       = encoder_len;
+            decoder[encoder_len] = token;
+            encoder_len++;
+        }
+    }
+
    std::u32string bpe(const std::u32string& token) {
        std::vector<std::u32string> word;

@ -243,6 +281,7 @@ public:
                              size_t max_length = 0,
                              bool padding      = false) {
        std::vector<int32_t> tokens = encode(text, on_new_token_cb);
+
        tokens.insert(tokens.begin(), BOS_TOKEN_ID);
        if (max_length > 0) {
            if (tokens.size() > max_length - 1) {
@ -251,17 +290,83 @@ public:
            } else {
                tokens.push_back(EOS_TOKEN_ID);
                if (padding) {
-                    int pad_token_id = PAD_TOKEN_ID;
-                    if (version == VERSION_2_x) {
-                        pad_token_id = 0;
-                    }
-                    tokens.insert(tokens.end(), max_length - tokens.size(), pad_token_id);
+                    tokens.insert(tokens.end(), max_length - tokens.size(), PAD_TOKEN_ID);
                }
            }
        }
+
        return tokens;
    }

+    void pad_tokens(std::vector<int>& tokens,
+                    std::vector<float>& weights,
+                    size_t max_length = 0,
+                    bool padding      = false) {
+        if (max_length > 0 && padding) {
+            size_t n = std::ceil(tokens.size() * 1.0 / (max_length - 2));
+            if (n == 0) {
+                n = 1;
+            }
+            size_t length = max_length * n;
+            LOG_DEBUG("token length: %llu", length);
+            std::vector<int> new_tokens;
+            std::vector<float> new_weights;
+            new_tokens.push_back(BOS_TOKEN_ID);
+            new_weights.push_back(1.0);
+            int token_idx = 0;
+            for (int i = 1; i < length; i++) {
+                if (token_idx >= tokens.size()) {
+                    break;
+                }
+                if (i % max_length == 0) {
+                    new_tokens.push_back(BOS_TOKEN_ID);
+                    new_weights.push_back(1.0);
+                } else if (i % max_length == max_length - 1) {
+                    new_tokens.push_back(EOS_TOKEN_ID);
+                    new_weights.push_back(1.0);
+                } else {
+                    new_tokens.push_back(tokens[token_idx]);
+                    new_weights.push_back(weights[token_idx]);
+                    token_idx++;
+                }
+            }
+
+            new_tokens.push_back(EOS_TOKEN_ID);
+            new_weights.push_back(1.0);
+            tokens  = new_tokens;
+            weights = new_weights;
+
+            if (padding) {
+                tokens.insert(tokens.end(), length - tokens.size(), PAD_TOKEN_ID);
+                weights.insert(weights.end(), length - weights.size(), 1.0);
+            }
+        }
+    }
+
+    std::string decode(const std::vector<int>& tokens) {
+        std::string text = "";
+        for (int t : tokens) {
+            if (t == 49406 || t == 49407)
+                continue;
+            std::u32string ts = decoder[t];
+            // printf("%d, %s \n", t,  utf32_to_utf8(ts).c_str());
+            std::string s = utf32_to_utf8(ts);
+            if (s.length() >= 4 && ends_with(s, "</w>")) {
+                text += " " + s.replace(s.length() - 4, s.length() - 1, "");
+            } else {
+                text += " " + s;
+            }
+        }
+        // std::vector<unsigned char> bytes;
+        // for (auto c : text){
+        //     bytes.push_back(byte_decoder[c]);
+        // }
+
+        // std::string s((char *)bytes.data());
+        // std::string s = "";
+        return trim(text);
+    }
+
    std::vector<int> encode(std::string text, on_new_token_cb_t on_new_token_cb) {
        std::string original_text = text;
        std::vector<int32_t> bpe_tokens;
@ -283,7 +388,7 @@ public:
                std::string token_str = token.str();
                std::u32string utf32_token;
                for (int i = 0; i < token_str.length(); i++) {
-                    char b = token_str[i];
+                    unsigned char b = token_str[i];
                    utf32_token += byte_encoder[b];
                }
                auto bpe_strs = bpe(utf32_token);
@ -308,118 +413,12 @@ public:
            ss << "\"" << token << "\", ";
        }
        ss << "]";
-        LOG_DEBUG("split prompt \"%s\" to tokens %s", original_text.c_str(), ss.str().c_str());
+        // LOG_DEBUG("split prompt \"%s\" to tokens %s", original_text.c_str(), ss.str().c_str());
+        // printf("split prompt \"%s\" to tokens %s \n", original_text.c_str(), ss.str().c_str());
        return bpe_tokens;
    }
 };

-// Ref: https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/cad87bf4e3e0b0a759afa94e933527c3123d59bc/modules/prompt_parser.py#L345
-//
-// Parses a string with attention tokens and returns a list of pairs: text and its associated weight.
-// Accepted tokens are:
-//   (abc) - increases attention to abc by a multiplier of 1.1
-//   (abc:3.12) - increases attention to abc by a multiplier of 3.12
-//   [abc] - decreases attention to abc by a multiplier of 1.1
-//   \( - literal character '('
-//   \[ - literal character '['
-//   \) - literal character ')'
-//   \] - literal character ']'
-//   \\ - literal character '\'
-//   anything else - just text
-//
-// >>> parse_prompt_attention('normal text')
-// [['normal text', 1.0]]
-// >>> parse_prompt_attention('an (important) word')
-// [['an ', 1.0], ['important', 1.1], [' word', 1.0]]
-// >>> parse_prompt_attention('(unbalanced')
-// [['unbalanced', 1.1]]
-// >>> parse_prompt_attention('\(literal\]')
-// [['(literal]', 1.0]]
-// >>> parse_prompt_attention('(unnecessary)(parens)')
-// [['unnecessaryparens', 1.1]]
-// >>> parse_prompt_attention('a (((house:1.3)) [on] a (hill:0.5), sun, (((sky))).')
-// [['a ', 1.0],
-//  ['house', 1.5730000000000004],
-//  [' ', 1.1],
-//  ['on', 1.0],
-//  [' a ', 1.1],
-//  ['hill', 0.55],
-//  [', sun, ', 1.1],
-//  ['sky', 1.4641000000000006],
-//  ['.', 1.1]]
-std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::string& text) {
-    std::vector<std::pair<std::string, float>> res;
-    std::vector<int> round_brackets;
-    std::vector<int> square_brackets;
-
-    float round_bracket_multiplier  = 1.1f;
-    float square_bracket_multiplier = 1 / 1.1f;
-
-    std::regex re_attention(R"(\\\(|\\\)|\\\[|\\\]|\\\\|\\|\(|\[|:([+-]?[.\d]+)\)|\)|\]|[^\\()\[\]:]+|:)");
-    std::regex re_break(R"(\s*\bBREAK\b\s*)");
-
-    auto multiply_range = [&](int start_position, float multiplier) {
-        for (int p = start_position; p < res.size(); ++p) {
-            res[p].second *= multiplier;
-        }
-    };
-
-    std::smatch m;
-    std::string remaining_text = text;
-
-    while (std::regex_search(remaining_text, m, re_attention)) {
-        std::string text   = m[0];
-        std::string weight = m[1];
-
-        if (text == "(") {
-            round_brackets.push_back((int)res.size());
-        } else if (text == "[") {
-            square_brackets.push_back((int)res.size());
-        } else if (!weight.empty()) {
-            if (!round_brackets.empty()) {
-                multiply_range(round_brackets.back(), std::stof(weight));
-                round_brackets.pop_back();
-            }
-        } else if (text == ")" && !round_brackets.empty()) {
-            multiply_range(round_brackets.back(), round_bracket_multiplier);
-            round_brackets.pop_back();
-        } else if (text == "]" && !square_brackets.empty()) {
-            multiply_range(square_brackets.back(), square_bracket_multiplier);
-            square_brackets.pop_back();
-        } else if (text == "\\(") {
-            res.push_back({text.substr(1), 1.0f});
-        } else {
-            res.push_back({text, 1.0f});
-        }
-
-        remaining_text = m.suffix();
-    }
-
-    for (int pos : round_brackets) {
-        multiply_range(pos, round_bracket_multiplier);
-    }
-
-    for (int pos : square_brackets) {
-        multiply_range(pos, square_bracket_multiplier);
-    }
-
-    if (res.empty()) {
-        res.push_back({"", 1.0f});
-    }
-
-    int i = 0;
-    while (i + 1 < res.size()) {
-        if (res[i].second == res[i + 1].second) {
-            res[i].first += res[i + 1].first;
-            res.erase(res.begin() + i + 1);
-        } else {
-            ++i;
-        }
-    }
-
-    return res;
-}
-
 /*================================================ FrozenCLIPEmbedder ================================================*/

 // Ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/clip/modeling_clip.py
@ -469,7 +468,8 @@ public:
        : d_model(d_model),
          n_head(n_head),
          intermediate_size(intermediate_size) {
-        blocks["self_attn"]   = std::shared_ptr<GGMLBlock>(new MultiheadAttention(d_model, n_head));
+        blocks["self_attn"] = std::shared_ptr<GGMLBlock>(new MultiheadAttention(d_model, n_head, true, true));
+
        blocks["layer_norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(d_model));
        blocks["layer_norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(d_model));

@ -508,7 +508,7 @@ public:
    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, int clip_skip = -1, bool mask = true) {
        // x: [N, n_token, d_model]
        int layer_idx = n_layer - 1;
-        LOG_DEBUG("clip_skip %d", clip_skip);
+        // LOG_DEBUG("clip_skip %d", clip_skip);
        if (clip_skip > 0) {
            layer_idx = n_layer - clip_skip;
        }
@ -520,7 +520,7 @@ public:
            }
            std::string name = "layers." + std::to_string(i);
            auto layer       = std::dynamic_pointer_cast<CLIPLayer>(blocks[name]);
-            x                = layer->forward(ctx, x);  // [N, n_token, d_model]
+            x                = layer->forward(ctx, x, mask);  // [N, n_token, d_model]
            // LOG_DEBUG("layer %d", i);
        }
        return x;
@ -558,11 +558,14 @@ public:
        auto token_embed_weight    = params["token_embedding.weight"];
        auto position_embed_weight = params["position_embedding.weight"];

-        GGML_ASSERT(input_ids->ne[0] <= position_embed_weight->ne[0]);
+        GGML_ASSERT(input_ids->ne[0] == position_embed_weight->ne[1]);
+        input_ids            = ggml_reshape_3d(ctx, input_ids, input_ids->ne[0], 1, input_ids->ne[1]);
+        auto token_embedding = ggml_get_rows(ctx, custom_embed_weight != NULL ? custom_embed_weight : token_embed_weight, input_ids);
+        token_embedding      = ggml_reshape_3d(ctx, token_embedding, token_embedding->ne[0], token_embedding->ne[1], token_embedding->ne[3]);

        // token_embedding + position_embedding
        auto x = ggml_add(ctx,
-                          ggml_get_rows(ctx, custom_embed_weight != NULL ? custom_embed_weight : token_embed_weight, input_ids),
+                          token_embedding,
                          position_embed_weight);  // [N, n_token, embed_dim]
        return x;
    }
@ -617,8 +620,8 @@ public:
        class_embedding                     = ggml_repeat(ctx, class_embed_weight, class_embedding);      // [N, embed_dim]
        class_embedding                     = ggml_reshape_4d(ctx, class_embedding, 1, embed_dim, 1, N);  // [N, 1, embed_dim, 1]

-        struct ggml_tensor* x = ggml_concat(ctx, class_embedding, patch_embedding, 2);    // [N, num_positions, embed_dim, 1]
-        x                     = ggml_reshape_3d(ctx, x, embed_dim, num_positions, N);  // [N, num_positions, embed_dim]
+        struct ggml_tensor* x = ggml_concat(ctx, class_embedding, patch_embedding, 2);  // [N, num_positions, embed_dim, 1]
+        x                     = ggml_reshape_3d(ctx, x, embed_dim, num_positions, N);   // [N, num_positions, embed_dim]
        x                     = ggml_add(ctx, x, position_embed_weight);
        return x;  // [N, num_positions, embed_dim]
    }
@ -717,11 +720,6 @@ public:
 };

 class CLIPVisionModel : public GGMLBlock {
-protected:
-    void init_params(struct ggml_context* ctx, ggml_type wtype) {
-        params["visual_projection"] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, projection_dim, hidden_size);
-    }
-
 public:
    // network hparams
    int32_t num_channels      = 3;
@ -732,16 +730,14 @@ public:
    int32_t intermediate_size = 4096;
    int32_t n_head            = 16;
    int32_t n_layer           = 24;
-    int32_t projection_dim    = 768;

 public:
-    CLIPVisionModel(CLIPVersion version = OPEN_CLIP_VIT_H_14) {
+    CLIPVisionModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14) {
        if (version == OPEN_CLIP_VIT_H_14) {
            hidden_size       = 1280;
            intermediate_size = 5120;
            n_head            = 16;
            n_layer           = 32;
-            projection_dim    = 1024;
        } else if (version == OPEN_CLIP_VIT_BIGG_14) {
            hidden_size       = 1664;
            intermediate_size = 8192;
@ -755,9 +751,8 @@ public:
        blocks["post_layernorm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size));
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* pixel_values) {
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* pixel_values, bool return_pooled = true) {
        // pixel_values: [N, num_channels, image_size, image_size]
-        // return: // [N, projection_dim]
        auto embeddings     = std::dynamic_pointer_cast<CLIPVisionEmbeddings>(blocks["embeddings"]);
        auto pre_layernorm  = std::dynamic_pointer_cast<LayerNorm>(blocks["pre_layernorm"]);
        auto encoder        = std::dynamic_pointer_cast<CLIPEncoder>(blocks["encoder"]);
@ -765,26 +760,60 @@ public:

        auto x = embeddings->forward(ctx, pixel_values);  // [N, num_positions, embed_dim]
        x      = pre_layernorm->forward(ctx, x);
-        x      = encoder->forward(ctx, x, -1, true);
+        x      = encoder->forward(ctx, x, -1, false);
        x      = post_layernorm->forward(ctx, x);  // [N, n_token, hidden_size]

-        GGML_ASSERT(x->ne[2] == 1);
-        int64_t max_token_idx  = 0;
-        ggml_tensor* pooled    = ggml_view_1d(ctx, x, x->ne[0], x->nb[1] * max_token_idx);  // assert N == 1
-        auto visual_projection = params["visual_projection"];
-        pooled                 = ggml_mul_mat(ctx, ggml_cont(ctx, ggml_transpose(ctx, visual_projection)), pooled);
-        return pooled;  // [N, projection_dim]
+        GGML_ASSERT(x->ne[3] == 1);
+        if (return_pooled) {
+            ggml_tensor* pooled = ggml_cont(ctx, ggml_view_2d(ctx, x, x->ne[0], x->ne[2], x->nb[2], 0));
+            return pooled;  // [N, hidden_size]
+        } else {
+            return x;  // [N, n_token, hidden_size]
+        }
+    }
+};
+
+class CLIPProjection : public UnaryBlock {
+protected:
+    int64_t in_features;
+    int64_t out_features;
+    bool transpose_weight;
+
+    void init_params(struct ggml_context* ctx, ggml_type wtype) {
+        if (transpose_weight) {
+            LOG_ERROR("transpose_weight");
+            params["weight"] = ggml_new_tensor_2d(ctx, wtype, out_features, in_features);
+        } else {
+            params["weight"] = ggml_new_tensor_2d(ctx, wtype, in_features, out_features);
+        }
+    }
+
+public:
+    CLIPProjection(int64_t in_features,
+                   int64_t out_features,
+                   bool transpose_weight = false)
+        : in_features(in_features),
+          out_features(out_features),
+          transpose_weight(transpose_weight) {}
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        struct ggml_tensor* w = params["weight"];
+        if (transpose_weight) {
+            w = ggml_cont(ctx, ggml_transpose(ctx, w));
+        }
+        return ggml_nn_linear(ctx, x, w, NULL);
    }
 };

 class CLIPVisionModelProjection : public GGMLBlock {
 public:
    int32_t hidden_size    = 1024;
-    int32_t projection_dim = 1024;
+    int32_t projection_dim = 768;
    int32_t image_size     = 224;

 public:
-    CLIPVisionModelProjection(CLIPVersion version = OPEN_CLIP_VIT_H_14) {
+    CLIPVisionModelProjection(CLIPVersion version   = OPENAI_CLIP_VIT_L_14,
+                              bool transpose_proj_w = false) {
        if (version == OPEN_CLIP_VIT_H_14) {
            hidden_size    = 1280;
            projection_dim = 1024;
@ -792,199 +821,86 @@ public:
            hidden_size = 1664;
        }

-        blocks["visual_model"]      = std::shared_ptr<GGMLBlock>(new CLIPVisionModel(version));
-        blocks["visual_projection"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, projection_dim, false));
+        blocks["vision_model"]      = std::shared_ptr<GGMLBlock>(new CLIPVisionModel(version));
+        blocks["visual_projection"] = std::shared_ptr<GGMLBlock>(new CLIPProjection(hidden_size, projection_dim, transpose_proj_w));
    }

    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* pixel_values) {
        // pixel_values: [N, num_channels, image_size, image_size]
-        // return: [N, num_positions, projection_dim]
-        auto visual_model      = std::dynamic_pointer_cast<CLIPVisionModel>(blocks["visual_model"]);
-        auto visual_projection = std::dynamic_pointer_cast<Linear>(blocks["visual_projection"]);
+        // return: [N, projection_dim]
+        auto vision_model      = std::dynamic_pointer_cast<CLIPVisionModel>(blocks["vision_model"]);
+        auto visual_projection = std::dynamic_pointer_cast<CLIPProjection>(blocks["visual_projection"]);

-        auto x = visual_model->forward(ctx, pixel_values);  // [N, embed_dim]
+        auto x = vision_model->forward(ctx, pixel_values);  // [N, hidden_size]
        x      = visual_projection->forward(ctx, x);        // [N, projection_dim]

        return x;  // [N, projection_dim]
    }
 };

-// ldm.modules.encoders.modules.FrozenCLIPEmbedder
-// Ref: https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/cad87bf4e3e0b0a759afa94e933527c3123d59bc/modules/sd_hijack_clip.py#L283
-struct FrozenCLIPEmbedderWithCustomWords : public GGMLModule {
-    SDVersion version = VERSION_1_x;
-    CLIPTokenizer tokenizer;
-    CLIPTextModel text_model;
-    CLIPTextModel text_model2;
+struct CLIPTextModelRunner : public GGMLRunner {
+    CLIPTextModel model;

-    std::string embd_dir;
-    int32_t num_custom_embeddings = 0;
-    std::vector<uint8_t> token_embed_custom;
-    std::vector<std::string> readed_embeddings;
-
-    FrozenCLIPEmbedderWithCustomWords(ggml_backend_t backend,
-                                      ggml_type wtype,
-                                      SDVersion version = VERSION_1_x,
-                                      int clip_skip     = -1)
-        : GGMLModule(backend, wtype), version(version), tokenizer(version) {
-        if (clip_skip <= 0) {
-            clip_skip = 1;
-            if (version == VERSION_2_x || version == VERSION_XL) {
-                clip_skip = 2;
-            }
-        }
-        if (version == VERSION_1_x) {
-            text_model = CLIPTextModel(OPENAI_CLIP_VIT_L_14, clip_skip);
-            text_model.init(params_ctx, wtype);
-        } else if (version == VERSION_2_x) {
-            text_model = CLIPTextModel(OPEN_CLIP_VIT_H_14, clip_skip);
-            text_model.init(params_ctx, wtype);
-        } else if (version == VERSION_XL) {
-            text_model  = CLIPTextModel(OPENAI_CLIP_VIT_L_14, clip_skip, false);
-            text_model2 = CLIPTextModel(OPEN_CLIP_VIT_BIGG_14, clip_skip, false);
-            text_model.init(params_ctx, wtype);
-            text_model2.init(params_ctx, wtype);
-        }
+    CLIPTextModelRunner(ggml_backend_t backend,
+                        ggml_type wtype,
+                        CLIPVersion version = OPENAI_CLIP_VIT_L_14,
+                        int clip_skip_value = 1,
+                        bool with_final_ln  = true)
+        : GGMLRunner(backend, wtype), model(version, clip_skip_value, with_final_ln) {
+        model.init(params_ctx, wtype);
    }

    std::string get_desc() {
        return "clip";
    }

-    size_t get_params_mem_size() {
-        size_t params_mem_size = text_model.get_params_mem_size();
-        if (version == VERSION_XL) {
-            params_mem_size += text_model2.get_params_mem_size();
-        }
-        return params_mem_size;
-    }
-
-    size_t get_params_num() {
-        size_t params_num = text_model.get_params_num();
-        if (version == VERSION_XL) {
-            params_num += text_model2.get_params_num();
-        }
-        return params_num;
-    }
-
    void set_clip_skip(int clip_skip) {
-        text_model.set_clip_skip(clip_skip);
-        if (version == VERSION_XL) {
-            text_model2.set_clip_skip(clip_skip);
-        }
+        model.set_clip_skip(clip_skip);
    }

    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
-        text_model.get_param_tensors(tensors, prefix + "transformer.text_model");
-        if (version == VERSION_XL) {
-            text_model2.get_param_tensors(tensors, prefix + "1.transformer.text_model");
-        }
-    }
-
-    bool load_embedding(std::string embd_name, std::string embd_path, std::vector<int32_t>& bpe_tokens) {
-        // the order matters
-        ModelLoader model_loader;
-        if (!model_loader.init_from_file(embd_path)) {
-            LOG_ERROR("embedding '%s' failed", embd_name.c_str());
-            return false;
-        }
-        struct ggml_init_params params;
-        params.mem_size               = 32 * 1024;  // max for custom embeddings 32 KB
-        params.mem_buffer             = NULL;
-        params.no_alloc               = false;
-        struct ggml_context* embd_ctx = ggml_init(params);
-        struct ggml_tensor* embd      = NULL;
-        auto on_load                  = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) {
-            if (tensor_storage.ne[0] != text_model.hidden_size) {
-                LOG_DEBUG("embedding wrong hidden size, got %i, expected %i", tensor_storage.ne[0], text_model.hidden_size);
-                return false;
-            }
-            embd        = ggml_new_tensor_2d(embd_ctx, wtype, text_model.hidden_size, tensor_storage.n_dims > 1 ? tensor_storage.ne[1] : 1);
-            *dst_tensor = embd;
-            return true;
-        };
-        model_loader.load_tensors(on_load, NULL);
-        readed_embeddings.push_back(embd_name);
-        token_embed_custom.resize(token_embed_custom.size() + ggml_nbytes(embd));
-        memcpy((void*)(token_embed_custom.data() + num_custom_embeddings * text_model.hidden_size * ggml_type_size(wtype)),
-               embd->data,
-               ggml_nbytes(embd));
-        for (int i = 0; i < embd->ne[1]; i++) {
-            bpe_tokens.push_back(text_model.vocab_size + num_custom_embeddings);
-            // LOG_DEBUG("new custom token: %i", text_model.vocab_size + num_custom_embeddings);
-            num_custom_embeddings++;
-        }
-        LOG_DEBUG("embedding '%s' applied, custom embeddings: %i", embd_name.c_str(), num_custom_embeddings);
-        return true;
+        model.get_param_tensors(tensors, prefix);
    }

    struct ggml_tensor* forward(struct ggml_context* ctx,
                                struct ggml_tensor* input_ids,
-                                struct ggml_tensor* input_ids2,
                                struct ggml_tensor* embeddings,
                                size_t max_token_idx = 0,
                                bool return_pooled   = false) {
-        if (return_pooled) {
-            return text_model2.forward(ctx, input_ids2, NULL, max_token_idx, return_pooled);
+        size_t N       = input_ids->ne[1];
+        size_t n_token = input_ids->ne[0];
+        if (input_ids->ne[0] > model.n_token) {
+            GGML_ASSERT(input_ids->ne[0] % model.n_token == 0);
+            input_ids = ggml_reshape_2d(ctx, input_ids, model.n_token, input_ids->ne[0] / model.n_token);
        }
-        auto hidden_states = text_model.forward(ctx, input_ids, embeddings);  // [N, n_token, hidden_size]
-        // LOG_DEBUG("hidden_states: %d %d %d %d", hidden_states->ne[0], hidden_states->ne[1], hidden_states->ne[2], hidden_states->ne[3]);
-        if (version == VERSION_XL) {
-            hidden_states = ggml_reshape_4d(ctx,
-                                            hidden_states,
-                                            hidden_states->ne[0],
-                                            hidden_states->ne[1],
-                                            hidden_states->ne[2],
-                                            hidden_states->ne[3]);
-            hidden_states = ggml_cont(ctx, ggml_permute(ctx, hidden_states, 2, 0, 1, 3));

-            auto hidden_states2 = text_model2.forward(ctx, input_ids2, NULL);  // [N, n_token, hidden_size2]
-            // LOG_DEBUG("hidden_states: %d %d %d %d", hidden_states->ne[0], hidden_states->ne[1], hidden_states->ne[2], hidden_states->ne[3]);
-            hidden_states2 = ggml_reshape_4d(ctx,
-                                             hidden_states2,
-                                             hidden_states2->ne[0],
-                                             hidden_states2->ne[1],
-                                             hidden_states2->ne[2],
-                                             hidden_states2->ne[3]);
-            hidden_states2 = ggml_cont(ctx, ggml_permute(ctx, hidden_states2, 2, 0, 1, 3));
-
-            hidden_states = ggml_concat(ctx, hidden_states, hidden_states2, 2);  // [N, n_token, hidden_size + hidden_size2]
-
-            hidden_states = ggml_cont(ctx, ggml_permute(ctx, hidden_states, 1, 2, 0, 3));
-        }
-        // LOG_DEBUG("hidden_states: %d %d %d %d", hidden_states->ne[0], hidden_states->ne[1], hidden_states->ne[2], hidden_states->ne[3]);
-        return hidden_states;
+        return model.forward(ctx, input_ids, embeddings, max_token_idx, return_pooled);
    }

    struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids,
-                                    struct ggml_tensor* input_ids2 = NULL,
-                                    size_t max_token_idx           = 0,
-                                    bool return_pooled             = false) {
+                                    int num_custom_embeddings    = 0,
+                                    void* custom_embeddings_data = NULL,
+                                    size_t max_token_idx         = 0,
+                                    bool return_pooled           = false) {
        struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);

-        input_ids2 = to_backend(input_ids2);
-        if (!return_pooled) {
-            input_ids = to_backend(input_ids);
-        }
+        input_ids = to_backend(input_ids);

        struct ggml_tensor* embeddings = NULL;

-        if (num_custom_embeddings > 0 && version != VERSION_XL) {
-            auto custom_embeddings = ggml_new_tensor_3d(compute_ctx,
+        if (num_custom_embeddings > 0 && custom_embeddings_data != NULL) {
+            auto custom_embeddings = ggml_new_tensor_2d(compute_ctx,
                                                        wtype,
-                                                        text_model.hidden_size,
-                                                        1,
+                                                        model.hidden_size,
                                                        num_custom_embeddings);
-            set_backend_tensor_data(custom_embeddings, token_embed_custom.data());
+            set_backend_tensor_data(custom_embeddings, custom_embeddings_data);

-            auto token_embed_weight = text_model.get_token_embed_weight();
-            token_embed_weight      = ggml_reshape_3d(compute_ctx, token_embed_weight, token_embed_weight->ne[0], 1, token_embed_weight->ne[1]);
+            auto token_embed_weight = model.get_token_embed_weight();
            // concatenate custom embeddings
-            embeddings = ggml_concat(compute_ctx, token_embed_weight, custom_embeddings, 2);
-            embeddings = ggml_reshape_2d(compute_ctx, embeddings, embeddings->ne[0], embeddings->ne[2]);
+            embeddings = ggml_concat(compute_ctx, token_embed_weight, custom_embeddings, 1);
        }

-        struct ggml_tensor* hidden_states = forward(compute_ctx, input_ids, input_ids2, embeddings, max_token_idx, return_pooled);
+        struct ggml_tensor* hidden_states = forward(compute_ctx, input_ids, embeddings, max_token_idx, return_pooled);

        ggml_build_forward_expand(gf, hidden_states);

@ -993,147 +909,17 @@ struct FrozenCLIPEmbedderWithCustomWords : public GGMLModule {

    void compute(const int n_threads,
                 struct ggml_tensor* input_ids,
-                 struct ggml_tensor* input_ids2,
+                 int num_custom_embeddings,
+                 void* custom_embeddings_data,
                 size_t max_token_idx,
                 bool return_pooled,
                 ggml_tensor** output,
                 ggml_context* output_ctx = NULL) {
        auto get_graph = [&]() -> struct ggml_cgraph* {
-            return build_graph(input_ids, input_ids2, max_token_idx, return_pooled);
+            return build_graph(input_ids, num_custom_embeddings, custom_embeddings_data, max_token_idx, return_pooled);
        };
-        GGMLModule::compute(get_graph, n_threads, true, output, output_ctx);
-    }
-
-    std::pair<std::vector<int>, std::vector<float>> tokenize(std::string text,
-                                                             bool padding = false) {
-        return tokenize(text, text_model.n_token, padding);
-    }
-
-    std::pair<std::vector<int>, std::vector<float>> tokenize(std::string text,
-                                                             size_t max_length = 0,
-                                                             bool padding      = false) {
-        auto parsed_attention = parse_prompt_attention(text);
-
-        {
-            std::stringstream ss;
-            ss << "[";
-            for (const auto& item : parsed_attention) {
-                ss << "['" << item.first << "', " << item.second << "], ";
-            }
-            ss << "]";
-            LOG_DEBUG("parse '%s' to %s", text.c_str(), ss.str().c_str());
-        }
-
-        auto on_new_token_cb = [&](std::string& str, std::vector<int32_t>& bpe_tokens) -> bool {
-            size_t word_end       = str.find(",");
-            std::string embd_name = word_end == std::string::npos ? str : str.substr(0, word_end);
-            embd_name             = trim(embd_name);
-            std::string embd_path = get_full_path(embd_dir, embd_name + ".pt");
-            if (embd_path.size() == 0) {
-                embd_path = get_full_path(embd_dir, embd_name + ".ckpt");
-            }
-            if (embd_path.size() == 0) {
-                embd_path = get_full_path(embd_dir, embd_name + ".safetensors");
-            }
-            if (embd_path.size() > 0) {
-                if (load_embedding(embd_name, embd_path, bpe_tokens)) {
-                    if (word_end != std::string::npos) {
-                        str = str.substr(word_end);
-                    } else {
-                        str = "";
-                    }
-                    return true;
-                }
-            }
-            return false;
-        };
-
-        std::vector<int> tokens;
-        std::vector<float> weights;
-        for (const auto& item : parsed_attention) {
-            const std::string& curr_text = item.first;
-            float curr_weight            = item.second;
-            std::vector<int> curr_tokens = tokenizer.encode(curr_text, on_new_token_cb);
-            tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end());
-            weights.insert(weights.end(), curr_tokens.size(), curr_weight);
-        }
-        tokens.insert(tokens.begin(), BOS_TOKEN_ID);
-        weights.insert(weights.begin(), 1.0);
-
-        if (max_length > 0) {
-            if (tokens.size() > max_length - 1) {
-                tokens.resize(max_length - 1);
-                weights.resize(max_length - 1);
-                tokens.push_back(EOS_TOKEN_ID);
-                weights.push_back(1.0);
-            } else {
-                tokens.push_back(EOS_TOKEN_ID);
-                weights.push_back(1.0);
-                if (padding) {
-                    int pad_token_id = PAD_TOKEN_ID;
-                    if (version == VERSION_2_x) {
-                        pad_token_id = 0;
-                    }
-                    tokens.insert(tokens.end(), max_length - tokens.size(), pad_token_id);
-                    weights.insert(weights.end(), max_length - weights.size(), 1.0);
-                }
-            }
-        }
-
-        // for (int i = 0; i < tokens.size(); i++) {
-        //     std::cout << tokens[i] << ":" << weights[i] << ", ";
-        // }
-        // std::cout << std::endl;
-
-        return {tokens, weights};
+        GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
    }
 };

-struct FrozenCLIPVisionEmbedder : public GGMLModule {
-    CLIPVisionModel vision_model;
-
-    FrozenCLIPVisionEmbedder(ggml_backend_t backend, ggml_type wtype)
-        : GGMLModule(backend, wtype) {
-        vision_model.init(params_ctx, wtype);
-    }
-
-    std::string get_desc() {
-        return "clip_vision";
-    }
-
-    size_t get_params_mem_size() {
-        return vision_model.get_params_mem_size();
-    }
-
-    size_t get_params_num() {
-        return vision_model.get_params_num();
-    }
-
-    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
-        vision_model.get_param_tensors(tensors, prefix + "transformer.visual_model");
-    }
-
-    struct ggml_cgraph* build_graph(struct ggml_tensor* pixel_values) {
-        struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
-
-        pixel_values = to_backend(pixel_values);
-
-        struct ggml_tensor* hidden_states = vision_model.forward(compute_ctx, pixel_values);
-
-        ggml_build_forward_expand(gf, hidden_states);
-
-        return gf;
-    }
-
-    void compute(const int n_threads,
-                 ggml_tensor* pixel_values,
-                 ggml_tensor** output,
-                 ggml_context* output_ctx) {
-        auto get_graph = [&]() -> struct ggml_cgraph* {
-            return build_graph(pixel_values);
-        };
-        GGMLModule::compute(get_graph, n_threads, true, output, output_ctx);
-    }
-};
-
-#endif  // __CLIP_HPP__
+#endif  // __CLIP_HPP__
--- a/otherarch/sdcpp/common.hpp
+++ b/otherarch/sdcpp/common.hpp
@ -279,26 +279,11 @@ public:
        int64_t n_context = context->ne[1];
        int64_t inner_dim = d_head * n_head;

-        auto q = to_q->forward(ctx, x);                                 // [N, n_token, inner_dim]
-        q      = ggml_reshape_4d(ctx, q, d_head, n_head, n_token, n);   // [N, n_token, n_head, d_head]
-        q      = ggml_cont(ctx, ggml_permute(ctx, q, 0, 2, 1, 3));      // [N, n_head, n_token, d_head]
-        q      = ggml_reshape_3d(ctx, q, d_head, n_token, n_head * n);  // [N * n_head, n_token, d_head]
+        auto q = to_q->forward(ctx, x);        // [N, n_token, inner_dim]
+        auto k = to_k->forward(ctx, context);  // [N, n_context, inner_dim]
+        auto v = to_v->forward(ctx, context);  // [N, n_context, inner_dim]

-        auto k = to_k->forward(ctx, context);                             // [N, n_context, inner_dim]
-        k      = ggml_reshape_4d(ctx, k, d_head, n_head, n_context, n);   // [N, n_context, n_head, d_head]
-        k      = ggml_cont(ctx, ggml_permute(ctx, k, 0, 2, 1, 3));        // [N, n_head, n_context, d_head]
-        k      = ggml_reshape_3d(ctx, k, d_head, n_context, n_head * n);  // [N * n_head, n_context, d_head]
-
-        auto v = to_v->forward(ctx, context);                             // [N, n_context, inner_dim]
-        v      = ggml_reshape_4d(ctx, v, d_head, n_head, n_context, n);   // [N, n_context, n_head, d_head]
-        v      = ggml_cont(ctx, ggml_permute(ctx, v, 1, 2, 0, 3));        // [N, n_head, d_head, n_context]
-        v      = ggml_reshape_3d(ctx, v, n_context, d_head, n_head * n);  // [N * n_head, d_head, n_context]
-
-        auto kqv = ggml_nn_attention(ctx, q, k, v, false);  // [N * n_head, n_token, d_head]
-        kqv      = ggml_reshape_4d(ctx, kqv, d_head, n_token, n_head, n);
-        kqv      = ggml_cont(ctx, ggml_permute(ctx, kqv, 0, 2, 1, 3));  // [N, n_token, n_head, d_head]
-
-        x = ggml_reshape_3d(ctx, kqv, d_head * n_head, n_token, n);  // [N, n_token, inner_dim]
+        x = ggml_nn_attention_ext(ctx, q, k, v, n_head, NULL, false);  // [N, n_token, inner_dim]

        x = to_out_0->forward(ctx, x);  // [N, n_token, query_dim]
        return x;
@ -382,7 +367,7 @@ protected:
    int64_t n_head;
    int64_t d_head;
    int64_t depth       = 1;    // 1
-    int64_t context_dim = 768;  // hidden_size, 1024 for VERSION_2_x
+    int64_t context_dim = 768;  // hidden_size, 1024 for VERSION_SD2

 public:
    SpatialTransformer(int64_t in_channels,
--- a/otherarch/sdcpp/conditioner.hpp
+++ b/otherarch/sdcpp/conditioner.hpp
--- a/otherarch/sdcpp/control.hpp
+++ b/otherarch/sdcpp/control.hpp
@ -14,7 +14,7 @@
 */
 class ControlNetBlock : public GGMLBlock {
 protected:
-    SDVersion version = VERSION_1_x;
+    SDVersion version = VERSION_SD1;
    // network hparams
    int in_channels                        = 4;
    int out_channels                       = 4;
@ -26,19 +26,19 @@ protected:
    int time_embed_dim                     = 1280;  // model_channels*4
    int num_heads                          = 8;
    int num_head_channels                  = -1;   // channels // num_heads
-    int context_dim                        = 768;  // 1024 for VERSION_2_x, 2048 for VERSION_XL
+    int context_dim                        = 768;  // 1024 for VERSION_SD2, 2048 for VERSION_SDXL

 public:
    int model_channels  = 320;
-    int adm_in_channels = 2816;  // only for VERSION_XL
+    int adm_in_channels = 2816;  // only for VERSION_SDXL

-    ControlNetBlock(SDVersion version = VERSION_1_x)
+    ControlNetBlock(SDVersion version = VERSION_SD1)
        : version(version) {
-        if (version == VERSION_2_x) {
+        if (version == VERSION_SD2) {
            context_dim       = 1024;
            num_head_channels = 64;
            num_heads         = -1;
-        } else if (version == VERSION_XL) {
+        } else if (version == VERSION_SDXL) {
            context_dim           = 2048;
            attention_resolutions = {4, 2};
            channel_mult          = {1, 2, 4};
@ -58,7 +58,7 @@ public:
        // time_embed_1 is nn.SiLU()
        blocks["time_embed.2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, time_embed_dim));

-        if (version == VERSION_XL || version == VERSION_SVD) {
+        if (version == VERSION_SDXL || version == VERSION_SVD) {
            blocks["label_emb.0.0"] = std::shared_ptr<GGMLBlock>(new Linear(adm_in_channels, time_embed_dim));
            // label_emb_1 is nn.SiLU()
            blocks["label_emb.0.2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, time_embed_dim));
@ -306,8 +306,8 @@ public:
    }
 };

-struct ControlNet : public GGMLModule {
-    SDVersion version = VERSION_1_x;
+struct ControlNet : public GGMLRunner {
+    SDVersion version = VERSION_SD1;
    ControlNetBlock control_net;

    ggml_backend_buffer_t control_buffer = NULL;  // keep control output tensors in backend memory
@ -318,8 +318,8 @@ struct ControlNet : public GGMLModule {

    ControlNet(ggml_backend_t backend,
               ggml_type wtype,
-               SDVersion version = VERSION_1_x)
-        : GGMLModule(backend, wtype), control_net(version) {
+               SDVersion version = VERSION_SD1)
+        : GGMLRunner(backend, wtype), control_net(version) {
        control_net.init(params_ctx, wtype);
    }

@ -369,14 +369,6 @@ struct ControlNet : public GGMLModule {
        return "control_net";
    }

-    size_t get_params_mem_size() {
-        return control_net.get_params_mem_size();
-    }
-
-    size_t get_params_num() {
-        return control_net.get_params_num();
-    }
-
    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
        control_net.get_param_tensors(tensors, prefix);
    }
@ -434,7 +426,7 @@ struct ControlNet : public GGMLModule {
            return build_graph(x, hint, timesteps, context, y);
        };

-        GGMLModule::compute(get_graph, n_threads, false, output, output_ctx);
+        GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
        guided_hint_cached = true;
    }

--- a/otherarch/sdcpp/denoiser.hpp
+++ b/otherarch/sdcpp/denoiser.hpp
--- a/otherarch/sdcpp/diffusion_model.hpp
+++ b/otherarch/sdcpp/diffusion_model.hpp
@ -0,0 +1,176 @@
+#ifndef __DIFFUSION_MODEL_H__
+#define __DIFFUSION_MODEL_H__
+
+#include "flux.hpp"
+#include "mmdit.hpp"
+#include "unet.hpp"
+
+struct DiffusionModel {
+    virtual void compute(int n_threads,
+                         struct ggml_tensor* x,
+                         struct ggml_tensor* timesteps,
+                         struct ggml_tensor* context,
+                         struct ggml_tensor* c_concat,
+                         struct ggml_tensor* y,
+                         struct ggml_tensor* guidance,
+                         int num_video_frames                      = -1,
+                         std::vector<struct ggml_tensor*> controls = {},
+                         float control_strength                    = 0.f,
+                         struct ggml_tensor** output               = NULL,
+                         struct ggml_context* output_ctx           = NULL)                        = 0;
+    virtual void alloc_params_buffer()                                                  = 0;
+    virtual void free_params_buffer()                                                   = 0;
+    virtual void free_compute_buffer()                                                  = 0;
+    virtual void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) = 0;
+    virtual size_t get_params_buffer_size()                                             = 0;
+    virtual int64_t get_adm_in_channels()                                               = 0;
+};
+
+struct UNetModel : public DiffusionModel {
+    UNetModelRunner unet;
+
+    UNetModel(ggml_backend_t backend,
+              ggml_type wtype,
+              SDVersion version = VERSION_SD1)
+        : unet(backend, wtype, version) {
+    }
+
+    void alloc_params_buffer() {
+        unet.alloc_params_buffer();
+    }
+
+    void free_params_buffer() {
+        unet.free_params_buffer();
+    }
+
+    void free_compute_buffer() {
+        unet.free_compute_buffer();
+    }
+
+    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
+        unet.get_param_tensors(tensors, "model.diffusion_model");
+    }
+
+    size_t get_params_buffer_size() {
+        return unet.get_params_buffer_size();
+    }
+
+    int64_t get_adm_in_channels() {
+        return unet.unet.adm_in_channels;
+    }
+
+    void compute(int n_threads,
+                 struct ggml_tensor* x,
+                 struct ggml_tensor* timesteps,
+                 struct ggml_tensor* context,
+                 struct ggml_tensor* c_concat,
+                 struct ggml_tensor* y,
+                 struct ggml_tensor* guidance,
+                 int num_video_frames                      = -1,
+                 std::vector<struct ggml_tensor*> controls = {},
+                 float control_strength                    = 0.f,
+                 struct ggml_tensor** output               = NULL,
+                 struct ggml_context* output_ctx           = NULL) {
+        return unet.compute(n_threads, x, timesteps, context, c_concat, y, num_video_frames, controls, control_strength, output, output_ctx);
+    }
+};
+
+struct MMDiTModel : public DiffusionModel {
+    MMDiTRunner mmdit;
+
+    MMDiTModel(ggml_backend_t backend,
+               ggml_type wtype,
+               SDVersion version = VERSION_SD3_2B)
+        : mmdit(backend, wtype, version) {
+    }
+
+    void alloc_params_buffer() {
+        mmdit.alloc_params_buffer();
+    }
+
+    void free_params_buffer() {
+        mmdit.free_params_buffer();
+    }
+
+    void free_compute_buffer() {
+        mmdit.free_compute_buffer();
+    }
+
+    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
+        mmdit.get_param_tensors(tensors, "model.diffusion_model");
+    }
+
+    size_t get_params_buffer_size() {
+        return mmdit.get_params_buffer_size();
+    }
+
+    int64_t get_adm_in_channels() {
+        return 768 + 1280;
+    }
+
+    void compute(int n_threads,
+                 struct ggml_tensor* x,
+                 struct ggml_tensor* timesteps,
+                 struct ggml_tensor* context,
+                 struct ggml_tensor* c_concat,
+                 struct ggml_tensor* y,
+                 struct ggml_tensor* guidance,
+                 int num_video_frames                      = -1,
+                 std::vector<struct ggml_tensor*> controls = {},
+                 float control_strength                    = 0.f,
+                 struct ggml_tensor** output               = NULL,
+                 struct ggml_context* output_ctx           = NULL) {
+        return mmdit.compute(n_threads, x, timesteps, context, y, output, output_ctx);
+    }
+};
+
+struct FluxModel : public DiffusionModel {
+    Flux::FluxRunner flux;
+
+    FluxModel(ggml_backend_t backend,
+              ggml_type wtype,
+              SDVersion version = VERSION_FLUX_DEV)
+        : flux(backend, wtype, version) {
+    }
+
+    void alloc_params_buffer() {
+        flux.alloc_params_buffer();
+    }
+
+    void free_params_buffer() {
+        flux.free_params_buffer();
+    }
+
+    void free_compute_buffer() {
+        flux.free_compute_buffer();
+    }
+
+    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
+        flux.get_param_tensors(tensors, "model.diffusion_model");
+    }
+
+    size_t get_params_buffer_size() {
+        return flux.get_params_buffer_size();
+    }
+
+    int64_t get_adm_in_channels() {
+        return 768;
+    }
+
+    void compute(int n_threads,
+                 struct ggml_tensor* x,
+                 struct ggml_tensor* timesteps,
+                 struct ggml_tensor* context,
+                 struct ggml_tensor* c_concat,
+                 struct ggml_tensor* y,
+                 struct ggml_tensor* guidance,
+                 int num_video_frames                      = -1,
+                 std::vector<struct ggml_tensor*> controls = {},
+                 float control_strength                    = 0.f,
+                 struct ggml_tensor** output               = NULL,
+                 struct ggml_context* output_ctx           = NULL) {
+        return flux.compute(n_threads, x, timesteps, context, y, guidance, output, output_ctx);
+    }
+};
+
+#endif
--- a/otherarch/sdcpp/esrgan.hpp
+++ b/otherarch/sdcpp/esrgan.hpp
@ -137,14 +137,14 @@ public:
    }
 };

-struct ESRGAN : public GGMLModule {
+struct ESRGAN : public GGMLRunner {
    RRDBNet rrdb_net;
    int scale     = 4;
    int tile_size = 128;  // avoid cuda OOM for 4gb VRAM

    ESRGAN(ggml_backend_t backend,
           ggml_type wtype)
-        : GGMLModule(backend, wtype) {
+        : GGMLRunner(backend, wtype) {
        rrdb_net.init(params_ctx, wtype);
    }

@ -152,14 +152,6 @@ struct ESRGAN : public GGMLModule {
        return "esrgan";
    }

-    size_t get_params_mem_size() {
-        return rrdb_net.get_params_mem_size();
-    }
-
-    size_t get_params_num() {
-        return rrdb_net.get_params_num();
-    }
-
    bool load_from_file(const std::string& file_path) {
        LOG_INFO("loading esrgan from '%s'", file_path.c_str());

@ -199,7 +191,7 @@ struct ESRGAN : public GGMLModule {
        auto get_graph = [&]() -> struct ggml_cgraph* {
            return build_graph(x);
        };
-        GGMLModule::compute(get_graph, n_threads, false, output, output_ctx);
+        GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
    }
 };

--- a/otherarch/sdcpp/flux.hpp
+++ b/otherarch/sdcpp/flux.hpp
@ -0,0 +1,961 @@
+#ifndef __FLUX_HPP__
+#define __FLUX_HPP__
+
+#include <vector>
+
+#include "ggml_extend.hpp"
+#include "model.h"
+
+#define FLUX_GRAPH_SIZE 10240
+
+namespace Flux {
+
+    struct MLPEmbedder : public UnaryBlock {
+    public:
+        MLPEmbedder(int64_t in_dim, int64_t hidden_dim) {
+            blocks["in_layer"]  = std::shared_ptr<GGMLBlock>(new Linear(in_dim, hidden_dim, true));
+            blocks["out_layer"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_dim, hidden_dim, true));
+        }
+
+        struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+            // x: [..., in_dim]
+            // return: [..., hidden_dim]
+            auto in_layer  = std::dynamic_pointer_cast<Linear>(blocks["in_layer"]);
+            auto out_layer = std::dynamic_pointer_cast<Linear>(blocks["out_layer"]);
+
+            x = in_layer->forward(ctx, x);
+            x = ggml_silu_inplace(ctx, x);
+            x = out_layer->forward(ctx, x);
+            return x;
+        }
+    };
+
+    class RMSNorm : public UnaryBlock {
+    protected:
+        int64_t hidden_size;
+        float eps;
+
+        void init_params(struct ggml_context* ctx, ggml_type wtype) {
+            params["scale"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
+        }
+
+    public:
+        RMSNorm(int64_t hidden_size,
+                float eps = 1e-06f)
+            : hidden_size(hidden_size),
+              eps(eps) {}
+
+        struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+            struct ggml_tensor* w = params["scale"];
+            x                     = ggml_rms_norm(ctx, x, eps);
+            x                     = ggml_mul(ctx, x, w);
+            return x;
+        }
+    };
+
+    struct QKNorm : public GGMLBlock {
+    public:
+        QKNorm(int64_t dim) {
+            blocks["query_norm"] = std::shared_ptr<GGMLBlock>(new RMSNorm(dim));
+            blocks["key_norm"]   = std::shared_ptr<GGMLBlock>(new RMSNorm(dim));
+        }
+
+        struct ggml_tensor* query_norm(struct ggml_context* ctx, struct ggml_tensor* x) {
+            // x: [..., dim]
+            // return: [..., dim]
+            auto norm = std::dynamic_pointer_cast<RMSNorm>(blocks["query_norm"]);
+
+            x = norm->forward(ctx, x);
+            return x;
+        }
+
+        struct ggml_tensor* key_norm(struct ggml_context* ctx, struct ggml_tensor* x) {
+            // x: [..., dim]
+            // return: [..., dim]
+            auto norm = std::dynamic_pointer_cast<RMSNorm>(blocks["key_norm"]);
+
+            x = norm->forward(ctx, x);
+            return x;
+        }
+    };
+
+    __STATIC_INLINE__ struct ggml_tensor* apply_rope(struct ggml_context* ctx,
+                                                     struct ggml_tensor* x,
+                                                     struct ggml_tensor* pe) {
+        // x: [N, L, n_head, d_head]
+        // pe: [L, d_head/2, 2, 2]
+        int64_t d_head = x->ne[0];
+        int64_t n_head = x->ne[1];
+        int64_t L      = x->ne[2];
+        int64_t N      = x->ne[3];
+        x              = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));       // [N, n_head, L, d_head]
+        x              = ggml_reshape_4d(ctx, x, 2, d_head / 2, L, n_head * N);  // [N * n_head, L, d_head/2, 2]
+        x              = ggml_cont(ctx, ggml_permute(ctx, x, 3, 0, 1, 2));       // [2, N * n_head, L, d_head/2]
+
+        int64_t offset = x->nb[2] * x->ne[2];
+        auto x_0       = ggml_view_3d(ctx, x, x->ne[0], x->ne[1], x->ne[2], x->nb[1], x->nb[2], offset * 0);  // [N * n_head, L, d_head/2]
+        auto x_1       = ggml_view_3d(ctx, x, x->ne[0], x->ne[1], x->ne[2], x->nb[1], x->nb[2], offset * 1);  // [N * n_head, L, d_head/2]
+        x_0            = ggml_reshape_4d(ctx, x_0, 1, x_0->ne[0], x_0->ne[1], x_0->ne[2]);                    // [N * n_head, L, d_head/2, 1]
+        x_1            = ggml_reshape_4d(ctx, x_1, 1, x_1->ne[0], x_1->ne[1], x_1->ne[2]);                    // [N * n_head, L, d_head/2, 1]
+        auto temp_x    = ggml_new_tensor_4d(ctx, x_0->type, 2, x_0->ne[1], x_0->ne[2], x_0->ne[3]);
+        x_0            = ggml_repeat(ctx, x_0, temp_x);  // [N * n_head, L, d_head/2, 2]
+        x_1            = ggml_repeat(ctx, x_1, temp_x);  // [N * n_head, L, d_head/2, 2]
+
+        pe        = ggml_cont(ctx, ggml_permute(ctx, pe, 3, 0, 1, 2));  // [2, L, d_head/2, 2]
+        offset    = pe->nb[2] * pe->ne[2];
+        auto pe_0 = ggml_view_3d(ctx, pe, pe->ne[0], pe->ne[1], pe->ne[2], pe->nb[1], pe->nb[2], offset * 0);  // [L, d_head/2, 2]
+        auto pe_1 = ggml_view_3d(ctx, pe, pe->ne[0], pe->ne[1], pe->ne[2], pe->nb[1], pe->nb[2], offset * 1);  // [L, d_head/2, 2]
+
+        auto x_out = ggml_add_inplace(ctx, ggml_mul(ctx, x_0, pe_0), ggml_mul(ctx, x_1, pe_1));  // [N * n_head, L, d_head/2, 2]
+        x_out      = ggml_reshape_3d(ctx, x_out, d_head, L, n_head * N);                         // [N*n_head, L, d_head]
+        return x_out;
+    }
+
+    __STATIC_INLINE__ struct ggml_tensor* attention(struct ggml_context* ctx,
+                                                    struct ggml_tensor* q,
+                                                    struct ggml_tensor* k,
+                                                    struct ggml_tensor* v,
+                                                    struct ggml_tensor* pe) {
+        // q,k,v: [N, L, n_head, d_head]
+        // pe: [L, d_head/2, 2, 2]
+        // return: [N, L, n_head*d_head]
+        q = apply_rope(ctx, q, pe);  // [N*n_head, L, d_head]
+        k = apply_rope(ctx, k, pe);  // [N*n_head, L, d_head]
+
+        auto x = ggml_nn_attention_ext(ctx, q, k, v, v->ne[1], NULL, false, true);  // [N, L, n_head*d_head]
+        return x;
+    }
+
+    struct SelfAttention : public GGMLBlock {
+    public:
+        int64_t num_heads;
+
+    public:
+        SelfAttention(int64_t dim,
+                      int64_t num_heads = 8,
+                      bool qkv_bias     = false)
+            : num_heads(num_heads) {
+            int64_t head_dim = dim / num_heads;
+            blocks["qkv"]    = std::shared_ptr<GGMLBlock>(new Linear(dim, dim * 3, qkv_bias));
+            blocks["norm"]   = std::shared_ptr<GGMLBlock>(new QKNorm(head_dim));
+            blocks["proj"]   = std::shared_ptr<GGMLBlock>(new Linear(dim, dim));
+        }
+
+        std::vector<struct ggml_tensor*> pre_attention(struct ggml_context* ctx, struct ggml_tensor* x) {
+            auto qkv_proj = std::dynamic_pointer_cast<Linear>(blocks["qkv"]);
+            auto norm     = std::dynamic_pointer_cast<QKNorm>(blocks["norm"]);
+
+            auto qkv         = qkv_proj->forward(ctx, x);
+            auto qkv_vec     = split_qkv(ctx, qkv);
+            int64_t head_dim = qkv_vec[0]->ne[0] / num_heads;
+            auto q           = ggml_reshape_4d(ctx, qkv_vec[0], head_dim, num_heads, qkv_vec[0]->ne[1], qkv_vec[0]->ne[2]);
+            auto k           = ggml_reshape_4d(ctx, qkv_vec[1], head_dim, num_heads, qkv_vec[1]->ne[1], qkv_vec[1]->ne[2]);
+            auto v           = ggml_reshape_4d(ctx, qkv_vec[2], head_dim, num_heads, qkv_vec[2]->ne[1], qkv_vec[2]->ne[2]);
+            q                = norm->query_norm(ctx, q);
+            k                = norm->key_norm(ctx, k);
+            return {q, k, v};
+        }
+
+        struct ggml_tensor* post_attention(struct ggml_context* ctx, struct ggml_tensor* x) {
+            auto proj = std::dynamic_pointer_cast<Linear>(blocks["proj"]);
+
+            x = proj->forward(ctx, x);  // [N, n_token, dim]
+            return x;
+        }
+
+        struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* pe) {
+            // x: [N, n_token, dim]
+            // pe: [n_token, d_head/2, 2, 2]
+            // return [N, n_token, dim]
+            auto qkv = pre_attention(ctx, x);                       // q,k,v: [N, n_token, n_head, d_head]
+            x        = attention(ctx, qkv[0], qkv[1], qkv[2], pe);  // [N, n_token, dim]
+            x        = post_attention(ctx, x);                      // [N, n_token, dim]
+            return x;
+        }
+    };
+
+    struct ModulationOut {
+        ggml_tensor* shift = NULL;
+        ggml_tensor* scale = NULL;
+        ggml_tensor* gate  = NULL;
+
+        ModulationOut(ggml_tensor* shift = NULL, ggml_tensor* scale = NULL, ggml_tensor* gate = NULL)
+            : shift(shift), scale(scale), gate(gate) {}
+    };
+
+    struct Modulation : public GGMLBlock {
+    public:
+        bool is_double;
+        int multiplier;
+
+    public:
+        Modulation(int64_t dim, bool is_double)
+            : is_double(is_double) {
+            multiplier    = is_double ? 6 : 3;
+            blocks["lin"] = std::shared_ptr<GGMLBlock>(new Linear(dim, dim * multiplier));
+        }
+
+        std::vector<ModulationOut> forward(struct ggml_context* ctx, struct ggml_tensor* vec) {
+            // x: [N, dim]
+            // return: [ModulationOut, ModulationOut]
+            auto lin = std::dynamic_pointer_cast<Linear>(blocks["lin"]);
+
+            auto out = ggml_silu(ctx, vec);
+            out      = lin->forward(ctx, out);  // [N, multiplier*dim]
+
+            auto m = ggml_reshape_3d(ctx, out, vec->ne[0], multiplier, vec->ne[1]);  // [N, multiplier, dim]
+            m      = ggml_cont(ctx, ggml_permute(ctx, m, 0, 2, 1, 3));               // [multiplier, N, dim]
+
+            int64_t offset = m->nb[1] * m->ne[1];
+            auto shift_0   = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 0);  // [N, dim]
+            auto scale_0   = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 1);  // [N, dim]
+            auto gate_0    = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 2);  // [N, dim]
+
+            if (is_double) {
+                auto shift_1 = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 3);  // [N, dim]
+                auto scale_1 = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 4);  // [N, dim]
+                auto gate_1  = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 5);  // [N, dim]
+                return {ModulationOut(shift_0, scale_0, gate_0), ModulationOut(shift_1, scale_1, gate_1)};
+            }
+
+            return {ModulationOut(shift_0, scale_0, gate_0), ModulationOut()};
+        }
+    };
+
+    __STATIC_INLINE__ struct ggml_tensor* modulate(struct ggml_context* ctx,
+                                                   struct ggml_tensor* x,
+                                                   struct ggml_tensor* shift,
+                                                   struct ggml_tensor* scale) {
+        // x: [N, L, C]
+        // scale: [N, C]
+        // shift: [N, C]
+        scale = ggml_reshape_3d(ctx, scale, scale->ne[0], 1, scale->ne[1]);  // [N, 1, C]
+        shift = ggml_reshape_3d(ctx, shift, shift->ne[0], 1, shift->ne[1]);  // [N, 1, C]
+        x     = ggml_add(ctx, x, ggml_mul(ctx, x, scale));
+        x     = ggml_add(ctx, x, shift);
+        return x;
+    }
+
+    struct DoubleStreamBlock : public GGMLBlock {
+    public:
+        DoubleStreamBlock(int64_t hidden_size,
+                          int64_t num_heads,
+                          float mlp_ratio,
+                          bool qkv_bias = false) {
+            int64_t mlp_hidden_dim = hidden_size * mlp_ratio;
+            blocks["img_mod"]      = std::shared_ptr<GGMLBlock>(new Modulation(hidden_size, true));
+            blocks["img_norm1"]    = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-6f, false));
+            blocks["img_attn"]     = std::shared_ptr<GGMLBlock>(new SelfAttention(hidden_size, num_heads, qkv_bias));
+
+            blocks["img_norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-6f, false));
+            blocks["img_mlp.0"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, mlp_hidden_dim));
+            // img_mlp.1 is nn.GELU(approximate="tanh")
+            blocks["img_mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(mlp_hidden_dim, hidden_size));
+
+            blocks["txt_mod"]   = std::shared_ptr<GGMLBlock>(new Modulation(hidden_size, true));
+            blocks["txt_norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-6f, false));
+            blocks["txt_attn"]  = std::shared_ptr<GGMLBlock>(new SelfAttention(hidden_size, num_heads, qkv_bias));
+
+            blocks["txt_norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-6f, false));
+            blocks["txt_mlp.0"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, mlp_hidden_dim));
+            // img_mlp.1 is nn.GELU(approximate="tanh")
+            blocks["txt_mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(mlp_hidden_dim, hidden_size));
+        }
+
+        std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(struct ggml_context* ctx,
+                                                                    struct ggml_tensor* img,
+                                                                    struct ggml_tensor* txt,
+                                                                    struct ggml_tensor* vec,
+                                                                    struct ggml_tensor* pe) {
+            // img: [N, n_img_token, hidden_size]
+            // txt: [N, n_txt_token, hidden_size]
+            // pe: [n_img_token + n_txt_token, d_head/2, 2, 2]
+            // return: ([N, n_img_token, hidden_size], [N, n_txt_token, hidden_size])
+
+            auto img_mod   = std::dynamic_pointer_cast<Modulation>(blocks["img_mod"]);
+            auto img_norm1 = std::dynamic_pointer_cast<LayerNorm>(blocks["img_norm1"]);
+            auto img_attn  = std::dynamic_pointer_cast<SelfAttention>(blocks["img_attn"]);
+
+            auto img_norm2 = std::dynamic_pointer_cast<LayerNorm>(blocks["img_norm2"]);
+            auto img_mlp_0 = std::dynamic_pointer_cast<Linear>(blocks["img_mlp.0"]);
+            auto img_mlp_2 = std::dynamic_pointer_cast<Linear>(blocks["img_mlp.2"]);
+
+            auto txt_mod   = std::dynamic_pointer_cast<Modulation>(blocks["txt_mod"]);
+            auto txt_norm1 = std::dynamic_pointer_cast<LayerNorm>(blocks["txt_norm1"]);
+            auto txt_attn  = std::dynamic_pointer_cast<SelfAttention>(blocks["txt_attn"]);
+
+            auto txt_norm2 = std::dynamic_pointer_cast<LayerNorm>(blocks["txt_norm2"]);
+            auto txt_mlp_0 = std::dynamic_pointer_cast<Linear>(blocks["txt_mlp.0"]);
+            auto txt_mlp_2 = std::dynamic_pointer_cast<Linear>(blocks["txt_mlp.2"]);
+
+            auto img_mods          = img_mod->forward(ctx, vec);
+            ModulationOut img_mod1 = img_mods[0];
+            ModulationOut img_mod2 = img_mods[1];
+            auto txt_mods          = txt_mod->forward(ctx, vec);
+            ModulationOut txt_mod1 = txt_mods[0];
+            ModulationOut txt_mod2 = txt_mods[1];
+
+            // prepare image for attention
+            auto img_modulated = img_norm1->forward(ctx, img);
+            img_modulated      = Flux::modulate(ctx, img_modulated, img_mod1.shift, img_mod1.scale);
+            auto img_qkv       = img_attn->pre_attention(ctx, img_modulated);  // q,k,v: [N, n_img_token, n_head, d_head]
+            auto img_q         = img_qkv[0];
+            auto img_k         = img_qkv[1];
+            auto img_v         = img_qkv[2];
+
+            // prepare txt for attention
+            auto txt_modulated = txt_norm1->forward(ctx, txt);
+            txt_modulated      = Flux::modulate(ctx, txt_modulated, txt_mod1.shift, txt_mod1.scale);
+            auto txt_qkv       = txt_attn->pre_attention(ctx, txt_modulated);  // q,k,v: [N, n_txt_token, n_head, d_head]
+            auto txt_q         = txt_qkv[0];
+            auto txt_k         = txt_qkv[1];
+            auto txt_v         = txt_qkv[2];
+
+            // run actual attention
+            auto q = ggml_concat(ctx, txt_q, img_q, 2);  // [N, n_txt_token + n_img_token, n_head, d_head]
+            auto k = ggml_concat(ctx, txt_k, img_k, 2);  // [N, n_txt_token + n_img_token, n_head, d_head]
+            auto v = ggml_concat(ctx, txt_v, img_v, 2);  // [N, n_txt_token + n_img_token, n_head, d_head]
+
+            auto attn         = attention(ctx, q, k, v, pe);                          // [N, n_txt_token + n_img_token, n_head*d_head]
+            attn              = ggml_cont(ctx, ggml_permute(ctx, attn, 0, 2, 1, 3));  // [n_txt_token + n_img_token, N, hidden_size]
+            auto txt_attn_out = ggml_view_3d(ctx,
+                                             attn,
+                                             attn->ne[0],
+                                             attn->ne[1],
+                                             txt->ne[1],
+                                             attn->nb[1],
+                                             attn->nb[2],
+                                             0);                                              // [n_txt_token, N, hidden_size]
+            txt_attn_out      = ggml_cont(ctx, ggml_permute(ctx, txt_attn_out, 0, 2, 1, 3));  // [N, n_txt_token, hidden_size]
+            auto img_attn_out = ggml_view_3d(ctx,
+                                             attn,
+                                             attn->ne[0],
+                                             attn->ne[1],
+                                             img->ne[1],
+                                             attn->nb[1],
+                                             attn->nb[2],
+                                             attn->nb[2] * txt->ne[1]);                       // [n_img_token, N, hidden_size]
+            img_attn_out      = ggml_cont(ctx, ggml_permute(ctx, img_attn_out, 0, 2, 1, 3));  // [N, n_img_token, hidden_size]
+
+            // calculate the img bloks
+            img = ggml_add(ctx, img, ggml_mul(ctx, img_attn->post_attention(ctx, img_attn_out), img_mod1.gate));
+
+            auto img_mlp_out = img_mlp_0->forward(ctx, Flux::modulate(ctx, img_norm2->forward(ctx, img), img_mod2.shift, img_mod2.scale));
+            img_mlp_out      = ggml_gelu_inplace(ctx, img_mlp_out);
+            img_mlp_out      = img_mlp_2->forward(ctx, img_mlp_out);
+
+            img = ggml_add(ctx, img, ggml_mul(ctx, img_mlp_out, img_mod2.gate));
+
+            // calculate the txt bloks
+            txt = ggml_add(ctx, txt, ggml_mul(ctx, txt_attn->post_attention(ctx, txt_attn_out), txt_mod1.gate));
+
+            auto txt_mlp_out = txt_mlp_0->forward(ctx, Flux::modulate(ctx, txt_norm2->forward(ctx, txt), txt_mod2.shift, txt_mod2.scale));
+            txt_mlp_out      = ggml_gelu_inplace(ctx, txt_mlp_out);
+            txt_mlp_out      = txt_mlp_2->forward(ctx, txt_mlp_out);
+
+            txt = ggml_add(ctx, txt, ggml_mul(ctx, txt_mlp_out, txt_mod2.gate));
+
+            return {img, txt};
+        }
+    };
+
+    struct SingleStreamBlock : public GGMLBlock {
+    public:
+        int64_t num_heads;
+        int64_t hidden_size;
+        int64_t mlp_hidden_dim;
+
+    public:
+        SingleStreamBlock(int64_t hidden_size,
+                          int64_t num_heads,
+                          float mlp_ratio = 4.0f,
+                          float qk_scale  = 0.f)
+            : hidden_size(hidden_size), num_heads(num_heads) {
+            int64_t head_dim = hidden_size / num_heads;
+            float scale      = qk_scale;
+            if (scale <= 0.f) {
+                scale = 1 / sqrt((float)head_dim);
+            }
+            mlp_hidden_dim = hidden_size * mlp_ratio;
+
+            blocks["linear1"]  = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size * 3 + mlp_hidden_dim));
+            blocks["linear2"]  = std::shared_ptr<GGMLBlock>(new Linear(hidden_size + mlp_hidden_dim, hidden_size));
+            blocks["norm"]     = std::shared_ptr<GGMLBlock>(new QKNorm(head_dim));
+            blocks["pre_norm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-6f, false));
+            // mlp_act is nn.GELU(approximate="tanh")
+            blocks["modulation"] = std::shared_ptr<GGMLBlock>(new Modulation(hidden_size, false));
+        }
+
+        struct ggml_tensor* forward(struct ggml_context* ctx,
+                                    struct ggml_tensor* x,
+                                    struct ggml_tensor* vec,
+                                    struct ggml_tensor* pe) {
+            // x: [N, n_token, hidden_size]
+            // pe: [n_token, d_head/2, 2, 2]
+            // return: [N, n_token, hidden_size]
+
+            auto linear1    = std::dynamic_pointer_cast<Linear>(blocks["linear1"]);
+            auto linear2    = std::dynamic_pointer_cast<Linear>(blocks["linear2"]);
+            auto norm       = std::dynamic_pointer_cast<QKNorm>(blocks["norm"]);
+            auto pre_norm   = std::dynamic_pointer_cast<LayerNorm>(blocks["pre_norm"]);
+            auto modulation = std::dynamic_pointer_cast<Modulation>(blocks["modulation"]);
+
+            auto mods         = modulation->forward(ctx, vec);
+            ModulationOut mod = mods[0];
+
+            auto x_mod   = Flux::modulate(ctx, pre_norm->forward(ctx, x), mod.shift, mod.scale);
+            auto qkv_mlp = linear1->forward(ctx, x_mod);                            // [N, n_token, hidden_size * 3 + mlp_hidden_dim]
+            qkv_mlp      = ggml_cont(ctx, ggml_permute(ctx, qkv_mlp, 2, 0, 1, 3));  // [hidden_size * 3 + mlp_hidden_dim, N, n_token]
+
+            auto qkv = ggml_view_3d(ctx,
+                                    qkv_mlp,
+                                    qkv_mlp->ne[0],
+                                    qkv_mlp->ne[1],
+                                    hidden_size * 3,
+                                    qkv_mlp->nb[1],
+                                    qkv_mlp->nb[2],
+                                    0);                                     // [hidden_size * 3 , N, n_token]
+            qkv      = ggml_cont(ctx, ggml_permute(ctx, qkv, 1, 2, 0, 3));  // [N, n_token, hidden_size * 3]
+            auto mlp = ggml_view_3d(ctx,
+                                    qkv_mlp,
+                                    qkv_mlp->ne[0],
+                                    qkv_mlp->ne[1],
+                                    mlp_hidden_dim,
+                                    qkv_mlp->nb[1],
+                                    qkv_mlp->nb[2],
+                                    qkv_mlp->nb[2] * hidden_size * 3);      // [mlp_hidden_dim , N, n_token]
+            mlp      = ggml_cont(ctx, ggml_permute(ctx, mlp, 1, 2, 0, 3));  // [N, n_token, mlp_hidden_dim]
+
+            auto qkv_vec     = split_qkv(ctx, qkv);  // q,k,v: [N, n_token, hidden_size]
+            int64_t head_dim = hidden_size / num_heads;
+            auto q           = ggml_reshape_4d(ctx, qkv_vec[0], head_dim, num_heads, qkv_vec[0]->ne[1], qkv_vec[0]->ne[2]);  // [N, n_token, n_head, d_head]
+            auto k           = ggml_reshape_4d(ctx, qkv_vec[1], head_dim, num_heads, qkv_vec[1]->ne[1], qkv_vec[1]->ne[2]);  // [N, n_token, n_head, d_head]
+            auto v           = ggml_reshape_4d(ctx, qkv_vec[2], head_dim, num_heads, qkv_vec[2]->ne[1], qkv_vec[2]->ne[2]);  // [N, n_token, n_head, d_head]
+            q                = norm->query_norm(ctx, q);
+            k                = norm->key_norm(ctx, k);
+            auto attn        = attention(ctx, q, k, v, pe);  // [N, n_token, hidden_size]
+
+            auto attn_mlp = ggml_concat(ctx, attn, ggml_gelu_inplace(ctx, mlp), 0);  // [N, n_token, hidden_size + mlp_hidden_dim]
+            auto output   = linear2->forward(ctx, attn_mlp);                         // [N, n_token, hidden_size]
+
+            output = ggml_add(ctx, x, ggml_mul(ctx, output, mod.gate));
+            return output;
+        }
+    };
+
+    struct LastLayer : public GGMLBlock {
+    public:
+        LastLayer(int64_t hidden_size,
+                  int64_t patch_size,
+                  int64_t out_channels) {
+            blocks["norm_final"]         = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-06f, false));
+            blocks["linear"]             = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, patch_size * patch_size * out_channels));
+            blocks["adaLN_modulation.1"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, 2 * hidden_size));
+        }
+
+        struct ggml_tensor* forward(struct ggml_context* ctx,
+                                    struct ggml_tensor* x,
+                                    struct ggml_tensor* c) {
+            // x: [N, n_token, hidden_size]
+            // c: [N, hidden_size]
+            // return: [N, n_token, patch_size * patch_size * out_channels]
+            auto norm_final         = std::dynamic_pointer_cast<LayerNorm>(blocks["norm_final"]);
+            auto linear             = std::dynamic_pointer_cast<Linear>(blocks["linear"]);
+            auto adaLN_modulation_1 = std::dynamic_pointer_cast<Linear>(blocks["adaLN_modulation.1"]);
+
+            auto m = adaLN_modulation_1->forward(ctx, ggml_silu(ctx, c));  // [N, 2 * hidden_size]
+            m      = ggml_reshape_3d(ctx, m, c->ne[0], 2, c->ne[1]);       // [N, 2, hidden_size]
+            m      = ggml_cont(ctx, ggml_permute(ctx, m, 0, 2, 1, 3));     // [2, N, hidden_size]
+
+            int64_t offset = m->nb[1] * m->ne[1];
+            auto shift     = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 0);  // [N, hidden_size]
+            auto scale     = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 1);  // [N, hidden_size]
+
+            x = Flux::modulate(ctx, norm_final->forward(ctx, x), shift, scale);
+            x = linear->forward(ctx, x);
+
+            return x;
+        }
+    };
+
+    struct FluxParams {
+        int64_t in_channels         = 64;
+        int64_t vec_in_dim          = 768;
+        int64_t context_in_dim      = 4096;
+        int64_t hidden_size         = 3072;
+        float mlp_ratio             = 4.0f;
+        int64_t num_heads           = 24;
+        int64_t depth               = 19;
+        int64_t depth_single_blocks = 38;
+        std::vector<int> axes_dim   = {16, 56, 56};
+        int64_t axes_dim_sum        = 128;
+        int theta                   = 10000;
+        bool qkv_bias               = true;
+        bool guidance_embed         = true;
+    };
+
+    struct Flux : public GGMLBlock {
+    public:
+        std::vector<float> linspace(float start, float end, int num) {
+            std::vector<float> result(num);
+            float step = (end - start) / (num - 1);
+            for (int i = 0; i < num; ++i) {
+                result[i] = start + i * step;
+            }
+            return result;
+        }
+
+        std::vector<std::vector<float>> transpose(const std::vector<std::vector<float>>& mat) {
+            int rows = mat.size();
+            int cols = mat[0].size();
+            std::vector<std::vector<float>> transposed(cols, std::vector<float>(rows));
+            for (int i = 0; i < rows; ++i) {
+                for (int j = 0; j < cols; ++j) {
+                    transposed[j][i] = mat[i][j];
+                }
+            }
+            return transposed;
+        }
+
+        std::vector<float> flatten(const std::vector<std::vector<float>>& vec) {
+            std::vector<float> flat_vec;
+            for (const auto& sub_vec : vec) {
+                flat_vec.insert(flat_vec.end(), sub_vec.begin(), sub_vec.end());
+            }
+            return flat_vec;
+        }
+
+        std::vector<std::vector<float>> rope(const std::vector<float>& pos, int dim, int theta) {
+            assert(dim % 2 == 0);
+            int half_dim = dim / 2;
+
+            std::vector<float> scale = linspace(0, (dim * 1.0f - 2) / dim, half_dim);
+
+            std::vector<float> omega(half_dim);
+            for (int i = 0; i < half_dim; ++i) {
+                omega[i] = 1.0 / std::pow(theta, scale[i]);
+            }
+
+            int pos_size = pos.size();
+            std::vector<std::vector<float>> out(pos_size, std::vector<float>(half_dim));
+            for (int i = 0; i < pos_size; ++i) {
+                for (int j = 0; j < half_dim; ++j) {
+                    out[i][j] = pos[i] * omega[j];
+                }
+            }
+
+            std::vector<std::vector<float>> result(pos_size, std::vector<float>(half_dim * 4));
+            for (int i = 0; i < pos_size; ++i) {
+                for (int j = 0; j < half_dim; ++j) {
+                    result[i][4 * j]     = std::cos(out[i][j]);
+                    result[i][4 * j + 1] = -std::sin(out[i][j]);
+                    result[i][4 * j + 2] = std::sin(out[i][j]);
+                    result[i][4 * j + 3] = std::cos(out[i][j]);
+                }
+            }
+
+            return result;
+        }
+
+        // Generate IDs for image patches and text
+        std::vector<std::vector<float>> gen_ids(int h, int w, int patch_size, int bs, int context_len) {
+            int h_len = (h + (patch_size / 2)) / patch_size;
+            int w_len = (w + (patch_size / 2)) / patch_size;
+
+            std::vector<std::vector<float>> img_ids(h_len * w_len, std::vector<float>(3, 0.0));
+
+            std::vector<float> row_ids = linspace(0, h_len - 1, h_len);
+            std::vector<float> col_ids = linspace(0, w_len - 1, w_len);
+
+            for (int i = 0; i < h_len; ++i) {
+                for (int j = 0; j < w_len; ++j) {
+                    img_ids[i * w_len + j][1] = row_ids[i];
+                    img_ids[i * w_len + j][2] = col_ids[j];
+                }
+            }
+
+            std::vector<std::vector<float>> img_ids_repeated(bs * img_ids.size(), std::vector<float>(3));
+            for (int i = 0; i < bs; ++i) {
+                for (int j = 0; j < img_ids.size(); ++j) {
+                    img_ids_repeated[i * img_ids.size() + j] = img_ids[j];
+                }
+            }
+
+            std::vector<std::vector<float>> txt_ids(bs * context_len, std::vector<float>(3, 0.0));
+            std::vector<std::vector<float>> ids(bs * (context_len + img_ids.size()), std::vector<float>(3));
+            for (int i = 0; i < bs; ++i) {
+                for (int j = 0; j < context_len; ++j) {
+                    ids[i * (context_len + img_ids.size()) + j] = txt_ids[j];
+                }
+                for (int j = 0; j < img_ids.size(); ++j) {
+                    ids[i * (context_len + img_ids.size()) + context_len + j] = img_ids_repeated[i * img_ids.size() + j];
+                }
+            }
+
+            return ids;
+        }
+
+        // Generate positional embeddings
+        std::vector<float> gen_pe(int h, int w, int patch_size, int bs, int context_len, int theta, const std::vector<int>& axes_dim) {
+            std::vector<std::vector<float>> ids       = gen_ids(h, w, patch_size, bs, context_len);
+            std::vector<std::vector<float>> trans_ids = transpose(ids);
+            size_t pos_len                            = ids.size();
+            int num_axes                              = axes_dim.size();
+            for (int i = 0; i < pos_len; i++) {
+                // std::cout << trans_ids[0][i] << " " << trans_ids[1][i] << " " << trans_ids[2][i] << std::endl;
+            }
+
+            int emb_dim = 0;
+            for (int d : axes_dim)
+                emb_dim += d / 2;
+
+            std::vector<std::vector<float>> emb(bs * pos_len, std::vector<float>(emb_dim * 2 * 2, 0.0));
+            int offset = 0;
+            for (int i = 0; i < num_axes; ++i) {
+                std::vector<std::vector<float>> rope_emb = rope(trans_ids[i], axes_dim[i], theta);  // [bs*pos_len, axes_dim[i]/2 * 2 * 2]
+                for (int b = 0; b < bs; ++b) {
+                    for (int j = 0; j < pos_len; ++j) {
+                        for (int k = 0; k < rope_emb[0].size(); ++k) {
+                            emb[b * pos_len + j][offset + k] = rope_emb[j][k];
+                        }
+                    }
+                }
+                offset += rope_emb[0].size();
+            }
+
+            return flatten(emb);
+        }
+
+    public:
+        FluxParams params;
+        Flux() {}
+        Flux(FluxParams params)
+            : params(params) {
+            int64_t out_channels = params.in_channels;
+            int64_t pe_dim       = params.hidden_size / params.num_heads;
+
+            blocks["img_in"]    = std::shared_ptr<GGMLBlock>(new Linear(params.in_channels, params.hidden_size, true));
+            blocks["time_in"]   = std::shared_ptr<GGMLBlock>(new MLPEmbedder(256, params.hidden_size));
+            blocks["vector_in"] = std::shared_ptr<GGMLBlock>(new MLPEmbedder(params.vec_in_dim, params.hidden_size));
+            if (params.guidance_embed) {
+                blocks["guidance_in"] = std::shared_ptr<GGMLBlock>(new MLPEmbedder(256, params.hidden_size));
+            }
+            blocks["txt_in"] = std::shared_ptr<GGMLBlock>(new Linear(params.context_in_dim, params.hidden_size, true));
+
+            for (int i = 0; i < params.depth; i++) {
+                blocks["double_blocks." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new DoubleStreamBlock(params.hidden_size,
+                                                                                                                params.num_heads,
+                                                                                                                params.mlp_ratio,
+                                                                                                                params.qkv_bias));
+            }
+
+            for (int i = 0; i < params.depth_single_blocks; i++) {
+                blocks["single_blocks." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new SingleStreamBlock(params.hidden_size,
+                                                                                                                params.num_heads,
+                                                                                                                params.mlp_ratio));
+            }
+
+            blocks["final_layer"] = std::shared_ptr<GGMLBlock>(new LastLayer(params.hidden_size, 1, out_channels));
+        }
+
+        struct ggml_tensor* patchify(struct ggml_context* ctx,
+                                     struct ggml_tensor* x,
+                                     int64_t patch_size) {
+            // x: [N, C, H, W]
+            // return: [N, h*w, C * patch_size * patch_size]
+            int64_t N = x->ne[3];
+            int64_t C = x->ne[2];
+            int64_t H = x->ne[1];
+            int64_t W = x->ne[0];
+            int64_t p = patch_size;
+            int64_t h = H / patch_size;
+            int64_t w = W / patch_size;
+
+            GGML_ASSERT(h * p == H && w * p == W);
+
+            x = ggml_reshape_4d(ctx, x, p, w, p, h * C * N);       // [N*C*h, p, w, p]
+            x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));  // [N*C*h, w, p, p]
+            x = ggml_reshape_4d(ctx, x, p * p, w * h, C, N);       // [N, C, h*w, p*p]
+            x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));  // [N, h*w, C, p*p]
+            x = ggml_reshape_3d(ctx, x, p * p * C, w * h, N);      // [N, h*w, C*p*p]
+            return x;
+        }
+
+        struct ggml_tensor* unpatchify(struct ggml_context* ctx,
+                                       struct ggml_tensor* x,
+                                       int64_t h,
+                                       int64_t w,
+                                       int64_t patch_size) {
+            // x: [N, h*w, C*patch_size*patch_size]
+            // return: [N, C, H, W]
+            int64_t N = x->ne[2];
+            int64_t C = x->ne[0] / patch_size / patch_size;
+            int64_t H = h * patch_size;
+            int64_t W = w * patch_size;
+            int64_t p = patch_size;
+
+            GGML_ASSERT(C * p * p == x->ne[0]);
+
+            x = ggml_reshape_4d(ctx, x, p * p, C, w * h, N);       // [N, h*w, C, p*p]
+            x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));  // [N, C, h*w, p*p]
+            x = ggml_reshape_4d(ctx, x, p, p, w, h * C * N);       // [N*C*h, w, p, p]
+            x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));  // [N*C*h, p, w, p]
+            x = ggml_reshape_4d(ctx, x, W, H, C, N);               // [N, C, h*p, w*p]
+
+            return x;
+        }
+
+        struct ggml_tensor* forward_orig(struct ggml_context* ctx,
+                                         struct ggml_tensor* img,
+                                         struct ggml_tensor* txt,
+                                         struct ggml_tensor* timesteps,
+                                         struct ggml_tensor* y,
+                                         struct ggml_tensor* guidance,
+                                         struct ggml_tensor* pe) {
+            auto img_in      = std::dynamic_pointer_cast<Linear>(blocks["img_in"]);
+            auto time_in     = std::dynamic_pointer_cast<MLPEmbedder>(blocks["time_in"]);
+            auto vector_in   = std::dynamic_pointer_cast<MLPEmbedder>(blocks["vector_in"]);
+            auto txt_in      = std::dynamic_pointer_cast<Linear>(blocks["txt_in"]);
+            auto final_layer = std::dynamic_pointer_cast<LastLayer>(blocks["final_layer"]);
+
+            img      = img_in->forward(ctx, img);
+            auto vec = time_in->forward(ctx, ggml_nn_timestep_embedding(ctx, timesteps, 256, 10000, 1000.f));
+
+            if (params.guidance_embed) {
+                GGML_ASSERT(guidance != NULL);
+                auto guidance_in = std::dynamic_pointer_cast<MLPEmbedder>(blocks["guidance_in"]);
+                // bf16 and fp16 result is different
+                auto g_in = ggml_nn_timestep_embedding(ctx, guidance, 256, 10000, 1000.f);
+                vec       = ggml_add(ctx, vec, guidance_in->forward(ctx, g_in));
+            }
+
+            vec = ggml_add(ctx, vec, vector_in->forward(ctx, y));
+            txt = txt_in->forward(ctx, txt);
+
+            for (int i = 0; i < params.depth; i++) {
+                auto block = std::dynamic_pointer_cast<DoubleStreamBlock>(blocks["double_blocks." + std::to_string(i)]);
+
+                auto img_txt = block->forward(ctx, img, txt, vec, pe);
+                img          = img_txt.first;   // [N, n_img_token, hidden_size]
+                txt          = img_txt.second;  // [N, n_txt_token, hidden_size]
+            }
+
+            auto txt_img = ggml_concat(ctx, txt, img, 1);  // [N, n_txt_token + n_img_token, hidden_size]
+            for (int i = 0; i < params.depth_single_blocks; i++) {
+                auto block = std::dynamic_pointer_cast<SingleStreamBlock>(blocks["single_blocks." + std::to_string(i)]);
+
+                txt_img = block->forward(ctx, txt_img, vec, pe);
+            }
+
+            txt_img = ggml_cont(ctx, ggml_permute(ctx, txt_img, 0, 2, 1, 3));  // [n_txt_token + n_img_token, N, hidden_size]
+            img     = ggml_view_3d(ctx,
+                                   txt_img,
+                                   txt_img->ne[0],
+                                   txt_img->ne[1],
+                                   img->ne[1],
+                                   txt_img->nb[1],
+                                   txt_img->nb[2],
+                                   txt_img->nb[2] * txt->ne[1]);           // [n_img_token, N, hidden_size]
+            img     = ggml_cont(ctx, ggml_permute(ctx, img, 0, 2, 1, 3));  // [N, n_img_token, hidden_size]
+
+            img = final_layer->forward(ctx, img, vec);  // (N, T, patch_size ** 2 * out_channels)
+
+            return img;
+        }
+
+        struct ggml_tensor* forward(struct ggml_context* ctx,
+                                    struct ggml_tensor* x,
+                                    struct ggml_tensor* timestep,
+                                    struct ggml_tensor* context,
+                                    struct ggml_tensor* y,
+                                    struct ggml_tensor* guidance,
+                                    struct ggml_tensor* pe) {
+            // Forward pass of DiT.
+            // x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
+            // timestep: (N,) tensor of diffusion timesteps
+            // context: (N, L, D)
+            // y: (N, adm_in_channels) tensor of class labels
+            // guidance: (N,)
+            // pe: (L, d_head/2, 2, 2)
+            // return: (N, C, H, W)
+
+            GGML_ASSERT(x->ne[3] == 1);
+
+            int64_t W          = x->ne[0];
+            int64_t H          = x->ne[1];
+            int64_t patch_size = 2;
+            int pad_h          = (patch_size - H % patch_size) % patch_size;
+            int pad_w          = (patch_size - W % patch_size) % patch_size;
+            x                  = ggml_pad(ctx, x, pad_w, pad_h, 0, 0);  // [N, C, H + pad_h, W + pad_w]
+
+            // img = rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=patch_size, pw=patch_size)
+            auto img = patchify(ctx, x, patch_size);  // [N, h*w, C * patch_size * patch_size]
+
+            auto out = forward_orig(ctx, img, context, timestep, y, guidance, pe);  // [N, h*w, C * patch_size * patch_size]
+
+            // rearrange(out, "b (h w) (c ph pw) -> b c (h ph) (w pw)", h=h_len, w=w_len, ph=2, pw=2)
+            out = unpatchify(ctx, out, (H + pad_h) / patch_size, (W + pad_w) / patch_size, patch_size);  // [N, C, H + pad_h, W + pad_w]
+
+            return out;
+        }
+    };
+
+    struct FluxRunner : public GGMLRunner {
+    public:
+        FluxParams flux_params;
+        Flux flux;
+        std::vector<float> pe_vec;  // for cache
+
+        FluxRunner(ggml_backend_t backend,
+                   ggml_type wtype,
+                   SDVersion version = VERSION_FLUX_DEV)
+            : GGMLRunner(backend, wtype) {
+            if (version == VERSION_FLUX_SCHNELL) {
+                flux_params.guidance_embed = false;
+            }
+            flux = Flux(flux_params);
+            flux.init(params_ctx, wtype);
+        }
+
+        std::string get_desc() {
+            return "flux";
+        }
+
+        void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
+            flux.get_param_tensors(tensors, prefix);
+        }
+
+        struct ggml_cgraph* build_graph(struct ggml_tensor* x,
+                                        struct ggml_tensor* timesteps,
+                                        struct ggml_tensor* context,
+                                        struct ggml_tensor* y,
+                                        struct ggml_tensor* guidance) {
+            GGML_ASSERT(x->ne[3] == 1);
+            struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, FLUX_GRAPH_SIZE, false);
+
+            x         = to_backend(x);
+            context   = to_backend(context);
+            y         = to_backend(y);
+            timesteps = to_backend(timesteps);
+            if (flux_params.guidance_embed) {
+                guidance = to_backend(guidance);
+            }
+
+            pe_vec      = flux.gen_pe(x->ne[1], x->ne[0], 2, x->ne[3], context->ne[1], flux_params.theta, flux_params.axes_dim);
+            int pos_len = pe_vec.size() / flux_params.axes_dim_sum / 2;
+            // LOG_DEBUG("pos_len %d", pos_len);
+            auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, flux_params.axes_dim_sum / 2, pos_len);
+            // pe->data = pe_vec.data();
+            // print_ggml_tensor(pe);
+            // pe->data = NULL;
+            set_backend_tensor_data(pe, pe_vec.data());
+
+            struct ggml_tensor* out = flux.forward(compute_ctx,
+                                                   x,
+                                                   timesteps,
+                                                   context,
+                                                   y,
+                                                   guidance,
+                                                   pe);
+
+            ggml_build_forward_expand(gf, out);
+
+            return gf;
+        }
+
+        void compute(int n_threads,
+                     struct ggml_tensor* x,
+                     struct ggml_tensor* timesteps,
+                     struct ggml_tensor* context,
+                     struct ggml_tensor* y,
+                     struct ggml_tensor* guidance,
+                     struct ggml_tensor** output     = NULL,
+                     struct ggml_context* output_ctx = NULL) {
+            // x: [N, in_channels, h, w]
+            // timesteps: [N, ]
+            // context: [N, max_position, hidden_size]
+            // y: [N, adm_in_channels] or [1, adm_in_channels]
+            // guidance: [N, ]
+            auto get_graph = [&]() -> struct ggml_cgraph* {
+                return build_graph(x, timesteps, context, y, guidance);
+            };
+
+            GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
+        }
+
+        void test() {
+            struct ggml_init_params params;
+            params.mem_size   = static_cast<size_t>(20 * 1024 * 1024);  // 20 MB
+            params.mem_buffer = NULL;
+            params.no_alloc   = false;
+
+            struct ggml_context* work_ctx = ggml_init(params);
+            GGML_ASSERT(work_ctx != NULL);
+
+            {
+                // cpu f16:
+                // cuda f16: nan
+                // cuda q8_0: pass
+                auto x = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 16, 16, 16, 1);
+                ggml_set_f32(x, 0.01f);
+                // print_ggml_tensor(x);
+
+                std::vector<float> timesteps_vec(1, 999.f);
+                auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec);
+
+                std::vector<float> guidance_vec(1, 3.5f);
+                auto guidance = vector_to_ggml_tensor(work_ctx, guidance_vec);
+
+                auto context = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32, 4096, 256, 1);
+                ggml_set_f32(context, 0.01f);
+                // print_ggml_tensor(context);
+
+                auto y = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 768, 1);
+                ggml_set_f32(y, 0.01f);
+                // print_ggml_tensor(y);
+
+                struct ggml_tensor* out = NULL;
+
+                int t0 = ggml_time_ms();
+                compute(8, x, timesteps, context, y, guidance, &out, work_ctx);
+                int t1 = ggml_time_ms();
+
+                print_ggml_tensor(out);
+                LOG_DEBUG("flux test done in %dms", t1 - t0);
+            }
+        }
+
+        static void load_from_file_and_test(const std::string& file_path) {
+            // ggml_backend_t backend    = ggml_backend_cuda_init(0);
+            ggml_backend_t backend           = ggml_backend_cpu_init();
+            ggml_type model_data_type        = GGML_TYPE_Q8_0;
+            std::shared_ptr<FluxRunner> flux = std::shared_ptr<FluxRunner>(new FluxRunner(backend, model_data_type));
+            {
+                LOG_INFO("loading from '%s'", file_path.c_str());
+
+                flux->alloc_params_buffer();
+                std::map<std::string, ggml_tensor*> tensors;
+                flux->get_param_tensors(tensors, "model.diffusion_model");
+
+                ModelLoader model_loader;
+                if (!model_loader.init_from_file(file_path, "model.diffusion_model.")) {
+                    LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
+                    return;
+                }
+
+                bool success = model_loader.load_tensors(tensors, backend);
+
+                if (!success) {
+                    LOG_ERROR("load tensors from model loader failed");
+                    return;
+                }
+
+                LOG_INFO("flux model loaded");
+            }
+            flux->test();
+        }
+    };
+
+}  // namespace Flux
+
+#endif  // __FLUX_HPP__
--- a/otherarch/sdcpp/ggml_extend.hpp
+++ b/otherarch/sdcpp/ggml_extend.hpp
@ -36,6 +36,10 @@
 #include "ggml-vulkan.h"
 #endif

+#ifdef SD_USE_SYCL
+#include "ggml-sycl.h"
+#endif
+
 #include "rng.hpp"
 #include "util.h"

@ -79,13 +83,42 @@ __STATIC_INLINE__ float ggml_tensor_get_f32(const ggml_tensor* tensor, int l, in
    return *(float*)((char*)(tensor->data) + i * tensor->nb[3] + j * tensor->nb[2] + k * tensor->nb[1] + l * tensor->nb[0]);
 }

+__STATIC_INLINE__ int ggml_tensor_get_i32(const ggml_tensor* tensor, int l, int k = 0, int j = 0, int i = 0) {
+    if (tensor->buffer != NULL) {
+        float value;
+        ggml_backend_tensor_get(tensor, &value, i * tensor->nb[3] + j * tensor->nb[2] + k * tensor->nb[1] + l * tensor->nb[0], sizeof(int));
+        return value;
+    }
+    GGML_ASSERT(tensor->nb[0] == sizeof(int));
+    return *(int*)((char*)(tensor->data) + i * tensor->nb[3] + j * tensor->nb[2] + k * tensor->nb[1] + l * tensor->nb[0]);
+}
+
 __STATIC_INLINE__ ggml_fp16_t ggml_tensor_get_f16(const ggml_tensor* tensor, int l, int k = 0, int j = 0, int i = 0) {
    GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
    return *(ggml_fp16_t*)((char*)(tensor->data) + i * tensor->nb[3] + j * tensor->nb[2] + k * tensor->nb[1] + l * tensor->nb[0]);
 }

-__STATIC_INLINE__ void print_ggml_tensor(struct ggml_tensor* tensor, bool shape_only = false) {
-    printf("shape(%zu, %zu, %zu, %zu)\n", tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
+// static struct ggml_tensor* get_tensor_from_graph(struct ggml_cgraph* gf, const char* name) {
+//     struct ggml_tensor* res = NULL;
+//     for (int i = 0; i < gf->n_nodes; i++) {
+//         // printf("%d, %s \n", i, gf->nodes[i]->name);
+//         if (strcmp(ggml_get_name(gf->nodes[i]), name) == 0) {
+//             res = gf->nodes[i];
+//             break;
+//         }
+//     }
+//     for (int i = 0; i < gf->n_leafs; i++) {
+//         // printf("%d, %s \n", i, gf->leafs[i]->name);
+//         if (strcmp(ggml_get_name(gf->leafs[i]), name) == 0) {
+//             res = gf->leafs[i];
+//             break;
+//         }
+//     }
+//     return res;
+// }
+
+__STATIC_INLINE__ void print_ggml_tensor(struct ggml_tensor* tensor, bool shape_only = false, const char* mark = "") {
+    printf("%s (%s): shape(%zu, %zu, %zu, %zu)\n", mark, ggml_type_name(tensor->type), tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
    fflush(stdout);
    if (shape_only) {
        return;
@ -111,6 +144,8 @@ __STATIC_INLINE__ void print_ggml_tensor(struct ggml_tensor* tensor, bool shape_
                        printf("  [%d, %d, %d, %d] = %f\n", i, j, k, l, ggml_tensor_get_f32(tensor, l, k, j, i));
                    } else if (tensor->type == GGML_TYPE_F16) {
                        printf("  [%d, %d, %d, %d] = %i\n", i, j, k, l, ggml_tensor_get_f16(tensor, l, k, j, i));
+                    } else if (tensor->type == GGML_TYPE_I32) {
+                        printf("  [%d, %d, %d, %d] = %i\n", i, j, k, l, ggml_tensor_get_i32(tensor, l, k, j, i));
                    }
                    fflush(stdout);
                }
@ -221,6 +256,23 @@ __STATIC_INLINE__ uint8_t* sd_tensor_to_image(struct ggml_tensor* input) {
    return image_data;
 }

+__STATIC_INLINE__ uint8_t* sd_tensor_to_mul_image(struct ggml_tensor* input, int idx) {
+    int64_t width    = input->ne[0];
+    int64_t height   = input->ne[1];
+    int64_t channels = input->ne[2];
+    GGML_ASSERT(channels == 3 && input->type == GGML_TYPE_F32);
+    uint8_t* image_data = (uint8_t*)malloc(width * height * channels);
+    for (int iy = 0; iy < height; iy++) {
+        for (int ix = 0; ix < width; ix++) {
+            for (int k = 0; k < channels; k++) {
+                float value                                               = ggml_tensor_get_f32(input, ix, iy, k, idx);
+                *(image_data + iy * width * channels + ix * channels + k) = (uint8_t)(value * 255.0f);
+            }
+        }
+    }
+    return image_data;
+}
+
 __STATIC_INLINE__ void sd_image_to_tensor(const uint8_t* image_data,
                                          struct ggml_tensor* output,
                                          bool scale = true) {
@ -241,6 +293,28 @@ __STATIC_INLINE__ void sd_image_to_tensor(const uint8_t* image_data,
    }
 }

+__STATIC_INLINE__ void sd_mul_images_to_tensor(const uint8_t* image_data,
+                                               struct ggml_tensor* output,
+                                               int idx,
+                                               float* mean = NULL,
+                                               float* std  = NULL) {
+    int64_t width    = output->ne[0];
+    int64_t height   = output->ne[1];
+    int64_t channels = output->ne[2];
+    GGML_ASSERT(channels == 3 && output->type == GGML_TYPE_F32);
+    for (int iy = 0; iy < height; iy++) {
+        for (int ix = 0; ix < width; ix++) {
+            for (int k = 0; k < channels; k++) {
+                int value       = *(image_data + iy * width * channels + ix * channels + k);
+                float pixel_val = value / 255.0f;
+                if (mean != NULL && std != NULL)
+                    pixel_val = (pixel_val - mean[k]) / std[k];
+                ggml_tensor_set_f32(output, pixel_val, ix, iy, k, idx);
+            }
+        }
+    }
+}
+
 __STATIC_INLINE__ void sd_image_f32_to_tensor(const float* image_data,
                                              struct ggml_tensor* output,
                                              bool scale = true) {
@ -251,7 +325,7 @@ __STATIC_INLINE__ void sd_image_f32_to_tensor(const float* image_data,
    for (int iy = 0; iy < height; iy++) {
        for (int ix = 0; ix < width; ix++) {
            for (int k = 0; k < channels; k++) {
-                float value = *(image_data + iy * width * channels + ix * channels + k);
+                int value = *(image_data + iy * width * channels + ix * channels + k);
                if (scale) {
                    value /= 255.f;
                }
@ -279,6 +353,12 @@ __STATIC_INLINE__ void ggml_split_tensor_2d(struct ggml_tensor* input,
    }
 }

+// unclamped -> expects x in the range [0-1]
+__STATIC_INLINE__ float ggml_smootherstep_f32(const float x) {
+    GGML_ASSERT(x >= 0.f && x <= 1.f);
+    return x * x * x * (x * (6.0f * x - 15.0f) + 10.0f);
+}
+
 __STATIC_INLINE__ void ggml_merge_tensor_2d(struct ggml_tensor* input,
                                            struct ggml_tensor* output,
                                            int x,
@ -287,6 +367,10 @@ __STATIC_INLINE__ void ggml_merge_tensor_2d(struct ggml_tensor* input,
    int64_t width    = input->ne[0];
    int64_t height   = input->ne[1];
    int64_t channels = input->ne[2];
+
+    int64_t img_width  = output->ne[0];
+    int64_t img_height = output->ne[1];
+
    GGML_ASSERT(input->type == GGML_TYPE_F32 && output->type == GGML_TYPE_F32);
    for (int iy = 0; iy < height; iy++) {
        for (int ix = 0; ix < width; ix++) {
@ -294,16 +378,22 @@ __STATIC_INLINE__ void ggml_merge_tensor_2d(struct ggml_tensor* input,
                float new_value = ggml_tensor_get_f32(input, ix, iy, k);
                if (overlap > 0) {  // blend colors in overlapped area
                    float old_value = ggml_tensor_get_f32(output, x + ix, y + iy, k);
-                    if (x > 0 && ix < overlap) {  // in overlapped horizontal
-                        ggml_tensor_set_f32(output, old_value + (new_value - old_value) * (ix / (1.0f * overlap)), x + ix, y + iy, k);
-                        continue;
-                    }
-                    if (y > 0 && iy < overlap) {  // in overlapped vertical
-                        ggml_tensor_set_f32(output, old_value + (new_value - old_value) * (iy / (1.0f * overlap)), x + ix, y + iy, k);
-                        continue;
-                    }
+
+                    const float x_f_0 = (x > 0) ? ix / float(overlap) : 1;
+                    const float x_f_1 = (x < (img_width - width)) ? (width - ix) / float(overlap) : 1;
+                    const float y_f_0 = (y > 0) ? iy / float(overlap) : 1;
+                    const float y_f_1 = (y < (img_height - height)) ? (height - iy) / float(overlap) : 1;
+
+                    const float x_f = std::min(std::min(x_f_0, x_f_1), 1.f);
+                    const float y_f = std::min(std::min(y_f_0, y_f_1), 1.f);
+
+                    ggml_tensor_set_f32(
+                        output,
+                        old_value + new_value * ggml_smootherstep_f32(y_f) * ggml_smootherstep_f32(x_f),
+                        x + ix, y + iy, k);
+                } else {
+                    ggml_tensor_set_f32(output, new_value, x + ix, y + iy, k);
                }
-                ggml_tensor_set_f32(output, new_value, x + ix, y + iy, k);
            }
        }
    }
@ -347,6 +437,42 @@ __STATIC_INLINE__ void ggml_tensor_clamp(struct ggml_tensor* src, float min, flo
    }
 }

+__STATIC_INLINE__ struct ggml_tensor* ggml_tensor_concat(struct ggml_context* ctx,
+                                                         struct ggml_tensor* a,
+                                                         struct ggml_tensor* b,
+                                                         int dim) {
+    int64_t ne[GGML_MAX_DIMS];
+    for (int d = 0; d < GGML_MAX_DIMS; ++d) {
+        if (d == dim) {
+            ne[d] = a->ne[d] + b->ne[d];
+            continue;
+        }
+        GGML_ASSERT(a->ne[d] == b->ne[d]);
+        ne[d] = a->ne[d];
+    }
+    struct ggml_tensor* result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, ne);
+    int64_t o[4]               = {0, 0, 0, 0};
+    o[dim]                     = a->ne[dim];
+
+    float v;
+    for (int i3 = 0; i3 < result->ne[3]; i3++) {
+        for (int i2 = 0; i2 < result->ne[2]; i2++) {
+            for (int i1 = 0; i1 < result->ne[1]; i1++) {
+                for (int i0 = 0; i0 < result->ne[0]; i0++) {
+                    if (i0 < a->ne[0] && i1 < a->ne[1] && i2 < a->ne[2] && i3 < a->ne[3]) {
+                        v = ggml_tensor_get_f32(a, i0, i1, i2, i3);
+                    } else {
+                        v = ggml_tensor_get_f32(b, i0 - o[0], i1 - o[1], i2 - o[2], i3 - o[3]);
+                    }
+
+                    ggml_tensor_set_f32(result, v, i0, i1, i2, i3);
+                }
+            }
+        }
+    }
+    return result;
+}
+
 // convert values from [0, 1] to [-1, 1]
 __STATIC_INLINE__ void ggml_tensor_scale_input(struct ggml_tensor* src) {
    int64_t nelements = ggml_nelements(src);
@ -400,7 +526,7 @@ __STATIC_INLINE__ void sd_tiling(ggml_tensor* input, ggml_tensor* output, const
    ggml_tensor* input_tile  = ggml_new_tensor_4d(tiles_ctx, GGML_TYPE_F32, tile_size, tile_size, input->ne[2], 1);
    ggml_tensor* output_tile = ggml_new_tensor_4d(tiles_ctx, GGML_TYPE_F32, tile_size * scale, tile_size * scale, output->ne[2], 1);
    on_processing(input_tile, NULL, true);
-    int num_tiles = (input_width * input_height) / (non_tile_overlap * non_tile_overlap);
+    int num_tiles = ceil((float)input_width / non_tile_overlap) * ceil((float)input_height / non_tile_overlap);
    LOG_INFO("processing %i tiles", num_tiles);
    pretty_progress(1, num_tiles, 0.0f);
    int tile_count = 1;
@ -430,11 +556,13 @@ __STATIC_INLINE__ void sd_tiling(ggml_tensor* input, ggml_tensor* output, const
    if (tile_count < num_tiles) {
        pretty_progress(num_tiles, num_tiles, last_time);
    }
+    ggml_free(tiles_ctx);
 }

 __STATIC_INLINE__ struct ggml_tensor* ggml_group_norm_32(struct ggml_context* ctx,
                                                         struct ggml_tensor* a) {
-    return ggml_group_norm(ctx, a, 32, 1e-6f);
+    const float eps = 1e-6f;  // default eps parameter
+    return ggml_group_norm(ctx, a, 32, eps);
 }

 __STATIC_INLINE__ struct ggml_tensor* ggml_nn_linear(struct ggml_context* ctx,
@ -524,6 +652,20 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_conv_3d_nx1x1(struct ggml_context*
    return x;  // [N, OC, T, OH * OW]
 }

+// qkv: [N, L, 3*C]
+// return: ([N, L, C], [N, L, C], [N, L, C])
+__STATIC_INLINE__ std::vector<struct ggml_tensor*> split_qkv(struct ggml_context* ctx,
+                                                             struct ggml_tensor* qkv) {
+    qkv = ggml_reshape_4d(ctx, qkv, qkv->ne[0] / 3, 3, qkv->ne[1], qkv->ne[2]);  // [N, L, 3, C]
+    qkv = ggml_cont(ctx, ggml_permute(ctx, qkv, 0, 3, 1, 2));                    // [3, N, L, C]
+
+    int64_t offset = qkv->nb[2] * qkv->ne[2];
+    auto q         = ggml_view_3d(ctx, qkv, qkv->ne[0], qkv->ne[1], qkv->ne[2], qkv->nb[1], qkv->nb[2], offset * 0);  // [N, L, C]
+    auto k         = ggml_view_3d(ctx, qkv, qkv->ne[0], qkv->ne[1], qkv->ne[2], qkv->nb[1], qkv->nb[2], offset * 1);  // [N, L, C]
+    auto v         = ggml_view_3d(ctx, qkv, qkv->ne[0], qkv->ne[1], qkv->ne[2], qkv->nb[1], qkv->nb[2], offset * 2);  // [N, L, C]
+    return {q, k, v};
+}
+
 // q: [N * n_head, n_token, d_head]
 // k: [N * n_head, n_k, d_head]
 // v: [N * n_head, d_head, n_k]
@ -533,7 +675,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_attention(struct ggml_context* ctx
                                                        struct ggml_tensor* k,
                                                        struct ggml_tensor* v,
                                                        bool mask = false) {
-#if defined(SD_USE_FLASH_ATTENTION) && !defined(SD_USE_CUBLAS) && !defined(SD_USE_METAL) && !defined(SD_USE_VULKAN)
+#if defined(SD_USE_FLASH_ATTENTION) && !defined(SD_USE_CUBLAS) && !defined(SD_USE_METAL) && !defined(SD_USE_VULKAN) && !defined(SD_USE_SYCL)
    struct ggml_tensor* kqv = ggml_flash_attn(ctx, q, k, v, false);  // [N * n_head, n_token, d_head]
 #else
    float d_head = (float)q->ne[0];
@ -550,6 +692,79 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_attention(struct ggml_context* ctx
    return kqv;
 }

+// q: [N, L_q, C] or [N*n_head, L_q, d_head]
+// k: [N, L_k, C] or [N*n_head, L_k, d_head]
+// v: [N, L_k, C] or [N, L_k, n_head, d_head]
+// return: [N, L_q, C]
+__STATIC_INLINE__ struct ggml_tensor* ggml_nn_attention_ext(struct ggml_context* ctx,
+                                                            struct ggml_tensor* q,
+                                                            struct ggml_tensor* k,
+                                                            struct ggml_tensor* v,
+                                                            int64_t n_head,
+                                                            struct ggml_tensor* mask = NULL,
+                                                            bool diag_mask_inf       = false,
+                                                            bool skip_reshape        = false) {
+    int64_t L_q;
+    int64_t L_k;
+    int64_t C;
+    int64_t N;
+    int64_t d_head;
+    if (!skip_reshape) {
+        L_q    = q->ne[1];
+        L_k    = k->ne[1];
+        C      = q->ne[0];
+        N      = q->ne[2];
+        d_head = C / n_head;
+        q      = ggml_reshape_4d(ctx, q, d_head, n_head, L_q, N);   // [N, L_q, n_head, d_head]
+        q      = ggml_cont(ctx, ggml_permute(ctx, q, 0, 2, 1, 3));  // [N, n_head, L_q, d_head]
+        q      = ggml_reshape_3d(ctx, q, d_head, L_q, n_head * N);  // [N * n_head, L_q, d_head]
+
+        k = ggml_reshape_4d(ctx, k, d_head, n_head, L_k, N);   // [N, L_k, n_head, d_head]
+        k = ggml_cont(ctx, ggml_permute(ctx, k, 0, 2, 1, 3));  // [N, n_head, L_k, d_head]
+        k = ggml_reshape_3d(ctx, k, d_head, L_k, n_head * N);  // [N * n_head, L_k, d_head]
+
+        v = ggml_reshape_4d(ctx, v, d_head, n_head, L_k, N);  // [N, L_k, n_head, d_head]
+    } else {
+        L_q    = q->ne[1];
+        L_k    = k->ne[1];
+        d_head = v->ne[0];
+        N      = v->ne[3];
+        C      = d_head * n_head;
+    }
+
+    float scale = (1.0f / sqrt((float)d_head));
+
+    bool use_flash_attn = false;
+    ggml_tensor* kqv    = NULL;
+    if (use_flash_attn) {
+        v = ggml_cont(ctx, ggml_permute(ctx, v, 0, 2, 1, 3));  // [N, n_head, L_k, d_head]
+        v = ggml_reshape_3d(ctx, v, d_head, L_k, n_head * N);  // [N * n_head, L_k, d_head]
+        LOG_DEBUG("k->ne[1] == %d", k->ne[1]);
+        kqv = ggml_flash_attn_ext(ctx, q, k, v, mask, scale, 0, 0);
+    } else {
+        v = ggml_cont(ctx, ggml_permute(ctx, v, 1, 2, 0, 3));  // [N, n_head, d_head, L_k]
+        v = ggml_reshape_3d(ctx, v, L_k, d_head, n_head * N);  // [N * n_head, d_head, L_k]
+
+        auto kq = ggml_mul_mat(ctx, k, q);  // [N * n_head, L_q, L_k]
+        kq      = ggml_scale_inplace(ctx, kq, scale);
+        if (mask) {
+            kq = ggml_add(ctx, kq, mask);
+        }
+        if (diag_mask_inf) {
+            kq = ggml_diag_mask_inf_inplace(ctx, kq, 0);
+        }
+        kq = ggml_soft_max_inplace(ctx, kq);
+
+        kqv = ggml_mul_mat(ctx, v, kq);  // [N * n_head, L_q, d_head]
+    }
+
+    kqv = ggml_reshape_4d(ctx, kqv, d_head, L_q, n_head, N);   // [N, n_head, L_q, d_head]
+    kqv = ggml_cont(ctx, ggml_permute(ctx, kqv, 0, 2, 1, 3));  // [N, L_q, n_head, d_head]
+    kqv = ggml_reshape_3d(ctx, kqv, d_head * n_head, L_q, N);  // [N, L_q, C]
+
+    return kqv;
+}
+
 __STATIC_INLINE__ struct ggml_tensor* ggml_nn_layer_norm(struct ggml_context* ctx,
                                                         struct ggml_tensor* x,
                                                         struct ggml_tensor* w,
@ -575,7 +790,8 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_group_norm(struct ggml_context* ct
        b = ggml_reshape_4d(ctx, b, 1, 1, b->ne[0], 1);
    }

-    x = ggml_group_norm(ctx, x, num_groups, 1e-6f);
+    const float eps = 1e-6f;  // default eps parameter
+    x               = ggml_group_norm(ctx, x, num_groups, eps);
    if (w != NULL && b != NULL) {
        x = ggml_mul(ctx, x, w);
        // b = ggml_repeat(ctx, b, x);
@ -585,7 +801,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_group_norm(struct ggml_context* ct
 }

 __STATIC_INLINE__ void ggml_backend_tensor_get_and_sync(ggml_backend_t backend, const struct ggml_tensor* tensor, void* data, size_t offset, size_t size) {
-#ifdef SD_USE_CUBLAS
+#if defined(SD_USE_CUBLAS) || defined(SD_USE_SYCL)
    if (!ggml_backend_is_cpu(backend)) {
        ggml_backend_tensor_get_async(backend, tensor, data, offset, size);
        ggml_backend_synchronize(backend);
@ -693,22 +909,25 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_timestep_embedding(
    struct ggml_context* ctx,
    struct ggml_tensor* timesteps,
    int dim,
-    int max_period = 10000) {
+    int max_period    = 10000,
+    float time_factor = 1.0f) {
+    timesteps = ggml_scale(ctx, timesteps, time_factor);
    return ggml_timestep_embedding(ctx, timesteps, dim, max_period);
 }

-// struct GGMLComputeGraph {
-//     virtual void init(struct ggml_context* ctx, ggml_type wtype) = 0;
-//     virtual std::string get_desc() = 0;
-//     virtual size_t get_params_mem_size()   = 0;
-//     virtual size_t get_params_num() = 0;
-//     virtual struct ggml_cgraph* get_ggml_cgraph() = 0;
-// };
+__STATIC_INLINE__ size_t ggml_tensor_num(ggml_context* ctx) {
+    size_t num = 0;
+    for (ggml_tensor* t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
+        num++;
+    }
+    return num;
+}

+/* SDXL with LoRA requires more space */
 #define MAX_PARAMS_TENSOR_NUM 15360
 #define MAX_GRAPH_SIZE 15360

-struct GGMLModule {
+struct GGMLRunner {
 protected:
    typedef std::function<struct ggml_cgraph*()> get_graph_cb_t;

@ -775,7 +994,10 @@ protected:

        // compute the required memory
        size_t compute_buffer_size = ggml_gallocr_get_buffer_size(compute_allocr, 0);
-        LOG_DEBUG("%s compute buffer size: %.2f MB", get_desc().c_str(), compute_buffer_size / 1024.0 / 1024.0);
+        LOG_DEBUG("%s compute buffer size: %.2f MB(%s)",
+                  get_desc().c_str(),
+                  compute_buffer_size / 1024.0 / 1024.0,
+                  ggml_backend_is_cpu(backend) ? "RAM" : "VRAM");
        return true;
    }

@ -791,16 +1013,14 @@ protected:
    }

 public:
-    virtual size_t get_params_mem_size() = 0;
-    virtual size_t get_params_num()      = 0;
-    virtual std::string get_desc()       = 0;
+    virtual std::string get_desc() = 0;

-    GGMLModule(ggml_backend_t backend, ggml_type wtype = GGML_TYPE_F32)
+    GGMLRunner(ggml_backend_t backend, ggml_type wtype = GGML_TYPE_F32)
        : backend(backend), wtype(wtype) {
        alloc_params_ctx();
    }

-    virtual ~GGMLModule() {
+    virtual ~GGMLRunner() {
        free_params_buffer();
        free_compute_buffer();
        free_params_ctx();
@ -813,15 +1033,20 @@ public:
    }

    bool alloc_params_buffer() {
-        size_t num_tensors = get_params_num();
+        size_t num_tensors = ggml_tensor_num(params_ctx);
        params_buffer      = ggml_backend_alloc_ctx_tensors(params_ctx, backend);
        if (params_buffer == NULL) {
-            LOG_ERROR("%s alloc params backend buffer failed", get_desc().c_str());
+            LOG_ERROR("%s alloc params backend buffer failed, num_tensors = %i",
+                      get_desc().c_str(),
+                      num_tensors);
            return false;
        }
        size_t params_buffer_size = ggml_backend_buffer_get_size(params_buffer);
-        LOG_DEBUG("%s params backend buffer size = % 6.2f MB (%i tensors)",
-                  get_desc().c_str(), params_buffer_size / (1024.0 * 1024.0), num_tensors);
+        LOG_DEBUG("%s params backend buffer size = % 6.2f MB(%s) (%i tensors)",
+                  get_desc().c_str(),
+                  params_buffer_size / (1024.0 * 1024.0),
+                  ggml_backend_is_cpu(backend) ? "RAM" : "VRAM",
+                  num_tensors);
        return true;
    }

@ -832,6 +1057,13 @@ public:
        }
    }

+    size_t get_params_buffer_size() {
+        if (params_buffer != NULL) {
+            return ggml_backend_buffer_get_size(params_buffer);
+        }
+        return 0;
+    }
+
    void free_compute_buffer() {
        if (compute_allocr != NULL) {
            ggml_gallocr_free(compute_allocr);
@ -850,7 +1082,7 @@ public:
            return NULL;
        }
        // it's performing a compute, check if backend isn't cpu
-        if (!ggml_backend_is_cpu(backend) && tensor->backend == GGML_BACKEND_TYPE_CPU) {
+        if (!ggml_backend_is_cpu(backend) && (tensor->buffer == NULL || ggml_backend_buffer_is_host(tensor->buffer))) {
            // pass input tensors to gpu memory
            auto backend_tensor = ggml_dup_tensor(compute_ctx, tensor);

@ -869,11 +1101,8 @@ public:
        alloc_compute_buffer(get_graph);
        reset_compute_ctx();
        struct ggml_cgraph* gf = get_graph();
-
        GGML_ASSERT(ggml_gallocr_alloc_graph(compute_allocr, gf));
-
        cpy_data_to_backend_tensor();
-
        if (ggml_backend_is_cpu(backend)) {
            ggml_backend_cpu_set_n_threads(backend, n_threads);
        }
@ -883,13 +1112,11 @@ public:
 //             ggml_backend_metal_set_n_cb(backend, n_threads);
 //         }
 // #endif
-
        ggml_backend_graph_compute(backend, gf);

 #ifdef GGML_PERF
        ggml_graph_print(gf);
 #endif
-
        if (output != NULL) {
            auto result = ggml_graph_node(gf, -1);
            if (*output == NULL && output_ctx != NULL) {
@ -907,19 +1134,6 @@ public:
 };

 class GGMLBlock {
-private:
-    static char temp_buffer[1024 * 1024 * 10];
-    ggml_context* get_temp_ctx() {
-        struct ggml_init_params params;
-        params.mem_size   = sizeof(temp_buffer);
-        params.mem_buffer = temp_buffer;
-        params.no_alloc   = true;
-
-        ggml_context* temp_ctx = ggml_init(params);
-        GGML_ASSERT(temp_ctx != NULL);
-        return temp_ctx;
-    }
-
 protected:
    typedef std::unordered_map<std::string, struct ggml_tensor*> ParameterMap;
    typedef std::unordered_map<std::string, std::shared_ptr<GGMLBlock>> GGMLBlockMap;
@ -942,14 +1156,6 @@ public:
        init_params(ctx, wtype);
    }

-    std::tuple<size_t, size_t> get_params_info(ggml_type wtype) {
-        ggml_context* temp_ctx = get_temp_ctx();
-        init(temp_ctx, wtype);
-        size_t num_tensors = get_params_num();
-        size_t mem_size    = get_params_mem_size();
-        return {num_tensors, mem_size};
-    }
-
    size_t get_params_num() {
        size_t num_tensors = params.size();
        for (auto& pair : blocks) {
@ -981,13 +1187,11 @@ public:
        }
        for (auto& pair : blocks) {
            auto& block = pair.second;
-
            block->get_param_tensors(tensors, prefix + pair.first);
        }

        for (auto& pair : params) {
-            struct ggml_tensor* param = pair.second;
-
+            struct ggml_tensor* param    = pair.second;
            tensors[prefix + pair.first] = pair.second;
        }
    }
@ -1003,8 +1207,12 @@ protected:
    int64_t in_features;
    int64_t out_features;
    bool bias;
+    bool force_f32;

    void init_params(struct ggml_context* ctx, ggml_type wtype) {
+        if (in_features % ggml_blck_size(wtype) != 0 || force_f32) {
+            wtype = GGML_TYPE_F32;
+        }
        params["weight"] = ggml_new_tensor_2d(ctx, wtype, in_features, out_features);
        if (bias) {
            params["bias"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_features);
@ -1014,10 +1222,12 @@ protected:
 public:
    Linear(int64_t in_features,
           int64_t out_features,
-           bool bias = true)
+           bool bias      = true,
+           bool force_f32 = false)
        : in_features(in_features),
          out_features(out_features),
-          bias(bias) {}
+          bias(bias),
+          force_f32(force_f32) {}

    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
        struct ggml_tensor* w = params["weight"];
@ -1029,6 +1239,40 @@ public:
    }
 };

+class Embedding : public UnaryBlock {
+protected:
+    int64_t embedding_dim;
+    int64_t num_embeddings;
+
+    void init_params(struct ggml_context* ctx, ggml_type wtype) {
+        params["weight"] = ggml_new_tensor_2d(ctx, wtype, embedding_dim, num_embeddings);
+    }
+
+public:
+    Embedding(int64_t num_embeddings, int64_t embedding_dim)
+        : embedding_dim(embedding_dim),
+          num_embeddings(num_embeddings) {
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx,
+                                struct ggml_tensor* input_ids) {
+        // input_ids: [N, n_token]
+        auto weight = params["weight"];
+
+        // There are issues with ggml batch inference, so we are expanding it here first.
+        // TODO: fix ggml batch inference
+        int64_t n = input_ids->ne[1];
+        input_ids = ggml_reshape_1d(ctx, input_ids, input_ids->ne[0] * input_ids->ne[1]);
+
+        input_ids      = ggml_reshape_3d(ctx, input_ids, input_ids->ne[0], 1, input_ids->ne[1]);
+        auto embedding = ggml_get_rows(ctx, weight, input_ids);
+        embedding      = ggml_reshape_3d(ctx, embedding, embedding->ne[0], embedding->ne[1] / n, n);
+
+        // [N, n_token, embedding_dim]
+        return embedding;
+    }
+};
+
 class Conv2d : public UnaryBlock {
 protected:
    int64_t in_channels;
@ -1202,58 +1446,48 @@ class MultiheadAttention : public GGMLBlock {
 protected:
    int64_t embed_dim;
    int64_t n_head;
-    bool bias;
-    bool mask;
+    std::string q_proj_name;
+    std::string k_proj_name;
+    std::string v_proj_name;
+    std::string out_proj_name;

 public:
    MultiheadAttention(int64_t embed_dim,
                       int64_t n_head,
-                       bool bias = true)
+                       bool qkv_proj_bias        = true,
+                       bool out_proj_bias        = true,
+                       std::string q_proj_name   = "q_proj",
+                       std::string k_proj_name   = "k_proj",
+                       std::string v_proj_name   = "v_proj",
+                       std::string out_proj_name = "out_proj")
        : embed_dim(embed_dim),
          n_head(n_head),
-          bias(bias) {
-        blocks["q_proj"]   = std::shared_ptr<GGMLBlock>(new Linear(embed_dim, embed_dim, bias));
-        blocks["k_proj"]   = std::shared_ptr<GGMLBlock>(new Linear(embed_dim, embed_dim, bias));
-        blocks["v_proj"]   = std::shared_ptr<GGMLBlock>(new Linear(embed_dim, embed_dim, bias));
-        blocks["out_proj"] = std::shared_ptr<GGMLBlock>(new Linear(embed_dim, embed_dim, bias));
+          q_proj_name(q_proj_name),
+          k_proj_name(k_proj_name),
+          v_proj_name(v_proj_name),
+          out_proj_name(out_proj_name) {
+        blocks[q_proj_name]   = std::shared_ptr<GGMLBlock>(new Linear(embed_dim, embed_dim, qkv_proj_bias));
+        blocks[k_proj_name]   = std::shared_ptr<GGMLBlock>(new Linear(embed_dim, embed_dim, qkv_proj_bias));
+        blocks[v_proj_name]   = std::shared_ptr<GGMLBlock>(new Linear(embed_dim, embed_dim, qkv_proj_bias));
+        blocks[out_proj_name] = std::shared_ptr<GGMLBlock>(new Linear(embed_dim, embed_dim, out_proj_bias));
    }

    // x: [N, n_token, embed_dim]
    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, bool mask = false) {
-        auto q_proj   = std::dynamic_pointer_cast<Linear>(blocks["q_proj"]);
-        auto k_proj   = std::dynamic_pointer_cast<Linear>(blocks["k_proj"]);
-        auto v_proj   = std::dynamic_pointer_cast<Linear>(blocks["v_proj"]);
-        auto out_proj = std::dynamic_pointer_cast<Linear>(blocks["out_proj"]);
-
-        int64_t N       = x->ne[2];
-        int64_t n_token = x->ne[1];
-        int64_t d_head  = embed_dim / n_head;
+        auto q_proj   = std::dynamic_pointer_cast<Linear>(blocks[q_proj_name]);
+        auto k_proj   = std::dynamic_pointer_cast<Linear>(blocks[k_proj_name]);
+        auto v_proj   = std::dynamic_pointer_cast<Linear>(blocks[v_proj_name]);
+        auto out_proj = std::dynamic_pointer_cast<Linear>(blocks[out_proj_name]);

        struct ggml_tensor* q = q_proj->forward(ctx, x);
-        q                     = ggml_reshape_4d(ctx, q, d_head, n_head, n_token, N);   // [N, n_token, n_head, d_head]
-        q                     = ggml_cont(ctx, ggml_permute(ctx, q, 0, 2, 1, 3));      // [N, n_head, n_token, d_head]
-        q                     = ggml_reshape_3d(ctx, q, d_head, n_token, n_head * N);  // [N * n_head, n_token, d_head]
-
        struct ggml_tensor* k = k_proj->forward(ctx, x);
-        k                     = ggml_reshape_4d(ctx, k, d_head, n_head, n_token, N);  // [N, n_token, n_head, d_head]
-        k                     = ggml_cont(ctx, ggml_permute(ctx, k, 0, 2, 1, 3));     // [N, n_head, n_token, d_head]
-        k                     = ggml_reshape_3d(ctx, k, d_head, n_token, n_head);     // [N * n_head, n_token, d_head]
-
        struct ggml_tensor* v = v_proj->forward(ctx, x);
-        v                     = ggml_reshape_4d(ctx, v, d_head, n_head, n_token, N);   // [N, n_token, n_head, d_head]
-        v                     = ggml_cont(ctx, ggml_permute(ctx, v, 1, 2, 0, 3));      // [N, n_head, d_head, n_token]
-        v                     = ggml_reshape_3d(ctx, v, n_token, d_head, n_head * N);  // [N * n_head, d_head, n_token]

-        struct ggml_tensor* kqv = ggml_nn_attention(ctx, q, k, v, mask);  // [N * n_head, n_token, d_head]
+        x = ggml_nn_attention_ext(ctx, q, k, v, n_head, NULL, mask);  // [N, n_token, embed_dim]

-        kqv = ggml_reshape_4d(ctx, kqv, d_head, n_token, n_head, N);
-        kqv = ggml_cont(ctx, ggml_permute(ctx, kqv, 0, 2, 1, 3));  // [N, n_token, n_head, d_head]
-
-        x = ggml_reshape_2d(ctx, kqv, d_head * n_head, n_token * N);  // [N * n_token, d_head * n_head]
-
-        x = out_proj->forward(ctx, x);
+        x = out_proj->forward(ctx, x);  // [N, n_token, embed_dim]
        return x;
    }
 };

-#endif  // __GGML_EXTEND__HPP__
+#endif  // __GGML_EXTEND__HPP__
--- a/otherarch/sdcpp/gits_noise.inl
+++ b/otherarch/sdcpp/gits_noise.inl
@ -0,0 +1,349 @@
+#ifndef GITS_NOISE_INL
+#define GITS_NOISE_INL
+
+const std::vector<std::vector<float>> GITS_NOISE_0_80 = {
+    { 14.61464119f, 7.49001646f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 6.77309084f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 7.49001646f, 3.07277966f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 2.05039096f, 0.02916753f },
+    { 14.61464119f, 12.23089790f, 8.75849152f, 7.49001646f, 5.85520077f, 2.05039096f, 0.02916753f },
+    { 14.61464119f, 12.23089790f, 8.75849152f, 7.49001646f, 5.85520077f, 3.07277966f, 1.56271636f, 0.02916753f },
+    { 14.61464119f, 12.96784878f, 11.54541874f, 8.75849152f, 7.49001646f, 5.85520077f, 3.07277966f, 1.56271636f, 0.02916753f },
+    { 14.61464119f, 13.76078796f, 12.23089790f, 10.90732002f, 8.75849152f, 7.49001646f, 5.85520077f, 3.07277966f, 1.56271636f, 0.02916753f },
+    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 10.90732002f, 8.75849152f, 7.49001646f, 5.85520077f, 3.07277966f, 1.56271636f, 0.02916753f },
+    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 10.90732002f, 9.24142551f, 8.30717278f, 7.49001646f, 5.85520077f, 3.07277966f, 1.56271636f, 0.02916753f },
+    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 10.90732002f, 9.24142551f, 8.30717278f, 7.49001646f, 6.14220476f, 4.86714602f, 3.07277966f, 1.56271636f, 0.02916753f },
+    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.31284904f, 9.24142551f, 8.30717278f, 7.49001646f, 6.14220476f, 4.86714602f, 3.07277966f, 1.56271636f, 0.02916753f },
+    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.90732002f, 10.31284904f, 9.24142551f, 8.30717278f, 7.49001646f, 6.14220476f, 4.86714602f, 3.07277966f, 1.56271636f, 0.02916753f },
+    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.90732002f, 10.31284904f, 9.24142551f, 8.75849152f, 8.30717278f, 7.49001646f, 6.14220476f, 4.86714602f, 3.07277966f, 1.56271636f, 0.02916753f },
+    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.90732002f, 10.31284904f, 9.75859547f, 9.24142551f, 8.75849152f, 8.30717278f, 7.49001646f, 6.14220476f, 4.86714602f, 3.19567990f, 1.98035145f, 0.86115354f, 0.02916753f },
+    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.90732002f, 10.31284904f, 9.75859547f, 9.24142551f, 8.75849152f, 8.30717278f, 7.49001646f, 6.14220476f, 4.86714602f, 3.19567990f, 1.98035145f, 0.86115354f, 0.02916753f },
+    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.90732002f, 10.31284904f, 9.75859547f, 9.24142551f, 8.75849152f, 8.30717278f, 7.88507891f, 7.49001646f, 6.77309084f, 5.85520077f, 4.65472794f, 3.07277966f, 1.84880662f, 0.83188516f, 0.02916753f }
+};
+
+const std::vector<std::vector<float>> GITS_NOISE_0_85 = {
+    { 14.61464119f, 7.49001646f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 1.84880662f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 6.77309084f, 1.56271636f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 7.11996698f, 3.07277966f, 1.24153244f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 7.49001646f, 5.09240818f, 2.84484982f, 0.95350921f, 0.02916753f },
+    { 14.61464119f, 12.23089790f, 8.75849152f, 7.49001646f, 5.09240818f, 2.84484982f, 0.95350921f, 0.02916753f },
+    { 14.61464119f, 12.23089790f, 8.75849152f, 7.49001646f, 5.58536053f, 3.19567990f, 1.84880662f, 0.803307f, 0.02916753f },
+    { 14.61464119f, 12.96784878f, 11.54541874f, 8.75849152f, 7.49001646f, 5.58536053f, 3.19567990f, 1.84880662f, 0.803307f, 0.02916753f },
+    { 14.61464119f, 12.96784878f, 11.54541874f, 8.75849152f, 7.49001646f, 6.14220476f, 4.65472794f, 3.07277966f, 1.84880662f, 0.803307f, 0.02916753f },
+    { 14.61464119f, 13.76078796f, 12.23089790f, 10.90732002f, 8.75849152f, 7.49001646f, 6.14220476f, 4.65472794f, 3.07277966f, 1.84880662f, 0.803307f, 0.02916753f },
+    { 14.61464119f, 13.76078796f, 12.23089790f, 10.90732002f, 9.24142551f, 8.30717278f, 7.49001646f, 6.14220476f, 4.65472794f, 3.07277966f, 1.84880662f, 0.803307f, 0.02916753f },
+    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 10.90732002f, 9.24142551f, 8.30717278f, 7.49001646f, 6.14220476f, 4.65472794f, 3.07277966f, 1.84880662f, 0.803307f, 0.02916753f },
+    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.31284904f, 9.24142551f, 8.30717278f, 7.49001646f, 6.14220476f, 4.65472794f, 3.07277966f, 1.84880662f, 0.803307f, 0.02916753f },
+    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.31284904f, 9.24142551f, 8.30717278f, 7.49001646f, 6.14220476f, 4.86714602f, 3.60512662f, 2.63833880f, 1.56271636f, 0.72133851f, 0.02916753f },
+    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.31284904f, 9.24142551f, 8.30717278f, 7.49001646f, 6.77309084f, 5.85520077f, 4.65472794f, 3.46139455f, 2.45070267f, 1.56271636f, 0.72133851f, 0.02916753f },
+    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.31284904f, 9.24142551f, 8.75849152f, 8.30717278f, 7.49001646f, 6.77309084f, 5.85520077f, 4.65472794f, 3.46139455f, 2.45070267f, 1.56271636f, 0.72133851f, 0.02916753f },
+    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.90732002f, 10.31284904f, 9.24142551f, 8.75849152f, 8.30717278f, 7.49001646f, 6.77309084f, 5.85520077f, 4.65472794f, 3.46139455f, 2.45070267f, 1.56271636f, 0.72133851f, 0.02916753f },
+    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.90732002f, 10.31284904f, 9.75859547f, 9.24142551f, 8.75849152f, 8.30717278f, 7.49001646f, 6.77309084f, 5.85520077f, 4.65472794f, 3.46139455f, 2.45070267f, 1.56271636f, 0.72133851f, 0.02916753f },
+    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.90732002f, 10.31284904f, 9.75859547f, 9.24142551f, 8.75849152f, 8.30717278f, 7.88507891f, 7.49001646f, 6.77309084f, 5.85520077f, 4.65472794f, 3.46139455f, 2.45070267f, 1.56271636f, 0.72133851f, 0.02916753f }
+};
+
+const std::vector<std::vector<float>> GITS_NOISE_0_90 = {
+    { 14.61464119f, 6.77309084f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 1.56271636f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 3.07277966f, 0.95350921f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 4.86714602f, 2.54230714f, 0.89115214f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 7.49001646f, 4.86714602f, 2.54230714f, 0.89115214f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 7.49001646f, 5.09240818f, 3.07277966f, 1.61558151f, 0.69515091f, 0.02916753f },
+    { 14.61464119f, 12.23089790f, 8.75849152f, 7.11996698f, 4.86714602f, 3.07277966f, 1.61558151f, 0.69515091f, 0.02916753f },
+    { 14.61464119f, 12.23089790f, 8.75849152f, 7.49001646f, 5.85520077f, 4.45427561f, 2.95596409f, 1.61558151f, 0.69515091f, 0.02916753f },
+    { 14.61464119f, 12.23089790f, 8.75849152f, 7.49001646f, 5.85520077f, 4.45427561f, 3.19567990f, 2.19988537f, 1.24153244f, 0.57119018f, 0.02916753f },
+    { 14.61464119f, 12.96784878f, 10.90732002f, 8.75849152f, 7.49001646f, 5.85520077f, 4.45427561f, 3.19567990f, 2.19988537f, 1.24153244f, 0.57119018f, 0.02916753f },
+    { 14.61464119f, 12.96784878f, 11.54541874f, 9.24142551f, 8.30717278f, 7.49001646f, 5.85520077f, 4.45427561f, 3.19567990f, 2.19988537f, 1.24153244f, 0.57119018f, 0.02916753f },
+    { 14.61464119f, 12.96784878f, 11.54541874f, 9.24142551f, 8.30717278f, 7.49001646f, 6.14220476f, 4.86714602f, 3.75677586f, 2.84484982f, 1.84880662f, 1.08895338f, 0.52423614f, 0.02916753f },
+    { 14.61464119f, 13.76078796f, 12.23089790f, 10.90732002f, 9.24142551f, 8.30717278f, 7.49001646f, 6.14220476f, 4.86714602f, 3.75677586f, 2.84484982f, 1.84880662f, 1.08895338f, 0.52423614f, 0.02916753f },
+    { 14.61464119f, 13.76078796f, 12.23089790f, 10.90732002f, 9.24142551f, 8.30717278f, 7.49001646f, 6.44769001f, 5.58536053f, 4.45427561f, 3.32507086f, 2.45070267f, 1.61558151f, 0.95350921f, 0.45573691f, 0.02916753f },
+    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 10.90732002f, 9.24142551f, 8.30717278f, 7.49001646f, 6.44769001f, 5.58536053f, 4.45427561f, 3.32507086f, 2.45070267f, 1.61558151f, 0.95350921f, 0.45573691f, 0.02916753f },
+    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 10.90732002f, 9.24142551f, 8.30717278f, 7.49001646f, 6.77309084f, 5.85520077f, 4.86714602f, 3.91689563f, 3.07277966f, 2.27973175f, 1.56271636f, 0.95350921f, 0.45573691f, 0.02916753f },
+    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.31284904f, 9.24142551f, 8.30717278f, 7.49001646f, 6.77309084f, 5.85520077f, 4.86714602f, 3.91689563f, 3.07277966f, 2.27973175f, 1.56271636f, 0.95350921f, 0.45573691f, 0.02916753f },
+    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.31284904f, 9.24142551f, 8.75849152f, 8.30717278f, 7.49001646f, 6.77309084f, 5.85520077f, 4.86714602f, 3.91689563f, 3.07277966f, 2.27973175f, 1.56271636f, 0.95350921f, 0.45573691f, 0.02916753f },
+    { 14.61464119f, 13.76078796f, 12.96784878f, 12.23089790f, 11.54541874f, 10.31284904f, 9.24142551f, 8.75849152f, 8.30717278f, 7.49001646f, 6.77309084f, 5.85520077f, 5.09240818f, 4.45427561f, 3.60512662f, 2.95596409f, 2.19988537f, 1.51179266f, 0.89115214f, 0.43325692f, 0.02916753f }
+};
+
+const std::vector<std::vector<float>> GITS_NOISE_0_95 = {
+    { 14.61464119f, 6.77309084f, 0.02916753f },
+    { 14.61464119f, 6.77309084f, 1.56271636f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 2.84484982f, 0.89115214f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 4.86714602f, 2.36326075f, 0.803307f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 4.86714602f, 2.95596409f, 1.56271636f, 0.64427125f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 7.49001646f, 4.86714602f, 2.95596409f, 1.56271636f, 0.64427125f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 7.49001646f, 4.86714602f, 3.07277966f, 1.91321158f, 1.08895338f, 0.50118381f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 4.45427561f, 3.07277966f, 1.91321158f, 1.08895338f, 0.50118381f, 0.02916753f },
+    { 14.61464119f, 12.23089790f, 8.75849152f, 7.49001646f, 5.85520077f, 4.45427561f, 3.07277966f, 1.91321158f, 1.08895338f, 0.50118381f, 0.02916753f },
+    { 14.61464119f, 12.23089790f, 8.75849152f, 7.49001646f, 5.85520077f, 4.45427561f, 3.19567990f, 2.19988537f, 1.41535246f, 0.803307f, 0.38853383f, 0.02916753f },
+    { 14.61464119f, 12.23089790f, 8.75849152f, 7.49001646f, 5.85520077f, 4.65472794f, 3.46139455f, 2.63833880f, 1.84880662f, 1.24153244f, 0.72133851f, 0.34370604f, 0.02916753f },
+    { 14.61464119f, 12.96784878f, 10.90732002f, 8.75849152f, 7.49001646f, 5.85520077f, 4.65472794f, 3.46139455f, 2.63833880f, 1.84880662f, 1.24153244f, 0.72133851f, 0.34370604f, 0.02916753f },
+    { 14.61464119f, 12.96784878f, 10.90732002f, 8.75849152f, 7.49001646f, 6.14220476f, 4.86714602f, 3.75677586f, 2.95596409f, 2.19988537f, 1.56271636f, 1.05362725f, 0.64427125f, 0.32104823f, 0.02916753f },
+    { 14.61464119f, 12.96784878f, 10.90732002f, 8.75849152f, 7.49001646f, 6.44769001f, 5.58536053f, 4.65472794f, 3.60512662f, 2.95596409f, 2.19988537f, 1.56271636f, 1.05362725f, 0.64427125f, 0.32104823f, 0.02916753f },
+    { 14.61464119f, 12.96784878f, 11.54541874f, 9.24142551f, 8.30717278f, 7.49001646f, 6.44769001f, 5.58536053f, 4.65472794f, 3.60512662f, 2.95596409f, 2.19988537f, 1.56271636f, 1.05362725f, 0.64427125f, 0.32104823f, 0.02916753f },
+    { 14.61464119f, 12.96784878f, 11.54541874f, 9.24142551f, 8.30717278f, 7.49001646f, 6.44769001f, 5.58536053f, 4.65472794f, 3.75677586f, 3.07277966f, 2.45070267f, 1.78698075f, 1.24153244f, 0.83188516f, 0.50118381f, 0.22545385f, 0.02916753f },
+    { 14.61464119f, 12.96784878f, 11.54541874f, 9.24142551f, 8.30717278f, 7.49001646f, 6.77309084f, 5.85520077f, 5.09240818f, 4.45427561f, 3.60512662f, 2.95596409f, 2.36326075f, 1.72759056f, 1.24153244f, 0.83188516f, 0.50118381f, 0.22545385f, 0.02916753f },
+    { 14.61464119f, 13.76078796f, 12.23089790f, 10.90732002f, 9.24142551f, 8.30717278f, 7.49001646f, 6.77309084f, 5.85520077f, 5.09240818f, 4.45427561f, 3.60512662f, 2.95596409f, 2.36326075f, 1.72759056f, 1.24153244f, 0.83188516f, 0.50118381f, 0.22545385f, 0.02916753f },
+    { 14.61464119f, 13.76078796f, 12.23089790f, 10.90732002f, 9.24142551f, 8.30717278f, 7.49001646f, 6.77309084f, 5.85520077f, 5.09240818f, 4.45427561f, 3.75677586f, 3.07277966f, 2.45070267f, 1.91321158f, 1.46270394f, 1.05362725f, 0.72133851f, 0.43325692f, 0.19894916f, 0.02916753f }
+};
+
+const std::vector<std::vector<float>> GITS_NOISE_1_00 = {
+    { 14.61464119f, 1.56271636f, 0.02916753f },
+    { 14.61464119f, 6.77309084f, 0.95350921f, 0.02916753f },
+    { 14.61464119f, 6.77309084f, 2.36326075f, 0.803307f, 0.02916753f },
+    { 14.61464119f, 7.11996698f, 3.07277966f, 1.56271636f, 0.59516323f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 4.86714602f, 2.84484982f, 1.41535246f, 0.57119018f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 4.86714602f, 2.84484982f, 1.61558151f, 0.86115354f, 0.38853383f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 7.49001646f, 4.86714602f, 2.84484982f, 1.61558151f, 0.86115354f, 0.38853383f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 7.49001646f, 4.86714602f, 3.07277966f, 1.98035145f, 1.24153244f, 0.72133851f, 0.34370604f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 4.45427561f, 3.07277966f, 1.98035145f, 1.24153244f, 0.72133851f, 0.34370604f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 4.45427561f, 3.19567990f, 2.27973175f, 1.51179266f, 0.95350921f, 0.54755926f, 0.25053367f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 4.45427561f, 3.19567990f, 2.36326075f, 1.61558151f, 1.08895338f, 0.72133851f, 0.41087446f, 0.17026083f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 8.75849152f, 7.49001646f, 5.85520077f, 4.45427561f, 3.19567990f, 2.36326075f, 1.61558151f, 1.08895338f, 0.72133851f, 0.41087446f, 0.17026083f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 8.75849152f, 7.49001646f, 5.85520077f, 4.65472794f, 3.60512662f, 2.84484982f, 2.12350607f, 1.56271636f, 1.08895338f, 0.72133851f, 0.41087446f, 0.17026083f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 8.75849152f, 7.49001646f, 5.85520077f, 4.65472794f, 3.60512662f, 2.84484982f, 2.19988537f, 1.61558151f, 1.162866f, 0.803307f, 0.50118381f, 0.27464288f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 8.75849152f, 7.49001646f, 5.85520077f, 4.65472794f, 3.75677586f, 3.07277966f, 2.45070267f, 1.84880662f, 1.36964464f, 1.01931262f, 0.72133851f, 0.45573691f, 0.25053367f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 8.75849152f, 7.49001646f, 6.14220476f, 5.09240818f, 4.26497746f, 3.46139455f, 2.84484982f, 2.19988537f, 1.67050016f, 1.24153244f, 0.92192322f, 0.64427125f, 0.43325692f, 0.25053367f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 8.75849152f, 7.49001646f, 6.14220476f, 5.09240818f, 4.26497746f, 3.60512662f, 2.95596409f, 2.45070267f, 1.91321158f, 1.51179266f, 1.12534678f, 0.83188516f, 0.59516323f, 0.38853383f, 0.22545385f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 12.23089790f, 9.24142551f, 8.30717278f, 7.49001646f, 6.14220476f, 5.09240818f, 4.26497746f, 3.60512662f, 2.95596409f, 2.45070267f, 1.91321158f, 1.51179266f, 1.12534678f, 0.83188516f, 0.59516323f, 0.38853383f, 0.22545385f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 12.23089790f, 9.24142551f, 8.30717278f, 7.49001646f, 6.77309084f, 5.85520077f, 5.09240818f, 4.26497746f, 3.60512662f, 2.95596409f, 2.45070267f, 1.91321158f, 1.51179266f, 1.12534678f, 0.83188516f, 0.59516323f, 0.38853383f, 0.22545385f, 0.09824532f, 0.02916753f }
+};
+
+const std::vector<std::vector<float>> GITS_NOISE_1_05 = {
+    { 14.61464119f, 0.95350921f, 0.02916753f },
+    { 14.61464119f, 6.77309084f, 0.89115214f, 0.02916753f },
+    { 14.61464119f, 6.77309084f, 2.05039096f, 0.72133851f, 0.02916753f },
+    { 14.61464119f, 6.77309084f, 2.84484982f, 1.28281462f, 0.52423614f, 0.02916753f },
+    { 14.61464119f, 6.77309084f, 3.07277966f, 1.61558151f, 0.803307f, 0.34370604f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 4.86714602f, 2.84484982f, 1.56271636f, 0.803307f, 0.34370604f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 4.86714602f, 2.84484982f, 1.61558151f, 0.95350921f, 0.52423614f, 0.22545385f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 4.86714602f, 3.07277966f, 1.98035145f, 1.24153244f, 0.74807048f, 0.41087446f, 0.17026083f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 4.86714602f, 3.19567990f, 2.27973175f, 1.51179266f, 0.95350921f, 0.59516323f, 0.34370604f, 0.13792117f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 5.09240818f, 3.46139455f, 2.45070267f, 1.61558151f, 1.08895338f, 0.72133851f, 0.45573691f, 0.25053367f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 7.49001646f, 5.09240818f, 3.46139455f, 2.45070267f, 1.61558151f, 1.08895338f, 0.72133851f, 0.45573691f, 0.25053367f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 4.45427561f, 3.19567990f, 2.36326075f, 1.61558151f, 1.08895338f, 0.72133851f, 0.45573691f, 0.25053367f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 4.45427561f, 3.19567990f, 2.45070267f, 1.72759056f, 1.24153244f, 0.86115354f, 0.59516323f, 0.38853383f, 0.22545385f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 4.65472794f, 3.60512662f, 2.84484982f, 2.19988537f, 1.61558151f, 1.162866f, 0.83188516f, 0.59516323f, 0.38853383f, 0.22545385f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 4.65472794f, 3.60512662f, 2.84484982f, 2.19988537f, 1.67050016f, 1.28281462f, 0.95350921f, 0.72133851f, 0.52423614f, 0.34370604f, 0.19894916f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 4.65472794f, 3.60512662f, 2.95596409f, 2.36326075f, 1.84880662f, 1.41535246f, 1.08895338f, 0.83188516f, 0.61951244f, 0.45573691f, 0.32104823f, 0.19894916f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 4.65472794f, 3.60512662f, 2.95596409f, 2.45070267f, 1.91321158f, 1.51179266f, 1.20157266f, 0.95350921f, 0.74807048f, 0.57119018f, 0.43325692f, 0.29807833f, 0.19894916f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 8.30717278f, 7.11996698f, 5.85520077f, 4.65472794f, 3.60512662f, 2.95596409f, 2.45070267f, 1.91321158f, 1.51179266f, 1.20157266f, 0.95350921f, 0.74807048f, 0.57119018f, 0.43325692f, 0.29807833f, 0.19894916f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 8.30717278f, 7.11996698f, 5.85520077f, 4.65472794f, 3.60512662f, 2.95596409f, 2.45070267f, 1.98035145f, 1.61558151f, 1.32549286f, 1.08895338f, 0.86115354f, 0.69515091f, 0.54755926f, 0.41087446f, 0.29807833f, 0.19894916f, 0.09824532f, 0.02916753f }
+};
+
+const std::vector<std::vector<float>> GITS_NOISE_1_10 = {
+    { 14.61464119f, 0.89115214f, 0.02916753f },
+    { 14.61464119f, 2.36326075f, 0.72133851f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 1.61558151f, 0.57119018f, 0.02916753f },
+    { 14.61464119f, 6.77309084f, 2.45070267f, 1.08895338f, 0.45573691f, 0.02916753f },
+    { 14.61464119f, 6.77309084f, 2.95596409f, 1.56271636f, 0.803307f, 0.34370604f, 0.02916753f },
+    { 14.61464119f, 6.77309084f, 3.07277966f, 1.61558151f, 0.89115214f, 0.4783645f, 0.19894916f, 0.02916753f },
+    { 14.61464119f, 6.77309084f, 3.07277966f, 1.84880662f, 1.08895338f, 0.64427125f, 0.34370604f, 0.13792117f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 4.86714602f, 2.84484982f, 1.61558151f, 0.95350921f, 0.54755926f, 0.27464288f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 4.86714602f, 2.95596409f, 1.91321158f, 1.24153244f, 0.803307f, 0.4783645f, 0.25053367f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 4.86714602f, 3.07277966f, 2.05039096f, 1.41535246f, 0.95350921f, 0.64427125f, 0.41087446f, 0.22545385f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 4.86714602f, 3.19567990f, 2.27973175f, 1.61558151f, 1.12534678f, 0.803307f, 0.54755926f, 0.36617002f, 0.22545385f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 4.86714602f, 3.32507086f, 2.45070267f, 1.72759056f, 1.24153244f, 0.89115214f, 0.64427125f, 0.45573691f, 0.32104823f, 0.19894916f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 5.09240818f, 3.60512662f, 2.84484982f, 2.05039096f, 1.51179266f, 1.08895338f, 0.803307f, 0.59516323f, 0.43325692f, 0.29807833f, 0.19894916f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 5.09240818f, 3.60512662f, 2.84484982f, 2.12350607f, 1.61558151f, 1.24153244f, 0.95350921f, 0.72133851f, 0.54755926f, 0.41087446f, 0.29807833f, 0.19894916f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 5.85520077f, 4.45427561f, 3.19567990f, 2.45070267f, 1.84880662f, 1.41535246f, 1.08895338f, 0.83188516f, 0.64427125f, 0.50118381f, 0.36617002f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 5.85520077f, 4.45427561f, 3.19567990f, 2.45070267f, 1.91321158f, 1.51179266f, 1.20157266f, 0.95350921f, 0.74807048f, 0.59516323f, 0.45573691f, 0.34370604f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 5.85520077f, 4.45427561f, 3.46139455f, 2.84484982f, 2.19988537f, 1.72759056f, 1.36964464f, 1.08895338f, 0.86115354f, 0.69515091f, 0.54755926f, 0.43325692f, 0.34370604f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 4.45427561f, 3.46139455f, 2.84484982f, 2.19988537f, 1.72759056f, 1.36964464f, 1.08895338f, 0.86115354f, 0.69515091f, 0.54755926f, 0.43325692f, 0.34370604f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 11.54541874f, 7.49001646f, 5.85520077f, 4.45427561f, 3.46139455f, 2.84484982f, 2.19988537f, 1.72759056f, 1.36964464f, 1.08895338f, 0.89115214f, 0.72133851f, 0.59516323f, 0.4783645f, 0.38853383f, 0.29807833f, 0.22545385f, 0.17026083f, 0.09824532f, 0.02916753f }
+};
+
+const std::vector<std::vector<float>> GITS_NOISE_1_15 = {
+    { 14.61464119f, 0.83188516f, 0.02916753f },
+    { 14.61464119f, 1.84880662f, 0.59516323f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 1.56271636f, 0.52423614f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 1.91321158f, 0.83188516f, 0.34370604f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.45070267f, 1.24153244f, 0.59516323f, 0.25053367f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.84484982f, 1.51179266f, 0.803307f, 0.41087446f, 0.17026083f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.84484982f, 1.56271636f, 0.89115214f, 0.50118381f, 0.25053367f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 6.77309084f, 3.07277966f, 1.84880662f, 1.12534678f, 0.72133851f, 0.43325692f, 0.22545385f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 6.77309084f, 3.07277966f, 1.91321158f, 1.24153244f, 0.803307f, 0.52423614f, 0.34370604f, 0.19894916f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 4.86714602f, 2.95596409f, 1.91321158f, 1.24153244f, 0.803307f, 0.52423614f, 0.34370604f, 0.19894916f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 4.86714602f, 3.07277966f, 2.05039096f, 1.36964464f, 0.95350921f, 0.69515091f, 0.4783645f, 0.32104823f, 0.19894916f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 4.86714602f, 3.07277966f, 2.12350607f, 1.51179266f, 1.08895338f, 0.803307f, 0.59516323f, 0.43325692f, 0.29807833f, 0.19894916f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 4.86714602f, 3.07277966f, 2.12350607f, 1.51179266f, 1.08895338f, 0.803307f, 0.59516323f, 0.45573691f, 0.34370604f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 4.86714602f, 3.07277966f, 2.19988537f, 1.61558151f, 1.24153244f, 0.95350921f, 0.74807048f, 0.59516323f, 0.45573691f, 0.34370604f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 4.86714602f, 3.19567990f, 2.45070267f, 1.78698075f, 1.32549286f, 1.01931262f, 0.803307f, 0.64427125f, 0.50118381f, 0.38853383f, 0.29807833f, 0.22545385f, 0.17026083f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 4.86714602f, 3.19567990f, 2.45070267f, 1.78698075f, 1.32549286f, 1.01931262f, 0.803307f, 0.64427125f, 0.52423614f, 0.41087446f, 0.32104823f, 0.25053367f, 0.19894916f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 4.86714602f, 3.19567990f, 2.45070267f, 1.84880662f, 1.41535246f, 1.12534678f, 0.89115214f, 0.72133851f, 0.59516323f, 0.4783645f, 0.38853383f, 0.32104823f, 0.25053367f, 0.19894916f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 4.86714602f, 3.19567990f, 2.45070267f, 1.84880662f, 1.41535246f, 1.12534678f, 0.89115214f, 0.72133851f, 0.59516323f, 0.50118381f, 0.41087446f, 0.34370604f, 0.29807833f, 0.25053367f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f }
+};
+
+const std::vector<std::vector<float>> GITS_NOISE_1_20 = {
+    { 14.61464119f, 0.803307f, 0.02916753f },
+    { 14.61464119f, 1.56271636f, 0.52423614f, 0.02916753f },
+    { 14.61464119f, 2.36326075f, 0.92192322f, 0.36617002f, 0.02916753f },
+    { 14.61464119f, 2.84484982f, 1.24153244f, 0.59516323f, 0.25053367f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.05039096f, 0.95350921f, 0.45573691f, 0.17026083f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.45070267f, 1.24153244f, 0.64427125f, 0.29807833f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.45070267f, 1.36964464f, 0.803307f, 0.45573691f, 0.25053367f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.84484982f, 1.61558151f, 0.95350921f, 0.59516323f, 0.36617002f, 0.19894916f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.84484982f, 1.67050016f, 1.08895338f, 0.74807048f, 0.50118381f, 0.32104823f, 0.19894916f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.95596409f, 1.84880662f, 1.24153244f, 0.83188516f, 0.59516323f, 0.41087446f, 0.27464288f, 0.17026083f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 3.07277966f, 1.98035145f, 1.36964464f, 0.95350921f, 0.69515091f, 0.50118381f, 0.36617002f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 6.77309084f, 3.46139455f, 2.36326075f, 1.56271636f, 1.08895338f, 0.803307f, 0.59516323f, 0.45573691f, 0.34370604f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 6.77309084f, 3.46139455f, 2.45070267f, 1.61558151f, 1.162866f, 0.86115354f, 0.64427125f, 0.50118381f, 0.38853383f, 0.29807833f, 0.22545385f, 0.17026083f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 4.65472794f, 3.07277966f, 2.12350607f, 1.51179266f, 1.08895338f, 0.83188516f, 0.64427125f, 0.50118381f, 0.38853383f, 0.29807833f, 0.22545385f, 0.17026083f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 4.65472794f, 3.07277966f, 2.12350607f, 1.51179266f, 1.08895338f, 0.83188516f, 0.64427125f, 0.50118381f, 0.41087446f, 0.32104823f, 0.25053367f, 0.19894916f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 4.65472794f, 3.07277966f, 2.12350607f, 1.51179266f, 1.08895338f, 0.83188516f, 0.64427125f, 0.50118381f, 0.41087446f, 0.34370604f, 0.27464288f, 0.22545385f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 4.65472794f, 3.07277966f, 2.19988537f, 1.61558151f, 1.20157266f, 0.92192322f, 0.72133851f, 0.57119018f, 0.45573691f, 0.36617002f, 0.29807833f, 0.25053367f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 4.65472794f, 3.07277966f, 2.19988537f, 1.61558151f, 1.24153244f, 0.95350921f, 0.74807048f, 0.59516323f, 0.4783645f, 0.38853383f, 0.32104823f, 0.27464288f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 7.49001646f, 4.65472794f, 3.07277966f, 2.19988537f, 1.61558151f, 1.24153244f, 0.95350921f, 0.74807048f, 0.59516323f, 0.50118381f, 0.41087446f, 0.34370604f, 0.29807833f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f }
+};
+
+const std::vector<std::vector<float>> GITS_NOISE_1_25 = {
+    { 14.61464119f, 0.72133851f, 0.02916753f },
+    { 14.61464119f, 1.56271636f, 0.50118381f, 0.02916753f },
+    { 14.61464119f, 2.05039096f, 0.803307f, 0.32104823f, 0.02916753f },
+    { 14.61464119f, 2.36326075f, 0.95350921f, 0.43325692f, 0.17026083f, 0.02916753f },
+    { 14.61464119f, 2.84484982f, 1.24153244f, 0.59516323f, 0.27464288f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 3.07277966f, 1.51179266f, 0.803307f, 0.43325692f, 0.22545385f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.36326075f, 1.24153244f, 0.72133851f, 0.41087446f, 0.22545385f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.45070267f, 1.36964464f, 0.83188516f, 0.52423614f, 0.34370604f, 0.19894916f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.84484982f, 1.61558151f, 0.98595673f, 0.64427125f, 0.43325692f, 0.27464288f, 0.17026083f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.84484982f, 1.67050016f, 1.08895338f, 0.74807048f, 0.52423614f, 0.36617002f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.84484982f, 1.72759056f, 1.162866f, 0.803307f, 0.59516323f, 0.45573691f, 0.34370604f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.95596409f, 1.84880662f, 1.24153244f, 0.86115354f, 0.64427125f, 0.4783645f, 0.36617002f, 0.27464288f, 0.19894916f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.95596409f, 1.84880662f, 1.28281462f, 0.92192322f, 0.69515091f, 0.52423614f, 0.41087446f, 0.32104823f, 0.25053367f, 0.19894916f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.95596409f, 1.91321158f, 1.32549286f, 0.95350921f, 0.72133851f, 0.54755926f, 0.43325692f, 0.34370604f, 0.27464288f, 0.22545385f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.95596409f, 1.91321158f, 1.32549286f, 0.95350921f, 0.72133851f, 0.57119018f, 0.45573691f, 0.36617002f, 0.29807833f, 0.25053367f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.95596409f, 1.91321158f, 1.32549286f, 0.95350921f, 0.74807048f, 0.59516323f, 0.4783645f, 0.38853383f, 0.32104823f, 0.27464288f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 3.07277966f, 2.05039096f, 1.41535246f, 1.05362725f, 0.803307f, 0.61951244f, 0.50118381f, 0.41087446f, 0.34370604f, 0.29807833f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 3.07277966f, 2.05039096f, 1.41535246f, 1.05362725f, 0.803307f, 0.64427125f, 0.52423614f, 0.43325692f, 0.36617002f, 0.32104823f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 3.07277966f, 2.05039096f, 1.46270394f, 1.08895338f, 0.83188516f, 0.66947293f, 0.54755926f, 0.45573691f, 0.38853383f, 0.34370604f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f }
+};
+
+const std::vector<std::vector<float>> GITS_NOISE_1_30 = {
+    { 14.61464119f, 0.72133851f, 0.02916753f },
+    { 14.61464119f, 1.24153244f, 0.43325692f, 0.02916753f },
+    { 14.61464119f, 1.56271636f, 0.59516323f, 0.22545385f, 0.02916753f },
+    { 14.61464119f, 1.84880662f, 0.803307f, 0.36617002f, 0.13792117f, 0.02916753f },
+    { 14.61464119f, 2.36326075f, 1.01931262f, 0.52423614f, 0.25053367f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.84484982f, 1.36964464f, 0.74807048f, 0.41087446f, 0.22545385f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 3.07277966f, 1.56271636f, 0.89115214f, 0.54755926f, 0.34370604f, 0.19894916f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 3.07277966f, 1.61558151f, 0.95350921f, 0.61951244f, 0.41087446f, 0.27464288f, 0.17026083f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.45070267f, 1.36964464f, 0.83188516f, 0.54755926f, 0.36617002f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.45070267f, 1.41535246f, 0.92192322f, 0.64427125f, 0.45573691f, 0.34370604f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.6383388f, 1.56271636f, 1.01931262f, 0.72133851f, 0.50118381f, 0.36617002f, 0.27464288f, 0.19894916f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.84484982f, 1.61558151f, 1.05362725f, 0.74807048f, 0.54755926f, 0.41087446f, 0.32104823f, 0.25053367f, 0.19894916f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.84484982f, 1.61558151f, 1.08895338f, 0.77538133f, 0.57119018f, 0.43325692f, 0.34370604f, 0.27464288f, 0.22545385f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.84484982f, 1.61558151f, 1.08895338f, 0.803307f, 0.59516323f, 0.45573691f, 0.36617002f, 0.29807833f, 0.25053367f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.84484982f, 1.61558151f, 1.08895338f, 0.803307f, 0.59516323f, 0.4783645f, 0.38853383f, 0.32104823f, 0.27464288f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.84484982f, 1.72759056f, 1.162866f, 0.83188516f, 0.64427125f, 0.50118381f, 0.41087446f, 0.34370604f, 0.29807833f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.84484982f, 1.72759056f, 1.162866f, 0.83188516f, 0.64427125f, 0.52423614f, 0.43325692f, 0.36617002f, 0.32104823f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.84484982f, 1.78698075f, 1.24153244f, 0.92192322f, 0.72133851f, 0.57119018f, 0.45573691f, 0.38853383f, 0.34370604f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.84484982f, 1.78698075f, 1.24153244f, 0.92192322f, 0.72133851f, 0.57119018f, 0.4783645f, 0.41087446f, 0.36617002f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f }
+};
+
+const std::vector<std::vector<float>> GITS_NOISE_1_35 = {
+    { 14.61464119f, 0.69515091f, 0.02916753f },
+    { 14.61464119f, 0.95350921f, 0.34370604f, 0.02916753f },
+    { 14.61464119f, 1.56271636f, 0.57119018f, 0.19894916f, 0.02916753f },
+    { 14.61464119f, 1.61558151f, 0.69515091f, 0.29807833f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 1.84880662f, 0.83188516f, 0.43325692f, 0.22545385f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.45070267f, 1.162866f, 0.64427125f, 0.36617002f, 0.19894916f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.84484982f, 1.36964464f, 0.803307f, 0.50118381f, 0.32104823f, 0.19894916f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.84484982f, 1.41535246f, 0.83188516f, 0.54755926f, 0.36617002f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.84484982f, 1.56271636f, 0.95350921f, 0.64427125f, 0.45573691f, 0.32104823f, 0.22545385f, 0.17026083f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.84484982f, 1.56271636f, 0.95350921f, 0.64427125f, 0.45573691f, 0.34370604f, 0.25053367f, 0.19894916f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 3.07277966f, 1.61558151f, 1.01931262f, 0.72133851f, 0.52423614f, 0.38853383f, 0.29807833f, 0.22545385f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 3.07277966f, 1.61558151f, 1.01931262f, 0.72133851f, 0.52423614f, 0.41087446f, 0.32104823f, 0.25053367f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 3.07277966f, 1.61558151f, 1.05362725f, 0.74807048f, 0.54755926f, 0.43325692f, 0.34370604f, 0.27464288f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 3.07277966f, 1.72759056f, 1.12534678f, 0.803307f, 0.59516323f, 0.45573691f, 0.36617002f, 0.29807833f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 3.07277966f, 1.72759056f, 1.12534678f, 0.803307f, 0.59516323f, 0.4783645f, 0.38853383f, 0.32104823f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.45070267f, 1.51179266f, 1.01931262f, 0.74807048f, 0.57119018f, 0.45573691f, 0.36617002f, 0.32104823f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.6383388f, 1.61558151f, 1.08895338f, 0.803307f, 0.61951244f, 0.50118381f, 0.41087446f, 0.34370604f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.6383388f, 1.61558151f, 1.08895338f, 0.803307f, 0.64427125f, 0.52423614f, 0.43325692f, 0.36617002f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 5.85520077f, 2.6383388f, 1.61558151f, 1.08895338f, 0.803307f, 0.64427125f, 0.52423614f, 0.45573691f, 0.38853383f, 0.34370604f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f }
+};
+
+const std::vector<std::vector<float>> GITS_NOISE_1_40 = {
+    { 14.61464119f, 0.59516323f, 0.02916753f },
+    { 14.61464119f, 0.95350921f, 0.34370604f, 0.02916753f },
+    { 14.61464119f, 1.08895338f, 0.43325692f, 0.13792117f, 0.02916753f },
+    { 14.61464119f, 1.56271636f, 0.64427125f, 0.27464288f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 1.61558151f, 0.803307f, 0.43325692f, 0.22545385f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.05039096f, 0.95350921f, 0.54755926f, 0.34370604f, 0.19894916f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.45070267f, 1.24153244f, 0.72133851f, 0.43325692f, 0.27464288f, 0.17026083f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.45070267f, 1.24153244f, 0.74807048f, 0.50118381f, 0.34370604f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.45070267f, 1.28281462f, 0.803307f, 0.52423614f, 0.36617002f, 0.27464288f, 0.19894916f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.45070267f, 1.28281462f, 0.803307f, 0.54755926f, 0.38853383f, 0.29807833f, 0.22545385f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.84484982f, 1.41535246f, 0.86115354f, 0.59516323f, 0.43325692f, 0.32104823f, 0.25053367f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.84484982f, 1.51179266f, 0.95350921f, 0.64427125f, 0.45573691f, 0.34370604f, 0.27464288f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.84484982f, 1.51179266f, 0.95350921f, 0.64427125f, 0.4783645f, 0.36617002f, 0.29807833f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.84484982f, 1.56271636f, 0.98595673f, 0.69515091f, 0.52423614f, 0.41087446f, 0.34370604f, 0.29807833f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.84484982f, 1.56271636f, 1.01931262f, 0.72133851f, 0.54755926f, 0.43325692f, 0.36617002f, 0.32104823f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.84484982f, 1.61558151f, 1.05362725f, 0.74807048f, 0.57119018f, 0.45573691f, 0.38853383f, 0.34370604f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.84484982f, 1.61558151f, 1.08895338f, 0.803307f, 0.61951244f, 0.50118381f, 0.41087446f, 0.36617002f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.84484982f, 1.61558151f, 1.08895338f, 0.803307f, 0.61951244f, 0.50118381f, 0.43325692f, 0.38853383f, 0.34370604f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.84484982f, 1.61558151f, 1.08895338f, 0.803307f, 0.64427125f, 0.52423614f, 0.45573691f, 0.41087446f, 0.36617002f, 0.34370604f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f }
+};
+
+const std::vector<std::vector<float>> GITS_NOISE_1_45 = {
+    { 14.61464119f, 0.59516323f, 0.02916753f },
+    { 14.61464119f, 0.803307f, 0.25053367f, 0.02916753f },
+    { 14.61464119f, 0.95350921f, 0.34370604f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 1.24153244f, 0.54755926f, 0.25053367f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 1.56271636f, 0.72133851f, 0.36617002f, 0.19894916f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 1.61558151f, 0.803307f, 0.45573691f, 0.27464288f, 0.17026083f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 1.91321158f, 0.95350921f, 0.57119018f, 0.36617002f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.19988537f, 1.08895338f, 0.64427125f, 0.41087446f, 0.27464288f, 0.19894916f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.45070267f, 1.24153244f, 0.74807048f, 0.50118381f, 0.34370604f, 0.25053367f, 0.19894916f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.45070267f, 1.24153244f, 0.74807048f, 0.50118381f, 0.36617002f, 0.27464288f, 0.22545385f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.45070267f, 1.28281462f, 0.803307f, 0.54755926f, 0.41087446f, 0.32104823f, 0.25053367f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.45070267f, 1.28281462f, 0.803307f, 0.57119018f, 0.43325692f, 0.34370604f, 0.27464288f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.45070267f, 1.28281462f, 0.83188516f, 0.59516323f, 0.45573691f, 0.36617002f, 0.29807833f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.45070267f, 1.28281462f, 0.83188516f, 0.59516323f, 0.45573691f, 0.36617002f, 0.32104823f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.84484982f, 1.51179266f, 0.95350921f, 0.69515091f, 0.52423614f, 0.41087446f, 0.34370604f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.84484982f, 1.51179266f, 0.95350921f, 0.69515091f, 0.52423614f, 0.43325692f, 0.36617002f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.84484982f, 1.56271636f, 0.98595673f, 0.72133851f, 0.54755926f, 0.45573691f, 0.38853383f, 0.34370604f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.84484982f, 1.56271636f, 1.01931262f, 0.74807048f, 0.57119018f, 0.4783645f, 0.41087446f, 0.36617002f, 0.34370604f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.84484982f, 1.56271636f, 1.01931262f, 0.74807048f, 0.59516323f, 0.50118381f, 0.43325692f, 0.38853383f, 0.36617002f, 0.34370604f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f }
+};
+
+const std::vector<std::vector<float>> GITS_NOISE_1_50 = {
+    { 14.61464119f, 0.54755926f, 0.02916753f },
+    { 14.61464119f, 0.803307f, 0.25053367f, 0.02916753f },
+    { 14.61464119f, 0.86115354f, 0.32104823f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 1.24153244f, 0.54755926f, 0.25053367f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 1.56271636f, 0.72133851f, 0.36617002f, 0.19894916f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 1.61558151f, 0.803307f, 0.45573691f, 0.27464288f, 0.17026083f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 1.61558151f, 0.83188516f, 0.52423614f, 0.34370604f, 0.25053367f, 0.17026083f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 1.84880662f, 0.95350921f, 0.59516323f, 0.38853383f, 0.27464288f, 0.19894916f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 1.84880662f, 0.95350921f, 0.59516323f, 0.41087446f, 0.29807833f, 0.22545385f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 1.84880662f, 0.95350921f, 0.61951244f, 0.43325692f, 0.32104823f, 0.25053367f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.19988537f, 1.12534678f, 0.72133851f, 0.50118381f, 0.36617002f, 0.27464288f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.19988537f, 1.12534678f, 0.72133851f, 0.50118381f, 0.36617002f, 0.29807833f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.36326075f, 1.24153244f, 0.803307f, 0.57119018f, 0.43325692f, 0.34370604f, 0.29807833f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.36326075f, 1.24153244f, 0.803307f, 0.57119018f, 0.43325692f, 0.34370604f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.36326075f, 1.24153244f, 0.803307f, 0.59516323f, 0.45573691f, 0.36617002f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.36326075f, 1.24153244f, 0.803307f, 0.59516323f, 0.45573691f, 0.38853383f, 0.34370604f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.45070267f, 1.32549286f, 0.86115354f, 0.64427125f, 0.50118381f, 0.41087446f, 0.36617002f, 0.34370604f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.45070267f, 1.36964464f, 0.92192322f, 0.69515091f, 0.54755926f, 0.45573691f, 0.41087446f, 0.36617002f, 0.34370604f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f },
+    { 14.61464119f, 2.45070267f, 1.41535246f, 0.95350921f, 0.72133851f, 0.57119018f, 0.4783645f, 0.43325692f, 0.38853383f, 0.36617002f, 0.34370604f, 0.32104823f, 0.29807833f, 0.27464288f, 0.25053367f, 0.22545385f, 0.19894916f, 0.17026083f, 0.13792117f, 0.09824532f, 0.02916753f }
+};
+
+const std::vector<const std::vector<std::vector<float>>*> GITS_NOISE = {
+    { &GITS_NOISE_0_80 },
+    { &GITS_NOISE_0_85 },
+    { &GITS_NOISE_0_90 },
+    { &GITS_NOISE_0_95 },
+    { &GITS_NOISE_1_00 },
+    { &GITS_NOISE_1_05 },
+    { &GITS_NOISE_1_10 },
+    { &GITS_NOISE_1_15 },
+    { &GITS_NOISE_1_20 },
+    { &GITS_NOISE_1_25 },
+    { &GITS_NOISE_1_30 },
+    { &GITS_NOISE_1_35 },
+    { &GITS_NOISE_1_40 },
+    { &GITS_NOISE_1_45 },
+    { &GITS_NOISE_1_50 }
+};
+
+#endif // GITS_NOISE_INL
--- a/otherarch/sdcpp/lora.hpp
+++ b/otherarch/sdcpp/lora.hpp
@ -5,18 +5,22 @@

 #define LORA_GRAPH_SIZE 10240

-struct LoraModel : public GGMLModule {
+struct LoraModel : public GGMLRunner {
    float multiplier = 1.0f;
    std::map<std::string, struct ggml_tensor*> lora_tensors;
    std::string file_path;
    ModelLoader model_loader;
-    bool load_failed = false;
+    bool load_failed                = false;
+    bool applied                    = false;
+    std::vector<int> zero_index_vec = {0};
+    ggml_tensor* zero_index         = NULL;

    LoraModel(ggml_backend_t backend,
              ggml_type wtype,
-              const std::string file_path = "")
-        : file_path(file_path), GGMLModule(backend, wtype) {
-        if (!model_loader.init_from_file(file_path)) {
+              const std::string& file_path = "",
+              const std::string& prefix    = "")
+        : file_path(file_path), GGMLRunner(backend, wtype) {
+        if (!model_loader.init_from_file(file_path, prefix)) {
            load_failed = true;
        }
    }
@ -25,15 +29,7 @@ struct LoraModel : public GGMLModule {
        return "lora";
    }

-    size_t get_params_num() {
-        return LORA_GRAPH_SIZE;
-    }
-
-    size_t get_params_mem_size() {
-        return model_loader.get_params_mem_size(NULL);
-    }
-
-    bool load_from_file() {
+    bool load_from_file(bool filter_tensor = false) {
        LOG_INFO("loading LoRA from '%s'", file_path.c_str());

        if (load_failed) {
@ -45,6 +41,11 @@ struct LoraModel : public GGMLModule {
        auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool {
            const std::string& name = tensor_storage.name;

+            if (filter_tensor && !contains(name, "lora")) {
+                // LOG_INFO("skipping LoRA tesnor '%s'", name.c_str());
+                return true;
+            }
+
            if (dry_run) {
                struct ggml_tensor* real = ggml_new_tensor(params_ctx,
                                                           tensor_storage.type,
@ -55,6 +56,7 @@ struct LoraModel : public GGMLModule {
                auto real   = lora_tensors[name];
                *dst_tensor = real;
            }
+
            return true;
        };

@ -68,9 +70,20 @@ struct LoraModel : public GGMLModule {
        return true;
    }

-    struct ggml_cgraph* build_graph(std::map<std::string, struct ggml_tensor*> model_tensors) {
+    ggml_tensor* to_f32(ggml_context* ctx, ggml_tensor* a) {
+        auto out = ggml_reshape_1d(ctx, a, ggml_nelements(a));
+        out      = ggml_get_rows(ctx, out, zero_index);
+        out      = ggml_reshape(ctx, out, a);
+        return out;
+    }
+
+    struct ggml_cgraph* build_lora_graph(std::map<std::string, struct ggml_tensor*> model_tensors) {
        struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, LORA_GRAPH_SIZE, false);

+        zero_index = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_I32, 1);
+        set_backend_tensor_data(zero_index, zero_index_vec.data());
+        ggml_build_forward_expand(gf, zero_index);
+
        std::set<std::string> applied_lora_tensors;
        for (auto it : model_tensors) {
            std::string k_tensor       = it.first;
@ -141,33 +154,50 @@ struct LoraModel : public GGMLModule {
            GGML_ASSERT(ggml_nelements(updown) == ggml_nelements(weight));
            updown = ggml_scale_inplace(compute_ctx, updown, scale_value);
            ggml_tensor* final_weight;
-            // if (weight->type != GGML_TYPE_F32 && weight->type != GGML_TYPE_F16) {
-            //     final_weight = ggml_new_tensor(compute_ctx, GGML_TYPE_F32, weight->n_dims, weight->ne);
-            //     final_weight = ggml_cpy_inplace(compute_ctx, weight, final_weight);
-            //     final_weight = ggml_add_inplace(compute_ctx, final_weight, updown);
-            //     final_weight = ggml_cpy_inplace(compute_ctx, final_weight, weight);
-            // } else {
-            //     final_weight = ggml_add_inplace(compute_ctx, weight, updown);
-            // }
-            final_weight = ggml_add_inplace(compute_ctx, weight, updown);  // apply directly
+            if (weight->type != GGML_TYPE_F32 && weight->type != GGML_TYPE_F16) {
+                // final_weight = ggml_new_tensor(compute_ctx, GGML_TYPE_F32, ggml_n_dims(weight), weight->ne);
+                // final_weight = ggml_cpy(compute_ctx, weight, final_weight);
+                final_weight = to_f32(compute_ctx, weight);
+                final_weight = ggml_add_inplace(compute_ctx, final_weight, updown);
+                final_weight = ggml_cpy(compute_ctx, final_weight, weight);
+            } else {
+                final_weight = ggml_add_inplace(compute_ctx, weight, updown);
+            }
+            // final_weight = ggml_add_inplace(compute_ctx, weight, updown);  // apply directly
            ggml_build_forward_expand(gf, final_weight);
        }

+        size_t total_lora_tensors_count   = 0;
+        size_t applied_lora_tensors_count = 0;
+
        for (auto& kv : lora_tensors) {
+            total_lora_tensors_count++;
            if (applied_lora_tensors.find(kv.first) == applied_lora_tensors.end()) {
                LOG_WARN("unused lora tensor %s", kv.first.c_str());
+            } else {
+                applied_lora_tensors_count++;
            }
        }
+        /* Don't worry if this message shows up twice in the logs per LoRA,
+         * this function is called once to calculate the required buffer size
+         * and then again to actually generate a graph to be used */
+        if (applied_lora_tensors_count != total_lora_tensors_count) {
+            LOG_WARN("Only (%lu / %lu) LoRA tensors have been applied",
+                     applied_lora_tensors_count, total_lora_tensors_count);
+        } else {
+            LOG_DEBUG("(%lu / %lu) LoRA tensors applied successfully",
+                      applied_lora_tensors_count, total_lora_tensors_count);
+        }

        return gf;
    }

    void apply(std::map<std::string, struct ggml_tensor*> model_tensors, int n_threads) {
        auto get_graph = [&]() -> struct ggml_cgraph* {
-            return build_graph(model_tensors);
+            return build_lora_graph(model_tensors);
        };
-        GGMLModule::compute(get_graph, n_threads, true);
+        GGMLRunner::compute(get_graph, n_threads, true);
    }
 };

-#endif  // __LORA_HPP__
+#endif  // __LORA_HPP__
--- a/otherarch/sdcpp/main.cpp
+++ b/otherarch/sdcpp/main.cpp
@ -7,6 +7,7 @@
 #include <vector>

 // #include "preprocessing.hpp"
+#include "flux.hpp"
 #include "stable-diffusion.h"

 #define STB_IMAGE_IMPLEMENTATION
@ -16,6 +17,9 @@
 #define STB_IMAGE_WRITE_STATIC
 #include "stb_image_write.h"

+#define STB_IMAGE_RESIZE_IMPLEMENTATION
+#include "stb_image_resize.h"
+
 const char* rng_type_to_str[] = {
    "std_default",
    "cuda",
@ -30,6 +34,8 @@ const char* sample_method_str[] = {
    "dpm++2s_a",
    "dpm++2m",
    "dpm++2mv2",
+    "ipndm",
+    "ipndm_v",
    "lcm",
 };

@ -38,6 +44,9 @@ const char* schedule_str[] = {
    "default",
    "discrete",
    "karras",
+    "exponential",
+    "ays",
+    "gits",
 };

 const char* modes_str[] = {
@ -58,13 +67,18 @@ enum SDMode {
 struct SDParams {
    int n_threads = -1;
    SDMode mode   = TXT2IMG;
-
    std::string model_path;
+    std::string clip_l_path;
+    std::string clip_g_path;
+    std::string t5xxl_path;
+    std::string diffusion_model_path;
    std::string vae_path;
    std::string taesd_path;
    std::string esrgan_path;
    std::string controlnet_path;
    std::string embeddings_path;
+    std::string stacked_id_embeddings_path;
+    std::string input_id_images_path;
    sd_type_t wtype = SD_TYPE_COUNT;
    std::string lora_model_dir;
    std::string output_path = "output.png";
@ -73,12 +87,14 @@ struct SDParams {

    std::string prompt;
    std::string negative_prompt;
-    float min_cfg   = 1.0f;
-    float cfg_scale = 7.0f;
-    int clip_skip   = -1;  // <= 0 represents unspecified
-    int width       = 512;
-    int height      = 512;
-    int batch_count = 1;
+    float min_cfg     = 1.0f;
+    float cfg_scale   = 7.0f;
+    float guidance    = 3.5f;
+    float style_ratio = 20.f;
+    int clip_skip     = -1;  // <= 0 represents unspecified
+    int width         = 512;
+    int height        = 512;
+    int batch_count   = 1;

    int video_frames         = 6;
    int motion_bucket_id     = 127;
@ -95,7 +111,11 @@ struct SDParams {
    bool verbose                  = false;
    bool vae_tiling               = false;
    bool control_net_cpu          = false;
+    bool normalize_input          = false;
+    bool clip_on_cpu              = false;
+    bool vae_on_cpu               = false;
    bool canny_preprocess         = false;
+    bool color                    = false;
    int upscale_repeats           = 1;
 };

@ -105,20 +125,31 @@ void print_params(SDParams params) {
    printf("    mode:              %s\n", modes_str[params.mode]);
    printf("    model_path:        %s\n", params.model_path.c_str());
    printf("    wtype:             %s\n", params.wtype < SD_TYPE_COUNT ? sd_type_name(params.wtype) : "unspecified");
+    printf("    clip_l_path:       %s\n", params.clip_l_path.c_str());
+    printf("    clip_g_path:       %s\n", params.clip_g_path.c_str());
+    printf("    t5xxl_path:        %s\n", params.t5xxl_path.c_str());
+    printf("    diffusion_model_path:   %s\n", params.diffusion_model_path.c_str());
    printf("    vae_path:          %s\n", params.vae_path.c_str());
    printf("    taesd_path:        %s\n", params.taesd_path.c_str());
    printf("    esrgan_path:       %s\n", params.esrgan_path.c_str());
    printf("    controlnet_path:   %s\n", params.controlnet_path.c_str());
    printf("    embeddings_path:   %s\n", params.embeddings_path.c_str());
+    printf("    stacked_id_embeddings_path:   %s\n", params.stacked_id_embeddings_path.c_str());
+    printf("    input_id_images_path:   %s\n", params.input_id_images_path.c_str());
+    printf("    style ratio:       %.2f\n", params.style_ratio);
+    printf("    normalize input image :  %s\n", params.normalize_input ? "true" : "false");
    printf("    output_path:       %s\n", params.output_path.c_str());
    printf("    init_img:          %s\n", params.input_path.c_str());
    printf("    control_image:     %s\n", params.control_image_path.c_str());
+    printf("    clip on cpu:       %s\n", params.clip_on_cpu ? "true" : "false");
    printf("    controlnet cpu:    %s\n", params.control_net_cpu ? "true" : "false");
+    printf("    vae decoder on cpu:%s\n", params.vae_on_cpu ? "true" : "false");
    printf("    strength(control): %.2f\n", params.control_strength);
    printf("    prompt:            %s\n", params.prompt.c_str());
    printf("    negative_prompt:   %s\n", params.negative_prompt.c_str());
    printf("    min_cfg:           %.2f\n", params.min_cfg);
    printf("    cfg_scale:         %.2f\n", params.cfg_scale);
+    printf("    guidance:          %.2f\n", params.guidance);
    printf("    clip_skip:         %d\n", params.clip_skip);
    printf("    width:             %d\n", params.width);
    printf("    height:            %d\n", params.height);
@ -139,17 +170,24 @@ void print_usage(int argc, const char* argv[]) {
    printf("arguments:\n");
    printf("  -h, --help                         show this help message and exit\n");
    printf("  -M, --mode [MODEL]                 run mode (txt2img or img2img or convert, default: txt2img)\n");
-    printf("  -t, --threads N                    number of threads to use during computation (default: -1).\n");
+    printf("  -t, --threads N                    number of threads to use during computation (default: -1)\n");
    printf("                                     If threads <= 0, then threads will be set to the number of CPU physical cores\n");
-    printf("  -m, --model [MODEL]                path to model\n");
+    printf("  -m, --model [MODEL]                path to full model\n");
+    printf("  --diffusion-model                  path to the standalone diffusion model\n");
+    printf("  --clip_l                           path to the clip-l text encoder\n");
+    printf("  --clip_g                           path to the clip-l text encoder\n");
+    printf("  --t5xxl                            path to the the t5xxl text encoder\n");
    printf("  --vae [VAE]                        path to vae\n");
    printf("  --taesd [TAESD_PATH]               path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)\n");
    printf("  --control-net [CONTROL_PATH]       path to control net model\n");
-    printf("  --embd-dir [EMBEDDING_PATH]        path to embeddings.\n");
-    printf("  --upscale-model [ESRGAN_PATH]      path to esrgan model. Upscale images after generate, just RealESRGAN_x4plus_anime_6B supported by now.\n");
+    printf("  --embd-dir [EMBEDDING_PATH]        path to embeddings\n");
+    printf("  --stacked-id-embd-dir [DIR]        path to PHOTOMAKER stacked id embeddings\n");
+    printf("  --input-id-images-dir [DIR]        path to PHOTOMAKER input id images dir\n");
+    printf("  --normalize-input                  normalize PHOTOMAKER input id images\n");
+    printf("  --upscale-model [ESRGAN_PATH]      path to esrgan model. Upscale images after generate, just RealESRGAN_x4plus_anime_6B supported by now\n");
    printf("  --upscale-repeats                  Run the ESRGAN upscaler this many times (default 1)\n");
-    printf("  --type [TYPE]                      weight type (f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0)\n");
-    printf("                                     If not specified, the default is the type of the weight file.\n");
+    printf("  --type [TYPE]                      weight type (f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_k, q3_k, q4_k)\n");
+    printf("                                     If not specified, the default is the type of the weight file\n");
    printf("  --lora-model-dir [DIR]             lora model directory\n");
    printf("  -i, --init-img [IMAGE]             path to the input image, required by img2img\n");
    printf("  --control-image [IMAGE]            path to image condition, control net\n");
@ -158,22 +196,26 @@ void print_usage(int argc, const char* argv[]) {
    printf("  -n, --negative-prompt PROMPT       the negative prompt (default: \"\")\n");
    printf("  --cfg-scale SCALE                  unconditional guidance scale: (default: 7.0)\n");
    printf("  --strength STRENGTH                strength for noising/unnoising (default: 0.75)\n");
+    printf("  --style-ratio STYLE-RATIO          strength for keeping input identity (default: 20%%)\n");
    printf("  --control-strength STRENGTH        strength to apply Control Net (default: 0.9)\n");
    printf("                                     1.0 corresponds to full destruction of information in init image\n");
    printf("  -H, --height H                     image height, in pixel space (default: 512)\n");
    printf("  -W, --width W                      image width, in pixel space (default: 512)\n");
-    printf("  --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, lcm}\n");
+    printf("  --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm}\n");
    printf("                                     sampling method (default: \"euler_a\")\n");
    printf("  --steps  STEPS                     number of sample steps (default: 20)\n");
    printf("  --rng {std_default, cuda}          RNG (default: cuda)\n");
    printf("  -s SEED, --seed SEED               RNG seed (default: 42, use random seed for < 0)\n");
-    printf("  -b, --batch-count COUNT            number of images to generate.\n");
-    printf("  --schedule {discrete, karras}      Denoiser sigma schedule (default: discrete)\n");
+    printf("  -b, --batch-count COUNT            number of images to generate\n");
+    printf("  --schedule {discrete, karras, exponential, ays, gits} Denoiser sigma schedule (default: discrete)\n");
    printf("  --clip-skip N                      ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1)\n");
    printf("                                     <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x\n");
    printf("  --vae-tiling                       process vae in tiles to reduce memory usage\n");
+    printf("  --vae-on-cpu                       keep vae in cpu (for low vram)\n");
+    printf("  --clip-on-cpu                      keep clip in cpu (for low vram)\n");
    printf("  --control-net-cpu                  keep controlnet in cpu (for low vram)\n");
    printf("  --canny                            apply canny preprocessor (edge detection)\n");
+    printf("  --color                            Colors the logging tags according to level\n");
    printf("  -v, --verbose                      print extra info\n");
 }

@ -214,6 +256,30 @@ void parse_args(int argc, const char** argv, SDParams& params) {
                break;
            }
            params.model_path = argv[i];
+        } else if (arg == "--clip_l") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.clip_l_path = argv[i];
+        } else if (arg == "--clip_g") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.clip_g_path = argv[i];
+        } else if (arg == "--t5xxl") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.t5xxl_path = argv[i];
+        } else if (arg == "--diffusion-model") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.diffusion_model_path = argv[i];
        } else if (arg == "--vae") {
            if (++i >= argc) {
                invalid_arg = true;
@ -244,6 +310,18 @@ void parse_args(int argc, const char** argv, SDParams& params) {
                break;
            }
            params.embeddings_path = argv[i];
+        } else if (arg == "--stacked-id-embd-dir") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.stacked_id_embeddings_path = argv[i];
+        } else if (arg == "--input-id-images-dir") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.input_id_images_path = argv[i];
        } else if (arg == "--type") {
            if (++i >= argc) {
                invalid_arg = true;
@ -264,8 +342,14 @@ void parse_args(int argc, const char** argv, SDParams& params) {
                params.wtype = SD_TYPE_Q5_1;
            } else if (type == "q8_0") {
                params.wtype = SD_TYPE_Q8_0;
+            } else if (type == "q2_k") {
+                params.wtype = SD_TYPE_Q2_K;
+            } else if (type == "q3_k") {
+                params.wtype = SD_TYPE_Q3_K;
+            } else if (type == "q4_k") {
+                params.wtype = SD_TYPE_Q4_K;
            } else {
-                fprintf(stderr, "error: invalid weight format %s, must be one of [f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0]\n",
+                fprintf(stderr, "error: invalid weight format %s, must be one of [f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_k, q3_k, q4_k]\n",
                        type.c_str());
                exit(1);
            }
@ -321,12 +405,24 @@ void parse_args(int argc, const char** argv, SDParams& params) {
                break;
            }
            params.cfg_scale = std::stof(argv[i]);
+        } else if (arg == "--guidance") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.guidance = std::stof(argv[i]);
        } else if (arg == "--strength") {
            if (++i >= argc) {
                invalid_arg = true;
                break;
            }
            params.strength = std::stof(argv[i]);
+        } else if (arg == "--style-ratio") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.style_ratio = std::stof(argv[i]);
        } else if (arg == "--control-strength") {
            if (++i >= argc) {
                invalid_arg = true;
@ -361,6 +457,12 @@ void parse_args(int argc, const char** argv, SDParams& params) {
            params.vae_tiling = true;
        } else if (arg == "--control-net-cpu") {
            params.control_net_cpu = true;
+        } else if (arg == "--normalize-input") {
+            params.normalize_input = true;
+        } else if (arg == "--clip-on-cpu") {
+            params.clip_on_cpu = true;  // will slow down get_learned_condiotion but necessary for low MEM GPUs
+        } else if (arg == "--vae-on-cpu") {
+            params.vae_on_cpu = true;  // will slow down latent decoding but necessary for low MEM GPUs
        } else if (arg == "--canny") {
            params.canny_preprocess = true;
        } else if (arg == "-b" || arg == "--batch-count") {
@ -428,6 +530,8 @@ void parse_args(int argc, const char** argv, SDParams& params) {
            exit(0);
        } else if (arg == "-v" || arg == "--verbose") {
            params.verbose = true;
+        } else if (arg == "--color") {
+            params.color = true;
        } else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            print_usage(argc, argv);
@ -449,8 +553,8 @@ void parse_args(int argc, const char** argv, SDParams& params) {
        exit(1);
    }

-    if (params.model_path.length() == 0) {
-        fprintf(stderr, "error: the following arguments are required: model_path\n");
+    if (params.model_path.length() == 0 && params.diffusion_model_path.length() == 0) {
+        fprintf(stderr, "error: the following arguments are required: model_path/diffusion_model\n");
        print_usage(argc, argv);
        exit(1);
    }
@ -518,6 +622,7 @@ std::string get_image_params(SDParams params, int64_t seed) {
    }
    parameter_string += "Steps: " + std::to_string(params.sample_steps) + ", ";
    parameter_string += "CFG scale: " + std::to_string(params.cfg_scale) + ", ";
+    parameter_string += "Guidance: " + std::to_string(params.guidance) + ", ";
    parameter_string += "Seed: " + std::to_string(seed) + ", ";
    parameter_string += "Size: " + std::to_string(params.width) + "x" + std::to_string(params.height) + ", ";
    parameter_string += "Model: " + sd_basename(params.model_path) + ", ";
@ -531,23 +636,53 @@ std::string get_image_params(SDParams params, int64_t seed) {
    return parameter_string;
 }

+/* Enables Printing the log level tag in color using ANSI escape codes */
 void sd_log_cb(enum sd_log_level_t level, const char* log, void* data) {
    SDParams* params = (SDParams*)data;
-    if (!params->verbose && level <= SD_LOG_DEBUG) {
+    int tag_color;
+    const char* level_str;
+    FILE* out_stream = (level == SD_LOG_ERROR) ? stderr : stdout;
+
+    if (!log || (!params->verbose && level <= SD_LOG_DEBUG)) {
        return;
    }
-    if (level <= SD_LOG_INFO) {
-        fputs(log, stdout);
-        fflush(stdout);
-    } else {
-        fputs(log, stderr);
-        fflush(stderr);
+
+    switch (level) {
+        case SD_LOG_DEBUG:
+            tag_color = 37;
+            level_str = "DEBUG";
+            break;
+        case SD_LOG_INFO:
+            tag_color = 34;
+            level_str = "INFO";
+            break;
+        case SD_LOG_WARN:
+            tag_color = 35;
+            level_str = "WARN";
+            break;
+        case SD_LOG_ERROR:
+            tag_color = 31;
+            level_str = "ERROR";
+            break;
+        default: /* Potential future-proofing */
+            tag_color = 33;
+            level_str = "?????";
+            break;
    }
+
+    if (params->color == true) {
+        fprintf(out_stream, "\033[%d;1m[%-5s]\033[0m ", tag_color, level_str);
+    } else {
+        fprintf(out_stream, "[%-5s] ", level_str);
+    }
+    fputs(log, out_stream);
+    fflush(out_stream);
 }

 //concedo notes: if it crashes, make sure you specify --type!
 int main(int argc, const char* argv[]) {
    SDParams params;
+
    parse_args(argc, argv, params);

    sd_set_log_callback(sd_log_cb, (void*)&params);
@ -580,40 +715,72 @@ int main(int argc, const char* argv[]) {
        return 1;
    }

-    bool vae_decode_only        = true;
-    uint8_t* input_image_buffer = NULL;
+    bool vae_decode_only          = true;
+    uint8_t* input_image_buffer   = NULL;
+    uint8_t* control_image_buffer = NULL;
    if (params.mode == IMG2IMG || params.mode == IMG2VID) {
        vae_decode_only = false;

        int c              = 0;
-        input_image_buffer = stbi_load(params.input_path.c_str(), &params.width, &params.height, &c, 3);
+        int width          = 0;
+        int height         = 0;
+        input_image_buffer = stbi_load(params.input_path.c_str(), &width, &height, &c, 3);
        if (input_image_buffer == NULL) {
            fprintf(stderr, "load image from '%s' failed\n", params.input_path.c_str());
            return 1;
        }
-        if (c != 3) {
-            fprintf(stderr, "input image must be a 3 channels RGB image, but got %d channels\n", c);
+        if (c < 3) {
+            fprintf(stderr, "the number of channels for the input image must be >= 3, but got %d channels\n", c);
            free(input_image_buffer);
            return 1;
        }
-        if (params.width <= 0 || params.width % 64 != 0) {
-            fprintf(stderr, "error: the width of image must be a multiple of 64\n");
+        if (width <= 0) {
+            fprintf(stderr, "error: the width of image must be greater than 0\n");
            free(input_image_buffer);
            return 1;
        }
-        if (params.height <= 0 || params.height % 64 != 0) {
-            fprintf(stderr, "error: the height of image must be a multiple of 64\n");
+        if (height <= 0) {
+            fprintf(stderr, "error: the height of image must be greater than 0\n");
            free(input_image_buffer);
            return 1;
        }
+
+        // Resize input image ...
+        if (params.height != height || params.width != width) {
+            printf("resize input image from %dx%d to %dx%d\n", width, height, params.width, params.height);
+            int resized_height = params.height;
+            int resized_width  = params.width;
+
+            uint8_t* resized_image_buffer = (uint8_t*)malloc(resized_height * resized_width * 3);
+            if (resized_image_buffer == NULL) {
+                fprintf(stderr, "error: allocate memory for resize input image\n");
+                free(input_image_buffer);
+                return 1;
+            }
+            stbir_resize(input_image_buffer, width, height, 0,
+                         resized_image_buffer, resized_width, resized_height, 0, STBIR_TYPE_UINT8,
+                         3 /*RGB channel*/, STBIR_ALPHA_CHANNEL_NONE, 0,
+                         STBIR_EDGE_CLAMP, STBIR_EDGE_CLAMP,
+                         STBIR_FILTER_BOX, STBIR_FILTER_BOX,
+                         STBIR_COLORSPACE_SRGB, nullptr);
+
+            // Save resized result
+            free(input_image_buffer);
+            input_image_buffer = resized_image_buffer;
+        }
    }

    sd_ctx_t* sd_ctx = new_sd_ctx(params.model_path.c_str(),
+                                  params.clip_l_path.c_str(),
+                                  params.clip_g_path.c_str(),
+                                  params.t5xxl_path.c_str(),
+                                  params.diffusion_model_path.c_str(),
                                  params.vae_path.c_str(),
                                  params.taesd_path.c_str(),
                                  params.controlnet_path.c_str(),
                                  params.lora_model_dir.c_str(),
                                  params.embeddings_path.c_str(),
+                                  params.stacked_id_embeddings_path.c_str(),
                                  vae_decode_only,
                                  params.vae_tiling,
                                  true,
@ -621,43 +788,47 @@ int main(int argc, const char* argv[]) {
                                  params.wtype,
                                  params.rng_type,
                                  params.schedule,
-                                  params.control_net_cpu);
+                                  params.clip_on_cpu,
+                                  params.control_net_cpu,
+                                  params.vae_on_cpu);

    if (sd_ctx == NULL) {
        printf("new_sd_ctx_t failed\n");
        return 1;
    }

+    sd_image_t* control_image = NULL;
+    if (params.controlnet_path.size() > 0 && params.control_image_path.size() > 0) {
+        int c                = 0;
+        control_image_buffer = stbi_load(params.control_image_path.c_str(), &params.width, &params.height, &c, 3);
+        if (control_image_buffer == NULL) {
+            fprintf(stderr, "load image from '%s' failed\n", params.control_image_path.c_str());
+            return 1;
+        }
+        control_image = new sd_image_t{(uint32_t)params.width,
+                                       (uint32_t)params.height,
+                                       3,
+                                       control_image_buffer};
+        if (params.canny_preprocess) {  // apply preprocessor
+            control_image->data = preprocess_canny(control_image->data,
+                                                   control_image->width,
+                                                   control_image->height,
+                                                   0.08f,
+                                                   0.08f,
+                                                   0.8f,
+                                                   1.0f,
+                                                   false);
+        }
+    }
+
    sd_image_t* results;
    if (params.mode == TXT2IMG) {
-        sd_image_t* control_image = NULL;
-        if (params.controlnet_path.size() > 0 && params.control_image_path.size() > 0) {
-            int c              = 0;
-            input_image_buffer = stbi_load(params.control_image_path.c_str(), &params.width, &params.height, &c, 3);
-            if (input_image_buffer == NULL) {
-                fprintf(stderr, "load image from '%s' failed\n", params.control_image_path.c_str());
-                return 1;
-            }
-            control_image = new sd_image_t{(uint32_t)params.width,
-                                           (uint32_t)params.height,
-                                           3,
-                                           input_image_buffer};
-            if (params.canny_preprocess) {  // apply preprocessor
-                control_image->data = preprocess_canny(control_image->data,
-                                                       control_image->width,
-                                                       control_image->height,
-                                                       0.08f,
-                                                       0.08f,
-                                                       0.8f,
-                                                       1.0f,
-                                                       false);
-            }
-        }
        results = txt2img(sd_ctx,
                          params.prompt.c_str(),
                          params.negative_prompt.c_str(),
                          params.clip_skip,
                          params.cfg_scale,
+                          params.guidance,
                          params.width,
                          params.height,
                          params.sample_method,
@ -665,7 +836,10 @@ int main(int argc, const char* argv[]) {
                          params.seed,
                          params.batch_count,
                          control_image,
-                          params.control_strength);
+                          params.control_strength,
+                          params.style_ratio,
+                          params.normalize_input,
+                          params.input_id_images_path.c_str());
    } else {
        sd_image_t input_image = {(uint32_t)params.width,
                                  (uint32_t)params.height,
@ -715,13 +889,19 @@ int main(int argc, const char* argv[]) {
                              params.negative_prompt.c_str(),
                              params.clip_skip,
                              params.cfg_scale,
+                              params.guidance,
                              params.width,
                              params.height,
                              params.sample_method,
                              params.sample_steps,
                              params.strength,
                              params.seed,
-                              params.batch_count);
+                              params.batch_count,
+                              control_image,
+                              params.control_strength,
+                              params.style_ratio,
+                              params.normalize_input,
+                              params.input_id_images_path.c_str());
        }
    }

@ -774,6 +954,8 @@ int main(int argc, const char* argv[]) {
    }
    free(results);
    free_sd_ctx(sd_ctx);
+    free(control_image_buffer);
+    free(input_image_buffer);

    return 0;
 }
--- a/otherarch/sdcpp/mmdit.hpp
+++ b/otherarch/sdcpp/mmdit.hpp
@ -0,0 +1,845 @@
+#ifndef __MMDIT_HPP__
+#define __MMDIT_HPP__
+
+#include "ggml_extend.hpp"
+#include "model.h"
+
+#define MMDIT_GRAPH_SIZE 10240
+
+struct Mlp : public GGMLBlock {
+public:
+    Mlp(int64_t in_features,
+        int64_t hidden_features = -1,
+        int64_t out_features    = -1,
+        bool bias               = true) {
+        // act_layer is always lambda: nn.GELU(approximate="tanh")
+        // norm_layer is always None
+        // use_conv is always False
+        if (hidden_features == -1) {
+            hidden_features = in_features;
+        }
+        if (out_features == -1) {
+            out_features = in_features;
+        }
+        blocks["fc1"] = std::shared_ptr<GGMLBlock>(new Linear(in_features, hidden_features, bias));
+        blocks["fc2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_features, out_features, bias));
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        // x: [N, n_token, in_features]
+        auto fc1 = std::dynamic_pointer_cast<Linear>(blocks["fc1"]);
+        auto fc2 = std::dynamic_pointer_cast<Linear>(blocks["fc2"]);
+
+        x = fc1->forward(ctx, x);
+        x = ggml_gelu_inplace(ctx, x);
+        x = fc2->forward(ctx, x);
+        return x;
+    }
+};
+
+struct PatchEmbed : public GGMLBlock {
+    // 2D Image to Patch Embedding
+protected:
+    bool flatten;
+    bool dynamic_img_pad;
+    int patch_size;
+
+public:
+    PatchEmbed(int64_t img_size     = 224,
+               int patch_size       = 16,
+               int64_t in_chans     = 3,
+               int64_t embed_dim    = 1536,
+               bool bias            = true,
+               bool flatten         = true,
+               bool dynamic_img_pad = true)
+        : patch_size(patch_size),
+          flatten(flatten),
+          dynamic_img_pad(dynamic_img_pad) {
+        // img_size is always None
+        // patch_size is always 2
+        // in_chans is always 16
+        // norm_layer is always False
+        // strict_img_size is always true, but not used
+
+        blocks["proj"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_chans,
+                                                               embed_dim,
+                                                               {patch_size, patch_size},
+                                                               {patch_size, patch_size},
+                                                               {0, 0},
+                                                               {1, 1},
+                                                               bias));
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        // x: [N, C, H, W]
+        // return: [N, H*W, embed_dim]
+        auto proj = std::dynamic_pointer_cast<Conv2d>(blocks["proj"]);
+
+        if (dynamic_img_pad) {
+            int64_t W = x->ne[0];
+            int64_t H = x->ne[1];
+            int pad_h = (patch_size - H % patch_size) % patch_size;
+            int pad_w = (patch_size - W % patch_size) % patch_size;
+            x         = ggml_pad(ctx, x, pad_w, pad_h, 0, 0);  // TODO: reflect pad mode
+        }
+        x = proj->forward(ctx, x);
+
+        if (flatten) {
+            x = ggml_reshape_3d(ctx, x, x->ne[0] * x->ne[1], x->ne[2], x->ne[3]);
+            x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 0, 2, 3));
+        }
+        return x;
+    }
+};
+
+struct TimestepEmbedder : public GGMLBlock {
+    // Embeds scalar timesteps into vector representations.
+protected:
+    int64_t frequency_embedding_size;
+
+public:
+    TimestepEmbedder(int64_t hidden_size,
+                     int64_t frequency_embedding_size = 256)
+        : frequency_embedding_size(frequency_embedding_size) {
+        blocks["mlp.0"] = std::shared_ptr<GGMLBlock>(new Linear(frequency_embedding_size, hidden_size, true, true));
+        blocks["mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size, true, true));
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* t) {
+        // t: [N, ]
+        // return: [N, hidden_size]
+        auto mlp_0 = std::dynamic_pointer_cast<Linear>(blocks["mlp.0"]);
+        auto mlp_2 = std::dynamic_pointer_cast<Linear>(blocks["mlp.2"]);
+
+        auto t_freq = ggml_nn_timestep_embedding(ctx, t, frequency_embedding_size);  // [N, frequency_embedding_size]
+
+        auto t_emb = mlp_0->forward(ctx, t_freq);
+        t_emb      = ggml_silu_inplace(ctx, t_emb);
+        t_emb      = mlp_2->forward(ctx, t_emb);
+        return t_emb;
+    }
+};
+
+struct VectorEmbedder : public GGMLBlock {
+    // Embeds a flat vector of dimension input_dim
+public:
+    VectorEmbedder(int64_t input_dim,
+                   int64_t hidden_size) {
+        blocks["mlp.0"] = std::shared_ptr<GGMLBlock>(new Linear(input_dim, hidden_size, true, true));
+        blocks["mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size, true, true));
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        // x: [N, input_dim]
+        // return: [N, hidden_size]
+        auto mlp_0 = std::dynamic_pointer_cast<Linear>(blocks["mlp.0"]);
+        auto mlp_2 = std::dynamic_pointer_cast<Linear>(blocks["mlp.2"]);
+
+        x = mlp_0->forward(ctx, x);
+        x = ggml_silu_inplace(ctx, x);
+        x = mlp_2->forward(ctx, x);
+        return x;
+    }
+};
+
+class RMSNorm : public UnaryBlock {
+protected:
+    int64_t hidden_size;
+    float eps;
+
+    void init_params(struct ggml_context* ctx, ggml_type wtype) {
+        params["weight"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
+    }
+
+public:
+    RMSNorm(int64_t hidden_size,
+            float eps = 1e-06f)
+        : hidden_size(hidden_size),
+          eps(eps) {}
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        struct ggml_tensor* w = params["weight"];
+        x                     = ggml_rms_norm(ctx, x, eps);
+        x                     = ggml_mul(ctx, x, w);
+        return x;
+    }
+};
+
+class SelfAttention : public GGMLBlock {
+public:
+    int64_t num_heads;
+    bool pre_only;
+    std::string qk_norm;
+
+public:
+    SelfAttention(int64_t dim,
+                  int64_t num_heads   = 8,
+                  std::string qk_norm = "",
+                  bool qkv_bias       = false,
+                  bool pre_only       = false)
+        : num_heads(num_heads), pre_only(pre_only), qk_norm(qk_norm) {
+        int64_t d_head = dim / num_heads;
+        blocks["qkv"]  = std::shared_ptr<GGMLBlock>(new Linear(dim, dim * 3, qkv_bias));
+        if (!pre_only) {
+            blocks["proj"] = std::shared_ptr<GGMLBlock>(new Linear(dim, dim));
+        }
+        if (qk_norm == "rms") {
+            blocks["ln_q"] = std::shared_ptr<GGMLBlock>(new RMSNorm(d_head, 1.0e-6));
+            blocks["ln_k"] = std::shared_ptr<GGMLBlock>(new RMSNorm(d_head, 1.0e-6));
+        } else if (qk_norm == "ln") {
+            blocks["ln_q"] = std::shared_ptr<GGMLBlock>(new LayerNorm(d_head, 1.0e-6));
+            blocks["ln_k"] = std::shared_ptr<GGMLBlock>(new LayerNorm(d_head, 1.0e-6));
+        }
+    }
+
+    std::vector<struct ggml_tensor*> pre_attention(struct ggml_context* ctx, struct ggml_tensor* x) {
+        auto qkv_proj = std::dynamic_pointer_cast<Linear>(blocks["qkv"]);
+
+        auto qkv         = qkv_proj->forward(ctx, x);
+        auto qkv_vec     = split_qkv(ctx, qkv);
+        int64_t head_dim = qkv_vec[0]->ne[0] / num_heads;
+        auto q           = ggml_reshape_4d(ctx, qkv_vec[0], head_dim, num_heads, qkv_vec[0]->ne[1], qkv_vec[0]->ne[2]);  // [N, n_token, n_head, d_head]
+        auto k           = ggml_reshape_4d(ctx, qkv_vec[1], head_dim, num_heads, qkv_vec[1]->ne[1], qkv_vec[1]->ne[2]);  // [N, n_token, n_head, d_head]
+        auto v           = qkv_vec[2];                                                                                   // [N, n_token, n_head*d_head]
+
+        if (qk_norm == "rms" || qk_norm == "ln") {
+            auto ln_q = std::dynamic_pointer_cast<UnaryBlock>(blocks["ln_q"]);
+            auto ln_k = std::dynamic_pointer_cast<UnaryBlock>(blocks["ln_k"]);
+            q         = ln_q->forward(ctx, q);
+            k         = ln_k->forward(ctx, k);
+        }
+
+        q = ggml_reshape_3d(ctx, q, q->ne[0] * q->ne[1], q->ne[2], q->ne[3]);  // [N, n_token, n_head*d_head]
+        k = ggml_reshape_3d(ctx, k, k->ne[0] * k->ne[1], k->ne[2], k->ne[3]);  // [N, n_token, n_head*d_head]
+
+        return {q, k, v};
+    }
+
+    struct ggml_tensor* post_attention(struct ggml_context* ctx, struct ggml_tensor* x) {
+        GGML_ASSERT(!pre_only);
+
+        auto proj = std::dynamic_pointer_cast<Linear>(blocks["proj"]);
+
+        x = proj->forward(ctx, x);  // [N, n_token, dim]
+        return x;
+    }
+
+    // x: [N, n_token, dim]
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        auto qkv = pre_attention(ctx, x);
+        x        = ggml_nn_attention_ext(ctx, qkv[0], qkv[1], qkv[2], num_heads);  // [N, n_token, dim]
+        x        = post_attention(ctx, x);                                         // [N, n_token, dim]
+        return x;
+    }
+};
+
+__STATIC_INLINE__ struct ggml_tensor* modulate(struct ggml_context* ctx,
+                                               struct ggml_tensor* x,
+                                               struct ggml_tensor* shift,
+                                               struct ggml_tensor* scale) {
+    // x: [N, L, C]
+    // scale: [N, C]
+    // shift: [N, C]
+    scale = ggml_reshape_3d(ctx, scale, scale->ne[0], 1, scale->ne[1]);  // [N, 1, C]
+    shift = ggml_reshape_3d(ctx, shift, shift->ne[0], 1, shift->ne[1]);  // [N, 1, C]
+    x     = ggml_add(ctx, x, ggml_mul(ctx, x, scale));
+    x     = ggml_add(ctx, x, shift);
+    return x;
+}
+
+struct DismantledBlock : public GGMLBlock {
+    // A DiT block with gated adaptive layer norm (adaLN) conditioning.
+public:
+    int64_t num_heads;
+    bool pre_only;
+
+public:
+    DismantledBlock(int64_t hidden_size,
+                    int64_t num_heads,
+                    float mlp_ratio     = 4.0,
+                    std::string qk_norm = "",
+                    bool qkv_bias       = false,
+                    bool pre_only       = false)
+        : num_heads(num_heads), pre_only(pre_only) {
+        // rmsnorm is always Flase
+        // scale_mod_only is always Flase
+        // swiglu is always Flase
+        blocks["norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-06f, false));
+        blocks["attn"]  = std::shared_ptr<GGMLBlock>(new SelfAttention(hidden_size, num_heads, qk_norm, qkv_bias, pre_only));
+
+        if (!pre_only) {
+            blocks["norm2"]        = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-06f, false));
+            int64_t mlp_hidden_dim = (int64_t)(hidden_size * mlp_ratio);
+            blocks["mlp"]          = std::shared_ptr<GGMLBlock>(new Mlp(hidden_size, mlp_hidden_dim));
+        }
+
+        int64_t n_mods = 6;
+        if (pre_only) {
+            n_mods = 2;
+        }
+        blocks["adaLN_modulation.1"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, n_mods * hidden_size));
+    }
+
+    std::pair<std::vector<struct ggml_tensor*>, std::vector<struct ggml_tensor*>> pre_attention(struct ggml_context* ctx,
+                                                                                                struct ggml_tensor* x,
+                                                                                                struct ggml_tensor* c) {
+        // x: [N, n_token, hidden_size]
+        // c: [N, hidden_size]
+        auto norm1              = std::dynamic_pointer_cast<LayerNorm>(blocks["norm1"]);
+        auto attn               = std::dynamic_pointer_cast<SelfAttention>(blocks["attn"]);
+        auto adaLN_modulation_1 = std::dynamic_pointer_cast<Linear>(blocks["adaLN_modulation.1"]);
+
+        int64_t n_mods = 6;
+        if (pre_only) {
+            n_mods = 2;
+        }
+        auto m = adaLN_modulation_1->forward(ctx, ggml_silu(ctx, c));  // [N, n_mods * hidden_size]
+        m      = ggml_reshape_3d(ctx, m, c->ne[0], n_mods, c->ne[1]);  // [N, n_mods, hidden_size]
+        m      = ggml_cont(ctx, ggml_permute(ctx, m, 0, 2, 1, 3));     // [n_mods, N, hidden_size]
+
+        int64_t offset = m->nb[1] * m->ne[1];
+        auto shift_msa = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 0);  // [N, hidden_size]
+        auto scale_msa = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 1);  // [N, hidden_size]
+        if (!pre_only) {
+            auto gate_msa  = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 2);  // [N, hidden_size]
+            auto shift_mlp = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 3);  // [N, hidden_size]
+            auto scale_mlp = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 4);  // [N, hidden_size]
+            auto gate_mlp  = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 5);  // [N, hidden_size]
+
+            auto attn_in = modulate(ctx, norm1->forward(ctx, x), shift_msa, scale_msa);
+
+            auto qkv = attn->pre_attention(ctx, attn_in);
+
+            return {qkv, {x, gate_msa, shift_mlp, scale_mlp, gate_mlp}};
+        } else {
+            auto attn_in = modulate(ctx, norm1->forward(ctx, x), shift_msa, scale_msa);
+            auto qkv     = attn->pre_attention(ctx, attn_in);
+
+            return {qkv, {NULL, NULL, NULL, NULL, NULL}};
+        }
+    }
+
+    struct ggml_tensor* post_attention(struct ggml_context* ctx,
+                                       struct ggml_tensor* attn_out,
+                                       struct ggml_tensor* x,
+                                       struct ggml_tensor* gate_msa,
+                                       struct ggml_tensor* shift_mlp,
+                                       struct ggml_tensor* scale_mlp,
+                                       struct ggml_tensor* gate_mlp) {
+        // attn_out: [N, n_token, hidden_size]
+        // x: [N, n_token, hidden_size]
+        // gate_msa: [N, hidden_size]
+        // shift_mlp: [N, hidden_size]
+        // scale_mlp: [N, hidden_size]
+        // gate_mlp: [N, hidden_size]
+        // return: [N, n_token, hidden_size]
+        GGML_ASSERT(!pre_only);
+
+        auto attn  = std::dynamic_pointer_cast<SelfAttention>(blocks["attn"]);
+        auto norm2 = std::dynamic_pointer_cast<LayerNorm>(blocks["norm2"]);
+        auto mlp   = std::dynamic_pointer_cast<Mlp>(blocks["mlp"]);
+
+        gate_msa = ggml_reshape_3d(ctx, gate_msa, gate_msa->ne[0], 1, gate_msa->ne[1]);  // [N, 1, hidden_size]
+        gate_mlp = ggml_reshape_3d(ctx, gate_mlp, gate_mlp->ne[0], 1, gate_mlp->ne[1]);  // [N, 1, hidden_size]
+
+        attn_out = attn->post_attention(ctx, attn_out);
+
+        x            = ggml_add(ctx, x, ggml_mul(ctx, attn_out, gate_msa));
+        auto mlp_out = mlp->forward(ctx, modulate(ctx, norm2->forward(ctx, x), shift_mlp, scale_mlp));
+        x            = ggml_add(ctx, x, ggml_mul(ctx, mlp_out, gate_mlp));
+
+        return x;
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* c) {
+        // x: [N, n_token, hidden_size]
+        // c: [N, hidden_size]
+        // return: [N, n_token, hidden_size]
+
+        auto attn = std::dynamic_pointer_cast<SelfAttention>(blocks["attn"]);
+
+        auto qkv_intermediates = pre_attention(ctx, x, c);
+        auto qkv               = qkv_intermediates.first;
+        auto intermediates     = qkv_intermediates.second;
+
+        auto attn_out = ggml_nn_attention_ext(ctx, qkv[0], qkv[1], qkv[2], num_heads);  // [N, n_token, dim]
+        x             = post_attention(ctx,
+                                       attn_out,
+                                       intermediates[0],
+                                       intermediates[1],
+                                       intermediates[2],
+                                       intermediates[3],
+                                       intermediates[4]);
+        return x;  // [N, n_token, dim]
+    }
+};
+
+__STATIC_INLINE__ std::pair<struct ggml_tensor*, struct ggml_tensor*> block_mixing(struct ggml_context* ctx,
+                                                                                   struct ggml_tensor* context,
+                                                                                   struct ggml_tensor* x,
+                                                                                   struct ggml_tensor* c,
+                                                                                   std::shared_ptr<DismantledBlock> context_block,
+                                                                                   std::shared_ptr<DismantledBlock> x_block) {
+    // context: [N, n_context, hidden_size]
+    // x: [N, n_token, hidden_size]
+    // c: [N, hidden_size]
+    auto context_qkv_intermediates = context_block->pre_attention(ctx, context, c);
+    auto context_qkv               = context_qkv_intermediates.first;
+    auto context_intermediates     = context_qkv_intermediates.second;
+
+    auto x_qkv_intermediates = x_block->pre_attention(ctx, x, c);
+    auto x_qkv               = x_qkv_intermediates.first;
+    auto x_intermediates     = x_qkv_intermediates.second;
+
+    std::vector<struct ggml_tensor*> qkv;
+    for (int i = 0; i < 3; i++) {
+        qkv.push_back(ggml_concat(ctx, context_qkv[i], x_qkv[i], 1));
+    }
+
+    auto attn         = ggml_nn_attention_ext(ctx, qkv[0], qkv[1], qkv[2], x_block->num_heads);  // [N, n_context + n_token, hidden_size]
+    attn              = ggml_cont(ctx, ggml_permute(ctx, attn, 0, 2, 1, 3));                     // [n_context + n_token, N, hidden_size]
+    auto context_attn = ggml_view_3d(ctx,
+                                     attn,
+                                     attn->ne[0],
+                                     attn->ne[1],
+                                     context->ne[1],
+                                     attn->nb[1],
+                                     attn->nb[2],
+                                     0);                                              // [n_context, N, hidden_size]
+    context_attn      = ggml_cont(ctx, ggml_permute(ctx, context_attn, 0, 2, 1, 3));  // [N, n_context, hidden_size]
+    auto x_attn       = ggml_view_3d(ctx,
+                                     attn,
+                                     attn->ne[0],
+                                     attn->ne[1],
+                                     x->ne[1],
+                                     attn->nb[1],
+                                     attn->nb[2],
+                                     attn->nb[2] * context->ne[1]);             // [n_token, N, hidden_size]
+    x_attn            = ggml_cont(ctx, ggml_permute(ctx, x_attn, 0, 2, 1, 3));  // [N, n_token, hidden_size]
+
+    if (!context_block->pre_only) {
+        context = context_block->post_attention(ctx,
+                                                context_attn,
+                                                context_intermediates[0],
+                                                context_intermediates[1],
+                                                context_intermediates[2],
+                                                context_intermediates[3],
+                                                context_intermediates[4]);
+    } else {
+        context = NULL;
+    }
+
+    x = x_block->post_attention(ctx,
+                                x_attn,
+                                x_intermediates[0],
+                                x_intermediates[1],
+                                x_intermediates[2],
+                                x_intermediates[3],
+                                x_intermediates[4]);
+
+    return {context, x};
+}
+
+struct JointBlock : public GGMLBlock {
+public:
+    JointBlock(int64_t hidden_size,
+               int64_t num_heads,
+               float mlp_ratio     = 4.0,
+               std::string qk_norm = "",
+               bool qkv_bias       = false,
+               bool pre_only       = false) {
+        blocks["context_block"] = std::shared_ptr<GGMLBlock>(new DismantledBlock(hidden_size, num_heads, mlp_ratio, qk_norm, qkv_bias, pre_only));
+        blocks["x_block"]       = std::shared_ptr<GGMLBlock>(new DismantledBlock(hidden_size, num_heads, mlp_ratio, qk_norm, qkv_bias, false));
+    }
+
+    std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(struct ggml_context* ctx,
+                                                                struct ggml_tensor* context,
+                                                                struct ggml_tensor* x,
+                                                                struct ggml_tensor* c) {
+        auto context_block = std::dynamic_pointer_cast<DismantledBlock>(blocks["context_block"]);
+        auto x_block       = std::dynamic_pointer_cast<DismantledBlock>(blocks["x_block"]);
+
+        return block_mixing(ctx, context, x, c, context_block, x_block);
+    }
+};
+
+struct FinalLayer : public GGMLBlock {
+    // The final layer of DiT.
+public:
+    FinalLayer(int64_t hidden_size,
+               int64_t patch_size,
+               int64_t out_channels) {
+        // total_out_channels is always None
+        blocks["norm_final"]         = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-06f, false));
+        blocks["linear"]             = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, patch_size * patch_size * out_channels, true, true));
+        blocks["adaLN_modulation.1"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, 2 * hidden_size));
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx,
+                                struct ggml_tensor* x,
+                                struct ggml_tensor* c) {
+        // x: [N, n_token, hidden_size]
+        // c: [N, hidden_size]
+        // return: [N, n_token, patch_size * patch_size * out_channels]
+        auto norm_final         = std::dynamic_pointer_cast<LayerNorm>(blocks["norm_final"]);
+        auto linear             = std::dynamic_pointer_cast<Linear>(blocks["linear"]);
+        auto adaLN_modulation_1 = std::dynamic_pointer_cast<Linear>(blocks["adaLN_modulation.1"]);
+
+        auto m = adaLN_modulation_1->forward(ctx, ggml_silu(ctx, c));  // [N, 2 * hidden_size]
+        m      = ggml_reshape_3d(ctx, m, c->ne[0], 2, c->ne[1]);       // [N, 2, hidden_size]
+        m      = ggml_cont(ctx, ggml_permute(ctx, m, 0, 2, 1, 3));     // [2, N, hidden_size]
+
+        int64_t offset = m->nb[1] * m->ne[1];
+        auto shift     = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 0);  // [N, hidden_size]
+        auto scale     = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 1);  // [N, hidden_size]
+
+        x = modulate(ctx, norm_final->forward(ctx, x), shift, scale);
+        x = linear->forward(ctx, x);
+
+        return x;
+    }
+};
+
+struct MMDiT : public GGMLBlock {
+    // Diffusion model with a Transformer backbone.
+protected:
+    SDVersion version                = VERSION_SD3_2B;
+    int64_t input_size               = -1;
+    int64_t patch_size               = 2;
+    int64_t in_channels              = 16;
+    int64_t depth                    = 24;
+    float mlp_ratio                  = 4.0f;
+    int64_t adm_in_channels          = 2048;
+    int64_t out_channels             = 16;
+    int64_t pos_embed_max_size       = 192;
+    int64_t num_patchs               = 36864;  // 192 * 192
+    int64_t context_size             = 4096;
+    int64_t context_embedder_out_dim = 1536;
+    int64_t hidden_size;
+    std::string qk_norm;
+
+    void init_params(struct ggml_context* ctx, ggml_type wtype) {
+        params["pos_embed"] = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hidden_size, num_patchs, 1);
+    }
+
+public:
+    MMDiT(SDVersion version = VERSION_SD3_2B)
+        : version(version) {
+        // input_size is always None
+        // learn_sigma is always False
+        // register_length is alwalys 0
+        // rmsnorm is alwalys False
+        // scale_mod_only is alwalys False
+        // swiglu is alwalys False
+        // qkv_bias is always True
+        // context_processor_layers is always None
+        // pos_embed_scaling_factor is not used
+        // pos_embed_offset is not used
+        // context_embedder_config is always {'target': 'torch.nn.Linear', 'params': {'in_features': 4096, 'out_features': 1536}}
+        if (version == VERSION_SD3_2B) {
+            input_size               = -1;
+            patch_size               = 2;
+            in_channels              = 16;
+            depth                    = 24;
+            mlp_ratio                = 4.0f;
+            adm_in_channels          = 2048;
+            out_channels             = 16;
+            pos_embed_max_size       = 192;
+            num_patchs               = 36864;  // 192 * 192
+            context_size             = 4096;
+            context_embedder_out_dim = 1536;
+        } else if (version == VERSION_SD3_5_8B) {
+            input_size               = -1;
+            patch_size               = 2;
+            in_channels              = 16;
+            depth                    = 38;
+            mlp_ratio                = 4.0f;
+            adm_in_channels          = 2048;
+            out_channels             = 16;
+            pos_embed_max_size       = 192;
+            num_patchs               = 36864;  // 192 * 192
+            context_size             = 4096;
+            context_embedder_out_dim = 2432;
+            qk_norm                  = "rms";
+        }
+        int64_t default_out_channels = in_channels;
+        hidden_size                  = 64 * depth;
+        int64_t num_heads            = depth;
+
+        blocks["x_embedder"] = std::shared_ptr<GGMLBlock>(new PatchEmbed(input_size, patch_size, in_channels, hidden_size, true));
+        blocks["t_embedder"] = std::shared_ptr<GGMLBlock>(new TimestepEmbedder(hidden_size));
+
+        if (adm_in_channels != -1) {
+            blocks["y_embedder"] = std::shared_ptr<GGMLBlock>(new VectorEmbedder(adm_in_channels, hidden_size));
+        }
+
+        blocks["context_embedder"] = std::shared_ptr<GGMLBlock>(new Linear(4096, context_embedder_out_dim, true, true));
+
+        for (int i = 0; i < depth; i++) {
+            blocks["joint_blocks." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new JointBlock(hidden_size,
+                                                                                                    num_heads,
+                                                                                                    mlp_ratio,
+                                                                                                    qk_norm,
+                                                                                                    true,
+                                                                                                    i == depth - 1));
+        }
+
+        blocks["final_layer"] = std::shared_ptr<GGMLBlock>(new FinalLayer(hidden_size, patch_size, out_channels));
+    }
+
+    struct ggml_tensor* cropped_pos_embed(struct ggml_context* ctx,
+                                          int64_t h,
+                                          int64_t w) {
+        auto pos_embed = params["pos_embed"];
+
+        h = (h + 1) / patch_size;
+        w = (w + 1) / patch_size;
+
+        GGML_ASSERT(h <= pos_embed_max_size && h > 0);
+        GGML_ASSERT(w <= pos_embed_max_size && w > 0);
+
+        int64_t top  = (pos_embed_max_size - h) / 2;
+        int64_t left = (pos_embed_max_size - w) / 2;
+
+        auto spatial_pos_embed = ggml_reshape_3d(ctx, pos_embed, hidden_size, pos_embed_max_size, pos_embed_max_size);
+
+        // spatial_pos_embed = spatial_pos_embed[:, top : top + h, left : left + w, :]
+        spatial_pos_embed = ggml_view_3d(ctx,
+                                         spatial_pos_embed,
+                                         hidden_size,
+                                         pos_embed_max_size,
+                                         h,
+                                         spatial_pos_embed->nb[1],
+                                         spatial_pos_embed->nb[2],
+                                         spatial_pos_embed->nb[2] * top);                      // [h, pos_embed_max_size, hidden_size]
+        spatial_pos_embed = ggml_cont(ctx, ggml_permute(ctx, spatial_pos_embed, 0, 2, 1, 3));  // [pos_embed_max_size, h, hidden_size]
+        spatial_pos_embed = ggml_view_3d(ctx,
+                                         spatial_pos_embed,
+                                         hidden_size,
+                                         h,
+                                         w,
+                                         spatial_pos_embed->nb[1],
+                                         spatial_pos_embed->nb[2],
+                                         spatial_pos_embed->nb[2] * left);                     // [w, h, hidden_size]
+        spatial_pos_embed = ggml_cont(ctx, ggml_permute(ctx, spatial_pos_embed, 0, 2, 1, 3));  // [h, w, hidden_size]
+        spatial_pos_embed = ggml_reshape_3d(ctx, spatial_pos_embed, hidden_size, h * w, 1);    // [1, h*w, hidden_size]
+        return spatial_pos_embed;
+    }
+
+    struct ggml_tensor* unpatchify(struct ggml_context* ctx,
+                                   struct ggml_tensor* x,
+                                   int64_t h,
+                                   int64_t w) {
+        // x: [N, H*W, patch_size * patch_size * C]
+        // return: [N, C, H, W]
+        int64_t n = x->ne[2];
+        int64_t c = out_channels;
+        int64_t p = patch_size;
+        h         = (h + 1) / p;
+        w         = (w + 1) / p;
+
+        GGML_ASSERT(h * w == x->ne[1]);
+
+        x = ggml_reshape_4d(ctx, x, c, p * p, w * h, n);       // [N, H*W, P*P, C]
+        x = ggml_cont(ctx, ggml_permute(ctx, x, 2, 0, 1, 3));  // [N, C, H*W, P*P]
+        x = ggml_reshape_4d(ctx, x, p, p, w, h * c * n);       // [N*C*H, W, P, P]
+        x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));  // [N*C*H, P, W, P]
+        x = ggml_reshape_4d(ctx, x, p * w, p * h, c, n);       // [N, C, H*P, W*P]
+        return x;
+    }
+
+    struct ggml_tensor* forward_core_with_concat(struct ggml_context* ctx,
+                                                 struct ggml_tensor* x,
+                                                 struct ggml_tensor* c_mod,
+                                                 struct ggml_tensor* context) {
+        // x: [N, H*W, hidden_size]
+        // context: [N, n_context, d_context]
+        // c: [N, hidden_size]
+        // return: [N, N*W, patch_size * patch_size * out_channels]
+        auto final_layer = std::dynamic_pointer_cast<FinalLayer>(blocks["final_layer"]);
+
+        for (int i = 0; i < depth; i++) {
+            auto block = std::dynamic_pointer_cast<JointBlock>(blocks["joint_blocks." + std::to_string(i)]);
+
+            auto context_x = block->forward(ctx, context, x, c_mod);
+            context        = context_x.first;
+            x              = context_x.second;
+        }
+
+        x = final_layer->forward(ctx, x, c_mod);  // (N, T, patch_size ** 2 * out_channels)
+
+        return x;
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx,
+                                struct ggml_tensor* x,
+                                struct ggml_tensor* t,
+                                struct ggml_tensor* y       = NULL,
+                                struct ggml_tensor* context = NULL) {
+        // Forward pass of DiT.
+        // x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
+        // t: (N,) tensor of diffusion timesteps
+        // y: (N, adm_in_channels) tensor of class labels
+        // context: (N, L, D)
+        // return: (N, C, H, W)
+        auto x_embedder = std::dynamic_pointer_cast<PatchEmbed>(blocks["x_embedder"]);
+        auto t_embedder = std::dynamic_pointer_cast<TimestepEmbedder>(blocks["t_embedder"]);
+
+        int64_t w = x->ne[0];
+        int64_t h = x->ne[1];
+
+        auto patch_embed = x_embedder->forward(ctx, x);            // [N, H*W, hidden_size]
+        auto pos_embed   = cropped_pos_embed(ctx, h, w);           // [1, H*W, hidden_size]
+        x                = ggml_add(ctx, patch_embed, pos_embed);  // [N, H*W, hidden_size]
+
+        auto c = t_embedder->forward(ctx, t);  // [N, hidden_size]
+        if (y != NULL && adm_in_channels != -1) {
+            auto y_embedder = std::dynamic_pointer_cast<VectorEmbedder>(blocks["y_embedder"]);
+
+            y = y_embedder->forward(ctx, y);  // [N, hidden_size]
+            c = ggml_add(ctx, c, y);
+        }
+
+        if (context != NULL) {
+            auto context_embedder = std::dynamic_pointer_cast<Linear>(blocks["context_embedder"]);
+
+            context = context_embedder->forward(ctx, context);  // [N, L, D] aka [N, L, 1536]
+        }
+
+        x = forward_core_with_concat(ctx, x, c, context);  // (N, H*W, patch_size ** 2 * out_channels)
+
+        x = unpatchify(ctx, x, h, w);  // [N, C, H, W]
+
+        return x;
+    }
+};
+
+struct MMDiTRunner : public GGMLRunner {
+    MMDiT mmdit;
+
+    MMDiTRunner(ggml_backend_t backend,
+                ggml_type wtype,
+                SDVersion version = VERSION_SD3_2B)
+        : GGMLRunner(backend, wtype), mmdit(version) {
+        mmdit.init(params_ctx, wtype);
+    }
+
+    std::string get_desc() {
+        return "mmdit";
+    }
+
+    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
+        mmdit.get_param_tensors(tensors, prefix);
+    }
+
+    struct ggml_cgraph* build_graph(struct ggml_tensor* x,
+                                    struct ggml_tensor* timesteps,
+                                    struct ggml_tensor* context,
+                                    struct ggml_tensor* y) {
+        struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, MMDIT_GRAPH_SIZE, false);
+
+        x         = to_backend(x);
+        context   = to_backend(context);
+        y         = to_backend(y);
+        timesteps = to_backend(timesteps);
+
+        struct ggml_tensor* out = mmdit.forward(compute_ctx,
+                                                x,
+                                                timesteps,
+                                                y,
+                                                context);
+
+        ggml_build_forward_expand(gf, out);
+
+        return gf;
+    }
+
+    void compute(int n_threads,
+                 struct ggml_tensor* x,
+                 struct ggml_tensor* timesteps,
+                 struct ggml_tensor* context,
+                 struct ggml_tensor* y,
+                 struct ggml_tensor** output     = NULL,
+                 struct ggml_context* output_ctx = NULL) {
+        // x: [N, in_channels, h, w]
+        // timesteps: [N, ]
+        // context: [N, max_position, hidden_size]([N, 154, 4096]) or [1, max_position, hidden_size]
+        // y: [N, adm_in_channels] or [1, adm_in_channels]
+        auto get_graph = [&]() -> struct ggml_cgraph* {
+            return build_graph(x, timesteps, context, y);
+        };
+
+        GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
+    }
+
+    void test() {
+        struct ggml_init_params params;
+        params.mem_size   = static_cast<size_t>(10 * 1024 * 1024);  // 10 MB
+        params.mem_buffer = NULL;
+        params.no_alloc   = false;
+
+        struct ggml_context* work_ctx = ggml_init(params);
+        GGML_ASSERT(work_ctx != NULL);
+
+        {
+            // cpu f16: pass
+            // cpu f32: pass
+            // cuda f16: pass
+            // cuda f32: pass
+            auto x = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 128, 128, 16, 1);
+            std::vector<float> timesteps_vec(1, 999.f);
+            auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec);
+            ggml_set_f32(x, 0.01f);
+            // print_ggml_tensor(x);
+
+            auto context = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32, 4096, 154, 1);
+            ggml_set_f32(context, 0.01f);
+            // print_ggml_tensor(context);
+
+            auto y = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 2048, 1);
+            ggml_set_f32(y, 0.01f);
+            // print_ggml_tensor(y);
+
+            struct ggml_tensor* out = NULL;
+
+            int t0 = ggml_time_ms();
+            compute(8, x, timesteps, context, y, &out, work_ctx);
+            int t1 = ggml_time_ms();
+
+            print_ggml_tensor(out);
+            LOG_DEBUG("mmdit test done in %dms", t1 - t0);
+        }
+    }
+
+    static void load_from_file_and_test(const std::string& file_path) {
+        // ggml_backend_t backend    = ggml_backend_cuda_init(0);
+        ggml_backend_t backend             = ggml_backend_cpu_init();
+        ggml_type model_data_type          = GGML_TYPE_F16;
+        std::shared_ptr<MMDiTRunner> mmdit = std::shared_ptr<MMDiTRunner>(new MMDiTRunner(backend, model_data_type));
+        {
+            LOG_INFO("loading from '%s'", file_path.c_str());
+
+            mmdit->alloc_params_buffer();
+            std::map<std::string, ggml_tensor*> tensors;
+            mmdit->get_param_tensors(tensors, "model.diffusion_model");
+
+            ModelLoader model_loader;
+            if (!model_loader.init_from_file(file_path)) {
+                LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
+                return;
+            }
+
+            bool success = model_loader.load_tensors(tensors, backend);
+
+            if (!success) {
+                LOG_ERROR("load tensors from model loader failed");
+                return;
+            }
+
+            LOG_INFO("mmdit model loaded");
+        }
+        mmdit->test();
+    }
+};
+
+#endif
--- a/otherarch/sdcpp/model.cpp
+++ b/otherarch/sdcpp/model.cpp
@ -21,6 +21,10 @@
 #include "ggml-metal.h"
 #endif

+#ifdef SD_USE_VULKAN
+#include "ggml-vulkan.h"
+#endif
+
 #define ST_HEADER_SIZE_LEN 8

 uint64_t read_u64(uint8_t* buffer) {
@ -161,6 +165,10 @@ std::string convert_open_clip_to_hf_clip(const std::string& name) {
        prefix   = new_name.substr(0, new_name.size() - strlen("vision_model.visual_projection.weight"));
        new_name = prefix + "visual_projection.weight";
        return new_name;
+    } else if (ends_with(new_name, "transformer.text_projection.weight")) {
+        prefix   = new_name.substr(0, new_name.size() - strlen("transformer.text_projection.weight"));
+        new_name = prefix + "transformer.text_model.text_projection";
+        return new_name;
    } else {
        return new_name;
    }
@ -418,9 +426,20 @@ std::string convert_diffusers_name_to_compvis(std::string key, char seq) {
    return key;
 }

-std::string convert_tensor_name(const std::string& name) {
+std::string convert_tensor_name(std::string name) {
+    if (starts_with(name, "diffusion_model")) {
+        name = "model." + name;
+    }
+    // size_t pos = name.find("lora_A");
+    // if (pos != std::string::npos) {
+    //     name.replace(pos, strlen("lora_A"), "lora_up");
+    // }
+    // pos = name.find("lora_B");
+    // if (pos != std::string::npos) {
+    //     name.replace(pos, strlen("lora_B"), "lora_down");
+    // }
    std::string new_name = name;
-    if (starts_with(name, "cond_stage_model.") || starts_with(name, "conditioner.embedders.") || ends_with(name, ".vision_model.visual_projection.weight")) {
+    if (starts_with(name, "cond_stage_model.") || starts_with(name, "conditioner.embedders.") || starts_with(name, "text_encoders.") || ends_with(name, ".vision_model.visual_projection.weight")) {
        new_name = convert_open_clip_to_hf_clip(name);
    } else if (starts_with(name, "first_stage_model.decoder")) {
        new_name = convert_vae_decoder_name(name);
@ -455,6 +474,9 @@ std::string convert_tensor_name(const std::string& name) {
        if (pos != std::string::npos) {
            new_name.replace(pos, strlen(".processor"), "");
        }
+        // if (starts_with(new_name, "transformer.transformer_blocks") || starts_with(new_name, "transformer.single_transformer_blocks")) {
+        //     new_name = "model.diffusion_model." + new_name;
+        // }
        pos = new_name.rfind("lora");
        if (pos != std::string::npos) {
            std::string name_without_network_parts = new_name.substr(0, pos - 1);
@ -550,6 +572,48 @@ float bf16_to_f32(uint16_t bfloat16) {
    return *reinterpret_cast<float*>(&val_bits);
 }

+uint16_t f8_e4m3_to_f16(uint8_t f8) {
+    // do we need to support uz?
+
+    const uint32_t exponent_bias = 7;
+    if (f8 == 0xff) {
+        return ggml_fp32_to_fp16(-NAN);
+    } else if (f8 == 0x7f) {
+        return ggml_fp32_to_fp16(NAN);
+    }
+
+    uint32_t sign     = f8 & 0x80;
+    uint32_t exponent = (f8 & 0x78) >> 3;
+    uint32_t mantissa = f8 & 0x07;
+    uint32_t result   = sign << 24;
+    if (exponent == 0) {
+        if (mantissa > 0) {
+            exponent = 0x7f - exponent_bias;
+
+            // yes, 2 times
+            if ((mantissa & 0x04) == 0) {
+                mantissa &= 0x03;
+                mantissa <<= 1;
+                exponent -= 1;
+            }
+            if ((mantissa & 0x04) == 0) {
+                mantissa &= 0x03;
+                mantissa <<= 1;
+                exponent -= 1;
+            }
+
+            result |= (mantissa & 0x03) << 21;
+            result |= exponent << 23;
+        }
+    } else {
+        result |= mantissa << 20;
+        exponent += 0x7f - exponent_bias;
+        result |= exponent << 23;
+    }
+
+    return ggml_fp32_to_fp16(*reinterpret_cast<const float*>(&result));
+}
+
 void bf16_to_f32_vec(uint16_t* src, float* dst, int64_t n) {
    // support inplace op
    for (int64_t i = n - 1; i >= 0; i--) {
@ -557,6 +621,13 @@ void bf16_to_f32_vec(uint16_t* src, float* dst, int64_t n) {
    }
 }

+void f8_e4m3_to_f16_vec(uint8_t* src, uint16_t* dst, int64_t n) {
+    // support inplace op
+    for (int64_t i = n - 1; i >= 0; i--) {
+        dst[i] = f8_e4m3_to_f16(src[i]);
+    }
+}
+
 void convert_tensor(void* src,
                    ggml_type src_type,
                    void* dst,
@ -571,7 +642,6 @@ void convert_tensor(void* src,
        if (dst_type == GGML_TYPE_F16) {
            ggml_fp32_to_fp16_row((float*)src, (ggml_fp16_t*)dst, n);
        } else {
-            int64_t hist[16];
            std::vector<float> imatrix(n_per_row, 1.0f);  // dummy importance matrix
            const float* im = imatrix.data();
            ggml_quantize_chunk(dst_type, (float*)src, dst, 0, nrows, n_per_row, im);
@ -602,7 +672,6 @@ void convert_tensor(void* src,
        if (dst_type == GGML_TYPE_F16) {
            ggml_fp32_to_fp16_row((float*)src_data_f32, (ggml_fp16_t*)dst, n);
        } else {
-            int64_t hist[16];
            std::vector<float> imatrix(n_per_row, 1.0f);  // dummy importance matrix
            const float* im = imatrix.data();
            ggml_quantize_chunk(dst_type, (float*)src_data_f32, dst, 0, nrows, n_per_row, im);
@ -739,8 +808,8 @@ bool ModelLoader::init_from_file(const std::string& file_path, const std::string
    // else if (is_zip_file(file_path)) {
    //     LOG_INFO("load %s using checkpoint format", file_path.c_str());
    //     return init_from_ckpt_file(file_path, prefix);
-    // } else
-    {
+    // }
+    else {
        LOG_WARN("unknown format %s", file_path.c_str());
        return false;
    }
@ -795,6 +864,8 @@ ggml_type str_to_ggml_type(const std::string& dtype) {
        ttype = GGML_TYPE_F32;
    } else if (dtype == "F32") {
        ttype = GGML_TYPE_F32;
+    } else if (dtype == "F8_E4M3") {
+        ttype = GGML_TYPE_F16;
    }
    return ttype;
 }
@ -867,7 +938,7 @@ bool ModelLoader::init_from_safetensors_file(const std::string& file_path, const

        ggml_type type = str_to_ggml_type(dtype);
        if (type == GGML_TYPE_COUNT) {
-            LOG_ERROR("unsupported dtype '%s'", dtype.c_str());
+            LOG_ERROR("unsupported dtype '%s' (tensor '%s')", dtype.c_str(), name.c_str());
            return false;
        }

@ -904,6 +975,10 @@ bool ModelLoader::init_from_safetensors_file(const std::string& file_path, const
        if (dtype == "BF16") {
            tensor_storage.is_bf16 = true;
            GGML_ASSERT(tensor_storage.nbytes() == tensor_data_size * 2);
+        } else if (dtype == "F8_E4M3") {
+            tensor_storage.is_f8_e4m3 = true;
+            // f8 -> f16
+            GGML_ASSERT(tensor_storage.nbytes() == tensor_data_size * 2);
        } else {
            GGML_ASSERT(tensor_storage.nbytes() == tensor_data_size);
        }
@ -1292,12 +1367,26 @@ bool ModelLoader::init_from_ckpt_file(const std::string& file_path, const std::s

 SDVersion ModelLoader::get_sd_version() {
    TensorStorage token_embedding_weight;
+    bool is_flux = false;
+    bool is_sd3  = false;
    for (auto& tensor_storage : tensor_storages) {
+        if (tensor_storage.name.find("model.diffusion_model.guidance_in.in_layer.weight") != std::string::npos) {
+            return VERSION_FLUX_DEV;
+        }
+        if (tensor_storage.name.find("model.diffusion_model.double_blocks.") != std::string::npos) {
+            is_flux = true;
+        }
+        if (tensor_storage.name.find("joint_blocks.37.x_block.attn.ln_q.weight") != std::string::npos) {
+            return VERSION_SD3_5_8B;
+        }
+        if (tensor_storage.name.find("model.diffusion_model.joint_blocks.23.") != std::string::npos) {
+            is_sd3 = true;
+        }
        if (tensor_storage.name.find("conditioner.embedders.1") != std::string::npos) {
-            return VERSION_XL;
+            return VERSION_SDXL;
        }
        if (tensor_storage.name.find("cond_stage_model.1") != std::string::npos) {
-            return VERSION_XL;
+            return VERSION_SDXL;
        }
        if (tensor_storage.name.find("model.diffusion_model.input_blocks.8.0.time_mixer.mix_factor") != std::string::npos) {
            return VERSION_SVD;
@ -1313,10 +1402,16 @@ SDVersion ModelLoader::get_sd_version() {
            // break;
        }
    }
+    if (is_flux) {
+        return VERSION_FLUX_SCHNELL;
+    }
+    if (is_sd3) {
+        return VERSION_SD3_2B;
+    }
    if (token_embedding_weight.ne[0] == 768) {
-        return VERSION_1_x;
+        return VERSION_SD1;
    } else if (token_embedding_weight.ne[0] == 1024) {
-        return VERSION_2_x;
+        return VERSION_SD2;
    }
    return VERSION_COUNT;
 }
@ -1327,8 +1422,78 @@ ggml_type ModelLoader::get_sd_wtype() {
            continue;
        }

-        if (tensor_storage.name.find(".weight") != std::string::npos &&
-            tensor_storage.name.find("time_embed") != std::string::npos) {
+        if (ggml_is_quantized(tensor_storage.type)) {
+            return tensor_storage.type;
+        }
+
+        if (tensor_should_be_converted(tensor_storage, GGML_TYPE_Q4_K)) {
+            return tensor_storage.type;
+        }
+    }
+    return GGML_TYPE_COUNT;
+}
+
+ggml_type ModelLoader::get_conditioner_wtype() {
+    for (auto& tensor_storage : tensor_storages) {
+        if (is_unused_tensor(tensor_storage.name)) {
+            continue;
+        }
+
+        if ((tensor_storage.name.find("text_encoders") == std::string::npos &&
+             tensor_storage.name.find("cond_stage_model") == std::string::npos &&
+             tensor_storage.name.find("te.text_model.") == std::string::npos &&
+             tensor_storage.name.find("conditioner") == std::string::npos)) {
+            continue;
+        }
+
+        if (ggml_is_quantized(tensor_storage.type)) {
+            return tensor_storage.type;
+        }
+
+        if (tensor_should_be_converted(tensor_storage, GGML_TYPE_Q4_K)) {
+            return tensor_storage.type;
+        }
+    }
+    return GGML_TYPE_COUNT;
+}
+
+ggml_type ModelLoader::get_diffusion_model_wtype() {
+    for (auto& tensor_storage : tensor_storages) {
+        if (is_unused_tensor(tensor_storage.name)) {
+            continue;
+        }
+
+        if (tensor_storage.name.find("model.diffusion_model.") == std::string::npos) {
+            continue;
+        }
+
+        if (ggml_is_quantized(tensor_storage.type)) {
+            return tensor_storage.type;
+        }
+
+        if (tensor_should_be_converted(tensor_storage, GGML_TYPE_Q4_K)) {
+            return tensor_storage.type;
+        }
+    }
+    return GGML_TYPE_COUNT;
+}
+
+ggml_type ModelLoader::get_vae_wtype() {
+    for (auto& tensor_storage : tensor_storages) {
+        if (is_unused_tensor(tensor_storage.name)) {
+            continue;
+        }
+
+        if (tensor_storage.name.find("vae.") == std::string::npos &&
+            tensor_storage.name.find("first_stage_model") == std::string::npos) {
+            continue;
+        }
+
+        if (ggml_is_quantized(tensor_storage.type)) {
+            return tensor_storage.type;
+        }
+
+        if (tensor_should_be_converted(tensor_storage, GGML_TYPE_Q4_K)) {
            return tensor_storage.type;
        }
    }
@ -1340,6 +1505,11 @@ std::string ModelLoader::load_merges() {
    return merges_utf8_str;
 }

+std::string ModelLoader::load_t5_tokenizer_json() {
+    std::string json_str(reinterpret_cast<const char*>(t5_tokenizer_json_str), sizeof(t5_tokenizer_json_str));
+    return json_str;
+}
+
 std::vector<TensorStorage> remove_duplicates(const std::vector<TensorStorage>& vec) {
    std::vector<TensorStorage> res;
    std::unordered_map<std::string, size_t> name_to_index_map;
@ -1459,6 +1629,9 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
                    if (tensor_storage.is_bf16) {
                        // inplace op
                        bf16_to_f32_vec((uint16_t*)dst_tensor->data, (float*)dst_tensor->data, tensor_storage.nelements());
+                    } else if (tensor_storage.is_f8_e4m3) {
+                        // inplace op
+                        f8_e4m3_to_f16_vec((uint8_t*)dst_tensor->data, (uint16_t*)dst_tensor->data, tensor_storage.nelements());
                    }
                } else {
                    read_buffer.resize(tensor_storage.nbytes());
@ -1467,6 +1640,9 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
                    if (tensor_storage.is_bf16) {
                        // inplace op
                        bf16_to_f32_vec((uint16_t*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements());
+                    } else if (tensor_storage.is_f8_e4m3) {
+                        // inplace op
+                        f8_e4m3_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements());
                    }

                    convert_tensor((void*)read_buffer.data(), tensor_storage.type, dst_tensor->data,
@ -1479,6 +1655,9 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
                if (tensor_storage.is_bf16) {
                    // inplace op
                    bf16_to_f32_vec((uint16_t*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements());
+                } else if (tensor_storage.is_f8_e4m3) {
+                    // inplace op
+                    f8_e4m3_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements());
                }

                if (tensor_storage.type == dst_tensor->type) {
@ -1579,6 +1758,37 @@ bool ModelLoader::load_tensors(std::map<std::string, struct ggml_tensor*>& tenso
    return true;
 }

+bool ModelLoader::tensor_should_be_converted(const TensorStorage& tensor_storage, ggml_type type) {
+    const std::string& name = tensor_storage.name;
+    if (type != GGML_TYPE_COUNT) {
+        if (ggml_is_quantized(type) && tensor_storage.ne[0] % ggml_blck_size(type) != 0) {
+            // Pass, do not convert
+        } else if (ends_with(name, ".bias")) {
+            // Pass, do not convert
+        } else if (ends_with(name, ".scale")) {
+            // Pass, do not convert
+        } else if (contains(name, "img_in.") ||
+                   contains(name, "txt_in.") ||
+                   contains(name, "time_in.") ||
+                   contains(name, "vector_in.") ||
+                   contains(name, "guidance_in.") ||
+                   contains(name, "final_layer.")) {
+            // Pass, do not convert. For FLUX
+        } else if (contains(name, "x_embedder.") ||
+                   contains(name, "t_embedder.") ||
+                   contains(name, "y_embedder.") ||
+                   contains(name, "pos_embed") ||
+                   contains(name, "context_embedder.")) {
+            // Pass, do not convert. For MMDiT
+        } else if (contains(name, "time_embed.") || contains(name, "label_emb.")) {
+            // Pass, do not convert. For Unet
+        } else {
+            return true;
+        }
+    }
+    return false;
+}
+
 bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type) {
    auto backend    = ggml_backend_cpu_init();
    size_t mem_size = 1 * 1024 * 1024;  // for padding
@ -1593,12 +1803,8 @@ bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type
        const std::string& name = tensor_storage.name;

        ggml_type tensor_type = tensor_storage.type;
-        if (type != GGML_TYPE_COUNT) {
-            if (ggml_is_quantized(type) && tensor_storage.ne[0] % 32 != 0) {
-                tensor_type = GGML_TYPE_F16;
-            } else {
-                tensor_type = type;
-            }
+        if (tensor_should_be_converted(tensor_storage, type)) {
+            tensor_type = type;
        }

        ggml_tensor* tensor = ggml_new_tensor(ggml_ctx, tensor_type, tensor_storage.n_dims, tensor_storage.ne);
@ -1648,15 +1854,9 @@ int64_t ModelLoader::get_params_mem_size(ggml_backend_t backend, ggml_type type)
    }

    for (auto& tensor_storage : processed_tensor_storages) {
-        ggml_type tensor_type = tensor_storage.type;
-        if (type != GGML_TYPE_COUNT) {
-            if (ggml_is_quantized(type) && tensor_storage.ne[0] % 32 != 0) {
-                tensor_type = GGML_TYPE_F16;
-            } else {
-                tensor_type = type;
-            }
+        if (tensor_should_be_converted(tensor_storage, type)) {
+            tensor_storage.type = type;
        }
-        tensor_storage.type = tensor_type;
        mem_size += tensor_storage.nbytes() + alignment;
    }

@ -1679,4 +1879,4 @@ bool convert(const char* input_path, const char* vae_path, const char* output_pa
    }
    bool success = model_loader.save_to_gguf_file(output_path, (ggml_type)output_type);
    return success;
-}
+}
--- a/otherarch/sdcpp/model.h
+++ b/otherarch/sdcpp/model.h
@ -7,6 +7,7 @@
 #include <set>
 #include <sstream>
 #include <string>
+#include <tuple>
 #include <vector>

 #include "ggml-backend.h"
@ -17,10 +18,14 @@
 #define SD_MAX_DIMS 5

 enum SDVersion {
-    VERSION_1_x,
-    VERSION_2_x,
-    VERSION_XL,
+    VERSION_SD1,
+    VERSION_SD2,
+    VERSION_SDXL,
    VERSION_SVD,
+    VERSION_SD3_2B,
+    VERSION_FLUX_DEV,
+    VERSION_FLUX_SCHNELL,
+    VERSION_SD3_5_8B,
    VERSION_COUNT,
 };

@ -28,6 +33,7 @@ struct TensorStorage {
    std::string name;
    ggml_type type          = GGML_TYPE_F32;
    bool is_bf16            = false;
+    bool is_f8_e4m3         = false;
    int64_t ne[SD_MAX_DIMS] = {1, 1, 1, 1, 1};
    int n_dims              = 0;

@ -57,7 +63,7 @@ struct TensorStorage {
    }

    int64_t nbytes_to_read() const {
-        if (is_bf16) {
+        if (is_bf16 || is_f8_e4m3) {
            return nbytes() / 2;
        } else {
            return nbytes();
@ -105,6 +111,8 @@ struct TensorStorage {
        const char* type_name = ggml_type_name(type);
        if (is_bf16) {
            type_name = "bf16";
+        } else if (is_f8_e4m3) {
+            type_name = "f8_e4m3";
        }
        ss << name << " | " << type_name << " | ";
        ss << n_dims << " [";
@ -142,13 +150,20 @@ public:
    bool init_from_file(const std::string& file_path, const std::string& prefix = "");
    SDVersion get_sd_version();
    ggml_type get_sd_wtype();
-    std::string load_merges();
+    ggml_type get_conditioner_wtype();
+    ggml_type get_diffusion_model_wtype();
+    ggml_type get_vae_wtype();
    bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend_t backend);
    bool load_tensors(std::map<std::string, struct ggml_tensor*>& tensors,
                      ggml_backend_t backend,
                      std::set<std::string> ignore_tensors = {});
    bool save_to_gguf_file(const std::string& file_path, ggml_type type);
+    bool tensor_should_be_converted(const TensorStorage& tensor_storage, ggml_type type);
    int64_t get_params_mem_size(ggml_backend_t backend, ggml_type type = GGML_TYPE_COUNT);
    ~ModelLoader() = default;
+
+    static std::string load_merges();
+    static std::string load_t5_tokenizer_json();
 };
-#endif  // __MODEL_H__
+
+#endif  // __MODEL_H__
--- a/otherarch/sdcpp/pmid.hpp
+++ b/otherarch/sdcpp/pmid.hpp
@ -0,0 +1,295 @@
+#ifndef __PMI_HPP__
+#define __PMI_HPP__
+
+#include "ggml_extend.hpp"
+
+#include "clip.hpp"
+#include "lora.hpp"
+
+struct FuseBlock : public GGMLBlock {
+    // network hparams
+    int in_dim;
+    int out_dim;
+    int hidden_dim;
+    bool use_residue;
+
+public:
+    FuseBlock(int i_d, int o_d, int h_d, bool use_residue = true)
+        : in_dim(i_d), out_dim(o_d), hidden_dim(h_d), use_residue(use_residue) {
+        blocks["fc1"]       = std::shared_ptr<GGMLBlock>(new Linear(in_dim, hidden_dim, true));
+        blocks["fc2"]       = std::shared_ptr<GGMLBlock>(new Linear(hidden_dim, out_dim, true));
+        blocks["layernorm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(in_dim));
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        // x: [N, channels, h, w]
+
+        auto fc1        = std::dynamic_pointer_cast<Linear>(blocks["fc1"]);
+        auto fc2        = std::dynamic_pointer_cast<Linear>(blocks["fc2"]);
+        auto layer_norm = std::dynamic_pointer_cast<LayerNorm>(blocks["layernorm"]);
+
+        struct ggml_tensor* r = x;
+        // x = ggml_nn_layer_norm(ctx, x, ln_w, ln_b);
+        x = layer_norm->forward(ctx, x);
+        // x = ggml_add(ctx, ggml_mul_mat(ctx, fc1_w, x),  fc1_b);
+        x = fc1->forward(ctx, x);
+        x = ggml_gelu_inplace(ctx, x);
+        x = fc2->forward(ctx, x);
+        // x = ggml_add(ctx, ggml_mul_mat(ctx, fc2_w, x),  fc2_b);
+        if (use_residue)
+            x = ggml_add(ctx, x, r);
+        return x;
+    }
+};
+
+struct FuseModule : public GGMLBlock {
+    // network hparams
+    int embed_dim;
+
+public:
+    FuseModule(int imb_d)
+        : embed_dim(imb_d) {
+        blocks["mlp1"]       = std::shared_ptr<GGMLBlock>(new FuseBlock(imb_d * 2, imb_d, imb_d, false));
+        blocks["mlp2"]       = std::shared_ptr<GGMLBlock>(new FuseBlock(imb_d, imb_d, imb_d, true));
+        blocks["layer_norm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(embed_dim));
+    }
+
+    struct ggml_tensor* fuse_fn(struct ggml_context* ctx,
+                                struct ggml_tensor* prompt_embeds,
+                                struct ggml_tensor* id_embeds) {
+        auto mlp1       = std::dynamic_pointer_cast<FuseBlock>(blocks["mlp1"]);
+        auto mlp2       = std::dynamic_pointer_cast<FuseBlock>(blocks["mlp2"]);
+        auto layer_norm = std::dynamic_pointer_cast<LayerNorm>(blocks["layer_norm"]);
+
+        auto prompt_embeds0 = ggml_cont(ctx, ggml_permute(ctx, prompt_embeds, 2, 0, 1, 3));
+        auto id_embeds0     = ggml_cont(ctx, ggml_permute(ctx, id_embeds, 2, 0, 1, 3));
+        // concat is along dim 2
+        auto stacked_id_embeds = ggml_concat(ctx, prompt_embeds0, id_embeds0, 2);
+        stacked_id_embeds      = ggml_cont(ctx, ggml_permute(ctx, stacked_id_embeds, 1, 2, 0, 3));
+
+        // stacked_id_embeds = mlp1.forward(ctx, stacked_id_embeds);
+        // stacked_id_embeds = ggml_add(ctx, stacked_id_embeds, prompt_embeds);
+        // stacked_id_embeds = mlp2.forward(ctx, stacked_id_embeds);
+        // stacked_id_embeds = ggml_nn_layer_norm(ctx, stacked_id_embeds, ln_w, ln_b);
+
+        stacked_id_embeds = mlp1->forward(ctx, stacked_id_embeds);
+        stacked_id_embeds = ggml_add(ctx, stacked_id_embeds, prompt_embeds);
+        stacked_id_embeds = mlp2->forward(ctx, stacked_id_embeds);
+        stacked_id_embeds = layer_norm->forward(ctx, stacked_id_embeds);
+
+        return stacked_id_embeds;
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx,
+                                struct ggml_tensor* prompt_embeds,
+                                struct ggml_tensor* id_embeds,
+                                struct ggml_tensor* class_tokens_mask,
+                                struct ggml_tensor* class_tokens_mask_pos,
+                                struct ggml_tensor* left,
+                                struct ggml_tensor* right) {
+        // x: [N, channels, h, w]
+
+        struct ggml_tensor* valid_id_embeds = id_embeds;
+        // # slice out the image token embeddings
+        // print_ggml_tensor(class_tokens_mask_pos, false);
+        ggml_set_name(class_tokens_mask_pos, "class_tokens_mask_pos");
+        ggml_set_name(prompt_embeds, "prompt_embeds");
+        // print_ggml_tensor(valid_id_embeds, true, "valid_id_embeds");
+        // print_ggml_tensor(class_tokens_mask_pos, true, "class_tokens_mask_pos");
+        struct ggml_tensor* image_token_embeds = ggml_get_rows(ctx, prompt_embeds, class_tokens_mask_pos);
+        ggml_set_name(image_token_embeds, "image_token_embeds");
+        struct ggml_tensor* stacked_id_embeds = fuse_fn(ctx, image_token_embeds, valid_id_embeds);
+
+        stacked_id_embeds = ggml_cont(ctx, ggml_permute(ctx, stacked_id_embeds, 0, 2, 1, 3));
+        if (left && right) {
+            stacked_id_embeds = ggml_concat(ctx, left, stacked_id_embeds, 2);
+            stacked_id_embeds = ggml_concat(ctx, stacked_id_embeds, right, 2);
+        } else if (left) {
+            stacked_id_embeds = ggml_concat(ctx, left, stacked_id_embeds, 2);
+        } else if (right) {
+            stacked_id_embeds = ggml_concat(ctx, stacked_id_embeds, right, 2);
+        }
+        stacked_id_embeds                         = ggml_cont(ctx, ggml_permute(ctx, stacked_id_embeds, 0, 2, 1, 3));
+        class_tokens_mask                         = ggml_cont(ctx, ggml_transpose(ctx, class_tokens_mask));
+        class_tokens_mask                         = ggml_repeat(ctx, class_tokens_mask, prompt_embeds);
+        prompt_embeds                             = ggml_mul(ctx, prompt_embeds, class_tokens_mask);
+        struct ggml_tensor* updated_prompt_embeds = ggml_add(ctx, prompt_embeds, stacked_id_embeds);
+        ggml_set_name(updated_prompt_embeds, "updated_prompt_embeds");
+        return updated_prompt_embeds;
+    }
+};
+
+struct PhotoMakerIDEncoderBlock : public CLIPVisionModelProjection {
+    PhotoMakerIDEncoderBlock()
+        : CLIPVisionModelProjection(OPENAI_CLIP_VIT_L_14) {
+        blocks["visual_projection_2"] = std::shared_ptr<GGMLBlock>(new Linear(1024, 1280, false));
+        blocks["fuse_module"]         = std::shared_ptr<GGMLBlock>(new FuseModule(2048));
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx,
+                                struct ggml_tensor* id_pixel_values,
+                                struct ggml_tensor* prompt_embeds,
+                                struct ggml_tensor* class_tokens_mask,
+                                struct ggml_tensor* class_tokens_mask_pos,
+                                struct ggml_tensor* left,
+                                struct ggml_tensor* right) {
+        // x: [N, channels, h, w]
+        auto vision_model        = std::dynamic_pointer_cast<CLIPVisionModel>(blocks["vision_model"]);
+        auto visual_projection   = std::dynamic_pointer_cast<CLIPProjection>(blocks["visual_projection"]);
+        auto visual_projection_2 = std::dynamic_pointer_cast<Linear>(blocks["visual_projection_2"]);
+        auto fuse_module         = std::dynamic_pointer_cast<FuseModule>(blocks["fuse_module"]);
+
+        struct ggml_tensor* shared_id_embeds = vision_model->forward(ctx, id_pixel_values);          // [N, hidden_size]
+        struct ggml_tensor* id_embeds        = visual_projection->forward(ctx, shared_id_embeds);    // [N, proj_dim(768)]
+        struct ggml_tensor* id_embeds_2      = visual_projection_2->forward(ctx, shared_id_embeds);  // [N, 1280]
+
+        id_embeds   = ggml_cont(ctx, ggml_permute(ctx, id_embeds, 2, 0, 1, 3));
+        id_embeds_2 = ggml_cont(ctx, ggml_permute(ctx, id_embeds_2, 2, 0, 1, 3));
+
+        id_embeds = ggml_concat(ctx, id_embeds, id_embeds_2, 2);  // [batch_size, seq_length, 1, 2048] check whether concat at dim 2 is right
+        id_embeds = ggml_cont(ctx, ggml_permute(ctx, id_embeds, 1, 2, 0, 3));
+
+        struct ggml_tensor* updated_prompt_embeds = fuse_module->forward(ctx,
+                                                                         prompt_embeds,
+                                                                         id_embeds,
+                                                                         class_tokens_mask,
+                                                                         class_tokens_mask_pos,
+                                                                         left, right);
+        return updated_prompt_embeds;
+    }
+};
+
+struct PhotoMakerIDEncoder : public GGMLRunner {
+public:
+    SDVersion version = VERSION_SDXL;
+    PhotoMakerIDEncoderBlock id_encoder;
+    float style_strength;
+
+    std::vector<float> ctm;
+    std::vector<ggml_fp16_t> ctmf16;
+    std::vector<int> ctmpos;
+
+    std::vector<ggml_fp16_t> zeros_left_16;
+    std::vector<float> zeros_left;
+    std::vector<ggml_fp16_t> zeros_right_16;
+    std::vector<float> zeros_right;
+
+public:
+    PhotoMakerIDEncoder(ggml_backend_t backend, ggml_type wtype, SDVersion version = VERSION_SDXL, float sty = 20.f)
+        : GGMLRunner(backend, wtype),
+          version(version),
+          style_strength(sty) {
+        id_encoder.init(params_ctx, wtype);
+    }
+
+    std::string get_desc() {
+        return "pmid";
+    }
+
+    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
+        id_encoder.get_param_tensors(tensors, prefix);
+    }
+
+    struct ggml_cgraph* build_graph(  // struct ggml_allocr* allocr,
+        struct ggml_tensor* id_pixel_values,
+        struct ggml_tensor* prompt_embeds,
+        std::vector<bool>& class_tokens_mask) {
+        ctm.clear();
+        ctmf16.clear();
+        ctmpos.clear();
+        zeros_left.clear();
+        zeros_left_16.clear();
+        zeros_right.clear();
+        zeros_right_16.clear();
+
+        ggml_context* ctx0 = compute_ctx;
+
+        struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
+
+        int64_t hidden_size = prompt_embeds->ne[0];
+        int64_t seq_length  = prompt_embeds->ne[1];
+        ggml_type type      = GGML_TYPE_F32;
+
+        struct ggml_tensor* class_tokens_mask_d = ggml_new_tensor_1d(ctx0, type, class_tokens_mask.size());
+
+        struct ggml_tensor* id_pixel_values_d = to_backend(id_pixel_values);
+        struct ggml_tensor* prompt_embeds_d   = to_backend(prompt_embeds);
+
+        struct ggml_tensor* left  = NULL;
+        struct ggml_tensor* right = NULL;
+        for (int i = 0; i < class_tokens_mask.size(); i++) {
+            if (class_tokens_mask[i]) {
+                ctm.push_back(0.f);                        // here use 0.f instead of 1.f to make a scale mask
+                ctmf16.push_back(ggml_fp32_to_fp16(0.f));  // here use 0.f instead of 1.f to make a scale mask
+                ctmpos.push_back(i);
+            } else {
+                ctm.push_back(1.f);                        // here use 1.f instead of 0.f to make a scale mask
+                ctmf16.push_back(ggml_fp32_to_fp16(1.f));  // here use 0.f instead of 1.f to make a scale mask
+            }
+        }
+        if (ctmpos[0] > 0) {
+            left = ggml_new_tensor_3d(ctx0, type, hidden_size, 1, ctmpos[0]);
+        }
+        if (ctmpos[ctmpos.size() - 1] < seq_length - 1) {
+            right = ggml_new_tensor_3d(ctx0, type,
+                                       hidden_size, 1, seq_length - ctmpos[ctmpos.size() - 1] - 1);
+        }
+        struct ggml_tensor* class_tokens_mask_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ctmpos.size());
+
+        {
+            if (type == GGML_TYPE_F16)
+                set_backend_tensor_data(class_tokens_mask_d, ctmf16.data());
+            else
+                set_backend_tensor_data(class_tokens_mask_d, ctm.data());
+            set_backend_tensor_data(class_tokens_mask_pos, ctmpos.data());
+            if (left) {
+                if (type == GGML_TYPE_F16) {
+                    for (int i = 0; i < ggml_nelements(left); ++i)
+                        zeros_left_16.push_back(ggml_fp32_to_fp16(0.f));
+                    set_backend_tensor_data(left, zeros_left_16.data());
+                } else {
+                    for (int i = 0; i < ggml_nelements(left); ++i)
+                        zeros_left.push_back(0.f);
+                    set_backend_tensor_data(left, zeros_left.data());
+                }
+            }
+            if (right) {
+                if (type == GGML_TYPE_F16) {
+                    for (int i = 0; i < ggml_nelements(right); ++i)
+                        zeros_right_16.push_back(ggml_fp32_to_fp16(0.f));
+                    set_backend_tensor_data(right, zeros_right_16.data());
+                } else {
+                    for (int i = 0; i < ggml_nelements(right); ++i)
+                        zeros_right.push_back(0.f);
+                    set_backend_tensor_data(right, zeros_right.data());
+                }
+            }
+        }
+        struct ggml_tensor* updated_prompt_embeds = id_encoder.forward(ctx0,
+                                                                       id_pixel_values_d,
+                                                                       prompt_embeds_d,
+                                                                       class_tokens_mask_d,
+                                                                       class_tokens_mask_pos,
+                                                                       left, right);
+        ggml_build_forward_expand(gf, updated_prompt_embeds);
+
+        return gf;
+    }
+
+    void compute(const int n_threads,
+                 struct ggml_tensor* id_pixel_values,
+                 struct ggml_tensor* prompt_embeds,
+                 std::vector<bool>& class_tokens_mask,
+                 struct ggml_tensor** updated_prompt_embeds,
+                 ggml_context* output_ctx) {
+        auto get_graph = [&]() -> struct ggml_cgraph* {
+            // return build_graph(compute_allocr, id_pixel_values, prompt_embeds, class_tokens_mask);
+            return build_graph(id_pixel_values, prompt_embeds, class_tokens_mask);
+        };
+
+        // GGMLRunner::compute(get_graph, n_threads, updated_prompt_embeds);
+        GGMLRunner::compute(get_graph, n_threads, true, updated_prompt_embeds, output_ctx);
+    }
+};
+
+#endif  // __PMI_HPP__
--- a/otherarch/sdcpp/sdtype_adapter.cpp
+++ b/otherarch/sdcpp/sdtype_adapter.cpp
@ -12,6 +12,7 @@

 #include "model_adapter.h"

+#include "flux.hpp"
 #include "stable-diffusion.cpp"
 #include "util.cpp"
 #include "upscaler.cpp"
@ -47,6 +48,8 @@ const char* sample_method_str[] = {
    "dpm++2s_a",
    "dpm++2m",
    "dpm++2mv2",
+    "ipndm",
+    "ipndm_v",
    "lcm",
 };

@ -55,6 +58,9 @@ const char* schedule_str[] = {
    "default",
    "discrete",
    "karras",
+    "exponential",
+    "ays",
+    "gits",
 };

 const char* modes_str[] = {
@ -75,13 +81,18 @@ enum SDMode {
 struct SDParams {
    int n_threads = -1;
    SDMode mode   = TXT2IMG;
-
    std::string model_path;
+    std::string clip_l_path;
+    std::string clip_g_path;
+    std::string t5xxl_path;
+    std::string diffusion_model_path;
    std::string vae_path;
    std::string taesd_path;
    std::string esrgan_path;
    std::string controlnet_path;
    std::string embeddings_path;
+    std::string stacked_id_embeddings_path;
+    std::string input_id_images_path;
    sd_type_t wtype = SD_TYPE_COUNT;
    std::string lora_model_dir;
    std::string output_path = "output.png";
@ -90,12 +101,14 @@ struct SDParams {

    std::string prompt;
    std::string negative_prompt;
-    float min_cfg   = 1.0f;
-    float cfg_scale = 7.0f;
-    int clip_skip   = -1;  // <= 0 represents unspecified
-    int width       = 512;
-    int height      = 512;
-    int batch_count = 1;
+    float min_cfg     = 1.0f;
+    float cfg_scale   = 7.0f;
+    float guidance    = 3.5f;
+    float style_ratio = 20.f;
+    int clip_skip     = -1;  // <= 0 represents unspecified
+    int width         = 512;
+    int height        = 512;
+    int batch_count   = 1;

    int video_frames         = 6;
    int motion_bucket_id     = 127;
@ -112,7 +125,11 @@ struct SDParams {
    bool verbose                  = false;
    bool vae_tiling               = false;
    bool control_net_cpu          = false;
+    bool normalize_input          = false;
+    bool clip_on_cpu              = false;
+    bool vae_on_cpu               = false;
    bool canny_preprocess         = false;
+    bool color                    = false;
    int upscale_repeats           = 1;
 };

@ -229,11 +246,16 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) {
    }

    sd_ctx = new_sd_ctx(sd_params->model_path.c_str(),
+                        sd_params->clip_l_path.c_str(),
+                        sd_params->clip_g_path.c_str(),
+                        sd_params->t5xxl_path.c_str(),
+                        sd_params->diffusion_model_path.c_str(),
                        sd_params->vae_path.c_str(),
                        sd_params->taesd_path.c_str(),
                        sd_params->controlnet_path.c_str(),
                        sd_params->lora_model_dir.c_str(),
                        sd_params->embeddings_path.c_str(),
+                        sd_params->stacked_id_embeddings_path.c_str(),
                        vae_decode_only,
                        sd_params->vae_tiling,
                        free_param,
@ -241,7 +263,9 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) {
                        sd_params->wtype,
                        sd_params->rng_type,
                        sd_params->schedule,
-                        sd_params->control_net_cpu);
+                        sd_params->clip_on_cpu,
+                        sd_params->control_net_cpu,
+                        sd_params->vae_on_cpu);

    if (sd_ctx == NULL) {
        printf("\nError: KCPP SD Failed to create context!\n");
@ -378,6 +402,7 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
                          sd_params->negative_prompt.c_str(),
                          sd_params->clip_skip,
                          sd_params->cfg_scale,
+                          sd_params->guidance,
                          sd_params->width,
                          sd_params->height,
                          sd_params->sample_method,
@ -385,7 +410,10 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
                          sd_params->seed,
                          sd_params->batch_count,
                          control_image,
-                          sd_params->control_strength);
+                          sd_params->control_strength,
+                          sd_params->style_ratio,
+                          sd_params->normalize_input,
+                          sd_params->input_id_images_path.c_str());
    } else {

        if (sd_params->width <= 0 || sd_params->width % 64 != 0 || sd_params->height <= 0 || sd_params->height % 64 != 0) {
@ -455,13 +483,19 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
                            sd_params->negative_prompt.c_str(),
                            sd_params->clip_skip,
                            sd_params->cfg_scale,
+                            sd_params->guidance,
                            sd_params->width,
                            sd_params->height,
                            sd_params->sample_method,
                            sd_params->sample_steps,
                            sd_params->strength,
                            sd_params->seed,
-                            sd_params->batch_count);
+                            sd_params->batch_count,
+                            control_image,
+                            sd_params->control_strength,
+                            sd_params->style_ratio,
+                            sd_params->normalize_input,
+                            sd_params->input_id_images_path.c_str());
    }

    if (results == NULL) {
--- a/otherarch/sdcpp/stable-diffusion.cpp
+++ b/otherarch/sdcpp/stable-diffusion.cpp
--- a/otherarch/sdcpp/stable-diffusion.h
+++ b/otherarch/sdcpp/stable-diffusion.h
@ -41,6 +41,8 @@ enum sample_method_t {
    DPMPP2S_A,
    DPMPP2M,
    DPMPP2Mv2,
+    IPNDM,
+    IPNDM_V,
    LCM,
    N_SAMPLE_METHODS
 };
@ -49,6 +51,9 @@ enum schedule_t {
    DEFAULT,
    DISCRETE,
    KARRAS,
+    EXPONENTIAL,
+    AYS,
+    GITS,
    N_SCHEDULES
 };

@ -59,26 +64,35 @@ enum sd_type_t {
    SD_TYPE_Q4_0 = 2,
    SD_TYPE_Q4_1 = 3,
    // SD_TYPE_Q4_2 = 4, support has been removed
-    // SD_TYPE_Q4_3 (5) support has been removed
-    SD_TYPE_Q5_0 = 6,
-    SD_TYPE_Q5_1 = 7,
-    SD_TYPE_Q8_0 = 8,
-    SD_TYPE_Q8_1 = 9,
-    // k-quantizations
-    SD_TYPE_Q2_K    = 10,
-    SD_TYPE_Q3_K    = 11,
-    SD_TYPE_Q4_K    = 12,
-    SD_TYPE_Q5_K    = 13,
-    SD_TYPE_Q6_K    = 14,
-    SD_TYPE_Q8_K    = 15,
-    SD_TYPE_IQ2_XXS = 16,
-    SD_TYPE_IQ2_XS  = 17,
-    SD_TYPE_IQ3_XXS = 18,
-    SD_TYPE_IQ1_S   = 19,
-    SD_TYPE_IQ4_NL  = 20,
-    SD_TYPE_I8,
-    SD_TYPE_I16,
-    SD_TYPE_I32,
+    // SD_TYPE_Q4_3 = 5, support has been removed
+    SD_TYPE_Q5_0     = 6,
+    SD_TYPE_Q5_1     = 7,
+    SD_TYPE_Q8_0     = 8,
+    SD_TYPE_Q8_1     = 9,
+    SD_TYPE_Q2_K     = 10,
+    SD_TYPE_Q3_K     = 11,
+    SD_TYPE_Q4_K     = 12,
+    SD_TYPE_Q5_K     = 13,
+    SD_TYPE_Q6_K     = 14,
+    SD_TYPE_Q8_K     = 15,
+    SD_TYPE_IQ2_XXS  = 16,
+    SD_TYPE_IQ2_XS   = 17,
+    SD_TYPE_IQ3_XXS  = 18,
+    SD_TYPE_IQ1_S    = 19,
+    SD_TYPE_IQ4_NL   = 20,
+    SD_TYPE_IQ3_S    = 21,
+    SD_TYPE_IQ2_S    = 22,
+    SD_TYPE_IQ4_XS   = 23,
+    SD_TYPE_I8       = 24,
+    SD_TYPE_I16      = 25,
+    SD_TYPE_I32      = 26,
+    SD_TYPE_I64      = 27,
+    SD_TYPE_F64      = 28,
+    SD_TYPE_IQ1_M    = 29,
+    SD_TYPE_BF16     = 30,
+    SD_TYPE_Q4_0_4_4 = 31,
+    SD_TYPE_Q4_0_4_8 = 32,
+    SD_TYPE_Q4_0_8_8 = 33,
    SD_TYPE_COUNT,
 };

@ -92,8 +106,10 @@ enum sd_log_level_t {
 };

 typedef void (*sd_log_cb_t)(enum sd_log_level_t level, const char* text, void* data);
+typedef void (*sd_progress_cb_t)(int step, int steps, float time, void* data);

 SD_API void sd_set_log_callback(sd_log_cb_t sd_log_cb, void* data);
+SD_API void sd_set_progress_callback(sd_progress_cb_t cb, void* data);
 SD_API int32_t sd_get_num_physical_cores();
 SD_API const char* sd_get_system_info();

@ -107,11 +123,16 @@ typedef struct {
 typedef struct sd_ctx_t sd_ctx_t;

 SD_API sd_ctx_t* new_sd_ctx(const char* model_path,
+                            const char* clip_l_path,
+                            const char* clip_g_path,
+                            const char* t5xxl_path,
+                            const char* diffusion_model_path,
                            const char* vae_path,
                            const char* taesd_path,
                            const char* control_net_path_c_str,
                            const char* lora_model_dir,
                            const char* embed_dir_c_str,
+                            const char* stacked_id_embed_dir_c_str,
                            bool vae_decode_only,
                            bool vae_tiling,
                            bool free_params_immediately,
@ -119,7 +140,9 @@ SD_API sd_ctx_t* new_sd_ctx(const char* model_path,
                            enum sd_type_t wtype,
                            enum rng_type_t rng_type,
                            enum schedule_t s,
-                            bool keep_control_net_cpu);
+                            bool keep_clip_on_cpu,
+                            bool keep_control_net_cpu,
+                            bool keep_vae_on_cpu);

 SD_API void free_sd_ctx(sd_ctx_t* sd_ctx);

@ -128,6 +151,7 @@ SD_API sd_image_t* txt2img(sd_ctx_t* sd_ctx,
                           const char* negative_prompt,
                           int clip_skip,
                           float cfg_scale,
+                           float guidance,
                           int width,
                           int height,
                           enum sample_method_t sample_method,
@ -135,7 +159,10 @@ SD_API sd_image_t* txt2img(sd_ctx_t* sd_ctx,
                           int64_t seed,
                           int batch_count,
                           const sd_image_t* control_cond,
-                           float control_strength);
+                           float control_strength,
+                           float style_strength,
+                           bool normalize_input,
+                           const char* input_id_images_path);

 SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx,
                           sd_image_t init_image,
@ -143,13 +170,19 @@ SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx,
                           const char* negative_prompt,
                           int clip_skip,
                           float cfg_scale,
+                           float guidance,
                           int width,
                           int height,
                           enum sample_method_t sample_method,
                           int sample_steps,
                           float strength,
                           int64_t seed,
-                           int batch_count);
+                           int batch_count,
+                           const sd_image_t* control_cond,
+                           float control_strength,
+                           float style_strength,
+                           bool normalize_input,
+                           const char* input_id_images_path);

 SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx,
                           sd_image_t init_image,
@ -175,7 +208,7 @@ SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx);

 SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx, sd_image_t input_image, uint32_t upscale_factor);

-SD_API bool convert(const char* input_path, const char* vae_path, const char* output_path, sd_type_t output_type);
+SD_API bool convert(const char* input_path, const char* vae_path, const char* output_path, enum sd_type_t output_type);

 SD_API uint8_t* preprocess_canny(uint8_t* img,
                                 int width,
@ -190,4 +223,4 @@ SD_API uint8_t* preprocess_canny(uint8_t* img,
 }
 #endif

-#endif  // __STABLE_DIFFUSION_H__
+#endif  // __STABLE_DIFFUSION_H__
--- a/otherarch/sdcpp/t5.hpp
+++ b/otherarch/sdcpp/t5.hpp
@ -0,0 +1,981 @@
+#ifndef __T5_HPP__
+#define __T5_HPP__
+
+#include <float.h>
+#include <limits>
+#include <map>
+#include <memory>
+#include <regex>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+
+#include "darts.h"
+#include "ggml_extend.hpp"
+#include "json.hpp"
+#include "model.h"
+
+// Port from: https://github.com/google/sentencepiece/blob/master/src/unigram_model.h
+// and https://github.com/google/sentencepiece/blob/master/src/unigram_model.h.
+// Original License: https://github.com/google/sentencepiece/blob/master/LICENSE
+//
+// Since tokenization is not the bottleneck in SD, performance was not a major consideration
+// during the migration.
+class MetaspacePreTokenizer {
+private:
+    std::string replacement;
+    bool add_prefix_space;
+
+public:
+    MetaspacePreTokenizer(const std::string replacement = " ", bool add_prefix_space = true)
+        : replacement(replacement), add_prefix_space(add_prefix_space) {}
+
+    std::string tokenize(const std::string& input) const {
+        std::string tokens;
+        std::stringstream ss(input);
+
+        if (add_prefix_space) {
+            tokens += replacement;
+        }
+
+        std::string token;
+        bool firstToken = true;
+        while (std::getline(ss, token, ' ')) {
+            if (!firstToken)
+                tokens += replacement + token;
+            else
+                tokens += token;
+
+            firstToken = false;
+        }
+
+        return tokens;
+    }
+};
+
+using EncodeResult = std::vector<std::pair<std::string, int>>;
+class T5UniGramTokenizer {
+public:
+    enum Status {
+        OK,
+        NO_PIECES_LOADED,
+        NO_ENTRY_FOUND,
+        BUILD_DOUBLE_ARRAY_FAILED,
+        PIECE_ALREADY_DEFINED,
+        INVLIAD_JSON
+    };
+
+protected:
+    MetaspacePreTokenizer pre_tokenizer;
+
+    // all <piece, score> pairs
+    std::vector<std::pair<std::string, float>> piece_score_pairs;
+
+    float min_score_ = 0.0;
+    float max_score_ = 0.0;
+    std::unique_ptr<Darts::DoubleArray> trie_;
+
+    // Maximum size of the return value of Trie, which corresponds
+    // to the maximum size of shared common prefix in the sentence pieces.
+    int trie_results_size_;
+    // unknown id.
+    int unk_id_            = 2;
+    std::string eos_token_ = "</s>";
+    int eos_id_            = 1;
+    int pad_id_            = 0;
+    // status.
+    Status status_ = OK;
+
+    float kUnkPenalty = 10.0;
+
+    std::string replacement;
+    bool add_prefix_space = true;
+
+    void InitializePieces(const std::string& json_str) {
+        nlohmann::json data;
+
+        try {
+            data = nlohmann::json::parse(json_str);
+        } catch (const nlohmann::json::parse_error& e) {
+            status_ = INVLIAD_JSON;
+            return;
+        }
+        if (!data.contains("model")) {
+            status_ = INVLIAD_JSON;
+            return;
+        }
+        nlohmann::json model = data["model"];
+        if (!model.contains("vocab")) {
+            status_ = INVLIAD_JSON;
+            return;
+        }
+        if (model.contains("unk_id")) {
+            unk_id_ = model["unk_id"];
+        }
+
+        replacement      = data["pre_tokenizer"]["replacement"];
+        add_prefix_space = data["pre_tokenizer"]["add_prefix_space"];
+
+        pre_tokenizer = MetaspacePreTokenizer(replacement, add_prefix_space);
+
+        for (const auto& item : model["vocab"]) {
+            if (item.size() != 2 || !item[0].is_string() || !item[1].is_number_float()) {
+                status_ = INVLIAD_JSON;
+                return;
+            }
+            std::string piece = item[0];
+            float score       = item[1];
+            piece_score_pairs.emplace_back(piece, score);
+        }
+    }
+
+    // Builds a Trie index.
+    void BuildTrie(std::vector<std::pair<std::string, int>>* pieces) {
+        if (status_ != OK)
+            return;
+
+        if (pieces->empty()) {
+            status_ = NO_PIECES_LOADED;
+            return;
+        }
+
+        // sort by sentencepiece since DoubleArray::build()
+        // only accepts sorted strings.
+        sort(pieces->begin(), pieces->end());
+
+        // Makes key/value set for DoubleArrayTrie.
+        std::vector<const char*> key(pieces->size());
+        std::vector<int> value(pieces->size());
+        for (size_t i = 0; i < pieces->size(); ++i) {
+            key[i]   = (*pieces)[i].first.data();  // sorted piece.
+            value[i] = (*pieces)[i].second;        // vocab_id
+        }
+
+        trie_ = std::unique_ptr<Darts::DoubleArray>(new Darts::DoubleArray());
+        if (trie_->build(key.size(), const_cast<char**>(&key[0]), nullptr,
+                         &value[0]) != 0) {
+            status_ = BUILD_DOUBLE_ARRAY_FAILED;
+            return;
+        }
+
+        // Computes the maximum number of shared prefixes in the trie.
+        const int kMaxTrieResultsSize = 1024;
+        std::vector<Darts::DoubleArray::result_pair_type> results(
+            kMaxTrieResultsSize);
+        trie_results_size_ = 0;
+        for (const auto& p : *pieces) {
+            const int num_nodes = trie_->commonPrefixSearch(
+                p.first.data(), results.data(), results.size(), p.first.size());
+            trie_results_size_ = std::max(trie_results_size_, num_nodes);
+        }
+
+        if (trie_results_size_ == 0)
+            status_ = NO_ENTRY_FOUND;
+    }
+
+    // Non-virtual (inlined) implementation for faster execution.
+    inline float GetScoreInlined(int id) const {
+        return piece_score_pairs[id].second;
+    }
+
+    inline bool IsUnusedInlined(int id) const {
+        return false;  // TODO
+    }
+
+    inline bool IsUserDefinedInlined(int id) const {
+        return false;  // TODO
+    }
+
+    inline size_t OneCharLen(const char* src) const {
+        return "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4"[(*src & 0xFF) >> 4];
+    }
+
+    // The optimized Viterbi encode.
+    // Main differences from the original function:
+    // 1. Memorizes the best path at each postion so far,
+    // 2. No need to store the Lattice nodes,
+    // 3. Works in utf-8 directly,
+    // 4. Defines a new struct with fewer fields than Lattice,
+    // 5. Does not depend on `class Lattice` nor call `SetSentence()`,
+    // `PopulateNodes()`, or `Viterbi()`. It does everything in one function.
+    // For detailed explanations please see the comments inside the function body.
+    EncodeResult EncodeOptimized(const std::string& normalized) const {
+        // An optimized Viterbi algorithm for unigram language models. Benchmarking
+        // results show that it generates almost identical outputs and achieves 2.1x
+        // speedup on average for 102 languages compared to the original
+        // implementation. It's based on the following three ideas:
+        //
+        // 1. Because it uses the *unigram* model:
+        //     best_score(x1, x2, …, xt) = best_score(x1, x2, …, x{t-1}) + score(xt)
+        // Deciding the best path (and score) can be decoupled into two isolated
+        // terms: (a) the best path ended before the last token `best_score(x1, x2, …,
+        // x{t-1})`, and (b) the last token and its `score(xt)`. The two terms are
+        // not related to each other at all.
+        //
+        // Therefore, we can compute once and store the *best_path ending at
+        // each character position*. In this way, when we know best_path_ends_at[M],
+        // we can reuse it to compute all the best_path_ends_at_[...] where the last
+        // token starts at the same character position M.
+        //
+        // This improves the time complexity from O(n*k*k) to O(n*k) because it
+        // eliminates the extra loop of recomputing the best path ending at the same
+        // position, where n is the input length and k is the maximum number of tokens
+        // that can be recognized starting at each position.
+        //
+        // 2. Again, because it uses the *unigram* model, we don’t need to actually
+        // store the lattice nodes. We still recognize all the tokens and lattice
+        // nodes from the input, but along identifying them, we use and discard them
+        // on the fly. There is no need to actually store them for best path Viterbi
+        // decoding. The only thing we need to store is the best_path ending at
+        // each character position.
+        //
+        // This improvement reduces the things needed to store in memory from O(n*k)
+        // to O(n), where n is the input length and k is the maximum number of tokens
+        // that can be recognized starting at each position.
+        //
+        // It also avoids the need of dynamic-size lattice node pool, because the
+        // number of things to store is fixed as n.
+        //
+        // 3. SentencePiece is designed to work with unicode, taking utf-8 encoding
+        // inputs. In the original implementation, the lattice positions are based on
+        // unicode positions. A mapping from unicode position to the utf-8 position is
+        // maintained to recover the utf-8 string piece.
+        //
+        // We found that it is sufficient and beneficial to directly work with utf-8
+        // positions:
+        //
+        // Firstly, it saves the conversion and mapping between unicode positions and
+        // utf-8 positions.
+        //
+        // Secondly, it reduces the number of fields we need to maintain in the
+        // node/path structure. Specifically, there are 8 fields defined in
+        // `Lattice::Node` used by the original encoder, but here in the optimized
+        // encoder we only need to define 3 fields in `BestPathNode`.
+
+        if (status() != OK || normalized.empty()) {
+            return {};
+        }
+        // Represents the last node of the best path.
+        struct BestPathNode {
+            int id = -1;  // The vocab id. (maybe -1 for UNK)
+            float best_path_score =
+                0;  // The total score of the best path ending at this node.
+            int starts_at =
+                -1;  // The starting position (in utf-8) of this node. The entire best
+                     // path can be constructed by backtracking along this link.
+        };
+        const int size        = normalized.size();
+        const float unk_score = min_score() - kUnkPenalty;
+        // The ends are exclusive.
+        std::vector<BestPathNode> best_path_ends_at(size + 1);
+        // Generate lattice on-the-fly (not stored) and update best_path_ends_at.
+        int starts_at = 0;
+        while (starts_at < size) {
+            std::size_t node_pos = 0;
+            std::size_t key_pos  = starts_at;
+            const auto best_path_score_till_here =
+                best_path_ends_at[starts_at].best_path_score;
+            bool has_single_node = false;
+            const int mblen =
+                std::min<int>(OneCharLen(normalized.data() + starts_at),
+                              size - starts_at);
+            while (key_pos < size) {
+                const int ret =
+                    trie_->traverse(normalized.data(), node_pos, key_pos, key_pos + 1);
+                if (ret == -2)
+                    break;
+                if (ret >= 0) {
+                    if (IsUnusedInlined(ret))
+                        continue;
+                    // Update the best path node.
+                    auto& target_node = best_path_ends_at[key_pos];
+                    const auto length = (key_pos - starts_at);
+                    // User defined symbol receives extra bonus to always be selected.
+                    const auto score = IsUserDefinedInlined(ret)
+                                           ? (length * max_score_ - 0.1)
+                                           : GetScoreInlined(ret);
+                    const auto candidate_best_path_score =
+                        score + best_path_score_till_here;
+                    if (target_node.starts_at == -1 ||
+                        candidate_best_path_score > target_node.best_path_score) {
+                        target_node.best_path_score = candidate_best_path_score;
+                        target_node.starts_at       = starts_at;
+                        target_node.id              = ret;
+                    }
+                    if (!has_single_node && length == mblen) {
+                        has_single_node = true;
+                    }
+                }
+            }
+            if (!has_single_node) {
+                auto& target_node = best_path_ends_at[starts_at + mblen];
+                const auto candidate_best_path_score =
+                    unk_score + best_path_score_till_here;
+                if (target_node.starts_at == -1 ||
+                    candidate_best_path_score > target_node.best_path_score) {
+                    target_node.best_path_score = candidate_best_path_score;
+                    target_node.starts_at       = starts_at;
+                    target_node.id              = unk_id_;
+                }
+            }
+            // Move by one unicode character.
+            starts_at += mblen;
+        }
+        // Backtrack to identify the best path.
+        EncodeResult results;
+        int ends_at = size;
+        while (ends_at > 0) {
+            const auto& node = best_path_ends_at[ends_at];
+            results.emplace_back(
+                normalized.substr(node.starts_at, ends_at - node.starts_at), node.id);
+            ends_at = node.starts_at;
+        }
+        std::reverse(results.begin(), results.end());
+        return results;
+    }
+
+public:
+    explicit T5UniGramTokenizer(const std::string& json_str = "") {
+        if (json_str.size() != 0) {
+            InitializePieces(json_str);
+        } else {
+            InitializePieces(ModelLoader::load_t5_tokenizer_json());
+        }
+
+        min_score_ = FLT_MAX;
+        max_score_ = FLT_MIN;
+
+        std::vector<std::pair<std::string, int>> pieces;
+        for (int i = 0; i < piece_score_pairs.size(); i++) {
+            const auto& sp = piece_score_pairs[i];
+
+            min_score_ = std::min(min_score_, sp.second);
+            max_score_ = std::max(max_score_, sp.second);
+
+            pieces.emplace_back(sp.first, i);
+        }
+
+        BuildTrie(&pieces);
+    }
+    ~T5UniGramTokenizer(){};
+
+    std::string Normalize(const std::string& input) const {
+        // Ref: https://github.com/huggingface/tokenizers/blob/1ff56c0c70b045f0cd82da1af9ac08cd4c7a6f9f/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py#L29
+        // TODO: nmt-nfkc
+        std::string normalized = std::regex_replace(input, std::regex(" {2,}"), " ");
+        return normalized;
+    }
+
+    std::vector<int> Encode(const std::string& input, bool append_eos_if_not_present = true) const {
+        std::string normalized = Normalize(input);
+        normalized             = pre_tokenizer.tokenize(normalized);
+        EncodeResult result    = EncodeOptimized(normalized);
+        if (result.size() > 0 && append_eos_if_not_present) {
+            auto item = result[result.size() - 1];
+            if (item.first != eos_token_) {
+                result.emplace_back(eos_token_, eos_id_);
+            }
+        }
+        std::vector<int> tokens;
+        for (auto item : result) {
+            tokens.push_back(item.second);
+        }
+        return tokens;
+    }
+
+    void pad_tokens(std::vector<int>& tokens,
+                    std::vector<float>& weights,
+                    size_t max_length = 0,
+                    bool padding      = false) {
+        if (max_length > 0 && padding) {
+            size_t orig_token_num = tokens.size() - 1;
+            size_t n              = std::ceil(orig_token_num * 1.0 / (max_length - 1));
+            if (n == 0) {
+                n = 1;
+            }
+            size_t length = max_length * n;
+            LOG_DEBUG("token length: %llu", length);
+            std::vector<int> new_tokens;
+            std::vector<float> new_weights;
+            int token_idx = 0;
+            for (int i = 0; i < length; i++) {
+                if (token_idx >= orig_token_num) {
+                    break;
+                }
+                if (i % max_length == max_length - 1) {
+                    new_tokens.push_back(eos_id_);
+                    new_weights.push_back(1.0);
+                } else {
+                    new_tokens.push_back(tokens[token_idx]);
+                    new_weights.push_back(weights[token_idx]);
+                    token_idx++;
+                }
+            }
+
+            new_tokens.push_back(eos_id_);
+            new_weights.push_back(1.0);
+            tokens  = new_tokens;
+            weights = new_weights;
+
+            if (padding) {
+                int pad_token_id = pad_id_;
+                tokens.insert(tokens.end(), length - tokens.size(), pad_token_id);
+                weights.insert(weights.end(), length - weights.size(), 1.0);
+            }
+        }
+    }
+
+    // Returns the minimum score in sentence pieces.
+    // min_score() - 10 is used for the cost of unknown sentence.
+    float min_score() const { return min_score_; }
+
+    // Returns the maximum score in sentence pieces.
+    // max_score() is used for the cost of user defined symbols.
+    float max_score() const { return max_score_; }
+
+    Status status() const { return status_; }
+};
+
+class T5LayerNorm : public UnaryBlock {
+protected:
+    int64_t hidden_size;
+    float eps;
+
+    void init_params(struct ggml_context* ctx, ggml_type wtype) {
+        params["weight"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
+    }
+
+public:
+    T5LayerNorm(int64_t hidden_size,
+                float eps = 1e-06f)
+        : hidden_size(hidden_size),
+          eps(eps) {}
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        struct ggml_tensor* w = params["weight"];
+        x                     = ggml_rms_norm(ctx, x, eps);
+        x                     = ggml_mul(ctx, x, w);
+        return x;
+    }
+};
+
+struct T5DenseActDense : public UnaryBlock {
+public:
+    T5DenseActDense(int64_t model_dim, int64_t ff_dim) {
+        blocks["wi"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, ff_dim, false));
+        blocks["wo"] = std::shared_ptr<GGMLBlock>(new Linear(ff_dim, model_dim, false));
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        // x: [N, n_token, model_dim]
+        auto wi = std::dynamic_pointer_cast<Linear>(blocks["wi"]);
+        auto wo = std::dynamic_pointer_cast<Linear>(blocks["wo"]);
+
+        x = wi->forward(ctx, x);
+        x = ggml_relu_inplace(ctx, x);
+        x = wo->forward(ctx, x);
+        return x;
+    }
+};
+
+struct T5DenseGatedActDense : public UnaryBlock {
+public:
+    T5DenseGatedActDense(int64_t model_dim, int64_t ff_dim) {
+        blocks["wi_0"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, ff_dim, false));
+        blocks["wi_1"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, ff_dim, false));
+        blocks["wo"]   = std::shared_ptr<GGMLBlock>(new Linear(ff_dim, model_dim, false));
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        // x: [N, n_token, model_dim]
+        auto wi_0 = std::dynamic_pointer_cast<Linear>(blocks["wi_0"]);
+        auto wi_1 = std::dynamic_pointer_cast<Linear>(blocks["wi_1"]);
+        auto wo   = std::dynamic_pointer_cast<Linear>(blocks["wo"]);
+
+        auto hidden_gelu   = ggml_gelu_inplace(ctx, wi_0->forward(ctx, x));
+        auto hidden_linear = wi_1->forward(ctx, x);
+        x                  = ggml_mul_inplace(ctx, hidden_gelu, hidden_linear);
+        x                  = wo->forward(ctx, x);
+        return x;
+    }
+};
+
+struct T5LayerFF : public UnaryBlock {
+public:
+    T5LayerFF(int64_t model_dim, int64_t ff_dim) {
+        blocks["DenseReluDense"] = std::shared_ptr<GGMLBlock>(new T5DenseGatedActDense(model_dim, ff_dim));
+        blocks["layer_norm"]     = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim));
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        // x: [N, n_token, model_dim]
+        auto DenseReluDense = std::dynamic_pointer_cast<T5DenseGatedActDense>(blocks["DenseReluDense"]);
+        auto layer_norm     = std::dynamic_pointer_cast<T5LayerNorm>(blocks["layer_norm"]);
+
+        auto forwarded_states = layer_norm->forward(ctx, x);
+        forwarded_states      = DenseReluDense->forward(ctx, forwarded_states);
+        x                     = ggml_add_inplace(ctx, forwarded_states, x);
+        return x;
+    }
+};
+
+class T5Attention : public GGMLBlock {
+protected:
+    int64_t model_dim;
+    int64_t inner_dim;
+    int64_t num_heads;
+    bool using_relative_attention_bias;
+    int64_t relative_attention_num_buckets  = 32;
+    int64_t relative_attention_max_distance = 128;
+
+public:
+    T5Attention(int64_t model_dim,
+                int64_t inner_dim,
+                int64_t num_heads,
+                bool using_relative_attention_bias = false)
+        : model_dim(model_dim),
+          inner_dim(inner_dim),
+          num_heads(num_heads),
+          using_relative_attention_bias(using_relative_attention_bias) {
+        blocks["q"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, inner_dim, false));
+        blocks["k"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, inner_dim, false));
+        blocks["v"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, inner_dim, false));
+        blocks["o"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, model_dim, false));
+        if (using_relative_attention_bias) {
+            blocks["relative_attention_bias"] = std::shared_ptr<GGMLBlock>(new Embedding(relative_attention_num_buckets, num_heads));
+        }
+    }
+
+    struct ggml_tensor* compute_bias(struct ggml_context* ctx,
+                                     struct ggml_tensor* relative_position_bucket) {
+        auto relative_attention_bias = std::dynamic_pointer_cast<Embedding>(blocks["relative_attention_bias"]);
+
+        auto values = relative_attention_bias->forward(ctx, relative_position_bucket);  // shape (query_length, key_length, num_heads)
+        values      = ggml_cont(ctx, ggml_permute(ctx, values, 2, 0, 1, 3));            // shape (1, num_heads, query_length, key_length)
+        return values;
+    }
+
+    // x: [N, n_token, model_dim]
+    std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(struct ggml_context* ctx,
+                                                                struct ggml_tensor* x,
+                                                                struct ggml_tensor* past_bias                = NULL,
+                                                                struct ggml_tensor* mask                     = NULL,
+                                                                struct ggml_tensor* relative_position_bucket = NULL) {
+        auto q_proj   = std::dynamic_pointer_cast<Linear>(blocks["q"]);
+        auto k_proj   = std::dynamic_pointer_cast<Linear>(blocks["k"]);
+        auto v_proj   = std::dynamic_pointer_cast<Linear>(blocks["v"]);
+        auto out_proj = std::dynamic_pointer_cast<Linear>(blocks["o"]);
+
+        int64_t n_head = num_heads;
+        int64_t d_head = inner_dim / n_head;
+
+        auto q = q_proj->forward(ctx, x);
+        auto k = k_proj->forward(ctx, x);
+        auto v = v_proj->forward(ctx, x);
+
+        if (using_relative_attention_bias && relative_position_bucket != NULL) {
+            past_bias = compute_bias(ctx, relative_position_bucket);
+        }
+        if (past_bias != NULL) {
+            if (mask != NULL) {
+                mask = ggml_add(ctx, mask, past_bias);
+            } else {
+                mask = past_bias;
+            }
+        }
+
+        k = ggml_scale_inplace(ctx, k, sqrt(d_head));
+
+        x = ggml_nn_attention_ext(ctx, q, k, v, num_heads, mask);  // [N, n_token, d_head * n_head]
+
+        x = out_proj->forward(ctx, x);  // [N, n_token, model_dim]
+        return {x, past_bias};
+    }
+};
+
+struct T5LayerSelfAttention : public GGMLBlock {
+public:
+    T5LayerSelfAttention(int64_t model_dim,
+                         int64_t inner_dim,
+                         int64_t ff_dim,
+                         int64_t num_heads,
+                         bool using_relative_attention_bias) {
+        blocks["SelfAttention"] = std::shared_ptr<GGMLBlock>(new T5Attention(model_dim, inner_dim, num_heads, using_relative_attention_bias));
+        blocks["layer_norm"]    = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim));
+    }
+
+    std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(struct ggml_context* ctx,
+                                                                struct ggml_tensor* x,
+                                                                struct ggml_tensor* past_bias                = NULL,
+                                                                struct ggml_tensor* mask                     = NULL,
+                                                                struct ggml_tensor* relative_position_bucket = NULL) {
+        // x: [N, n_token, model_dim]
+        auto SelfAttention = std::dynamic_pointer_cast<T5Attention>(blocks["SelfAttention"]);
+        auto layer_norm    = std::dynamic_pointer_cast<T5LayerNorm>(blocks["layer_norm"]);
+
+        auto normed_hidden_state = layer_norm->forward(ctx, x);
+        auto ret                 = SelfAttention->forward(ctx, normed_hidden_state, past_bias, mask, relative_position_bucket);
+        auto output              = ret.first;
+        past_bias                = ret.second;
+
+        x = ggml_add_inplace(ctx, output, x);
+        return {x, past_bias};
+    }
+};
+
+struct T5Block : public GGMLBlock {
+public:
+    T5Block(int64_t model_dim, int64_t inner_dim, int64_t ff_dim, int64_t num_heads, bool using_relative_attention_bias) {
+        blocks["layer.0"] = std::shared_ptr<GGMLBlock>(new T5LayerSelfAttention(model_dim, inner_dim, ff_dim, num_heads, using_relative_attention_bias));
+        blocks["layer.1"] = std::shared_ptr<GGMLBlock>(new T5LayerFF(model_dim, ff_dim));
+    }
+
+    std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(struct ggml_context* ctx,
+                                                                struct ggml_tensor* x,
+                                                                struct ggml_tensor* past_bias                = NULL,
+                                                                struct ggml_tensor* mask                     = NULL,
+                                                                struct ggml_tensor* relative_position_bucket = NULL) {
+        // x: [N, n_token, model_dim]
+        auto layer_0 = std::dynamic_pointer_cast<T5LayerSelfAttention>(blocks["layer.0"]);
+        auto layer_1 = std::dynamic_pointer_cast<T5LayerFF>(blocks["layer.1"]);
+
+        auto ret  = layer_0->forward(ctx, x, past_bias, mask, relative_position_bucket);
+        x         = ret.first;
+        past_bias = ret.second;
+        x         = layer_1->forward(ctx, x);
+        return {x, past_bias};
+    }
+};
+
+struct T5Stack : public GGMLBlock {
+    int64_t num_layers;
+
+public:
+    T5Stack(int64_t num_layers,
+            int64_t model_dim,
+            int64_t inner_dim,
+            int64_t ff_dim,
+            int64_t num_heads)
+        : num_layers(num_layers) {
+        for (int i = 0; i < num_layers; i++) {
+            blocks["block." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new T5Block(model_dim, inner_dim, ff_dim, num_heads, i == 0));
+        }
+
+        blocks["final_layer_norm"] = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim));
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx,
+                                struct ggml_tensor* x,
+                                struct ggml_tensor* past_bias                = NULL,
+                                struct ggml_tensor* attention_mask           = NULL,
+                                struct ggml_tensor* relative_position_bucket = NULL) {
+        // x: [N, n_token, model_dim]
+        for (int i = 0; i < num_layers; i++) {
+            auto block = std::dynamic_pointer_cast<T5Block>(blocks["block." + std::to_string(i)]);
+
+            auto ret  = block->forward(ctx, x, past_bias, attention_mask, relative_position_bucket);
+            x         = ret.first;
+            past_bias = ret.second;
+        }
+
+        auto final_layer_norm = std::dynamic_pointer_cast<T5LayerNorm>(blocks["final_layer_norm"]);
+
+        x = final_layer_norm->forward(ctx, x);
+        return x;
+    }
+};
+
+struct T5 : public GGMLBlock {
+public:
+    T5(int64_t num_layers,
+       int64_t model_dim,
+       int64_t ff_dim,
+       int64_t num_heads,
+       int64_t vocab_size) {
+        blocks["encoder"] = std::shared_ptr<GGMLBlock>(new T5Stack(num_layers, model_dim, model_dim, ff_dim, num_heads));
+        blocks["shared"]  = std::shared_ptr<GGMLBlock>(new Embedding(vocab_size, model_dim));
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx,
+                                struct ggml_tensor* input_ids,
+                                struct ggml_tensor* past_bias                = NULL,
+                                struct ggml_tensor* attention_mask           = NULL,
+                                struct ggml_tensor* relative_position_bucket = NULL) {
+        // input_ids: [N, n_token]
+
+        auto shared  = std::dynamic_pointer_cast<Embedding>(blocks["shared"]);
+        auto encoder = std::dynamic_pointer_cast<T5Stack>(blocks["encoder"]);
+
+        auto x = shared->forward(ctx, input_ids);
+        x      = encoder->forward(ctx, x, past_bias, attention_mask, relative_position_bucket);
+        return x;
+    }
+};
+
+struct T5Runner : public GGMLRunner {
+    T5 model;
+    std::vector<int> relative_position_bucket_vec;
+
+    T5Runner(ggml_backend_t backend,
+             ggml_type wtype,
+             int64_t num_layers = 24,
+             int64_t model_dim  = 4096,
+             int64_t ff_dim     = 10240,
+             int64_t num_heads  = 64,
+             int64_t vocab_size = 32128)
+        : GGMLRunner(backend, wtype), model(num_layers, model_dim, ff_dim, num_heads, vocab_size) {
+        model.init(params_ctx, wtype);
+    }
+
+    std::string get_desc() {
+        return "t5";
+    }
+
+    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
+        model.get_param_tensors(tensors, prefix);
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx,
+                                struct ggml_tensor* input_ids,
+                                struct ggml_tensor* relative_position_bucket) {
+        size_t N       = input_ids->ne[1];
+        size_t n_token = input_ids->ne[0];
+
+        auto hidden_states = model.forward(ctx, input_ids, NULL, NULL, relative_position_bucket);  // [N, n_token, model_dim]
+        return hidden_states;
+    }
+
+    struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids) {
+        struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
+
+        input_ids = to_backend(input_ids);
+
+        relative_position_bucket_vec = compute_relative_position_bucket(input_ids->ne[0], input_ids->ne[0]);
+
+        // for (int i = 0; i < relative_position_bucket_vec.size(); i++) {
+        //     if (i % 77 == 0) {
+        //         printf("\n");
+        //     }
+        //     printf("%d ", relative_position_bucket_vec[i]);
+        // }
+
+        auto relative_position_bucket = ggml_new_tensor_2d(compute_ctx,
+                                                           GGML_TYPE_I32,
+                                                           input_ids->ne[0],
+                                                           input_ids->ne[0]);
+        set_backend_tensor_data(relative_position_bucket, relative_position_bucket_vec.data());
+
+        struct ggml_tensor* hidden_states = forward(compute_ctx, input_ids, relative_position_bucket);
+
+        ggml_build_forward_expand(gf, hidden_states);
+
+        return gf;
+    }
+
+    void compute(const int n_threads,
+                 struct ggml_tensor* input_ids,
+                 ggml_tensor** output,
+                 ggml_context* output_ctx = NULL) {
+        auto get_graph = [&]() -> struct ggml_cgraph* {
+            return build_graph(input_ids);
+        };
+        GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
+    }
+
+    static std::vector<int> _relative_position_bucket(const std::vector<int>& relative_position,
+                                                      bool bidirectional = true,
+                                                      int num_buckets    = 32,
+                                                      int max_distance   = 128) {
+        std::vector<int> relative_buckets(relative_position.size(), 0);
+        std::vector<int> abs_relative_position = relative_position;
+
+        if (bidirectional) {
+            num_buckets = num_buckets / 2;
+            for (size_t i = 0; i < relative_position.size(); ++i) {
+                if (relative_position[i] > 0) {
+                    relative_buckets[i] += num_buckets;
+                }
+                abs_relative_position[i] = std::abs(relative_position[i]);
+            }
+        } else {
+            for (size_t i = 0; i < relative_position.size(); ++i) {
+                abs_relative_position[i] = std::max(-relative_position[i], 0);
+            }
+        }
+
+        int max_exact = num_buckets / 2;
+        std::vector<int> relative_position_if_large(relative_position.size(), 0);
+
+        for (size_t i = 0; i < relative_position.size(); ++i) {
+            if (abs_relative_position[i] < max_exact) {
+                relative_buckets[i] += abs_relative_position[i];
+            } else {
+                float log_pos                 = std::log(static_cast<float>(abs_relative_position[i]) / max_exact);
+                float log_base                = std::log(static_cast<float>(max_distance) / max_exact);
+                relative_position_if_large[i] = max_exact + static_cast<int>((log_pos / log_base) * (num_buckets - max_exact));
+                relative_position_if_large[i] = std::min(relative_position_if_large[i], num_buckets - 1);
+                relative_buckets[i] += relative_position_if_large[i];
+            }
+        }
+
+        return relative_buckets;
+    }
+
+    std::vector<int> compute_relative_position_bucket(int query_length,
+                                                      int key_length) {
+        std::vector<int> context_position(query_length);
+        std::vector<int> memory_position(key_length);
+
+        for (int i = 0; i < query_length; ++i) {
+            context_position[i] = i;
+        }
+        for (int i = 0; i < key_length; ++i) {
+            memory_position[i] = i;
+        }
+
+        std::vector<std::vector<int>> relative_position(query_length, std::vector<int>(key_length, 0));
+        for (int i = 0; i < query_length; ++i) {
+            for (int j = 0; j < key_length; ++j) {
+                relative_position[i][j] = memory_position[j] - context_position[i];
+            }
+        }
+
+        std::vector<int> relative_position_bucket;
+        for (int i = 0; i < query_length; ++i) {
+            std::vector<int> result = _relative_position_bucket(relative_position[i], true);
+            relative_position_bucket.insert(relative_position_bucket.end(), result.begin(), result.end());
+        }
+
+        return relative_position_bucket;
+    }
+};
+
+struct T5Embedder {
+    T5UniGramTokenizer tokenizer;
+    T5Runner model;
+
+    T5Embedder(ggml_backend_t backend,
+               ggml_type wtype,
+               int64_t num_layers = 24,
+               int64_t model_dim  = 4096,
+               int64_t ff_dim     = 10240,
+               int64_t num_heads  = 64,
+               int64_t vocab_size = 32128)
+        : model(backend, wtype, num_layers, model_dim, ff_dim, num_heads, vocab_size) {
+    }
+
+    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
+        model.get_param_tensors(tensors, prefix);
+    }
+
+    void alloc_params_buffer() {
+        model.alloc_params_buffer();
+    }
+
+    std::pair<std::vector<int>, std::vector<float>> tokenize(std::string text,
+                                                             size_t max_length = 0,
+                                                             bool padding      = false) {
+        auto parsed_attention = parse_prompt_attention(text);
+
+        {
+            std::stringstream ss;
+            ss << "[";
+            for (const auto& item : parsed_attention) {
+                ss << "['" << item.first << "', " << item.second << "], ";
+            }
+            ss << "]";
+            LOG_DEBUG("parse '%s' to %s", text.c_str(), ss.str().c_str());
+        }
+
+        std::vector<int> tokens;
+        std::vector<float> weights;
+        for (const auto& item : parsed_attention) {
+            const std::string& curr_text = item.first;
+            float curr_weight            = item.second;
+            std::vector<int> curr_tokens = tokenizer.Encode(curr_text, false);
+            tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end());
+            weights.insert(weights.end(), curr_tokens.size(), curr_weight);
+        }
+
+        int EOS_TOKEN_ID = 1;
+        tokens.push_back(EOS_TOKEN_ID);
+        weights.push_back(1.0);
+
+        tokenizer.pad_tokens(tokens, weights, max_length, padding);
+
+        // for (int i = 0; i < tokens.size(); i++) {
+        //     std::cout << tokens[i] << ":" << weights[i] << ", ";
+        // }
+        // std::cout << std::endl;
+
+        return {tokens, weights};
+    }
+
+    void test() {
+        struct ggml_init_params params;
+        params.mem_size   = static_cast<size_t>(10 * 1024 * 1024);  // 10 MB
+        params.mem_buffer = NULL;
+        params.no_alloc   = false;
+
+        struct ggml_context* work_ctx = ggml_init(params);
+        GGML_ASSERT(work_ctx != NULL);
+
+        {
+            // cpu f16: pass
+            // cpu f32: pass
+            // cuda f16: nan
+            // cuda f32: pass
+            // cuda q8_0: nan
+            // TODO: fix cuda nan
+            std::string text("a lovely cat");
+            auto tokens_and_weights     = tokenize(text, 77, true);
+            std::vector<int>& tokens    = tokens_and_weights.first;
+            std::vector<float>& weights = tokens_and_weights.second;
+            for (auto token : tokens) {
+                printf("%d ", token);
+            }
+            printf("\n");
+            auto input_ids          = vector_to_ggml_tensor_i32(work_ctx, tokens);
+            struct ggml_tensor* out = NULL;
+
+            int t0 = ggml_time_ms();
+            model.compute(8, input_ids, &out, work_ctx);
+            int t1 = ggml_time_ms();
+
+            print_ggml_tensor(out);
+            LOG_DEBUG("t5 test done in %dms", t1 - t0);
+        }
+    }
+
+    static void load_from_file_and_test(const std::string& file_path) {
+        // ggml_backend_t backend    = ggml_backend_cuda_init(0);
+        ggml_backend_t backend         = ggml_backend_cpu_init();
+        ggml_type model_data_type      = GGML_TYPE_F32;
+        std::shared_ptr<T5Embedder> t5 = std::shared_ptr<T5Embedder>(new T5Embedder(backend, model_data_type));
+        {
+            LOG_INFO("loading from '%s'", file_path.c_str());
+
+            t5->alloc_params_buffer();
+            std::map<std::string, ggml_tensor*> tensors;
+            t5->get_param_tensors(tensors, "");
+
+            ModelLoader model_loader;
+            if (!model_loader.init_from_file(file_path)) {
+                LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
+                return;
+            }
+
+            bool success = model_loader.load_tensors(tensors, backend);
+
+            if (!success) {
+                LOG_ERROR("load tensors from model loader failed");
+                return;
+            }
+
+            LOG_INFO("t5 model loaded");
+        }
+        t5->test();
+    }
+};
+
+#endif  // __T5_HPP__
--- a/otherarch/sdcpp/tae.hpp
+++ b/otherarch/sdcpp/tae.hpp
@ -183,7 +183,7 @@ public:
    }
 };

-struct TinyAutoEncoder : public GGMLModule {
+struct TinyAutoEncoder : public GGMLRunner {
    TAESD taesd;
    bool decode_only = false;

@ -192,7 +192,7 @@ struct TinyAutoEncoder : public GGMLModule {
                    bool decoder_only = true)
        : decode_only(decoder_only),
          taesd(decode_only),
-          GGMLModule(backend, wtype) {
+          GGMLRunner(backend, wtype) {
        taesd.init(params_ctx, wtype);
    }

@ -200,16 +200,8 @@ struct TinyAutoEncoder : public GGMLModule {
        return "taesd";
    }

-    size_t get_params_mem_size() {
-        return taesd.get_params_mem_size();
-    }
-
-    size_t get_params_num() {
-        return taesd.get_params_num();
-    }
-
    bool load_from_file(const std::string& file_path) {
-        LOG_INFO("loading taesd from '%s'", file_path.c_str());
+        LOG_INFO("loading taesd from '%s', decode_only = %s", file_path.c_str(), decode_only ? "true" : "false");
        alloc_params_buffer();
        std::map<std::string, ggml_tensor*> taesd_tensors;
        taesd.get_param_tensors(taesd_tensors);
@ -252,7 +244,7 @@ struct TinyAutoEncoder : public GGMLModule {
            return build_graph(z, decode_graph);
        };

-        GGMLModule::compute(get_graph, n_threads, false, output, output_ctx);
+        GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
    }
 };

--- a/otherarch/sdcpp/thirdparty/LICENSE.darts_clone.txt
+++ b/otherarch/sdcpp/thirdparty/LICENSE.darts_clone.txt
@ -0,0 +1,10 @@
+Copyright (c) 2008-2011, Susumu Yata
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+- Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 
+- Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 
+- Neither the name of the <ORGANIZATION> nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/otherarch/sdcpp/thirdparty/darts.h
+++ b/otherarch/sdcpp/thirdparty/darts.h
--- a/otherarch/sdcpp/thirdparty/zip.c
+++ b/otherarch/sdcpp/thirdparty/zip.c
@ -36,6 +36,7 @@
 #include <unistd.h>
 #endif

+#define USE_EXTERNAL_MZCRC
 #include "miniz.h"
 #include "zip.h"

@ -1834,3 +1835,234 @@ int zip_extract(const char *zipname, const char *dir,

  return zip_archive_extract(&zip_archive, dir, on_extract, arg);
 }
+
+#if defined(__SSE4_2__) || defined(__AVX512F__)
+#include <immintrin.h>
+#endif
+
+// Phil Katz 32-Bit Cyclic Redundancy Check Uber Alles
+// Goes 73 GiB/s on an AMD Ryzen Threadripper PRO 7995WX
+// "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
+//  V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
+mz_ulong mz_crc32(mz_ulong init, const uint8_t *buf, size_t len) {
+  uint32_t crc = ~init;
+#if defined(__AVX512F__) && defined(__VPCLMULQDQ__) && defined(__PCLMUL__)
+  if (len >= 256) {
+    _Alignas(__m512) static const uint64_t k1k2[] = {
+        0x011542778a, 0x01322d1430, 0x011542778a, 0x01322d1430,
+        0x011542778a, 0x01322d1430, 0x011542778a, 0x01322d1430,
+    };
+    _Alignas(__m512) static const uint64_t k3k4[] = {
+        0x0154442bd4, 0x01c6e41596, 0x0154442bd4, 0x01c6e41596,
+        0x0154442bd4, 0x01c6e41596, 0x0154442bd4, 0x01c6e41596,
+    };
+    _Alignas(__m512) static const uint64_t k5k6[] = {
+        0x01751997d0,
+        0x00ccaa009e,
+    };
+    _Alignas(__m512) static const uint64_t k7k8[] = {
+        0x0163cd6124,
+        0x0000000000,
+    };
+    _Alignas(__m512) static const uint64_t poly[] = {
+        0x01db710641,
+        0x01f7011641,
+    };
+    __m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
+    __m128i a0, a1, a2, a3;
+    x1 = _mm512_loadu_si512((__m512i *)(buf + 0x00));
+    x2 = _mm512_loadu_si512((__m512i *)(buf + 0x40));
+    x3 = _mm512_loadu_si512((__m512i *)(buf + 0x80));
+    x4 = _mm512_loadu_si512((__m512i *)(buf + 0xC0));
+    x1 = _mm512_xor_si512(x1, _mm512_castsi128_si512(_mm_cvtsi32_si128(crc)));
+    x0 = _mm512_load_si512((__m512i *)k1k2);
+    buf += 256;
+    len -= 256;
+    while (len >= 256) {
+      x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
+      x6 = _mm512_clmulepi64_epi128(x2, x0, 0x00);
+      x7 = _mm512_clmulepi64_epi128(x3, x0, 0x00);
+      x8 = _mm512_clmulepi64_epi128(x4, x0, 0x00);
+      x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
+      x2 = _mm512_clmulepi64_epi128(x2, x0, 0x11);
+      x3 = _mm512_clmulepi64_epi128(x3, x0, 0x11);
+      x4 = _mm512_clmulepi64_epi128(x4, x0, 0x11);
+      y5 = _mm512_loadu_si512((__m512i *)(buf + 0x00));
+      y6 = _mm512_loadu_si512((__m512i *)(buf + 0x40));
+      y7 = _mm512_loadu_si512((__m512i *)(buf + 0x80));
+      y8 = _mm512_loadu_si512((__m512i *)(buf + 0xC0));
+      x1 = _mm512_xor_si512(x1, x5);
+      x2 = _mm512_xor_si512(x2, x6);
+      x3 = _mm512_xor_si512(x3, x7);
+      x4 = _mm512_xor_si512(x4, x8);
+      x1 = _mm512_xor_si512(x1, y5);
+      x2 = _mm512_xor_si512(x2, y6);
+      x3 = _mm512_xor_si512(x3, y7);
+      x4 = _mm512_xor_si512(x4, y8);
+      buf += 256;
+      len -= 256;
+    }
+    x0 = _mm512_load_si512((__m512i *)k3k4);
+    x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
+    x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
+    x1 = _mm512_xor_si512(x1, x2);
+    x1 = _mm512_xor_si512(x1, x5);
+    x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
+    x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
+    x1 = _mm512_xor_si512(x1, x3);
+    x1 = _mm512_xor_si512(x1, x5);
+    x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
+    x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
+    x1 = _mm512_xor_si512(x1, x4);
+    x1 = _mm512_xor_si512(x1, x5);
+    while (len >= 64) {
+      x2 = _mm512_loadu_si512((__m512i *)buf);
+      x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
+      x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
+      x1 = _mm512_xor_si512(x1, x2);
+      x1 = _mm512_xor_si512(x1, x5);
+      buf += 64;
+      len -= 64;
+    }
+    a0 = _mm_load_si128((__m128i *)k5k6);
+    a1 = _mm512_extracti32x4_epi32(x1, 0);
+    a2 = _mm512_extracti32x4_epi32(x1, 1);
+    a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
+    a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
+    a1 = _mm_xor_si128(a1, a3);
+    a1 = _mm_xor_si128(a1, a2);
+    a2 = _mm512_extracti32x4_epi32(x1, 2);
+    a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
+    a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
+    a1 = _mm_xor_si128(a1, a3);
+    a1 = _mm_xor_si128(a1, a2);
+    a2 = _mm512_extracti32x4_epi32(x1, 3);
+    a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
+    a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
+    a1 = _mm_xor_si128(a1, a3);
+    a1 = _mm_xor_si128(a1, a2);
+    a2 = _mm_clmulepi64_si128(a1, a0, 0x10);
+    a3 = _mm_setr_epi32(~0, 0, ~0, 0);
+    a1 = _mm_srli_si128(a1, 8);
+    a1 = _mm_xor_si128(a1, a2);
+    a0 = _mm_loadl_epi64((__m128i *)k7k8);
+    a2 = _mm_srli_si128(a1, 4);
+    a1 = _mm_and_si128(a1, a3);
+    a1 = _mm_clmulepi64_si128(a1, a0, 0x00);
+    a1 = _mm_xor_si128(a1, a2);
+    a0 = _mm_load_si128((__m128i *)poly);
+    a2 = _mm_and_si128(a1, a3);
+    a2 = _mm_clmulepi64_si128(a2, a0, 0x10);
+    a2 = _mm_and_si128(a2, a3);
+    a2 = _mm_clmulepi64_si128(a2, a0, 0x00);
+    a1 = _mm_xor_si128(a1, a2);
+    crc = _mm_extract_epi32(a1, 1);
+  }
+#endif
+#if defined(__SSE4_2__) && defined(__PCLMUL__)
+  if (len >= 64) {
+    _Alignas(__m128) static const uint64_t k1k2[] = {
+        0x0154442bd4,
+        0x01c6e41596,
+    };
+    _Alignas(__m128) static const uint64_t k3k4[] = {
+        0x01751997d0,
+        0x00ccaa009e,
+    };
+    _Alignas(__m128) static const uint64_t k5k0[] = {
+        0x0163cd6124,
+        0x0000000000,
+    };
+    _Alignas(__m128) static const uint64_t poly[] = {
+        0x01db710641,
+        0x01f7011641,
+    };
+    __m128i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
+    x1 = _mm_loadu_si128((__m128i *)(buf + 0x00));
+    x2 = _mm_loadu_si128((__m128i *)(buf + 0x10));
+    x3 = _mm_loadu_si128((__m128i *)(buf + 0x20));
+    x4 = _mm_loadu_si128((__m128i *)(buf + 0x30));
+    x1 = _mm_xor_si128(x1, _mm_cvtsi32_si128(crc));
+    x0 = _mm_load_si128((__m128i *)k1k2);
+    buf += 64;
+    len -= 64;
+    while (len >= 64) {
+      x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
+      x6 = _mm_clmulepi64_si128(x2, x0, 0x00);
+      x7 = _mm_clmulepi64_si128(x3, x0, 0x00);
+      x8 = _mm_clmulepi64_si128(x4, x0, 0x00);
+      x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
+      x2 = _mm_clmulepi64_si128(x2, x0, 0x11);
+      x3 = _mm_clmulepi64_si128(x3, x0, 0x11);
+      x4 = _mm_clmulepi64_si128(x4, x0, 0x11);
+      y5 = _mm_loadu_si128((__m128i *)(buf + 0x00));
+      y6 = _mm_loadu_si128((__m128i *)(buf + 0x10));
+      y7 = _mm_loadu_si128((__m128i *)(buf + 0x20));
+      y8 = _mm_loadu_si128((__m128i *)(buf + 0x30));
+      x1 = _mm_xor_si128(x1, x5);
+      x2 = _mm_xor_si128(x2, x6);
+      x3 = _mm_xor_si128(x3, x7);
+      x4 = _mm_xor_si128(x4, x8);
+      x1 = _mm_xor_si128(x1, y5);
+      x2 = _mm_xor_si128(x2, y6);
+      x3 = _mm_xor_si128(x3, y7);
+      x4 = _mm_xor_si128(x4, y8);
+      buf += 64;
+      len -= 64;
+    }
+    x0 = _mm_load_si128((__m128i *)k3k4);
+    x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
+    x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
+    x1 = _mm_xor_si128(x1, x2);
+    x1 = _mm_xor_si128(x1, x5);
+    x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
+    x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
+    x1 = _mm_xor_si128(x1, x3);
+    x1 = _mm_xor_si128(x1, x5);
+    x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
+    x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
+    x1 = _mm_xor_si128(x1, x4);
+    x1 = _mm_xor_si128(x1, x5);
+    while (len >= 16) {
+      x2 = _mm_loadu_si128((__m128i *)buf);
+      x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
+      x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
+      x1 = _mm_xor_si128(x1, x2);
+      x1 = _mm_xor_si128(x1, x5);
+      buf += 16;
+      len -= 16;
+    }
+    x2 = _mm_clmulepi64_si128(x1, x0, 0x10);
+    x3 = _mm_setr_epi32(~0, 0, ~0, 0);
+    x1 = _mm_srli_si128(x1, 8);
+    x1 = _mm_xor_si128(x1, x2);
+    x0 = _mm_loadl_epi64((__m128i *)k5k0);
+    x2 = _mm_srli_si128(x1, 4);
+    x1 = _mm_and_si128(x1, x3);
+    x1 = _mm_clmulepi64_si128(x1, x0, 0x00);
+    x1 = _mm_xor_si128(x1, x2);
+    x0 = _mm_load_si128((__m128i *)poly);
+    x2 = _mm_and_si128(x1, x3);
+    x2 = _mm_clmulepi64_si128(x2, x0, 0x10);
+    x2 = _mm_and_si128(x2, x3);
+    x2 = _mm_clmulepi64_si128(x2, x0, 0x00);
+    x1 = _mm_xor_si128(x1, x2);
+    crc = _mm_extract_epi32(x1, 1);
+  }
+#endif
+  static uint32_t tab[256];
+  if (!tab[255]) {
+    // generates table for byte-wise crc calculation on the polynomial
+    // x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1
+    uint32_t polynomial = 0xedb88320;  // bits are reversed
+    for (int d = 0; d < 256; ++d) {
+      uint32_t r = d;
+      for (int i = 0; i < 8; ++i)
+        r = r >> 1 ^ (r & 1 ? polynomial : 0);
+      tab[d] = r;
+    }
+  }
+  for (size_t i = 0; i < len; ++i)
+    crc = crc >> 8 ^ tab[(crc & 255) ^ buf[i]];
+  return ~crc & 0xffffffff;
+}
--- a/otherarch/sdcpp/unet.hpp
+++ b/otherarch/sdcpp/unet.hpp
@ -166,7 +166,7 @@ public:
 // ldm.modules.diffusionmodules.openaimodel.UNetModel
 class UnetModelBlock : public GGMLBlock {
 protected:
-    SDVersion version = VERSION_1_x;
+    SDVersion version = VERSION_SD1;
    // network hparams
    int in_channels                        = 4;
    int out_channels                       = 4;
@ -177,19 +177,19 @@ protected:
    int time_embed_dim                     = 1280;  // model_channels*4
    int num_heads                          = 8;
    int num_head_channels                  = -1;   // channels // num_heads
-    int context_dim                        = 768;  // 1024 for VERSION_2_x, 2048 for VERSION_XL
+    int context_dim                        = 768;  // 1024 for VERSION_SD2, 2048 for VERSION_SDXL

 public:
    int model_channels  = 320;
-    int adm_in_channels = 2816;  // only for VERSION_XL/SVD
+    int adm_in_channels = 2816;  // only for VERSION_SDXL/SVD

-    UnetModelBlock(SDVersion version = VERSION_1_x)
+    UnetModelBlock(SDVersion version = VERSION_SD1)
        : version(version) {
-        if (version == VERSION_2_x) {
+        if (version == VERSION_SD2) {
            context_dim       = 1024;
            num_head_channels = 64;
            num_heads         = -1;
-        } else if (version == VERSION_XL) {
+        } else if (version == VERSION_SDXL) {
            context_dim           = 2048;
            attention_resolutions = {4, 2};
            channel_mult          = {1, 2, 4};
@ -211,7 +211,7 @@ public:
        // time_embed_1 is nn.SiLU()
        blocks["time_embed.2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, time_embed_dim));

-        if (version == VERSION_XL || version == VERSION_SVD) {
+        if (version == VERSION_SDXL || version == VERSION_SVD) {
            blocks["label_emb.0.0"] = std::shared_ptr<GGMLBlock>(new Linear(adm_in_channels, time_embed_dim));
            // label_emb_1 is nn.SiLU()
            blocks["label_emb.0.2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, time_embed_dim));
@ -528,14 +528,13 @@ public:
    }
 };

-struct UNetModel : public GGMLModule {
-    SDVersion version = VERSION_1_x;
+struct UNetModelRunner : public GGMLRunner {
    UnetModelBlock unet;

-    UNetModel(ggml_backend_t backend,
-              ggml_type wtype,
-              SDVersion version = VERSION_1_x)
-        : GGMLModule(backend, wtype), unet(version) {
+    UNetModelRunner(ggml_backend_t backend,
+                    ggml_type wtype,
+                    SDVersion version = VERSION_SD1)
+        : GGMLRunner(backend, wtype), unet(version) {
        unet.init(params_ctx, wtype);
    }

@ -543,14 +542,6 @@ struct UNetModel : public GGMLModule {
        return "unet";
    }

-    size_t get_params_mem_size() {
-        return unet.get_params_mem_size();
-    }
-
-    size_t get_params_num() {
-        return unet.get_params_num();
-    }
-
    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
        unet.get_param_tensors(tensors, prefix);
    }
@ -613,7 +604,7 @@ struct UNetModel : public GGMLModule {
            return build_graph(x, timesteps, context, c_concat, y, num_video_frames, controls, control_strength);
        };

-        GGMLModule::compute(get_graph, n_threads, false, output, output_ctx);
+        GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
    }

    void test() {
@ -655,7 +646,7 @@ struct UNetModel : public GGMLModule {
            print_ggml_tensor(out);
            LOG_DEBUG("unet test done in %dms", t1 - t0);
        }
-    };
+    }
 };

 #endif  // __UNET_HPP__
--- a/otherarch/sdcpp/upscaler.cpp
+++ b/otherarch/sdcpp/upscaler.cpp
@ -21,12 +21,17 @@ struct UpscalerGGML {
 #endif
 #ifdef SD_USE_METAL
        LOG_DEBUG("Using Metal backend");
+        ggml_backend_metal_log_set_callback(ggml_log_callback_default, nullptr);
        backend = ggml_backend_metal_init();
 #endif
 #ifdef SD_USE_VULKAN
        LOG_DEBUG("Using Vulkan backend");
        backend = ggml_backend_vk_init(0);
 #endif
+#ifdef SD_USE_SYCL
+        LOG_DEBUG("Using SYCL backend");
+        backend = ggml_backend_sycl_init(0);
+#endif

        if (!backend) {
            LOG_DEBUG("Using CPU backend");
--- a/otherarch/sdcpp/util.cpp
+++ b/otherarch/sdcpp/util.cpp
@ -28,6 +28,9 @@
 #include "ggml.h"
 #include "stable-diffusion.h"

+// #define STB_IMAGE_RESIZE_IMPLEMENTATION //already defined
+#include "stb_image_resize.h"
+
 bool ends_with(const std::string& str, const std::string& ending) {
    if (str.length() >= ending.length()) {
        return (str.compare(str.length() - ending.length(), ending.length(), ending) == 0);
@ -98,6 +101,43 @@ std::string get_full_path(const std::string& dir, const std::string& filename) {
    }
 }

+std::vector<std::string> get_files_from_dir(const std::string& dir) {
+    std::vector<std::string> files;
+
+    WIN32_FIND_DATA findFileData;
+    HANDLE hFind;
+
+    char currentDirectory[MAX_PATH];
+    GetCurrentDirectory(MAX_PATH, currentDirectory);
+
+    char directoryPath[MAX_PATH];  // this is absolute path
+    sprintf(directoryPath, "%s\\%s\\*", currentDirectory, dir.c_str());
+
+    // Find the first file in the directory
+    hFind = FindFirstFile(directoryPath, &findFileData);
+
+    // Check if the directory was found
+    if (hFind == INVALID_HANDLE_VALUE) {
+        printf("Unable to find directory.\n");
+        return files;
+    }
+
+    // Loop through all files in the directory
+    do {
+        // Check if the found file is a regular file (not a directory)
+        if (!(findFileData.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY)) {
+            files.push_back(std::string(currentDirectory) + "\\" + dir + "\\" + std::string(findFileData.cFileName));
+        }
+    } while (FindNextFile(hFind, &findFileData) != 0);
+
+    // Close the handle
+    FindClose(hFind);
+
+    sort(files.begin(), files.end());
+
+    return files;
+}
+
 #else  // Unix
 #include <dirent.h>
 #include <sys/stat.h>
@ -112,6 +152,7 @@ bool is_directory(const std::string& path) {
    return (stat(path.c_str(), &buffer) == 0 && S_ISDIR(buffer.st_mode));
 }

+// TODO: add windows version
 std::string get_full_path(const std::string& dir, const std::string& filename) {
    DIR* dp = opendir(dir.c_str());

@ -131,6 +172,27 @@ std::string get_full_path(const std::string& dir, const std::string& filename) {
    return "";
 }

+std::vector<std::string> get_files_from_dir(const std::string& dir) {
+    std::vector<std::string> files;
+
+    DIR* dp = opendir(dir.c_str());
+
+    if (dp != nullptr) {
+        struct dirent* entry;
+
+        while ((entry = readdir(dp)) != nullptr) {
+            std::string fname = dir + "/" + entry->d_name;
+            if (!is_directory(fname))
+                files.push_back(fname);
+        }
+        closedir(dp);
+    }
+
+    sort(files.begin(), files.end());
+
+    return files;
+}
+
 #endif

 // get_num_physical_cores is copy from
@ -171,6 +233,9 @@ int32_t sd_get_num_physical_cores() {
    return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
 }

+static sd_progress_cb_t sd_progress_cb = NULL;
+void* sd_progress_cb_data              = NULL;
+
 std::u32string utf8_to_utf32(const std::string& utf8_str) {
    std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
    return converter.from_bytes(utf8_str);
@ -214,9 +279,46 @@ std::string path_join(const std::string& p1, const std::string& p2) {
    return p1 + "/" + p2;
 }

+sd_image_t* preprocess_id_image(sd_image_t* img) {
+    int shortest_edge   = 224;
+    int size            = shortest_edge;
+    sd_image_t* resized = NULL;
+    uint32_t w          = img->width;
+    uint32_t h          = img->height;
+    uint32_t c          = img->channel;
+
+    // 1. do resize using stb_resize functions
+
+    unsigned char* buf = (unsigned char*)malloc(sizeof(unsigned char) * 3 * size * size);
+    if (!stbir_resize_uint8(img->data, w, h, 0,
+                            buf, size, size, 0,
+                            c)) {
+        fprintf(stderr, "%s: resize operation failed \n ", __func__);
+        return resized;
+    }
+
+    // 2. do center crop (likely unnecessary due to step 1)
+
+    // 3. do rescale
+
+    // 4. do normalize
+
+    // 3 and 4 will need to be done in float format.
+
+    resized = new sd_image_t{(uint32_t)shortest_edge,
+                             (uint32_t)shortest_edge,
+                             3,
+                             buf};
+    return resized;
+}
+
 static int sdloglevel = 0; //-1 = hide all, 0 = normal, 1 = showall
 static bool sdquiet = false;
 void pretty_progress(int step, int steps, float time) {
+    if (sd_progress_cb) {
+        sd_progress_cb(step, steps, time, sd_progress_cb_data);
+        return;
+    }
    if (step == 0) {
        return;
    }
@ -296,23 +398,13 @@ void log_printf(sd_log_level_t level, const char* file, int line, const char* fo
    va_list args;
    va_start(args, format);

-    const char* level_str = "DEBUG";
-    if (level == SD_LOG_INFO) {
-        level_str = "INFO ";
-    } else if (level == SD_LOG_WARN) {
-        level_str = "WARN ";
-    } else if (level == SD_LOG_ERROR) {
-        level_str = "ERROR";
-    }
-
-    static char log_buffer[LOG_BUFFER_SIZE];
-
-    int written = snprintf(log_buffer, LOG_BUFFER_SIZE, "[%s] %s:%-4d - ", level_str, sd_basename(file).c_str(), line);
+    static char log_buffer[LOG_BUFFER_SIZE + 1];
+    int written = snprintf(log_buffer, LOG_BUFFER_SIZE, "%s:%-4d - ", sd_basename(file).c_str(), line);

    if (written >= 0 && written < LOG_BUFFER_SIZE) {
        vsnprintf(log_buffer + written, LOG_BUFFER_SIZE - written, format, args);
-        strncat(log_buffer, "\n", LOG_BUFFER_SIZE - strlen(log_buffer) - 1);
    }
+    strncat(log_buffer, "\n", LOG_BUFFER_SIZE - strlen(log_buffer));

    if (sd_log_cb) {
        sd_log_cb(level, log_buffer, sd_log_cb_data);
@ -325,7 +417,10 @@ void sd_set_log_callback(sd_log_cb_t cb, void* data) {
    sd_log_cb      = cb;
    sd_log_cb_data = data;
 }
-
+void sd_set_progress_callback(sd_progress_cb_t cb, void* data) {
+    sd_progress_cb      = cb;
+    sd_progress_cb_data = data;
+}
 const char* sd_get_system_info() {
    static char buffer[1024];
    std::stringstream ss;
@ -499,4 +594,111 @@ sd_image_f32_t clip_preprocess(sd_image_f32_t image, int size) {
    }

    return result;
+}
+
+// Ref: https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/cad87bf4e3e0b0a759afa94e933527c3123d59bc/modules/prompt_parser.py#L345
+//
+// Parses a string with attention tokens and returns a list of pairs: text and its associated weight.
+// Accepted tokens are:
+//   (abc) - increases attention to abc by a multiplier of 1.1
+//   (abc:3.12) - increases attention to abc by a multiplier of 3.12
+//   [abc] - decreases attention to abc by a multiplier of 1.1
+//   \( - literal character '('
+//   \[ - literal character '['
+//   \) - literal character ')'
+//   \] - literal character ']'
+//   \\ - literal character '\'
+//   anything else - just text
+//
+// >>> parse_prompt_attention('normal text')
+// [['normal text', 1.0]]
+// >>> parse_prompt_attention('an (important) word')
+// [['an ', 1.0], ['important', 1.1], [' word', 1.0]]
+// >>> parse_prompt_attention('(unbalanced')
+// [['unbalanced', 1.1]]
+// >>> parse_prompt_attention('\(literal\]')
+// [['(literal]', 1.0]]
+// >>> parse_prompt_attention('(unnecessary)(parens)')
+// [['unnecessaryparens', 1.1]]
+// >>> parse_prompt_attention('a (((house:1.3)) [on] a (hill:0.5), sun, (((sky))).')
+// [['a ', 1.0],
+//  ['house', 1.5730000000000004],
+//  [' ', 1.1],
+//  ['on', 1.0],
+//  [' a ', 1.1],
+//  ['hill', 0.55],
+//  [', sun, ', 1.1],
+//  ['sky', 1.4641000000000006],
+//  ['.', 1.1]]
+std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::string& text) {
+    std::vector<std::pair<std::string, float>> res;
+    std::vector<int> round_brackets;
+    std::vector<int> square_brackets;
+
+    float round_bracket_multiplier  = 1.1f;
+    float square_bracket_multiplier = 1 / 1.1f;
+
+    std::regex re_attention(R"(\\\(|\\\)|\\\[|\\\]|\\\\|\\|\(|\[|:([+-]?[.\d]+)\)|\)|\]|[^\\()\[\]:]+|:)");
+    std::regex re_break(R"(\s*\bBREAK\b\s*)");
+
+    auto multiply_range = [&](int start_position, float multiplier) {
+        for (int p = start_position; p < res.size(); ++p) {
+            res[p].second *= multiplier;
+        }
+    };
+
+    std::smatch m;
+    std::string remaining_text = text;
+
+    while (std::regex_search(remaining_text, m, re_attention)) {
+        std::string text   = m[0];
+        std::string weight = m[1];
+
+        if (text == "(") {
+            round_brackets.push_back((int)res.size());
+        } else if (text == "[") {
+            square_brackets.push_back((int)res.size());
+        } else if (!weight.empty()) {
+            if (!round_brackets.empty()) {
+                multiply_range(round_brackets.back(), std::stof(weight));
+                round_brackets.pop_back();
+            }
+        } else if (text == ")" && !round_brackets.empty()) {
+            multiply_range(round_brackets.back(), round_bracket_multiplier);
+            round_brackets.pop_back();
+        } else if (text == "]" && !square_brackets.empty()) {
+            multiply_range(square_brackets.back(), square_bracket_multiplier);
+            square_brackets.pop_back();
+        } else if (text == "\\(") {
+            res.push_back({text.substr(1), 1.0f});
+        } else {
+            res.push_back({text, 1.0f});
+        }
+
+        remaining_text = m.suffix();
+    }
+
+    for (int pos : round_brackets) {
+        multiply_range(pos, round_bracket_multiplier);
+    }
+
+    for (int pos : square_brackets) {
+        multiply_range(pos, square_bracket_multiplier);
+    }
+
+    if (res.empty()) {
+        res.push_back({"", 1.0f});
+    }
+
+    int i = 0;
+    while (i + 1 < res.size()) {
+        if (res[i].second == res[i + 1].second) {
+            res[i].first += res[i + 1].first;
+            res.erase(res.begin() + i + 1);
+        } else {
+            ++i;
+        }
+    }
+
+    return res;
 }
--- a/otherarch/sdcpp/util.h
+++ b/otherarch/sdcpp/util.h
@ -3,6 +3,7 @@

 #include <cstdint>
 #include <string>
+#include <vector>

 #include "stable-diffusion.h"

@ -18,10 +19,16 @@ bool file_exists(const std::string& filename);
 bool is_directory(const std::string& path);
 std::string get_full_path(const std::string& dir, const std::string& filename);

+std::vector<std::string> get_files_from_dir(const std::string& dir);
+
 std::u32string utf8_to_utf32(const std::string& utf8_str);
 std::string utf32_to_utf8(const std::u32string& utf32_str);
 std::u32string unicode_value_to_utf32(int unicode_value);

+sd_image_t* preprocess_id_image(sd_image_t* img);
+
+// std::string sd_basename(const std::string& path);
+
 typedef struct {
    uint32_t width;
    uint32_t height;
@ -45,6 +52,8 @@ void log_printf(sd_log_level_t level, const char* file, int line, const char* fo

 std::string trim(const std::string& s);

+std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::string& text);
+
 void log_message(const char* format, ...);
 void set_sd_log_level(int log);
 bool get_sd_log_level();
--- a/otherarch/sdcpp/vae.hpp
+++ b/otherarch/sdcpp/vae.hpp
@ -6,7 +6,7 @@

 /*================================================== AutoEncoderKL ===================================================*/

-#define VAE_GRAPH_SIZE 10240
+#define VAE_GRAPH_SIZE 20480

 class ResnetBlock : public UnaryBlock {
 protected:
@ -439,6 +439,7 @@ class AutoencodingEngine : public GGMLBlock {
 protected:
    bool decode_only       = true;
    bool use_video_decoder = false;
+    bool use_quant         = true;
    int embed_dim          = 4;
    struct {
        int z_channels           = 4;
@ -453,15 +454,23 @@ protected:

 public:
    AutoencodingEngine(bool decode_only       = true,
-                       bool use_video_decoder = false)
+                       bool use_video_decoder = false,
+                       SDVersion version      = VERSION_SD1)
        : decode_only(decode_only), use_video_decoder(use_video_decoder) {
+        if (version == VERSION_SD3_2B || version == VERSION_SD3_5_8B || version == VERSION_FLUX_DEV || version == VERSION_FLUX_SCHNELL) {
+            dd_config.z_channels = 16;
+            use_quant            = false;
+        }
+        if (use_video_decoder) {
+            use_quant = false;
+        }
        blocks["decoder"] = std::shared_ptr<GGMLBlock>(new Decoder(dd_config.ch,
                                                                   dd_config.out_ch,
                                                                   dd_config.ch_mult,
                                                                   dd_config.num_res_blocks,
                                                                   dd_config.z_channels,
                                                                   use_video_decoder));
-        if (!use_video_decoder) {
+        if (use_quant) {
            blocks["post_quant_conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(dd_config.z_channels,
                                                                              embed_dim,
                                                                              {1, 1}));
@ -473,7 +482,7 @@ public:
                                                                       dd_config.in_channels,
                                                                       dd_config.z_channels,
                                                                       dd_config.double_z));
-            if (!use_video_decoder) {
+            if (use_quant) {
                int factor = dd_config.double_z ? 2 : 1;

                blocks["quant_conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(embed_dim * factor,
@ -485,7 +494,7 @@ public:

    struct ggml_tensor* decode(struct ggml_context* ctx, struct ggml_tensor* z) {
        // z: [N, z_channels, h, w]
-        if (!use_video_decoder) {
+        if (use_quant) {
            auto post_quant_conv = std::dynamic_pointer_cast<Conv2d>(blocks["post_quant_conv"]);
            z                    = post_quant_conv->forward(ctx, z);  // [N, z_channels, h, w]
        }
@ -502,7 +511,7 @@ public:
        auto encoder = std::dynamic_pointer_cast<Encoder>(blocks["encoder"]);

        auto h = encoder->forward(ctx, x);  // [N, 2*z_channels, h/8, w/8]
-        if (!use_video_decoder) {
+        if (use_quant) {
            auto quant_conv = std::dynamic_pointer_cast<Conv2d>(blocks["quant_conv"]);
            h               = quant_conv->forward(ctx, h);  // [N, 2*embed_dim, h/8, w/8]
        }
@ -510,15 +519,16 @@ public:
    }
 };

-struct AutoEncoderKL : public GGMLModule {
+struct AutoEncoderKL : public GGMLRunner {
    bool decode_only = true;
    AutoencodingEngine ae;

    AutoEncoderKL(ggml_backend_t backend,
                  ggml_type wtype,
                  bool decode_only       = false,
-                  bool use_video_decoder = false)
-        : decode_only(decode_only), ae(decode_only, use_video_decoder), GGMLModule(backend, wtype) {
+                  bool use_video_decoder = false,
+                  SDVersion version      = VERSION_SD1)
+        : decode_only(decode_only), ae(decode_only, use_video_decoder, version), GGMLRunner(backend, wtype) {
        ae.init(params_ctx, wtype);
    }

@ -526,14 +536,6 @@ struct AutoEncoderKL : public GGMLModule {
        return "vae";
    }

-    size_t get_params_mem_size() {
-        return ae.get_params_mem_size();
-    }
-
-    size_t get_params_num() {
-        return ae.get_params_num();
-    }
-
    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
        ae.get_param_tensors(tensors, prefix);
    }
@ -560,7 +562,7 @@ struct AutoEncoderKL : public GGMLModule {
        };
        // ggml_set_f32(z, 0.5f);
        // print_ggml_tensor(z);
-        GGMLModule::compute(get_graph, n_threads, true, output, output_ctx);
+        GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
    }

    void test() {
--- a/otherarch/sdcpp/vocab.hpp
+++ b/otherarch/sdcpp/vocab.hpp