From 78bbe8995650551a4412a911a8f359575f01c8a3 Mon Sep 17 00:00:00 2001
From: Wagner Bruna <wbruna@users.noreply.github.com>
Date: Tue, 16 Dec 2025 05:16:48 -0300
Subject: [PATCH] sd: sync to master-417-43a70e8 (#1889)

* sd: sync to master-417-43a70e8

* fix sdmain build

* switch to upstream apply_loras()

* refactor u8 path conversions and add it to the gguf reader
---
 Makefile                             |    2 +-
 otherarch/sdcpp/clip.hpp             |  103 +-
 otherarch/sdcpp/common.hpp           |    4 +-
 otherarch/sdcpp/common/common.hpp    | 1804 ++++++++++++++++++++++++++
 otherarch/sdcpp/conditioner.hpp      |  128 +-
 otherarch/sdcpp/esrgan.hpp           |    3 +-
 otherarch/sdcpp/flux.hpp             |  134 +-
 otherarch/sdcpp/ggml_extend.hpp      |   76 +-
 otherarch/sdcpp/gguf_reader.hpp      |    2 +-
 otherarch/sdcpp/latent-preview.h     |   79 +-
 otherarch/sdcpp/llm.hpp              |   71 +-
 otherarch/sdcpp/main.cpp             | 1491 +--------------------
 otherarch/sdcpp/model.cpp            |   32 +-
 otherarch/sdcpp/model.h              |    2 +
 otherarch/sdcpp/rope.hpp             |   15 +-
 otherarch/sdcpp/sdtype_adapter.cpp   |   19 +-
 otherarch/sdcpp/stable-diffusion.cpp |  237 ++--
 otherarch/sdcpp/stable-diffusion.h   |   26 +-
 otherarch/sdcpp/upscaler.cpp         |   16 +-
 otherarch/sdcpp/util.cpp             |   37 +-
 otherarch/sdcpp/util.h               |    3 +-
 otherarch/sdcpp/version.cpp          |   20 +
 22 files changed, 2462 insertions(+), 1842 deletions(-)
 create mode 100644 otherarch/sdcpp/common/common.hpp
 create mode 100644 otherarch/sdcpp/version.cpp

diff --git a/Makefile b/Makefile
index f355367aa..ce78dd4b6 100644
--- a/Makefile
+++ b/Makefile
@@ -770,7 +770,7 @@ main: tools/completion/completion.cpp common/arg.cpp common/download.cpp build-i
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 mainvk: tools/completion/completion.cpp common/arg.cpp common/download.cpp build-info.h ggml_v4_vulkan.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_vulkan.o llava.o ggml-backend_vulkan.o ggml-backend-reg_vulkan.o ggml-vulkan.o ggml-vulkan-shaders.o ggml-repack.o $(OBJS_FULL) $(OBJS) lib/vulkan-1.lib
 	$(CXX) $(CXXFLAGS) -DGGML_USE_VULKAN -DSD_USE_VULKAN $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-sdmain: otherarch/sdcpp/util.cpp otherarch/sdcpp/main.cpp otherarch/sdcpp/stable-diffusion.cpp otherarch/sdcpp/upscaler.cpp otherarch/sdcpp/model.cpp otherarch/sdcpp/name_conversion.cpp otherarch/sdcpp/tokenize_util.cpp otherarch/sdcpp/thirdparty/zip.c build-info.h ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS)
+sdmain: otherarch/sdcpp/util.cpp otherarch/sdcpp/main.cpp otherarch/sdcpp/stable-diffusion.cpp otherarch/sdcpp/upscaler.cpp otherarch/sdcpp/model.cpp otherarch/sdcpp/name_conversion.cpp otherarch/sdcpp/tokenize_util.cpp otherarch/sdcpp/version.cpp otherarch/sdcpp/thirdparty/zip.c build-info.h ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 whispermain: otherarch/whispercpp/main.cpp otherarch/whispercpp/whisper.cpp build-info.h ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
diff --git a/otherarch/sdcpp/clip.hpp b/otherarch/sdcpp/clip.hpp
index 1f983271f..24c94f1bb 100644
--- a/otherarch/sdcpp/clip.hpp
+++ b/otherarch/sdcpp/clip.hpp
@@ -3,34 +3,10 @@
 
 #include "ggml_extend.hpp"
 #include "model.h"
+#include "tokenize_util.h"
 
 /*================================================== CLIPTokenizer ===================================================*/
 
-__STATIC_INLINE__ std::pair<std::unordered_map<std::string, float>, std::string> extract_and_remove_lora(std::string text) {
-    std::regex re("<lora:([^:]+):([^>]+)>");
-    std::smatch matches;
-    std::unordered_map<std::string, float> filename2multiplier;
-
-    while (std::regex_search(text, matches, re)) {
-        std::string filename = matches[1].str();
-        float multiplier     = std::stof(matches[2].str());
-
-        text = std::regex_replace(text, re, "", std::regex_constants::format_first_only);
-
-        if (multiplier == 0.f) {
-            continue;
-        }
-
-        if (filename2multiplier.find(filename) == filename2multiplier.end()) {
-            filename2multiplier[filename] = multiplier;
-        } else {
-            filename2multiplier[filename] += multiplier;
-        }
-    }
-
-    return std::make_pair(filename2multiplier, text);
-}
-
 __STATIC_INLINE__ std::vector<std::pair<int, std::u32string>> bytes_to_unicode() {
     std::vector<std::pair<int, std::u32string>> byte_unicode_pairs;
     std::set<int> byte_set;
@@ -72,6 +48,8 @@ private:
     int encoder_len;
     int bpe_len;
 
+    std::vector<std::string> special_tokens;
+
 public:
     const std::string UNK_TOKEN = "<|endoftext|>";
     const std::string BOS_TOKEN = "<|startoftext|>";
@@ -117,6 +95,15 @@ private:
         return pairs;
     }
 
+    bool is_special_token(const std::string& token) {
+        for (auto& special_token : special_tokens) {
+            if (special_token == token) {
+                return true;
+            }
+        }
+        return false;
+    }
+
 public:
     CLIPTokenizer(int pad_token_id = 49407, const std::string& merges_utf8_str = "")
         : PAD_TOKEN_ID(pad_token_id) {
@@ -125,6 +112,8 @@ public:
         } else {
             load_from_merges(ModelLoader::load_merges());
         }
+        add_special_token("<|startoftext|>");
+        add_special_token("<|endoftext|>");
     }
 
     void load_from_merges(const std::string& merges_utf8_str) {
@@ -201,6 +190,10 @@ public:
         }
     }
 
+    void add_special_token(const std::string& token) {
+        special_tokens.push_back(token);
+    }
+
     std::u32string bpe(const std::u32string& token) {
         std::vector<std::u32string> word;
 
@@ -379,25 +372,54 @@ public:
         return trim(text);
     }
 
+    std::vector<std::string> token_split(const std::string& text) {
+        std::regex pat(R"('s|'t|'re|'ve|'m|'ll|'d|[[:alpha:]]+|[[:digit:]]|[^[:space:][:alpha:][:digit:]]+)",
+                       std::regex::icase);
+        std::sregex_iterator iter(text.begin(), text.end(), pat);
+        std::sregex_iterator end;
+
+        std::vector<std::string> result;
+        for (; iter != end; ++iter) {
+            result.emplace_back(iter->str());
+        }
+
+        return result;
+    }
+
     std::vector<int> encode(std::string text, on_new_token_cb_t on_new_token_cb) {
         std::string original_text = text;
         std::vector<int32_t> bpe_tokens;
         text = whitespace_clean(text);
         std::transform(text.begin(), text.end(), text.begin(), [](unsigned char c) { return std::tolower(c); });
 
-        std::regex pat(R"(<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[[:alpha:]]+|[[:digit:]]|[^[:space:][:alpha:][:digit:]]+)",
-                       std::regex::icase);
-
-        std::smatch matches;
         std::string str = text;
         std::vector<std::string> token_strs;
-        while (std::regex_search(str, matches, pat)) {
-            bool skip = on_new_token_cb(str, bpe_tokens);
-            if (skip) {
+
+        auto splited_texts = split_with_special_tokens(text, special_tokens);
+
+        for (auto& splited_text : splited_texts) {
+            LOG_DEBUG("token %s", splited_text.c_str());
+            if (is_special_token(splited_text)) {
+                LOG_DEBUG("special %s", splited_text.c_str());
+                bool skip = on_new_token_cb(splited_text, bpe_tokens);
+                if (skip) {
+                    token_strs.push_back(splited_text);
+                    continue;
+                }
                 continue;
             }
-            for (auto& token : matches) {
-                std::string token_str = token.str();
+
+            auto tokens = token_split(splited_text);
+            for (auto& token : tokens) {
+                if (on_new_token_cb != nullptr) {
+                    bool skip = on_new_token_cb(token, bpe_tokens);
+                    if (skip) {
+                        token_strs.push_back(token);
+                        continue;
+                    }
+                }
+
+                std::string token_str = token;
                 std::u32string utf32_token;
                 for (int i = 0; i < token_str.length(); i++) {
                     unsigned char b = token_str[i];
@@ -417,14 +439,13 @@ public:
                 bpe_tokens.push_back(encoder[bpe_str]);
                 token_strs.push_back(utf32_to_utf8(bpe_str));
             }
-            str = matches.suffix();
         }
-        std::stringstream ss;
-        ss << "[";
-        for (auto token : token_strs) {
-            ss << "\"" << token << "\", ";
-        }
-        ss << "]";
+        // std::stringstream ss;
+        // ss << "[";
+        // for (auto token : token_strs) {
+        //     ss << "\"" << token << "\", ";
+        // }
+        // ss << "]";
         // LOG_DEBUG("split prompt \"%s\" to tokens %s", original_text.c_str(), ss.str().c_str());
         // printf("split prompt \"%s\" to tokens %s \n", original_text.c_str(), ss.str().c_str());
         return bpe_tokens;
diff --git a/otherarch/sdcpp/common.hpp b/otherarch/sdcpp/common.hpp
index 33d499fb1..74b218ab7 100644
--- a/otherarch/sdcpp/common.hpp
+++ b/otherarch/sdcpp/common.hpp
@@ -194,10 +194,12 @@ public:
         auto proj = std::dynamic_pointer_cast<Linear>(blocks["proj"]);
 
         x          = proj->forward(ctx, x);  // [ne3, ne2, ne1, dim_out*2]
-        auto x_vec = ggml_ext_chunk(ctx->ggml_ctx, x, 2, 0);
+        auto x_vec = ggml_ext_chunk(ctx->ggml_ctx, x, 2, 0, false);
         x          = x_vec[0];  // [ne3, ne2, ne1, dim_out]
         auto gate  = x_vec[1];  // [ne3, ne2, ne1, dim_out]
 
+        gate = ggml_cont(ctx->ggml_ctx, gate);
+
         gate = ggml_gelu_inplace(ctx->ggml_ctx, gate);
 
         x = ggml_mul(ctx->ggml_ctx, x, gate);  // [ne3, ne2, ne1, dim_out]
diff --git a/otherarch/sdcpp/common/common.hpp b/otherarch/sdcpp/common/common.hpp
new file mode 100644
index 000000000..e9be436be
--- /dev/null
+++ b/otherarch/sdcpp/common/common.hpp
@@ -0,0 +1,1804 @@
+
+#include <filesystem>
+#include <iostream>
+#include <map>
+#include <random>
+#include <regex>
+#include <sstream>
+#include <string>
+#include <vector>
+
+//kcpp
+#include <nlohmann/json.hpp>
+using json   = nlohmann::json;
+namespace fs = std::filesystem;
+
+#if defined(_WIN32)
+#define NOMINMAX
+#include <windows.h>
+#endif  // _WIN32
+
+#include "stable-diffusion.h"
+
+#define STB_IMAGE_IMPLEMENTATION
+//#define STB_IMAGE_STATIC
+#include "stb_image.h"
+
+#define STB_IMAGE_WRITE_IMPLEMENTATION
+//#define STB_IMAGE_WRITE_STATIC
+#include "stb_image_write.h"
+
+#define STB_IMAGE_RESIZE_IMPLEMENTATION
+//#define STB_IMAGE_RESIZE_STATIC
+#include "stb_image_resize.h"
+
+#define SAFE_STR(s) ((s) ? (s) : "")
+#define BOOL_STR(b) ((b) ? "true" : "false")
+
+const char* modes_str[] = {
+    "img_gen",
+    "vid_gen",
+    "convert",
+    "upscale",
+};
+#define SD_ALL_MODES_STR "img_gen, vid_gen, convert, upscale"
+
+enum SDMode {
+    IMG_GEN,
+    VID_GEN,
+    CONVERT,
+    UPSCALE,
+    MODE_COUNT
+};
+
+#if defined(_WIN32)
+static std::string utf16_to_utf8(const std::wstring& wstr) {
+    if (wstr.empty())
+        return {};
+    int size_needed = WideCharToMultiByte(CP_UTF8, 0, wstr.data(), (int)wstr.size(),
+                                          nullptr, 0, nullptr, nullptr);
+    if (size_needed <= 0)
+        throw std::runtime_error("UTF-16 to UTF-8 conversion failed");
+
+    std::string utf8(size_needed, 0);
+    WideCharToMultiByte(CP_UTF8, 0, wstr.data(), (int)wstr.size(),
+                        (char*)utf8.data(), size_needed, nullptr, nullptr);
+    return utf8;
+}
+
+static std::string argv_to_utf8(int index, const char** argv) {
+    int argc;
+    wchar_t** argv_w = CommandLineToArgvW(GetCommandLineW(), &argc);
+    if (!argv_w)
+        throw std::runtime_error("Failed to parse command line");
+
+    std::string result;
+    if (index < argc) {
+        result = utf16_to_utf8(argv_w[index]);
+    }
+    LocalFree(argv_w);
+    return result;
+}
+
+#else  // Linux / macOS
+static std::string argv_to_utf8(int index, const char** argv) {
+    return std::string(argv[index]);
+}
+
+#endif
+
+struct StringOption {
+    std::string short_name;
+    std::string long_name;
+    std::string desc;
+    std::string* target;
+};
+
+struct IntOption {
+    std::string short_name;
+    std::string long_name;
+    std::string desc;
+    int* target;
+};
+
+struct FloatOption {
+    std::string short_name;
+    std::string long_name;
+    std::string desc;
+    float* target;
+};
+
+struct BoolOption {
+    std::string short_name;
+    std::string long_name;
+    std::string desc;
+    bool keep_true;
+    bool* target;
+};
+
+struct ManualOption {
+    std::string short_name;
+    std::string long_name;
+    std::string desc;
+    std::function<int(int argc, const char** argv, int index)> cb;
+};
+
+struct ArgOptions {
+    std::vector<StringOption> string_options;
+    std::vector<IntOption> int_options;
+    std::vector<FloatOption> float_options;
+    std::vector<BoolOption> bool_options;
+    std::vector<ManualOption> manual_options;
+
+    static std::string wrap_text(const std::string& text, size_t width, size_t indent) {
+        std::ostringstream oss;
+        size_t line_len = 0;
+        size_t pos      = 0;
+
+        while (pos < text.size()) {
+            // Preserve manual newlines
+            if (text[pos] == '\n') {
+                oss << '\n'
+                    << std::string(indent, ' ');
+                line_len = indent;
+                ++pos;
+                continue;
+            }
+
+            // Add the character
+            oss << text[pos];
+            ++line_len;
+            ++pos;
+
+            // If the current line exceeds width, try to break at the last space
+            if (line_len >= width) {
+                std::string current = oss.str();
+                size_t back         = current.size();
+
+                // Find the last space (for a clean break)
+                while (back > 0 && current[back - 1] != ' ' && current[back - 1] != '\n')
+                    --back;
+
+                // If found a space to break on
+                if (back > 0 && current[back - 1] != '\n') {
+                    std::string before = current.substr(0, back - 1);
+                    std::string after  = current.substr(back);
+                    oss.str("");
+                    oss.clear();
+                    oss << before << "\n"
+                        << std::string(indent, ' ') << after;
+                } else {
+                    // If no space found, just break at width
+                    oss << "\n"
+                        << std::string(indent, ' ');
+                }
+                line_len = indent;
+            }
+        }
+
+        return oss.str();
+    }
+
+    void print() const {
+        constexpr size_t max_line_width = 120;
+
+        struct Entry {
+            std::string names;
+            std::string desc;
+        };
+        std::vector<Entry> entries;
+
+        auto add_entry = [&](const std::string& s, const std::string& l,
+                             const std::string& desc, const std::string& hint = "") {
+            std::ostringstream ss;
+            if (!s.empty())
+                ss << s;
+            if (!s.empty() && !l.empty())
+                ss << ", ";
+            if (!l.empty())
+                ss << l;
+            if (!hint.empty())
+                ss << " " << hint;
+            entries.push_back({ss.str(), desc});
+        };
+
+        for (auto& o : string_options)
+            add_entry(o.short_name, o.long_name, o.desc, "<string>");
+        for (auto& o : int_options)
+            add_entry(o.short_name, o.long_name, o.desc, "<int>");
+        for (auto& o : float_options)
+            add_entry(o.short_name, o.long_name, o.desc, "<float>");
+        for (auto& o : bool_options)
+            add_entry(o.short_name, o.long_name, o.desc, "");
+        for (auto& o : manual_options)
+            add_entry(o.short_name, o.long_name, o.desc);
+
+        size_t max_name_width = 0;
+        for (auto& e : entries)
+            max_name_width = std::max(max_name_width, e.names.size());
+
+        for (auto& e : entries) {
+            size_t indent            = 2 + max_name_width + 4;
+            size_t desc_width        = (max_line_width > indent ? max_line_width - indent : 40);
+            std::string wrapped_desc = wrap_text(e.desc, max_line_width, indent);
+            std::cout << "  " << std::left << std::setw(static_cast<int>(max_name_width) + 4)
+                      << e.names << wrapped_desc << "\n";
+        }
+    }
+};
+
+static bool parse_options(int argc, const char** argv, const std::vector<ArgOptions>& options_list) {
+    bool invalid_arg = false;
+    std::string arg;
+
+    auto match_and_apply = [&](auto& opts, auto&& apply_fn) -> bool {
+        for (auto& option : opts) {
+            if ((option.short_name.size() > 0 && arg == option.short_name) ||
+                (option.long_name.size() > 0 && arg == option.long_name)) {
+                apply_fn(option);
+                return true;
+            }
+        }
+        return false;
+    };
+
+    for (int i = 1; i < argc; i++) {
+        arg            = argv[i];
+        bool found_arg = false;
+
+        for (auto& options : options_list) {
+            if (match_and_apply(options.string_options, [&](auto& option) {
+                    if (++i >= argc) {
+                        invalid_arg = true;
+                        return;
+                    }
+                    *option.target = argv_to_utf8(i, argv);
+                    found_arg      = true;
+                }))
+                break;
+
+            if (match_and_apply(options.int_options, [&](auto& option) {
+                    if (++i >= argc) {
+                        invalid_arg = true;
+                        return;
+                    }
+                    *option.target = std::stoi(argv[i]);
+                    found_arg      = true;
+                }))
+                break;
+
+            if (match_and_apply(options.float_options, [&](auto& option) {
+                    if (++i >= argc) {
+                        invalid_arg = true;
+                        return;
+                    }
+                    *option.target = std::stof(argv[i]);
+                    found_arg      = true;
+                }))
+                break;
+
+            if (match_and_apply(options.bool_options, [&](auto& option) {
+                    *option.target = option.keep_true ? true : false;
+                    found_arg      = true;
+                }))
+                break;
+
+            if (match_and_apply(options.manual_options, [&](auto& option) {
+                    int ret = option.cb(argc, argv, i);
+                    if (ret < 0) {
+                        invalid_arg = true;
+                        return;
+                    }
+                    i += ret;
+                    found_arg = true;
+                }))
+                break;
+        }
+
+        if (invalid_arg) {
+            fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
+            return false;
+        }
+        if (!found_arg) {
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            return false;
+        }
+    }
+
+    return true;
+}
+
+struct SDContextParams {
+    int n_threads = -1;
+    std::string model_path;
+    std::string clip_l_path;
+    std::string clip_g_path;
+    std::string clip_vision_path;
+    std::string t5xxl_path;
+    std::string llm_path;
+    std::string llm_vision_path;
+    std::string diffusion_model_path;
+    std::string high_noise_diffusion_model_path;
+    std::string vae_path;
+    std::string taesd_path;
+    std::string esrgan_path;
+    std::string control_net_path;
+    std::string embedding_dir;
+    std::string photo_maker_path;
+    sd_type_t wtype = SD_TYPE_COUNT;
+    std::string tensor_type_rules;
+    std::string lora_model_dir;
+
+    std::map<std::string, std::string> embedding_map;
+    std::vector<sd_embedding_t> embedding_vec;
+
+    rng_type_t rng_type         = CUDA_RNG;
+    rng_type_t sampler_rng_type = RNG_TYPE_COUNT;
+    bool offload_params_to_cpu  = false;
+    bool control_net_cpu        = false;
+    bool clip_on_cpu            = false;
+    bool vae_on_cpu             = false;
+    bool diffusion_flash_attn   = false;
+    bool diffusion_conv_direct  = false;
+    bool vae_conv_direct        = false;
+
+    bool chroma_use_dit_mask = true;
+    bool chroma_use_t5_mask  = false;
+    int chroma_t5_mask_pad   = 1;
+
+    prediction_t prediction           = PREDICTION_COUNT;
+    lora_apply_mode_t lora_apply_mode = LORA_APPLY_AUTO;
+
+    sd_tiling_params_t vae_tiling_params = {false, 0, 0, 0.5f, 0.0f, 0.0f};
+    bool force_sdxl_vae_conv_scale       = false;
+
+    float flow_shift = INFINITY;
+
+    ArgOptions get_options() {
+        ArgOptions options;
+        options.string_options = {
+            {"-m",
+             "--model",
+             "path to full model",
+             &model_path},
+            {"",
+             "--clip_l",
+             "path to the clip-l text encoder", &clip_l_path},
+            {"", "--clip_g",
+             "path to the clip-g text encoder",
+             &clip_g_path},
+            {"",
+             "--clip_vision",
+             "path to the clip-vision encoder",
+             &clip_vision_path},
+            {"",
+             "--t5xxl",
+             "path to the t5xxl text encoder",
+             &t5xxl_path},
+            {"",
+             "--llm",
+             "path to the llm text encoder. For example: (qwenvl2.5 for qwen-image, mistral-small3.2 for flux2, ...)",
+             &llm_path},
+            {"",
+             "--llm_vision",
+             "path to the llm vit",
+             &llm_vision_path},
+            {"",
+             "--qwen2vl",
+             "alias of --llm. Deprecated.",
+             &llm_path},
+            {"",
+             "--qwen2vl_vision",
+             "alias of --llm_vision. Deprecated.",
+             &llm_vision_path},
+            {"",
+             "--diffusion-model",
+             "path to the standalone diffusion model",
+             &diffusion_model_path},
+            {"",
+             "--high-noise-diffusion-model",
+             "path to the standalone high noise diffusion model",
+             &high_noise_diffusion_model_path},
+            {"",
+             "--vae",
+             "path to standalone vae model",
+             &vae_path},
+            {"",
+             "--taesd",
+             "path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)",
+             &taesd_path},
+            {"",
+             "--control-net",
+             "path to control net model",
+             &control_net_path},
+            {"",
+             "--embd-dir",
+             "embeddings directory",
+             &embedding_dir},
+            {"",
+             "--lora-model-dir",
+             "lora model directory",
+             &lora_model_dir},
+
+            {"",
+             "--tensor-type-rules",
+             "weight type per tensor pattern (example: \"^vae\\.=f16,model\\.=q8_0\")",
+             &tensor_type_rules},
+            {"",
+             "--photo-maker",
+             "path to PHOTOMAKER model",
+             &photo_maker_path},
+            {"",
+             "--upscale-model",
+             "path to esrgan model.",
+             &esrgan_path},
+        };
+
+        options.int_options = {
+            {"-t",
+             "--threads",
+             "number of threads to use during computation (default: -1). "
+             "If threads <= 0, then threads will be set to the number of CPU physical cores",
+             &n_threads},
+            {"",
+             "--chroma-t5-mask-pad",
+             "t5 mask pad size of chroma",
+             &chroma_t5_mask_pad},
+        };
+
+        options.float_options = {
+            {"",
+             "--vae-tile-overlap",
+             "tile overlap for vae tiling, in fraction of tile size (default: 0.5)",
+             &vae_tiling_params.target_overlap},
+            {"",
+             "--flow-shift",
+             "shift value for Flow models like SD3.x or WAN (default: auto)",
+             &flow_shift},
+        };
+
+        options.bool_options = {
+            {"",
+             "--vae-tiling",
+             "process vae in tiles to reduce memory usage",
+             true, &vae_tiling_params.enabled},
+            {"",
+             "--force-sdxl-vae-conv-scale",
+             "force use of conv scale on sdxl vae",
+             true, &force_sdxl_vae_conv_scale},
+            {"",
+             "--offload-to-cpu",
+             "place the weights in RAM to save VRAM, and automatically load them into VRAM when needed",
+             true, &offload_params_to_cpu},
+            {"",
+             "--control-net-cpu",
+             "keep controlnet in cpu (for low vram)",
+             true, &control_net_cpu},
+            {"",
+             "--clip-on-cpu",
+             "keep clip in cpu (for low vram)",
+             true, &clip_on_cpu},
+            {"",
+             "--vae-on-cpu",
+             "keep vae in cpu (for low vram)",
+             true, &vae_on_cpu},
+            {"",
+             "--diffusion-fa",
+             "use flash attention in the diffusion model",
+             true, &diffusion_flash_attn},
+            {"",
+             "--diffusion-conv-direct",
+             "use ggml_conv2d_direct in the diffusion model",
+             true, &diffusion_conv_direct},
+            {"",
+             "--vae-conv-direct",
+             "use ggml_conv2d_direct in the vae model",
+             true, &vae_conv_direct},
+            {"",
+             "--chroma-disable-dit-mask",
+             "disable dit mask for chroma",
+             false, &chroma_use_dit_mask},
+            {"",
+             "--chroma-enable-t5-mask",
+             "enable t5 mask for chroma",
+             true, &chroma_use_t5_mask},
+        };
+
+        auto on_type_arg = [&](int argc, const char** argv, int index) {
+            if (++index >= argc) {
+                return -1;
+            }
+            const char* arg = argv[index];
+            wtype           = str_to_sd_type(arg);
+            if (wtype == SD_TYPE_COUNT) {
+                fprintf(stderr, "error: invalid weight format %s\n",
+                        arg);
+                return -1;
+            }
+            return 1;
+        };
+
+        auto on_rng_arg = [&](int argc, const char** argv, int index) {
+            if (++index >= argc) {
+                return -1;
+            }
+            const char* arg = argv[index];
+            rng_type        = str_to_rng_type(arg);
+            if (rng_type == RNG_TYPE_COUNT) {
+                fprintf(stderr, "error: invalid rng type %s\n",
+                        arg);
+                return -1;
+            }
+            return 1;
+        };
+
+        auto on_sampler_rng_arg = [&](int argc, const char** argv, int index) {
+            if (++index >= argc) {
+                return -1;
+            }
+            const char* arg  = argv[index];
+            sampler_rng_type = str_to_rng_type(arg);
+            if (sampler_rng_type == RNG_TYPE_COUNT) {
+                fprintf(stderr, "error: invalid sampler rng type %s\n",
+                        arg);
+                return -1;
+            }
+            return 1;
+        };
+
+        auto on_prediction_arg = [&](int argc, const char** argv, int index) {
+            if (++index >= argc) {
+                return -1;
+            }
+            const char* arg = argv[index];
+            prediction      = str_to_prediction(arg);
+            if (prediction == PREDICTION_COUNT) {
+                fprintf(stderr, "error: invalid prediction type %s\n",
+                        arg);
+                return -1;
+            }
+            return 1;
+        };
+
+        auto on_lora_apply_mode_arg = [&](int argc, const char** argv, int index) {
+            if (++index >= argc) {
+                return -1;
+            }
+            const char* arg = argv[index];
+            lora_apply_mode = str_to_lora_apply_mode(arg);
+            if (lora_apply_mode == LORA_APPLY_MODE_COUNT) {
+                fprintf(stderr, "error: invalid lora apply model %s\n",
+                        arg);
+                return -1;
+            }
+            return 1;
+        };
+
+        auto on_tile_size_arg = [&](int argc, const char** argv, int index) {
+            if (++index >= argc) {
+                return -1;
+            }
+            std::string tile_size_str = argv[index];
+            size_t x_pos              = tile_size_str.find('x');
+            try {
+                if (x_pos != std::string::npos) {
+                    std::string tile_x_str        = tile_size_str.substr(0, x_pos);
+                    std::string tile_y_str        = tile_size_str.substr(x_pos + 1);
+                    vae_tiling_params.tile_size_x = std::stoi(tile_x_str);
+                    vae_tiling_params.tile_size_y = std::stoi(tile_y_str);
+                } else {
+                    vae_tiling_params.tile_size_x = vae_tiling_params.tile_size_y = std::stoi(tile_size_str);
+                }
+            } catch (const std::invalid_argument&) {
+                return -1;
+            } catch (const std::out_of_range&) {
+                return -1;
+            }
+            return 1;
+        };
+
+        auto on_relative_tile_size_arg = [&](int argc, const char** argv, int index) {
+            if (++index >= argc) {
+                return -1;
+            }
+            std::string rel_size_str = argv[index];
+            size_t x_pos             = rel_size_str.find('x');
+            try {
+                if (x_pos != std::string::npos) {
+                    std::string rel_x_str        = rel_size_str.substr(0, x_pos);
+                    std::string rel_y_str        = rel_size_str.substr(x_pos + 1);
+                    vae_tiling_params.rel_size_x = std::stof(rel_x_str);
+                    vae_tiling_params.rel_size_y = std::stof(rel_y_str);
+                } else {
+                    vae_tiling_params.rel_size_x = vae_tiling_params.rel_size_y = std::stof(rel_size_str);
+                }
+            } catch (const std::invalid_argument&) {
+                return -1;
+            } catch (const std::out_of_range&) {
+                return -1;
+            }
+            return 1;
+        };
+
+        options.manual_options = {
+            {"",
+             "--type",
+             "weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). "
+             "If not specified, the default is the type of the weight file",
+             on_type_arg},
+            {"",
+             "--rng",
+             "RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui)",
+             on_rng_arg},
+            {"",
+             "--sampler-rng",
+             "sampler RNG, one of [std_default, cuda, cpu]. If not specified, use --rng",
+             on_sampler_rng_arg},
+            {"",
+             "--prediction",
+             "prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow, flux2_flow]",
+             on_prediction_arg},
+            {"",
+             "--lora-apply-mode",
+             "the way to apply LoRA, one of [auto, immediately, at_runtime], default is auto. "
+             "In auto mode, if the model weights contain any quantized parameters, the at_runtime mode will be used; otherwise, immediately will be used."
+             "The immediately mode may have precision and compatibility issues with quantized parameters, "
+             "but it usually offers faster inference speed and, in some cases, lower memory usage. "
+             "The at_runtime mode, on the other hand, is exactly the opposite.",
+             on_lora_apply_mode_arg},
+            {"",
+             "--vae-tile-size",
+             "tile size for vae tiling, format [X]x[Y] (default: 32x32)",
+             on_tile_size_arg},
+            {"",
+             "--vae-relative-tile-size",
+             "relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)",
+             on_relative_tile_size_arg},
+        };
+
+        return options;
+    }
+
+    void build_embedding_map() {
+        static const std::vector<std::string> valid_ext = {".pt", ".safetensors", ".gguf"};
+
+        if (!fs::exists(embedding_dir) || !fs::is_directory(embedding_dir)) {
+            return;
+        }
+
+        for (auto& p : fs::directory_iterator(embedding_dir)) {
+            if (!p.is_regular_file())
+                continue;
+
+            auto path       = p.path();
+            std::string ext = path.extension().string();
+
+            bool valid = false;
+            for (auto& e : valid_ext) {
+                if (ext == e) {
+                    valid = true;
+                    break;
+                }
+            }
+            if (!valid)
+                continue;
+
+            std::string key   = path.stem().string();
+            std::string value = path.string();
+
+            embedding_map[key] = value;
+        }
+    }
+
+    bool process_and_check(SDMode mode) {
+        if (mode != UPSCALE && model_path.length() == 0 && diffusion_model_path.length() == 0) {
+            fprintf(stderr, "error: the following arguments are required: model_path/diffusion_model\n");
+            return false;
+        }
+
+        if (mode == UPSCALE) {
+            if (esrgan_path.length() == 0) {
+                fprintf(stderr, "error: upscale mode needs an upscaler model (--upscale-model)\n");
+                return false;
+            }
+        }
+
+        if (n_threads <= 0) {
+            n_threads = sd_get_num_physical_cores();
+        }
+
+        build_embedding_map();
+
+        return true;
+    }
+
+    std::string to_string() const {
+        std::ostringstream emb_ss;
+        emb_ss << "{\n";
+        for (auto it = embedding_map.begin(); it != embedding_map.end(); ++it) {
+            emb_ss << "    \"" << it->first << "\": \"" << it->second << "\"";
+            if (std::next(it) != embedding_map.end()) {
+                emb_ss << ",";
+            }
+            emb_ss << "\n";
+        }
+        emb_ss << "  }";
+
+        std::string embeddings_str = emb_ss.str();
+        std::ostringstream oss;
+        oss << "SDContextParams {\n"
+            << "  n_threads: " << n_threads << ",\n"
+            << "  model_path: \"" << model_path << "\",\n"
+            << "  clip_l_path: \"" << clip_l_path << "\",\n"
+            << "  clip_g_path: \"" << clip_g_path << "\",\n"
+            << "  clip_vision_path: \"" << clip_vision_path << "\",\n"
+            << "  t5xxl_path: \"" << t5xxl_path << "\",\n"
+            << "  llm_path: \"" << llm_path << "\",\n"
+            << "  llm_vision_path: \"" << llm_vision_path << "\",\n"
+            << "  diffusion_model_path: \"" << diffusion_model_path << "\",\n"
+            << "  high_noise_diffusion_model_path: \"" << high_noise_diffusion_model_path << "\",\n"
+            << "  vae_path: \"" << vae_path << "\",\n"
+            << "  taesd_path: \"" << taesd_path << "\",\n"
+            << "  esrgan_path: \"" << esrgan_path << "\",\n"
+            << "  control_net_path: \"" << control_net_path << "\",\n"
+            << "  embedding_dir: \"" << embedding_dir << "\",\n"
+            << "  embeddings: " << embeddings_str << "\n"
+            << "  wtype: " << sd_type_name(wtype) << ",\n"
+            << "  tensor_type_rules: \"" << tensor_type_rules << "\",\n"
+            << "  lora_model_dir: \"" << lora_model_dir << "\",\n"
+            << "  photo_maker_path: \"" << photo_maker_path << "\",\n"
+            << "  rng_type: " << sd_rng_type_name(rng_type) << ",\n"
+            << "  sampler_rng_type: " << sd_rng_type_name(sampler_rng_type) << ",\n"
+            << "  flow_shift: " << (std::isinf(flow_shift) ? "INF" : std::to_string(flow_shift)) << "\n"
+            << "  offload_params_to_cpu: " << (offload_params_to_cpu ? "true" : "false") << ",\n"
+            << "  control_net_cpu: " << (control_net_cpu ? "true" : "false") << ",\n"
+            << "  clip_on_cpu: " << (clip_on_cpu ? "true" : "false") << ",\n"
+            << "  vae_on_cpu: " << (vae_on_cpu ? "true" : "false") << ",\n"
+            << "  diffusion_flash_attn: " << (diffusion_flash_attn ? "true" : "false") << ",\n"
+            << "  diffusion_conv_direct: " << (diffusion_conv_direct ? "true" : "false") << ",\n"
+            << "  vae_conv_direct: " << (vae_conv_direct ? "true" : "false") << ",\n"
+            << "  chroma_use_dit_mask: " << (chroma_use_dit_mask ? "true" : "false") << ",\n"
+            << "  chroma_use_t5_mask: " << (chroma_use_t5_mask ? "true" : "false") << ",\n"
+            << "  chroma_t5_mask_pad: " << chroma_t5_mask_pad << ",\n"
+            << "  prediction: " << sd_prediction_name(prediction) << ",\n"
+            << "  lora_apply_mode: " << sd_lora_apply_mode_name(lora_apply_mode) << ",\n"
+            << "  vae_tiling_params: { "
+            << vae_tiling_params.enabled << ", "
+            << vae_tiling_params.tile_size_x << ", "
+            << vae_tiling_params.tile_size_y << ", "
+            << vae_tiling_params.target_overlap << ", "
+            << vae_tiling_params.rel_size_x << ", "
+            << vae_tiling_params.rel_size_y << " },\n"
+            << "  force_sdxl_vae_conv_scale: " << (force_sdxl_vae_conv_scale ? "true" : "false") << "\n"
+            << "}";
+        return oss.str();
+    }
+
+    sd_ctx_params_t to_sd_ctx_params_t(bool vae_decode_only, bool free_params_immediately, bool taesd_preview) {
+        embedding_vec.clear();
+        embedding_vec.reserve(embedding_map.size());
+        for (const auto& kv : embedding_map) {
+            sd_embedding_t item;
+            item.name = kv.first.c_str();
+            item.path = kv.second.c_str();
+            embedding_vec.emplace_back(item);
+        }
+
+        sd_ctx_params_t sd_ctx_params = {
+            model_path.c_str(),
+            clip_l_path.c_str(),
+            clip_g_path.c_str(),
+            clip_vision_path.c_str(),
+            t5xxl_path.c_str(),
+            llm_path.c_str(),
+            llm_vision_path.c_str(),
+            diffusion_model_path.c_str(),
+            high_noise_diffusion_model_path.c_str(),
+            vae_path.c_str(),
+            taesd_path.c_str(),
+            control_net_path.c_str(),
+            lora_model_dir.c_str(),
+            embedding_vec.data(),
+            static_cast<uint32_t>(embedding_vec.size()),
+            photo_maker_path.c_str(),
+            tensor_type_rules.c_str(),
+            vae_decode_only,
+            free_params_immediately,
+            n_threads,
+            wtype,
+            rng_type,
+            sampler_rng_type,
+            prediction,
+            lora_apply_mode,
+            offload_params_to_cpu,
+            clip_on_cpu,
+            control_net_cpu,
+            vae_on_cpu,
+            diffusion_flash_attn,
+            taesd_preview,
+            diffusion_conv_direct,
+            vae_conv_direct,
+            force_sdxl_vae_conv_scale,
+            chroma_use_dit_mask,
+            chroma_use_t5_mask,
+            chroma_t5_mask_pad,
+            flow_shift,
+        };
+        return sd_ctx_params;
+    }
+};
+
+template <typename T>
+static std::string vec_to_string(const std::vector<T>& v) {
+    std::ostringstream oss;
+    oss << "[";
+    for (size_t i = 0; i < v.size(); i++) {
+        oss << v[i];
+        if (i + 1 < v.size())
+            oss << ", ";
+    }
+    oss << "]";
+    return oss.str();
+}
+
+static std::string vec_str_to_string(const std::vector<std::string>& v) {
+    std::ostringstream oss;
+    oss << "[";
+    for (size_t i = 0; i < v.size(); i++) {
+        oss << "\"" << v[i] << "\"";
+        if (i + 1 < v.size())
+            oss << ", ";
+    }
+    oss << "]";
+    return oss.str();
+}
+
+static bool is_absolute_path(const std::string& p) {
+#ifdef _WIN32
+    // Windows: C:/path or C:\path
+    return p.size() > 1 && std::isalpha(static_cast<unsigned char>(p[0])) && p[1] == ':';
+#else
+    return !p.empty() && p[0] == '/';
+#endif
+}
+
+struct SDGenerationParams {
+    std::string prompt;
+    std::string prompt_with_lora; // for metadata record only
+    std::string negative_prompt;
+    int clip_skip   = -1;  // <= 0 represents unspecified
+    int width       = 512;
+    int height      = 512;
+    int batch_count = 1;
+    std::string init_image_path;
+    std::string end_image_path;
+    std::string mask_image_path;
+    std::string control_image_path;
+    std::vector<std::string> ref_image_paths;
+    std::string control_video_path;
+    bool auto_resize_ref_image = true;
+    bool increase_ref_index    = false;
+
+    std::vector<int> skip_layers = {7, 8, 9};
+    sd_sample_params_t sample_params;
+
+    std::vector<int> high_noise_skip_layers = {7, 8, 9};
+    sd_sample_params_t high_noise_sample_params;
+
+    std::vector<float> custom_sigmas;
+
+    std::string easycache_option;
+    sd_easycache_params_t easycache_params;
+
+    float moe_boundary  = 0.875f;
+    int video_frames    = 1;
+    int fps             = 16;
+    float vace_strength = 1.f;
+
+    float strength         = 0.75f;
+    float control_strength = 0.9f;
+
+    int64_t seed = 42;
+
+    // Photo Maker
+    std::string pm_id_images_dir;
+    std::string pm_id_embed_path;
+    float pm_style_strength = 20.f;
+
+    int upscale_repeats   = 1;
+    int upscale_tile_size = 128;
+
+    std::map<std::string, float> lora_map;
+    std::map<std::string, float> high_noise_lora_map;
+    std::vector<sd_lora_t> lora_vec;
+
+    SDGenerationParams() {
+        sd_sample_params_init(&sample_params);
+        sd_sample_params_init(&high_noise_sample_params);
+    }
+
+    ArgOptions get_options() {
+        ArgOptions options;
+        options.string_options = {
+            {"-p",
+             "--prompt",
+             "the prompt to render",
+             &prompt},
+            {"-n",
+             "--negative-prompt",
+             "the negative prompt (default: \"\")",
+             &negative_prompt},
+            {"-i",
+             "--init-img",
+             "path to the init image",
+             &init_image_path},
+            {"",
+             "--end-img",
+             "path to the end image, required by flf2v",
+             &end_image_path},
+            {"",
+             "--mask",
+             "path to the mask image",
+             &mask_image_path},
+            {"",
+             "--control-image",
+             "path to control image, control net",
+             &control_image_path},
+            {"",
+             "--control-video",
+             "path to control video frames, It must be a directory path. The video frames inside should be stored as images in "
+             "lexicographical (character) order. For example, if the control video path is `frames`, the directory contain images "
+             "such as 00.png, 01.png, ... etc.",
+             &control_video_path},
+            {"",
+             "--pm-id-images-dir",
+             "path to PHOTOMAKER input id images dir",
+             &pm_id_images_dir},
+            {"",
+             "--pm-id-embed-path",
+             "path to PHOTOMAKER v2 id embed",
+             &pm_id_embed_path},
+        };
+
+        options.int_options = {
+            {"-H",
+             "--height",
+             "image height, in pixel space (default: 512)",
+             &height},
+            {"-W",
+             "--width",
+             "image width, in pixel space (default: 512)",
+             &width},
+            {"",
+             "--steps",
+             "number of sample steps (default: 20)",
+             &sample_params.sample_steps},
+            {"",
+             "--high-noise-steps",
+             "(high noise) number of sample steps (default: -1 = auto)",
+             &high_noise_sample_params.sample_steps},
+            {"",
+             "--clip-skip",
+             "ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1). "
+             "<= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x",
+             &clip_skip},
+            {"-b",
+             "--batch-count",
+             "batch count",
+             &batch_count},
+            {"",
+             "--video-frames",
+             "video frames (default: 1)",
+             &video_frames},
+            {"",
+             "--fps",
+             "fps (default: 24)",
+             &fps},
+            {"",
+             "--timestep-shift",
+             "shift timestep for NitroFusion models (default: 0). "
+             "recommended N for NitroSD-Realism around 250 and 500 for NitroSD-Vibrant",
+             &sample_params.shifted_timestep},
+            {"",
+             "--upscale-repeats",
+             "Run the ESRGAN upscaler this many times (default: 1)",
+             &upscale_repeats},
+            {"",
+             "--upscale-tile-size",
+             "tile size for ESRGAN upscaling (default: 128)",
+             &upscale_tile_size},
+        };
+
+        options.float_options = {
+            {"",
+             "--cfg-scale",
+             "unconditional guidance scale: (default: 7.0)",
+             &sample_params.guidance.txt_cfg},
+            {"",
+             "--img-cfg-scale",
+             "image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)",
+             &sample_params.guidance.img_cfg},
+            {"",
+             "--guidance",
+             "distilled guidance scale for models with guidance input (default: 3.5)",
+             &sample_params.guidance.distilled_guidance},
+            {"",
+             "--slg-scale",
+             "skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means disabled, a value of 2.5 is nice for sd3.5 medium",
+             &sample_params.guidance.slg.scale},
+            {"",
+             "--skip-layer-start",
+             "SLG enabling point (default: 0.01)",
+             &sample_params.guidance.slg.layer_start},
+            {"",
+             "--skip-layer-end",
+             "SLG disabling point (default: 0.2)",
+             &sample_params.guidance.slg.layer_end},
+            {"",
+             "--eta",
+             "eta in DDIM, only for DDIM and TCD (default: 0)",
+             &sample_params.eta},
+            {"",
+             "--high-noise-cfg-scale",
+             "(high noise) unconditional guidance scale: (default: 7.0)",
+             &high_noise_sample_params.guidance.txt_cfg},
+            {"",
+             "--high-noise-img-cfg-scale",
+             "(high noise) image guidance scale for inpaint or instruct-pix2pix models (default: same as --cfg-scale)",
+             &high_noise_sample_params.guidance.img_cfg},
+            {"",
+             "--high-noise-guidance",
+             "(high noise) distilled guidance scale for models with guidance input (default: 3.5)",
+             &high_noise_sample_params.guidance.distilled_guidance},
+            {"",
+             "--high-noise-slg-scale",
+             "(high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0)",
+             &high_noise_sample_params.guidance.slg.scale},
+            {"",
+             "--high-noise-skip-layer-start",
+             "(high noise) SLG enabling point (default: 0.01)",
+             &high_noise_sample_params.guidance.slg.layer_start},
+            {"",
+             "--high-noise-skip-layer-end",
+             "(high noise) SLG disabling point (default: 0.2)",
+             &high_noise_sample_params.guidance.slg.layer_end},
+            {"",
+             "--high-noise-eta",
+             "(high noise) eta in DDIM, only for DDIM and TCD (default: 0)",
+             &high_noise_sample_params.eta},
+            {"",
+             "--strength",
+             "strength for noising/unnoising (default: 0.75)",
+             &strength},
+            {"",
+             "--pm-style-strength",
+             "",
+             &pm_style_strength},
+            {"",
+             "--control-strength",
+             "strength to apply Control Net (default: 0.9). 1.0 corresponds to full destruction of information in init image",
+             &control_strength},
+            {"",
+             "--moe-boundary",
+             "timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if `--high-noise-steps` is set to -1",
+             &moe_boundary},
+            {"",
+             "--vace-strength",
+             "wan vace strength",
+             &vace_strength},
+        };
+
+        options.bool_options = {
+            {"",
+             "--increase-ref-index",
+             "automatically increase the indices of references images based on the order they are listed (starting with 1).",
+             true,
+             &increase_ref_index},
+            {"",
+             "--disable-auto-resize-ref-image",
+             "disable auto resize of ref images",
+             false,
+             &auto_resize_ref_image},
+        };
+
+        auto on_seed_arg = [&](int argc, const char** argv, int index) {
+            if (++index >= argc) {
+                return -1;
+            }
+            seed = std::stoll(argv[index]);
+            return 1;
+        };
+
+        auto on_sample_method_arg = [&](int argc, const char** argv, int index) {
+            if (++index >= argc) {
+                return -1;
+            }
+            const char* arg             = argv[index];
+            sample_params.sample_method = str_to_sample_method(arg);
+            if (sample_params.sample_method == SAMPLE_METHOD_COUNT) {
+                fprintf(stderr, "error: invalid sample method %s\n",
+                        arg);
+                return -1;
+            }
+            return 1;
+        };
+
+        auto on_high_noise_sample_method_arg = [&](int argc, const char** argv, int index) {
+            if (++index >= argc) {
+                return -1;
+            }
+            const char* arg                        = argv[index];
+            high_noise_sample_params.sample_method = str_to_sample_method(arg);
+            if (high_noise_sample_params.sample_method == SAMPLE_METHOD_COUNT) {
+                fprintf(stderr, "error: invalid high noise sample method %s\n",
+                        arg);
+                return -1;
+            }
+            return 1;
+        };
+
+        auto on_scheduler_arg = [&](int argc, const char** argv, int index) {
+            if (++index >= argc) {
+                return -1;
+            }
+            const char* arg         = argv[index];
+            sample_params.scheduler = str_to_scheduler(arg);
+            if (sample_params.scheduler == SCHEDULER_COUNT) {
+                fprintf(stderr, "error: invalid scheduler %s\n",
+                        arg);
+                return -1;
+            }
+            return 1;
+        };
+
+        auto on_skip_layers_arg = [&](int argc, const char** argv, int index) {
+            if (++index >= argc) {
+                return -1;
+            }
+            std::string layers_str = argv[index];
+            if (layers_str[0] != '[' || layers_str[layers_str.size() - 1] != ']') {
+                return -1;
+            }
+
+            layers_str = layers_str.substr(1, layers_str.size() - 2);
+
+            std::regex regex("[, ]+");
+            std::sregex_token_iterator iter(layers_str.begin(), layers_str.end(), regex, -1);
+            std::sregex_token_iterator end;
+            std::vector<std::string> tokens(iter, end);
+            std::vector<int> layers;
+            for (const auto& token : tokens) {
+                try {
+                    layers.push_back(std::stoi(token));
+                } catch (const std::invalid_argument&) {
+                    return -1;
+                }
+            }
+            skip_layers = layers;
+            return 1;
+        };
+
+        auto on_high_noise_skip_layers_arg = [&](int argc, const char** argv, int index) {
+            if (++index >= argc) {
+                return -1;
+            }
+            std::string layers_str = argv[index];
+            if (layers_str[0] != '[' || layers_str[layers_str.size() - 1] != ']') {
+                return -1;
+            }
+
+            layers_str = layers_str.substr(1, layers_str.size() - 2);
+
+            std::regex regex("[, ]+");
+            std::sregex_token_iterator iter(layers_str.begin(), layers_str.end(), regex, -1);
+            std::sregex_token_iterator end;
+            std::vector<std::string> tokens(iter, end);
+            std::vector<int> layers;
+            for (const auto& token : tokens) {
+                try {
+                    layers.push_back(std::stoi(token));
+                } catch (const std::invalid_argument&) {
+                    return -1;
+                }
+            }
+            high_noise_skip_layers = layers;
+            return 1;
+        };
+
+        auto on_sigmas_arg = [&](int argc, const char** argv, int index) {
+            if (++index >= argc) {
+                return -1;
+            }
+            std::string sigmas_str = argv[index];
+            if (!sigmas_str.empty() && sigmas_str.front() == '[') {
+                sigmas_str.erase(0, 1);
+            }
+            if (!sigmas_str.empty() && sigmas_str.back() == ']') {
+                sigmas_str.pop_back();
+            }
+
+            std::stringstream ss(sigmas_str);
+            std::string item;
+            while (std::getline(ss, item, ',')) {
+                item.erase(0, item.find_first_not_of(" \t\n\r\f\v"));
+                item.erase(item.find_last_not_of(" \t\n\r\f\v") + 1);
+                if (!item.empty()) {
+                    try {
+                        custom_sigmas.push_back(std::stof(item));
+                    } catch (const std::invalid_argument& e) {
+                        fprintf(stderr, "error: invalid float value '%s' in --sigmas\n", item.c_str());
+                        return -1;
+                    } catch (const std::out_of_range& e) {
+                        fprintf(stderr, "error: float value '%s' out of range in --sigmas\n", item.c_str());
+                        return -1;
+                    }
+                }
+            }
+
+            if (custom_sigmas.empty() && !sigmas_str.empty()) {
+                fprintf(stderr, "error: could not parse any sigma values from '%s'\n", argv[index]);
+                return -1;
+            }
+            return 1;
+        };
+
+        auto on_ref_image_arg = [&](int argc, const char** argv, int index) {
+            if (++index >= argc) {
+                return -1;
+            }
+            ref_image_paths.push_back(argv[index]);
+            return 1;
+        };
+
+        auto on_easycache_arg = [&](int argc, const char** argv, int index) {
+            const std::string default_values = "0.2,0.15,0.95";
+            auto looks_like_value            = [](const std::string& token) {
+                if (token.empty()) {
+                    return false;
+                }
+                if (token[0] != '-') {
+                    return true;
+                }
+                if (token.size() == 1) {
+                    return false;
+                }
+                unsigned char next = static_cast<unsigned char>(token[1]);
+                return std::isdigit(next) || token[1] == '.';
+            };
+
+            std::string option_value;
+            int consumed = 0;
+            if (index + 1 < argc) {
+                std::string next_arg = argv[index + 1];
+                if (looks_like_value(next_arg)) {
+                    option_value = argv_to_utf8(index + 1, argv);
+                    consumed     = 1;
+                }
+            }
+            if (option_value.empty()) {
+                option_value = default_values;
+            }
+            easycache_option = option_value;
+            return consumed;
+        };
+
+        options.manual_options = {
+            {"-s",
+             "--seed",
+             "RNG seed (default: 42, use random seed for < 0)",
+             on_seed_arg},
+            {"",
+             "--sampling-method",
+             "sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd] "
+             "(default: euler for Flux/SD3/Wan, euler_a otherwise)",
+             on_sample_method_arg},
+            {"",
+             "--high-noise-sampling-method",
+             "(high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd]"
+             " default: euler for Flux/SD3/Wan, euler_a otherwise",
+             on_high_noise_sample_method_arg},
+            {"",
+             "--scheduler",
+             "denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple, lcm], default: discrete",
+             on_scheduler_arg},
+            {"",
+             "--sigmas",
+             "custom sigma values for the sampler, comma-separated (e.g., \"14.61,7.8,3.5,0.0\").",
+             on_sigmas_arg},
+            {"",
+             "--skip-layers",
+             "layers to skip for SLG steps (default: [7,8,9])",
+             on_skip_layers_arg},
+            {"",
+             "--high-noise-skip-layers",
+             "(high noise) layers to skip for SLG steps (default: [7,8,9])",
+             on_high_noise_skip_layers_arg},
+            {"-r",
+             "--ref-image",
+             "reference image for Flux Kontext models (can be used multiple times)",
+             on_ref_image_arg},
+            {"",
+             "--easycache",
+             "enable EasyCache for DiT models with optional \"threshold,start_percent,end_percent\" (default: 0.2,0.15,0.95)",
+             on_easycache_arg},
+
+        };
+
+        return options;
+    }
+
+    bool from_json_str(const std::string& json_str) {
+        json j;
+        try {
+            j = json::parse(json_str);
+        } catch (...) {
+            fprintf(stderr, "json parse failed %s\n", json_str.c_str());
+            return false;
+        }
+
+        auto load_if_exists = [&](const char* key, auto& out) {
+            if (j.contains(key)) {
+                using T = std::decay_t<decltype(out)>;
+                if constexpr (std::is_same_v<T, std::string>) {
+                    if (j[key].is_string())
+                        out = j[key];
+                } else if constexpr (std::is_same_v<T, int> || std::is_same_v<T, int64_t>) {
+                    if (j[key].is_number_integer())
+                        out = j[key];
+                } else if constexpr (std::is_same_v<T, float>) {
+                    if (j[key].is_number())
+                        out = j[key];
+                } else if constexpr (std::is_same_v<T, bool>) {
+                    if (j[key].is_boolean())
+                        out = j[key];
+                } else if constexpr (std::is_same_v<T, std::vector<int>>) {
+                    if (j[key].is_array())
+                        out = j[key].get<std::vector<int>>();
+                } else if constexpr (std::is_same_v<T, std::vector<std::string>>) {
+                    if (j[key].is_array())
+                        out = j[key].get<std::vector<std::string>>();
+                }
+            }
+        };
+
+        load_if_exists("prompt", prompt);
+        load_if_exists("negative_prompt", negative_prompt);
+        load_if_exists("easycache_option", easycache_option);
+
+        load_if_exists("clip_skip", clip_skip);
+        load_if_exists("width", width);
+        load_if_exists("height", height);
+        load_if_exists("batch_count", batch_count);
+        load_if_exists("video_frames", video_frames);
+        load_if_exists("fps", fps);
+        load_if_exists("upscale_repeats", upscale_repeats);
+        load_if_exists("seed", seed);
+
+        load_if_exists("strength", strength);
+        load_if_exists("control_strength", control_strength);
+        load_if_exists("pm_style_strength", pm_style_strength);
+        load_if_exists("moe_boundary", moe_boundary);
+        load_if_exists("vace_strength", vace_strength);
+
+        load_if_exists("auto_resize_ref_image", auto_resize_ref_image);
+        load_if_exists("increase_ref_index", increase_ref_index);
+
+        load_if_exists("skip_layers", skip_layers);
+        load_if_exists("high_noise_skip_layers", high_noise_skip_layers);
+
+        load_if_exists("cfg_scale", sample_params.guidance.txt_cfg);
+        load_if_exists("img_cfg_scale", sample_params.guidance.img_cfg);
+        load_if_exists("guidance", sample_params.guidance.distilled_guidance);
+
+        return true;
+    }
+
+    void extract_and_remove_lora(const std::string& lora_model_dir) {
+        if (lora_model_dir.empty()) {
+            return;
+        }
+        static const std::regex re(R"(<lora:([^:>]+):([^>]+)>)");
+        static const std::vector<std::string> valid_ext = {".pt", ".safetensors", ".gguf"};
+        std::smatch m;
+
+        std::string tmp = prompt;
+
+        while (std::regex_search(tmp, m, re)) {
+            std::string raw_path      = m[1].str();
+            const std::string raw_mul = m[2].str();
+
+            float mul = 0.f;
+            try {
+                mul = std::stof(raw_mul);
+            } catch (...) {
+                tmp    = m.suffix().str();
+                prompt = std::regex_replace(prompt, re, "", std::regex_constants::format_first_only);
+                continue;
+            }
+
+            bool is_high_noise              = false;
+            static const std::string prefix = "|high_noise|";
+            if (raw_path.rfind(prefix, 0) == 0) {
+                raw_path.erase(0, prefix.size());
+                is_high_noise = true;
+            }
+
+            fs::path final_path;
+            if (is_absolute_path(raw_path)) {
+                final_path = raw_path;
+            } else {
+                final_path = fs::path(lora_model_dir) / raw_path;
+            }
+            if (!fs::exists(final_path)) {
+                bool found = false;
+                for (const auto& ext : valid_ext) {
+                    fs::path try_path = final_path;
+                    try_path += ext;
+                    if (fs::exists(try_path)) {
+                        final_path = try_path;
+                        found      = true;
+                        break;
+                    }
+                }
+                if (!found) {
+                    printf("can not found lora %s\n", final_path.lexically_normal().string().c_str());
+                    tmp    = m.suffix().str();
+                    prompt = std::regex_replace(prompt, re, "", std::regex_constants::format_first_only);
+                    continue;
+                }
+            }
+
+            const std::string key = final_path.lexically_normal().string();
+
+            if (is_high_noise)
+                high_noise_lora_map[key] += mul;
+            else
+                lora_map[key] += mul;
+
+            prompt = std::regex_replace(prompt, re, "", std::regex_constants::format_first_only);
+
+            tmp = m.suffix().str();
+        }
+
+        for (const auto& kv : lora_map) {
+            sd_lora_t item;
+            item.is_high_noise = false;
+            item.path          = kv.first.c_str();
+            item.multiplier    = kv.second;
+            lora_vec.emplace_back(item);
+        }
+
+        for (const auto& kv : high_noise_lora_map) {
+            sd_lora_t item;
+            item.is_high_noise = true;
+            item.path          = kv.first.c_str();
+            item.multiplier    = kv.second;
+            lora_vec.emplace_back(item);
+        }
+    }
+
+    bool process_and_check(SDMode mode, const std::string& lora_model_dir) {
+        prompt_with_lora = prompt;
+        if (width <= 0) {
+            fprintf(stderr, "error: the width must be greater than 0\n");
+            return false;
+        }
+
+        if (height <= 0) {
+            fprintf(stderr, "error: the height must be greater than 0\n");
+            return false;
+        }
+
+        if (sample_params.sample_steps <= 0) {
+            fprintf(stderr, "error: the sample_steps must be greater than 0\n");
+            return false;
+        }
+
+        if (high_noise_sample_params.sample_steps <= 0) {
+            high_noise_sample_params.sample_steps = -1;
+        }
+
+        if (strength < 0.f || strength > 1.f) {
+            fprintf(stderr, "error: can only work with strength in [0.0, 1.0]\n");
+            return false;
+        }
+
+        if (!easycache_option.empty()) {
+            float values[3] = {0.0f, 0.0f, 0.0f};
+            std::stringstream ss(easycache_option);
+            std::string token;
+            int idx = 0;
+            while (std::getline(ss, token, ',')) {
+                auto trim = [](std::string& s) {
+                    const char* whitespace = " \t\r\n";
+                    auto start             = s.find_first_not_of(whitespace);
+                    if (start == std::string::npos) {
+                        s.clear();
+                        return;
+                    }
+                    auto end = s.find_last_not_of(whitespace);
+                    s        = s.substr(start, end - start + 1);
+                };
+                trim(token);
+                if (token.empty()) {
+                    fprintf(stderr, "error: invalid easycache option '%s'\n", easycache_option.c_str());
+                    return false;
+                }
+                if (idx >= 3) {
+                    fprintf(stderr, "error: easycache expects exactly 3 comma-separated values (threshold,start,end)\n");
+                    return false;
+                }
+                try {
+                    values[idx] = std::stof(token);
+                } catch (const std::exception&) {
+                    fprintf(stderr, "error: invalid easycache value '%s'\n", token.c_str());
+                    return false;
+                }
+                idx++;
+            }
+            if (idx != 3) {
+                fprintf(stderr, "error: easycache expects exactly 3 comma-separated values (threshold,start,end)\n");
+                return false;
+            }
+            if (values[0] < 0.0f) {
+                fprintf(stderr, "error: easycache threshold must be non-negative\n");
+                return false;
+            }
+            if (values[1] < 0.0f || values[1] >= 1.0f || values[2] <= 0.0f || values[2] > 1.0f || values[1] >= values[2]) {
+                fprintf(stderr, "error: easycache start/end percents must satisfy 0.0 <= start < end <= 1.0\n");
+                return false;
+            }
+            easycache_params.enabled         = true;
+            easycache_params.reuse_threshold = values[0];
+            easycache_params.start_percent   = values[1];
+            easycache_params.end_percent     = values[2];
+        } else {
+            easycache_params.enabled = false;
+        }
+
+        sample_params.guidance.slg.layers                 = skip_layers.data();
+        sample_params.guidance.slg.layer_count            = skip_layers.size();
+        sample_params.custom_sigmas                       = custom_sigmas.data();
+        sample_params.custom_sigmas_count                 = static_cast<int>(custom_sigmas.size());
+        high_noise_sample_params.guidance.slg.layers      = high_noise_skip_layers.data();
+        high_noise_sample_params.guidance.slg.layer_count = high_noise_skip_layers.size();
+
+        if (mode == VID_GEN && video_frames <= 0) {
+            return false;
+        }
+
+        if (mode == VID_GEN && fps <= 0) {
+            return false;
+        }
+
+        if (sample_params.shifted_timestep < 0 || sample_params.shifted_timestep > 1000) {
+            return false;
+        }
+
+        if (upscale_repeats < 1) {
+            return false;
+        }
+
+        if (upscale_tile_size < 1) {
+            return false;
+        }
+
+        if (mode == UPSCALE) {
+            if (init_image_path.length() == 0) {
+                fprintf(stderr, "error: upscale mode needs an init image (--init-img)\n");
+                return false;
+            }
+        }
+
+        if (seed < 0) {
+            srand((int)time(nullptr));
+            seed = rand();
+        }
+
+        extract_and_remove_lora(lora_model_dir);
+
+        return true;
+    }
+
+    std::string to_string() const {
+        char* sample_params_str            = sd_sample_params_to_str(&sample_params);
+        char* high_noise_sample_params_str = sd_sample_params_to_str(&high_noise_sample_params);
+
+        std::ostringstream lora_ss;
+        lora_ss << "{\n";
+        for (auto it = lora_map.begin(); it != lora_map.end(); ++it) {
+            lora_ss << "    \"" << it->first << "\": \"" << it->second << "\"";
+            if (std::next(it) != lora_map.end()) {
+                lora_ss << ",";
+            }
+            lora_ss << "\n";
+        }
+        lora_ss << "  }";
+        std::string loras_str = lora_ss.str();
+
+        lora_ss = std::ostringstream();
+        ;
+        lora_ss << "{\n";
+        for (auto it = high_noise_lora_map.begin(); it != high_noise_lora_map.end(); ++it) {
+            lora_ss << "    \"" << it->first << "\": \"" << it->second << "\"";
+            if (std::next(it) != high_noise_lora_map.end()) {
+                lora_ss << ",";
+            }
+            lora_ss << "\n";
+        }
+        lora_ss << "  }";
+        std::string high_noise_loras_str = lora_ss.str();
+
+        std::ostringstream oss;
+        oss << "SDGenerationParams {\n"
+            << "  loras: \"" << loras_str << "\",\n"
+            << "  high_noise_loras: \"" << high_noise_loras_str << "\",\n"
+            << "  prompt: \"" << prompt << "\",\n"
+            << "  negative_prompt: \"" << negative_prompt << "\",\n"
+            << "  clip_skip: " << clip_skip << ",\n"
+            << "  width: " << width << ",\n"
+            << "  height: " << height << ",\n"
+            << "  batch_count: " << batch_count << ",\n"
+            << "  init_image_path: \"" << init_image_path << "\",\n"
+            << "  end_image_path: \"" << end_image_path << "\",\n"
+            << "  mask_image_path: \"" << mask_image_path << "\",\n"
+            << "  control_image_path: \"" << control_image_path << "\",\n"
+            << "  ref_image_paths: " << vec_str_to_string(ref_image_paths) << ",\n"
+            << "  control_video_path: \"" << control_video_path << "\",\n"
+            << "  auto_resize_ref_image: " << (auto_resize_ref_image ? "true" : "false") << ",\n"
+            << "  increase_ref_index: " << (increase_ref_index ? "true" : "false") << ",\n"
+            << "  pm_id_images_dir: \"" << pm_id_images_dir << "\",\n"
+            << "  pm_id_embed_path: \"" << pm_id_embed_path << "\",\n"
+            << "  pm_style_strength: " << pm_style_strength << ",\n"
+            << "  skip_layers: " << vec_to_string(skip_layers) << ",\n"
+            << "  sample_params: " << sample_params_str << ",\n"
+            << "  high_noise_skip_layers: " << vec_to_string(high_noise_skip_layers) << ",\n"
+            << "  high_noise_sample_params: " << high_noise_sample_params_str << ",\n"
+            << "  custom_sigmas: " << vec_to_string(custom_sigmas) << ",\n"
+            << "  easycache_option: \"" << easycache_option << "\",\n"
+            << "  easycache: "
+            << (easycache_params.enabled ? "enabled" : "disabled")
+            << " (threshold=" << easycache_params.reuse_threshold
+            << ", start=" << easycache_params.start_percent
+            << ", end=" << easycache_params.end_percent << "),\n"
+            << "  moe_boundary: " << moe_boundary << ",\n"
+            << "  video_frames: " << video_frames << ",\n"
+            << "  fps: " << fps << ",\n"
+            << "  vace_strength: " << vace_strength << ",\n"
+            << "  strength: " << strength << ",\n"
+            << "  control_strength: " << control_strength << ",\n"
+            << "  seed: " << seed << ",\n"
+            << "  upscale_repeats: " << upscale_repeats << ",\n"
+            << "  upscale_tile_size: " << upscale_tile_size << ",\n"
+            << "}";
+        free(sample_params_str);
+        free(high_noise_sample_params_str);
+        return oss.str();
+    }
+};
+
+static std::string version_string() {
+    return std::string("stable-diffusion.cpp version ") + sd_version() + ", commit " + sd_commit();
+}
+
+uint8_t* load_image_common(bool from_memory,
+                           const char* image_path_or_bytes,
+                           int len,
+                           int& width,
+                           int& height,
+                           int expected_width   = 0,
+                           int expected_height  = 0,
+                           int expected_channel = 3) {
+    int c = 0;
+    const char* image_path;
+    uint8_t* image_buffer = nullptr;
+    if (from_memory) {
+        image_path   = "memory";
+        image_buffer = (uint8_t*)stbi_load_from_memory((const stbi_uc*)image_path_or_bytes, len, &width, &height, &c, expected_channel);
+    } else {
+        image_path   = image_path_or_bytes;
+        image_buffer = (uint8_t*)stbi_load(image_path_or_bytes, &width, &height, &c, expected_channel);
+    }
+    if (image_buffer == nullptr) {
+        fprintf(stderr, "load image from '%s' failed\n", image_path);
+        return nullptr;
+    }
+    if (c < expected_channel) {
+        fprintf(stderr,
+                "the number of channels for the input image must be >= %d,"
+                "but got %d channels, image_path = %s\n",
+                expected_channel,
+                c,
+                image_path);
+        free(image_buffer);
+        return nullptr;
+    }
+    if (width <= 0) {
+        fprintf(stderr, "error: the width of image must be greater than 0, image_path = %s\n", image_path);
+        free(image_buffer);
+        return nullptr;
+    }
+    if (height <= 0) {
+        fprintf(stderr, "error: the height of image must be greater than 0, image_path = %s\n", image_path);
+        free(image_buffer);
+        return nullptr;
+    }
+
+    // Resize input image ...
+    if ((expected_width > 0 && expected_height > 0) && (height != expected_height || width != expected_width)) {
+        float dst_aspect = (float)expected_width / (float)expected_height;
+        float src_aspect = (float)width / (float)height;
+
+        int crop_x = 0, crop_y = 0;
+        int crop_w = width, crop_h = height;
+
+        if (src_aspect > dst_aspect) {
+            crop_w = (int)(height * dst_aspect);
+            crop_x = (width - crop_w) / 2;
+        } else if (src_aspect < dst_aspect) {
+            crop_h = (int)(width / dst_aspect);
+            crop_y = (height - crop_h) / 2;
+        }
+
+        if (crop_x != 0 || crop_y != 0) {
+            printf("crop input image from %dx%d to %dx%d, image_path = %s\n", width, height, crop_w, crop_h, image_path);
+            uint8_t* cropped_image_buffer = (uint8_t*)malloc(crop_w * crop_h * expected_channel);
+            if (cropped_image_buffer == nullptr) {
+                fprintf(stderr, "error: allocate memory for crop\n");
+                free(image_buffer);
+                return nullptr;
+            }
+            for (int row = 0; row < crop_h; row++) {
+                uint8_t* src = image_buffer + ((crop_y + row) * width + crop_x) * expected_channel;
+                uint8_t* dst = cropped_image_buffer + (row * crop_w) * expected_channel;
+                memcpy(dst, src, crop_w * expected_channel);
+            }
+
+            width  = crop_w;
+            height = crop_h;
+            free(image_buffer);
+            image_buffer = cropped_image_buffer;
+        }
+
+        printf("resize input image from %dx%d to %dx%d\n", width, height, expected_width, expected_height);
+        int resized_height = expected_height;
+        int resized_width  = expected_width;
+
+        uint8_t* resized_image_buffer = (uint8_t*)malloc(resized_height * resized_width * expected_channel);
+        if (resized_image_buffer == nullptr) {
+            fprintf(stderr, "error: allocate memory for resize input image\n");
+            free(image_buffer);
+            return nullptr;
+        }
+        stbir_resize(image_buffer, width, height, 0,
+                     resized_image_buffer, resized_width, resized_height, 0, STBIR_TYPE_UINT8,
+                     expected_channel, STBIR_ALPHA_CHANNEL_NONE, 0,
+                     STBIR_EDGE_CLAMP, STBIR_EDGE_CLAMP,
+                     STBIR_FILTER_BOX, STBIR_FILTER_BOX,
+                     STBIR_COLORSPACE_SRGB, nullptr);
+        width  = resized_width;
+        height = resized_height;
+        free(image_buffer);
+        image_buffer = resized_image_buffer;
+    }
+    return image_buffer;
+}
+
+uint8_t* load_image_from_file(const char* image_path,
+                              int& width,
+                              int& height,
+                              int expected_width   = 0,
+                              int expected_height  = 0,
+                              int expected_channel = 3) {
+    return load_image_common(false, image_path, 0, width, height, expected_width, expected_height, expected_channel);
+}
+
+uint8_t* load_image_from_memory(const char* image_bytes,
+                                int len,
+                                int& width,
+                                int& height,
+                                int expected_width   = 0,
+                                int expected_height  = 0,
+                                int expected_channel = 3) {
+    return load_image_common(true, image_bytes, len, width, height, expected_width, expected_height, expected_channel);
+}
diff --git a/otherarch/sdcpp/conditioner.hpp b/otherarch/sdcpp/conditioner.hpp
index 403120d9b..45db314b9 100644
--- a/otherarch/sdcpp/conditioner.hpp
+++ b/otherarch/sdcpp/conditioner.hpp
@@ -56,20 +56,26 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
     std::shared_ptr<CLIPTextModelRunner> text_model2;
 
     std::string trigger_word = "img";  // should be user settable
-    std::string embd_dir;
+    std::map<std::string, std::string> embedding_map;
     int32_t num_custom_embeddings   = 0;
     int32_t num_custom_embeddings_2 = 0;
     std::vector<uint8_t> token_embed_custom;
-    std::vector<std::string> readed_embeddings;
+    std::map<std::string, std::pair<int, int>> embedding_pos_map;
 
     FrozenCLIPEmbedderWithCustomWords(ggml_backend_t backend,
                                       bool offload_params_to_cpu,
                                       const String2TensorStorage& tensor_storage_map,
-                                      const std::string& embd_dir,
+                                      const std::map<std::string, std::string>& orig_embedding_map,
                                       SDVersion version = VERSION_SD1,
                                       PMVersion pv      = PM_VERSION_1)
-        : version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407), embd_dir(embd_dir) {
-        bool force_clip_f32 = embd_dir.size() > 0;
+        : version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407) {
+        for (const auto& kv : orig_embedding_map) {
+            std::string name = kv.first;
+            std::transform(name.begin(), name.end(), name.begin(), [](unsigned char c) { return std::tolower(c); });
+            embedding_map[name] = kv.second;
+            tokenizer.add_special_token(name);
+        }
+        bool force_clip_f32 = !embedding_map.empty();
         if (sd_version_is_sd1(version)) {
             text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, true, force_clip_f32);
         } else if (sd_version_is_sd2(version)) {
@@ -117,14 +123,17 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
     }
 
     bool load_embedding(std::string embd_name, std::string embd_path, std::vector<int32_t>& bpe_tokens) {
-        // the order matters
         ModelLoader model_loader;
         if (!model_loader.init_from_file_and_convert_name(embd_path)) {
             LOG_ERROR("embedding '%s' failed", embd_name.c_str());
             return false;
         }
-        if (std::find(readed_embeddings.begin(), readed_embeddings.end(), embd_name) != readed_embeddings.end()) {
+        auto iter = embedding_pos_map.find(embd_name);
+        if (iter != embedding_pos_map.end()) {
             LOG_DEBUG("embedding already read in: %s", embd_name.c_str());
+            for (int i = iter->second.first; i < iter->second.second; i++) {
+                bpe_tokens.push_back(text_model->model.vocab_size + i);
+            }
             return true;
         }
         struct ggml_init_params params;
@@ -155,7 +164,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
             return true;
         };
         model_loader.load_tensors(on_load, 1);
-        readed_embeddings.push_back(embd_name);
+        int pos_start = num_custom_embeddings;
         if (embd) {
             int64_t hidden_size = text_model->model.hidden_size;
             token_embed_custom.resize(token_embed_custom.size() + ggml_nbytes(embd));
@@ -182,6 +191,11 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
             }
             LOG_DEBUG("embedding '%s' applied, custom embeddings: %i (text model 2)", embd_name.c_str(), num_custom_embeddings_2);
         }
+        int pos_end = num_custom_embeddings;
+        if (pos_end == pos_start) {
+            return false;
+        }
+        embedding_pos_map[embd_name] = std::pair{pos_start, pos_end};
         return true;
     }
 
@@ -196,25 +210,13 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
 
     std::vector<int> convert_token_to_id(std::string text) {
         auto on_new_token_cb = [&](std::string& str, std::vector<int32_t>& bpe_tokens) -> bool {
-            size_t word_end       = str.find(",");
-            std::string embd_name = word_end == std::string::npos ? str : str.substr(0, word_end);
-            embd_name             = trim(embd_name);
-            std::string embd_path = get_full_path(embd_dir, embd_name + ".pt");
-            if (embd_path.size() == 0) {
-                embd_path = get_full_path(embd_dir, embd_name + ".ckpt");
+            auto iter = embedding_map.find(str);
+            if (iter == embedding_map.end()) {
+                return false;
             }
-            if (embd_path.size() == 0) {
-                embd_path = get_full_path(embd_dir, embd_name + ".safetensors");
-            }
-            if (embd_path.size() > 0) {
-                if (load_embedding(embd_name, embd_path, bpe_tokens)) {
-                    if (word_end != std::string::npos) {
-                        str = str.substr(word_end);
-                    } else {
-                        str = "";
-                    }
-                    return true;
-                }
+            std::string embedding_path = iter->second;
+            if (load_embedding(str, embedding_path, bpe_tokens)) {
+                return true;
             }
             return false;
         };
@@ -245,25 +247,13 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
         }
 
         auto on_new_token_cb = [&](std::string& str, std::vector<int32_t>& bpe_tokens) -> bool {
-            size_t word_end       = str.find(",");
-            std::string embd_name = word_end == std::string::npos ? str : str.substr(0, word_end);
-            embd_name             = trim(embd_name);
-            std::string embd_path = get_full_path(embd_dir, embd_name + ".pt");
-            if (embd_path.size() == 0) {
-                embd_path = get_full_path(embd_dir, embd_name + ".ckpt");
+            auto iter = embedding_map.find(str);
+            if (iter == embedding_map.end()) {
+                return false;
             }
-            if (embd_path.size() == 0) {
-                embd_path = get_full_path(embd_dir, embd_name + ".safetensors");
-            }
-            if (embd_path.size() > 0) {
-                if (load_embedding(embd_name, embd_path, bpe_tokens)) {
-                    if (word_end != std::string::npos) {
-                        str = str.substr(word_end);
-                    } else {
-                        str = "";
-                    }
-                    return true;
-                }
+            std::string embedding_path = iter->second;
+            if (load_embedding(str, embedding_path, bpe_tokens)) {
+                return true;
             }
             return false;
         };
@@ -376,25 +366,13 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
         }
 
         auto on_new_token_cb = [&](std::string& str, std::vector<int32_t>& bpe_tokens) -> bool {
-            size_t word_end       = str.find(",");
-            std::string embd_name = word_end == std::string::npos ? str : str.substr(0, word_end);
-            embd_name             = trim(embd_name);
-            std::string embd_path = get_full_path(embd_dir, embd_name + ".pt");
-            if (embd_path.size() == 0) {
-                embd_path = get_full_path(embd_dir, embd_name + ".ckpt");
+            auto iter = embedding_map.find(str);
+            if (iter == embedding_map.end()) {
+                return false;
             }
-            if (embd_path.size() == 0) {
-                embd_path = get_full_path(embd_dir, embd_name + ".safetensors");
-            }
-            if (embd_path.size() > 0) {
-                if (load_embedding(embd_name, embd_path, bpe_tokens)) {
-                    if (word_end != std::string::npos) {
-                        str = str.substr(word_end);
-                    } else {
-                        str = "";
-                    }
-                    return true;
-                }
+            std::string embedding_path = iter->second;
+            if (load_embedding(str, embedding_path, bpe_tokens)) {
+                return true;
             }
             return false;
         };
@@ -1638,7 +1616,7 @@ struct LLMEmbedder : public Conditioner {
         LLM::LLMArch arch = LLM::LLMArch::QWEN2_5_VL;
         if (sd_version_is_flux2(version)) {
             arch = LLM::LLMArch::MISTRAL_SMALL_3_2;
-        } else if (sd_version_is_z_image(version)) {
+        } else if (sd_version_is_z_image(version) || version == VERSION_OVIS_IMAGE) {
             arch = LLM::LLMArch::QWEN3;
         }
         if (arch == LLM::LLMArch::MISTRAL_SMALL_3_2) {
@@ -1728,6 +1706,7 @@ struct LLMEmbedder : public Conditioner {
         std::vector<std::pair<int, ggml_tensor*>> image_embeds;
         std::pair<int, int> prompt_attn_range;
         int prompt_template_encode_start_idx = 34;
+        int max_length                       = 0;
         std::set<int> out_layers;
         if (llm->enable_vision && conditioner_params.ref_images.size() > 0) {
             LOG_INFO("QwenImageEditPlusPipeline");
@@ -1825,6 +1804,17 @@ struct LLMEmbedder : public Conditioner {
             prompt_attn_range.second = prompt.size();
 
             prompt += "[/INST]";
+        } else if (version == VERSION_OVIS_IMAGE) {
+            prompt_template_encode_start_idx = 28;
+            max_length                       = prompt_template_encode_start_idx + 256;
+
+            prompt = "<|im_start|>user\nDescribe the image by detailing the color, quantity, text, shape, size, texture, spatial relationships of the objects and background:";
+
+            prompt_attn_range.first = static_cast<int>(prompt.size());
+            prompt += " " + conditioner_params.text;
+            prompt_attn_range.second = static_cast<int>(prompt.size());
+
+            prompt += "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n";
         } else {
             prompt_template_encode_start_idx = 34;
 
@@ -1837,7 +1827,7 @@ struct LLMEmbedder : public Conditioner {
             prompt += "<|im_end|>\n<|im_start|>assistant\n";
         }
 
-        auto tokens_and_weights = tokenize(prompt, prompt_attn_range, 0, false);
+        auto tokens_and_weights = tokenize(prompt, prompt_attn_range, max_length, max_length > 0);
         auto& tokens            = std::get<0>(tokens_and_weights);
         auto& weights           = std::get<1>(tokens_and_weights);
 
@@ -1870,9 +1860,13 @@ struct LLMEmbedder : public Conditioner {
 
         GGML_ASSERT(hidden_states->ne[1] > prompt_template_encode_start_idx);
 
-        int64_t zero_pad_len = 0;
+        int64_t min_length = 0;
         if (sd_version_is_flux2(version)) {
-            int64_t min_length = 512;
+            min_length = 512;
+        }
+
+        int64_t zero_pad_len = 0;
+        if (min_length > 0) {
             if (hidden_states->ne[1] - prompt_template_encode_start_idx < min_length) {
                 zero_pad_len = min_length - hidden_states->ne[1] + prompt_template_encode_start_idx;
             }
@@ -1892,6 +1886,8 @@ struct LLMEmbedder : public Conditioner {
             ggml_ext_tensor_set_f32(new_hidden_states, value, i0, i1, i2, i3);
         });
 
+        // print_ggml_tensor(new_hidden_states);
+
         int64_t t1 = ggml_time_ms();
         LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0);
         return {new_hidden_states, nullptr, nullptr};
diff --git a/otherarch/sdcpp/esrgan.hpp b/otherarch/sdcpp/esrgan.hpp
index 4cac95686..961e84f89 100644
--- a/otherarch/sdcpp/esrgan.hpp
+++ b/otherarch/sdcpp/esrgan.hpp
@@ -156,9 +156,10 @@ struct ESRGAN : public GGMLRunner {
 
     ESRGAN(ggml_backend_t backend,
            bool offload_params_to_cpu,
+           int tile_size                                  = 128,
            const String2TensorStorage& tensor_storage_map = {})
         : GGMLRunner(backend, offload_params_to_cpu) {
-        // rrdb_net will be created in load_from_file
+        this->tile_size = tile_size;
     }
 
     std::string get_desc() override {
diff --git a/otherarch/sdcpp/flux.hpp b/otherarch/sdcpp/flux.hpp
index f0c65e3d7..1df2874ae 100644
--- a/otherarch/sdcpp/flux.hpp
+++ b/otherarch/sdcpp/flux.hpp
@@ -134,6 +134,54 @@ namespace Flux {
         }
     };
 
+    struct MLP : public UnaryBlock {
+        bool use_mlp_silu_act;
+
+    public:
+        MLP(int64_t hidden_size, int64_t intermediate_size, bool use_mlp_silu_act = false, bool bias = false)
+            : use_mlp_silu_act(use_mlp_silu_act) {
+            int64_t mlp_mult_factor = use_mlp_silu_act ? 2 : 1;
+            blocks["0"]             = std::make_shared<Linear>(hidden_size, intermediate_size * mlp_mult_factor, bias);
+            blocks["2"]             = std::make_shared<Linear>(intermediate_size, hidden_size, bias);
+        }
+
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+            auto mlp_0 = std::dynamic_pointer_cast<Linear>(blocks["0"]);
+            auto mlp_2 = std::dynamic_pointer_cast<Linear>(blocks["2"]);
+
+            x = mlp_0->forward(ctx, x);
+            if (use_mlp_silu_act) {
+                x = ggml_ext_silu_act(ctx->ggml_ctx, x);
+            } else {
+                x = ggml_gelu_inplace(ctx->ggml_ctx, x);
+            }
+            x = mlp_2->forward(ctx, x);
+            return x;
+        }
+    };
+
+    struct YakMLP : public UnaryBlock {
+    public:
+        YakMLP(int64_t hidden_size, int64_t intermediate_size, bool bias = true) {
+            blocks["gate_proj"] = std::make_shared<Linear>(hidden_size, intermediate_size, bias);
+            blocks["up_proj"]   = std::make_shared<Linear>(hidden_size, intermediate_size, bias);
+            blocks["down_proj"] = std::make_shared<Linear>(intermediate_size, hidden_size, bias);
+        }
+
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+            auto gate_proj = std::dynamic_pointer_cast<Linear>(blocks["gate_proj"]);
+            auto up_proj   = std::dynamic_pointer_cast<Linear>(blocks["up_proj"]);
+            auto down_proj = std::dynamic_pointer_cast<Linear>(blocks["down_proj"]);
+
+            auto gate = gate_proj->forward(ctx, x);
+            gate      = ggml_silu_inplace(ctx->ggml_ctx, gate);
+            x         = up_proj->forward(ctx, x);
+            x         = ggml_mul(ctx->ggml_ctx, x, gate);
+            x         = down_proj->forward(ctx, x);
+            return x;
+        }
+    };
+
     struct ModulationOut {
         ggml_tensor* shift = nullptr;
         ggml_tensor* scale = nullptr;
@@ -199,7 +247,6 @@ namespace Flux {
     struct DoubleStreamBlock : public GGMLBlock {
         bool prune_mod;
         int idx = 0;
-        bool use_mlp_silu_act;
 
     public:
         DoubleStreamBlock(int64_t hidden_size,
@@ -210,10 +257,10 @@ namespace Flux {
                           bool prune_mod        = false,
                           bool share_modulation = false,
                           bool mlp_proj_bias    = true,
+                          bool use_yak_mlp      = false,
                           bool use_mlp_silu_act = false)
-            : idx(idx), prune_mod(prune_mod), use_mlp_silu_act(use_mlp_silu_act) {
-            int64_t mlp_hidden_dim  = hidden_size * mlp_ratio;
-            int64_t mlp_mult_factor = use_mlp_silu_act ? 2 : 1;
+            : idx(idx), prune_mod(prune_mod) {
+            int64_t mlp_hidden_dim = hidden_size * mlp_ratio;
 
             if (!prune_mod && !share_modulation) {
                 blocks["img_mod"] = std::shared_ptr<GGMLBlock>(new Modulation(hidden_size, true));
@@ -222,9 +269,11 @@ namespace Flux {
             blocks["img_attn"]  = std::shared_ptr<GGMLBlock>(new SelfAttention(hidden_size, num_heads, qkv_bias, mlp_proj_bias));
 
             blocks["img_norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-6f, false));
-            blocks["img_mlp.0"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, mlp_hidden_dim * mlp_mult_factor, mlp_proj_bias));
-            // img_mlp.1 is nn.GELU(approximate="tanh")
-            blocks["img_mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(mlp_hidden_dim, hidden_size, mlp_proj_bias));
+            if (use_yak_mlp) {
+                blocks["img_mlp"] = std::shared_ptr<GGMLBlock>(new YakMLP(hidden_size, mlp_hidden_dim, mlp_proj_bias));
+            } else {
+                blocks["img_mlp"] = std::shared_ptr<GGMLBlock>(new MLP(hidden_size, mlp_hidden_dim, use_mlp_silu_act, mlp_proj_bias));
+            }
 
             if (!prune_mod && !share_modulation) {
                 blocks["txt_mod"] = std::shared_ptr<GGMLBlock>(new Modulation(hidden_size, true));
@@ -233,9 +282,11 @@ namespace Flux {
             blocks["txt_attn"]  = std::shared_ptr<GGMLBlock>(new SelfAttention(hidden_size, num_heads, qkv_bias, mlp_proj_bias));
 
             blocks["txt_norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-6f, false));
-            blocks["txt_mlp.0"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, mlp_hidden_dim * mlp_mult_factor, mlp_proj_bias));
-            // img_mlp.1 is nn.GELU(approximate="tanh")
-            blocks["txt_mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(mlp_hidden_dim, hidden_size, mlp_proj_bias));
+            if (use_yak_mlp) {
+                blocks["txt_mlp"] = std::shared_ptr<GGMLBlock>(new YakMLP(hidden_size, mlp_hidden_dim, mlp_proj_bias));
+            } else {
+                blocks["txt_mlp"] = std::shared_ptr<GGMLBlock>(new MLP(hidden_size, mlp_hidden_dim, use_mlp_silu_act, mlp_proj_bias));
+            }
         }
 
         std::vector<ModulationOut> get_distil_img_mod(GGMLRunnerContext* ctx, struct ggml_tensor* vec) {
@@ -272,15 +323,13 @@ namespace Flux {
             auto img_attn  = std::dynamic_pointer_cast<SelfAttention>(blocks["img_attn"]);
 
             auto img_norm2 = std::dynamic_pointer_cast<LayerNorm>(blocks["img_norm2"]);
-            auto img_mlp_0 = std::dynamic_pointer_cast<Linear>(blocks["img_mlp.0"]);
-            auto img_mlp_2 = std::dynamic_pointer_cast<Linear>(blocks["img_mlp.2"]);
+            auto img_mlp   = std::dynamic_pointer_cast<UnaryBlock>(blocks["img_mlp"]);
 
             auto txt_norm1 = std::dynamic_pointer_cast<LayerNorm>(blocks["txt_norm1"]);
             auto txt_attn  = std::dynamic_pointer_cast<SelfAttention>(blocks["txt_attn"]);
 
             auto txt_norm2 = std::dynamic_pointer_cast<LayerNorm>(blocks["txt_norm2"]);
-            auto txt_mlp_0 = std::dynamic_pointer_cast<Linear>(blocks["txt_mlp.0"]);
-            auto txt_mlp_2 = std::dynamic_pointer_cast<Linear>(blocks["txt_mlp.2"]);
+            auto txt_mlp   = std::dynamic_pointer_cast<UnaryBlock>(blocks["txt_mlp"]);
 
             if (img_mods.empty()) {
                 if (prune_mod) {
@@ -348,27 +397,15 @@ namespace Flux {
             // calculate the img bloks
             img = ggml_add(ctx->ggml_ctx, img, ggml_mul(ctx->ggml_ctx, img_attn->post_attention(ctx, img_attn_out), img_mod1.gate));
 
-            auto img_mlp_out = img_mlp_0->forward(ctx, Flux::modulate(ctx->ggml_ctx, img_norm2->forward(ctx, img), img_mod2.shift, img_mod2.scale));
-            if (use_mlp_silu_act) {
-                img_mlp_out = ggml_ext_silu_act(ctx->ggml_ctx, img_mlp_out);
-            } else {
-                img_mlp_out = ggml_gelu_inplace(ctx->ggml_ctx, img_mlp_out);
-            }
-            img_mlp_out = img_mlp_2->forward(ctx, img_mlp_out);
+            auto img_mlp_out = img_mlp->forward(ctx, Flux::modulate(ctx->ggml_ctx, img_norm2->forward(ctx, img), img_mod2.shift, img_mod2.scale));
 
             img = ggml_add(ctx->ggml_ctx, img, ggml_mul(ctx->ggml_ctx, img_mlp_out, img_mod2.gate));
 
             // calculate the txt bloks
             txt = ggml_add(ctx->ggml_ctx, txt, ggml_mul(ctx->ggml_ctx, txt_attn->post_attention(ctx, txt_attn_out), txt_mod1.gate));
 
-            auto txt_mlp_out = txt_mlp_0->forward(ctx, Flux::modulate(ctx->ggml_ctx, txt_norm2->forward(ctx, txt), txt_mod2.shift, txt_mod2.scale));
-            if (use_mlp_silu_act) {
-                txt_mlp_out = ggml_ext_silu_act(ctx->ggml_ctx, txt_mlp_out);
-            } else {
-                txt_mlp_out = ggml_gelu_inplace(ctx->ggml_ctx, txt_mlp_out);
-            }
-            txt_mlp_out = txt_mlp_2->forward(ctx, txt_mlp_out);
-            txt         = ggml_add(ctx->ggml_ctx, txt, ggml_mul(ctx->ggml_ctx, txt_mlp_out, txt_mod2.gate));
+            auto txt_mlp_out = txt_mlp->forward(ctx, Flux::modulate(ctx->ggml_ctx, txt_norm2->forward(ctx, txt), txt_mod2.shift, txt_mod2.scale));
+            txt              = ggml_add(ctx->ggml_ctx, txt, ggml_mul(ctx->ggml_ctx, txt_mlp_out, txt_mod2.gate));
 
             return {img, txt};
         }
@@ -381,6 +418,7 @@ namespace Flux {
         int64_t mlp_hidden_dim;
         bool prune_mod;
         int idx = 0;
+        bool use_yak_mlp;
         bool use_mlp_silu_act;
         int64_t mlp_mult_factor;
 
@@ -393,8 +431,9 @@ namespace Flux {
                           bool prune_mod        = false,
                           bool share_modulation = false,
                           bool mlp_proj_bias    = true,
+                          bool use_yak_mlp      = false,
                           bool use_mlp_silu_act = false)
-            : hidden_size(hidden_size), num_heads(num_heads), idx(idx), prune_mod(prune_mod), use_mlp_silu_act(use_mlp_silu_act) {
+            : hidden_size(hidden_size), num_heads(num_heads), idx(idx), prune_mod(prune_mod), use_yak_mlp(use_yak_mlp), use_mlp_silu_act(use_mlp_silu_act) {
             int64_t head_dim = hidden_size / num_heads;
             float scale      = qk_scale;
             if (scale <= 0.f) {
@@ -402,7 +441,7 @@ namespace Flux {
             }
             mlp_hidden_dim  = hidden_size * mlp_ratio;
             mlp_mult_factor = 1;
-            if (use_mlp_silu_act) {
+            if (use_yak_mlp || use_mlp_silu_act) {
                 mlp_mult_factor = 2;
             }
 
@@ -481,7 +520,9 @@ namespace Flux {
             k                = norm->key_norm(ctx, k);
             auto attn        = Rope::attention(ctx, q, k, v, pe, mask);  // [N, n_token, hidden_size]
 
-            if (use_mlp_silu_act) {
+            if (use_yak_mlp) {
+                mlp = ggml_ext_silu_act(ctx->ggml_ctx, mlp, false);
+            } else if (use_mlp_silu_act) {
                 mlp = ggml_ext_silu_act(ctx->ggml_ctx, mlp);
             } else {
                 mlp = ggml_gelu_inplace(ctx->ggml_ctx, mlp);
@@ -726,6 +767,8 @@ namespace Flux {
         int64_t in_dim              = 64;
         bool disable_bias           = false;
         bool share_modulation       = false;
+        bool semantic_txt_norm      = false;
+        bool use_yak_mlp            = false;
         bool use_mlp_silu_act       = false;
         float ref_index_scale       = 1.f;
         ChromaRadianceParams chroma_radiance_params;
@@ -759,6 +802,9 @@ namespace Flux {
                     blocks["guidance_in"] = std::make_shared<MLPEmbedder>(256, params.hidden_size, !params.disable_bias);
                 }
             }
+            if (params.semantic_txt_norm) {
+                blocks["txt_norm"] = std::make_shared<RMSNorm>(params.context_in_dim);
+            }
             blocks["txt_in"] = std::make_shared<Linear>(params.context_in_dim, params.hidden_size, !params.disable_bias);
 
             for (int i = 0; i < params.depth; i++) {
@@ -770,6 +816,7 @@ namespace Flux {
                                                                                                    params.is_chroma,
                                                                                                    params.share_modulation,
                                                                                                    !params.disable_bias,
+                                                                                                   params.use_yak_mlp,
                                                                                                    params.use_mlp_silu_act);
             }
 
@@ -782,6 +829,7 @@ namespace Flux {
                                                                                                    params.is_chroma,
                                                                                                    params.share_modulation,
                                                                                                    !params.disable_bias,
+                                                                                                   params.use_yak_mlp,
                                                                                                    params.use_mlp_silu_act);
             }
 
@@ -948,6 +996,12 @@ namespace Flux {
                 ss_mods     = single_stream_modulation->forward(ctx, vec);
             }
 
+            if (params.semantic_txt_norm) {
+                auto semantic_txt_norm = std::dynamic_pointer_cast<RMSNorm>(blocks["txt_norm"]);
+
+                txt = semantic_txt_norm->forward(ctx, txt);
+            }
+
             txt = txt_in->forward(ctx, txt);
 
             for (int i = 0; i < params.depth; i++) {
@@ -1206,6 +1260,11 @@ namespace Flux {
             } else if (version == VERSION_CHROMA_RADIANCE) {
                 flux_params.in_channels = 3;
                 flux_params.patch_size  = 16;
+            } else if (version == VERSION_OVIS_IMAGE) {
+                flux_params.semantic_txt_norm = true;
+                flux_params.use_yak_mlp       = true;
+                flux_params.context_in_dim    = 2048;
+                flux_params.vec_in_dim        = 0;
             } else if (sd_version_is_flux2(version)) {
                 flux_params.context_in_dim   = 15360;
                 flux_params.in_channels      = 128;
@@ -1364,13 +1423,22 @@ namespace Flux {
                 ref_latents[i] = to_backend(ref_latents[i]);
             }
 
+            std::set<int> txt_arange_dims;
+            if (sd_version_is_flux2(version)) {
+                txt_arange_dims    = {3};
+                increase_ref_index = true;
+            } else if (version == VERSION_OVIS_IMAGE) {
+                txt_arange_dims = {1, 2};
+            }
+
             pe_vec      = Rope::gen_flux_pe(x->ne[1],
                                             x->ne[0],
                                             flux_params.patch_size,
                                             x->ne[3],
                                             context->ne[1],
+                                            txt_arange_dims,
                                             ref_latents,
-                                       sd_version_is_flux2(version) ? true : increase_ref_index,
+                                            increase_ref_index,
                                             flux_params.ref_index_scale,
                                             flux_params.theta,
                                             flux_params.axes_dim);
diff --git a/otherarch/sdcpp/ggml_extend.hpp b/otherarch/sdcpp/ggml_extend.hpp
index 3669b17ba..f76aaef42 100644
--- a/otherarch/sdcpp/ggml_extend.hpp
+++ b/otherarch/sdcpp/ggml_extend.hpp
@@ -19,7 +19,6 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
-#include <filesystem>
 
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
@@ -61,6 +60,14 @@
 #define SD_UNUSED(x) (void)(x)
 #endif
 
+__STATIC_INLINE__ int align_up_offset(int n, int multiple) {
+    return (multiple - n % multiple) % multiple;
+}
+
+__STATIC_INLINE__ int align_up(int n, int multiple) {
+    return n + align_up_offset(n, multiple);
+}
+
 __STATIC_INLINE__ void ggml_log_callback_default(ggml_log_level level, const char* text, void*) {
     switch (level) {
         case GGML_LOG_LEVEL_DEBUG:
@@ -289,12 +296,7 @@ __STATIC_INLINE__ void ggml_ext_tensor_diff(
 }
 
 __STATIC_INLINE__ ggml_tensor* load_tensor_from_file(ggml_context* ctx, const std::string& file_path) {
-    #ifdef _WIN32
-        std::filesystem::path fpath = std::filesystem::u8path(file_path);
-    #else
-        std::filesystem::path fpath = std::filesystem::path(file_path);
-    #endif
-    std::ifstream file(fpath, std::ios::binary);
+    std::ifstream file(sd_get_u8path(file_path), std::ios::binary);
     if (!file.is_open()) {
         LOG_ERROR("failed to open '%s'", file_path.c_str());
         return nullptr;
@@ -730,34 +732,22 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_slice(struct ggml_context* ctx,
 __STATIC_INLINE__ std::vector<struct ggml_tensor*> ggml_ext_chunk(struct ggml_context* ctx,
                                                                   struct ggml_tensor* x,
                                                                   int num,
-                                                                  int64_t dim) {
+                                                                  int64_t dim,
+                                                                  bool cont = true) {
     GGML_ASSERT(dim >= 0 && dim < 4);
     GGML_ASSERT(x->ne[dim] % num == 0);
 
-    int perm[4] = {0, 1, 2, 3};
-    for (int i = dim; i < 3; ++i)
-        perm[i] = perm[i + 1];
-    perm[3] = dim;
-
-    int inv_perm[4];
-    for (int i = 0; i < 4; ++i)
-        inv_perm[perm[i]] = i;
-
-    if (dim != 3) {
-        x = ggml_ext_torch_permute(ctx, x, perm[0], perm[1], perm[2], perm[3]);
-        x = ggml_cont(ctx, x);
-    }
-
     std::vector<struct ggml_tensor*> chunks;
-    int64_t chunk_size = x->ne[3] / num;
+    int64_t chunk_size  = x->ne[dim] / num;
+    int64_t stride      = chunk_size * x->nb[dim];
+    int64_t chunk_ne[4] = {x->ne[0], x->ne[1], x->ne[2], x->ne[3]};
+    chunk_ne[dim]       = chunk_size;
     for (int i = 0; i < num; i++) {
         auto chunk = ggml_view_4d(
             ctx, x,
-            x->ne[0], x->ne[1], x->ne[2], chunk_size,
-            x->nb[1], x->nb[2], x->nb[3], x->nb[3] * i * chunk_size);
-
-        if (dim != 3) {
-            chunk = ggml_ext_torch_permute(ctx, chunk, inv_perm[0], inv_perm[1], inv_perm[2], inv_perm[3]);
+            chunk_ne[0], chunk_ne[1], chunk_ne[2], chunk_ne[3],
+            x->nb[1], x->nb[2], x->nb[3], stride * i);
+        if (cont) {
             chunk = ggml_cont(ctx, chunk);
         }
         chunks.push_back(chunk);
@@ -766,17 +756,23 @@ __STATIC_INLINE__ std::vector<struct ggml_tensor*> ggml_ext_chunk(struct ggml_co
     return chunks;
 }
 
-__STATIC_INLINE__ ggml_tensor* ggml_ext_silu_act(ggml_context* ctx, ggml_tensor* x) {
+__STATIC_INLINE__ ggml_tensor* ggml_ext_silu_act(ggml_context* ctx, ggml_tensor* x, bool gate_first = true) {
     // x: [ne3, ne2, ne1, ne0]
     // return: [ne3, ne2, ne1, ne0/2]
 
-    auto x_vec = ggml_ext_chunk(ctx, x, 2, 0);
-    auto x1    = x_vec[0];  // [ne3, ne2, ne1, ne0/2]
-    auto x2    = x_vec[1];  // [ne3, ne2, ne1, ne0/2]
+    auto x_vec = ggml_ext_chunk(ctx, x, 2, 0, false);
+    ggml_tensor* gate;
+    if (gate_first) {
+        gate = x_vec[0];
+        x    = x_vec[1];
+    } else {
+        x    = x_vec[0];
+        gate = x_vec[1];
+    }
+    gate = ggml_cont(ctx, gate);
+    gate = ggml_silu_inplace(ctx, gate);
 
-    x1 = ggml_silu_inplace(ctx, x1);
-
-    x = ggml_mul(ctx, x1, x2);  // [ne3, ne2, ne1, ne0/2]
+    x = ggml_mul(ctx, x, gate);  // [ne3, ne2, ne1, ne0/2]
 
     return x;
 }
@@ -1274,6 +1270,9 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_attention_ext(struct ggml_context
         }
 
         if (mask_in != nullptr) {
+            // the need for padding got removed in ggml 4767bda
+            // ensure we can still use the old version for now
+#ifdef GGML_KQ_MASK_PAD
             int mask_pad = 0;
             if (mask_in->ne[1] % GGML_KQ_MASK_PAD != 0) {
                 mask_pad = GGML_PAD(L_q, GGML_KQ_MASK_PAD) - mask_in->ne[1];
@@ -1281,6 +1280,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_attention_ext(struct ggml_context
             if (mask_pad > 0) {
                 mask_in = ggml_pad(ctx, mask_in, 0, mask_pad, 0, 0);
             }
+#endif
             mask_in = ggml_cast(ctx, mask_in, GGML_TYPE_F16);
         }
 
@@ -1392,10 +1392,14 @@ __STATIC_INLINE__ void ggml_ext_backend_tensor_get_and_sync(ggml_backend_t backe
 }
 
 __STATIC_INLINE__ float ggml_ext_backend_tensor_get_f32(ggml_tensor* tensor) {
-    GGML_ASSERT(tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_I32);
+    GGML_ASSERT(tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_I32 || tensor->type == GGML_TYPE_BF16);
     float value;
     if (tensor->type == GGML_TYPE_F32) {
         ggml_backend_tensor_get(tensor, &value, 0, sizeof(value));
+    } else if (tensor->type == GGML_TYPE_BF16) {
+        ggml_bf16_t bf16_value;
+        ggml_backend_tensor_get(tensor, &bf16_value, 0, sizeof(bf16_value));
+        value = ggml_bf16_to_fp32(bf16_value);
     } else if (tensor->type == GGML_TYPE_F16) {
         ggml_fp16_t f16_value;
         ggml_backend_tensor_get(tensor, &f16_value, 0, sizeof(f16_value));
diff --git a/otherarch/sdcpp/gguf_reader.hpp b/otherarch/sdcpp/gguf_reader.hpp
index 53482662e..edf5899a7 100644
--- a/otherarch/sdcpp/gguf_reader.hpp
+++ b/otherarch/sdcpp/gguf_reader.hpp
@@ -171,7 +171,7 @@ private:
 
 public:
     bool load(const std::string& file_path) {
-        std::ifstream fin(file_path, std::ios::binary);
+        std::ifstream fin(sd_get_u8path(file_path), std::ios::binary);
         if (!fin) {
             LOG_ERROR("failed to open '%s'", file_path.c_str());
             return false;
diff --git a/otherarch/sdcpp/latent-preview.h b/otherarch/sdcpp/latent-preview.h
index 97409a7d8..2c54c3b5e 100644
--- a/otherarch/sdcpp/latent-preview.h
+++ b/otherarch/sdcpp/latent-preview.h
@@ -91,6 +91,41 @@ const float flux_latent_rgb_proj[16][3] = {
     {-0.111849f, -0.055589f, -0.032361f}};
 float flux_latent_rgb_bias[3] = {0.024600f, -0.006937f, -0.008089f};
 
+const float flux2_latent_rgb_proj[32][3] = {
+    {0.000736f, -0.008385f, -0.019710f},
+    {-0.001352f, -0.016392f, 0.020693f},
+    {-0.006376f, 0.002428f, 0.036736f},
+    {0.039384f, 0.074167f, 0.119789f},
+    {0.007464f, -0.005705f, -0.004734f},
+    {-0.004086f, 0.005287f, -0.000409f},
+    {-0.032835f, 0.050802f, -0.028120f},
+    {-0.003158f, -0.000835f, 0.000406f},
+    {-0.112840f, -0.084337f, -0.023083f},
+    {0.001462f, -0.006656f, 0.000549f},
+    {-0.009980f, -0.007480f, 0.009702f},
+    {0.032540f, 0.000214f, -0.061388f},
+    {0.011023f, 0.000694f, 0.007143f},
+    {-0.001468f, -0.006723f, -0.001678f},
+    {-0.005921f, -0.010320f, -0.003907f},
+    {-0.028434f, 0.027584f, 0.018457f},
+    {0.014349f, 0.011523f, 0.000441f},
+    {0.009874f, 0.003081f, 0.001507f},
+    {0.002218f, 0.005712f, 0.001563f},
+    {0.053010f, -0.019844f, 0.008683f},
+    {-0.002507f, 0.005384f, 0.000938f},
+    {-0.002177f, -0.011366f, 0.003559f},
+    {-0.000261f, 0.015121f, -0.003240f},
+    {-0.003944f, -0.002083f, 0.005043f},
+    {-0.009138f, 0.011336f, 0.003781f},
+    {0.011429f, 0.003985f, -0.003855f},
+    {0.010518f, -0.005586f, 0.010131f},
+    {0.007883f, 0.002912f, -0.001473f},
+    {-0.003318f, -0.003160f, 0.003684f},
+    {-0.034560f, -0.008740f, 0.012996f},
+    {0.000166f, 0.001079f, -0.012153f},
+    {0.017772f, 0.000937f, -0.011953f}};
+float flux2_latent_rgb_bias[3] = {-0.028738f, -0.098463f, -0.107619f};
+
 // This one was taken straight from
 // https://github.com/Stability-AI/sd3.5/blob/8565799a3b41eb0c7ba976d18375f0f753f56402/sd3_impls.py#L288-L303
 // (MiT Licence)
@@ -128,16 +163,42 @@ const float sd_latent_rgb_proj[4][3] = {
     {-0.178022f, -0.200862f, -0.678514f}};
 float sd_latent_rgb_bias[3] = {-0.017478f, -0.055834f, -0.105825f};
 
-void preview_latent_video(uint8_t* buffer, struct ggml_tensor* latents, const float (*latent_rgb_proj)[3], const float latent_rgb_bias[3], int width, int height, int frames, int dim) {
+void preview_latent_video(uint8_t* buffer, struct ggml_tensor* latents, const float (*latent_rgb_proj)[3], const float latent_rgb_bias[3], int patch_size) {
     size_t buffer_head = 0;
+
+    uint32_t latent_width  = latents->ne[0];
+    uint32_t latent_height = latents->ne[1];
+    uint32_t dim           = latents->ne[ggml_n_dims(latents) - 1];
+    uint32_t frames        = 1;
+    if (ggml_n_dims(latents) == 4) {
+        frames = latents->ne[2];
+    }
+
+    uint32_t rgb_width  = latent_width * patch_size;
+    uint32_t rgb_height = latent_height * patch_size;
+
+    uint32_t unpatched_dim = dim / (patch_size * patch_size);
+
     for (int k = 0; k < frames; k++) {
-        for (int j = 0; j < height; j++) {
-            for (int i = 0; i < width; i++) {
-                size_t latent_id = (i * latents->nb[0] + j * latents->nb[1] + k * latents->nb[2]);
+        for (int rgb_x = 0; rgb_x < rgb_width; rgb_x++) {
+            for (int rgb_y = 0; rgb_y < rgb_height; rgb_y++) {
+                int latent_x = rgb_x / patch_size;
+                int latent_y = rgb_y / patch_size;
+
+                int channel_offset = 0;
+                if (patch_size > 1) {
+                    channel_offset = ((rgb_y % patch_size) * patch_size + (rgb_x % patch_size));
+                }
+
+                size_t latent_id = (latent_x * latents->nb[0] + latent_y * latents->nb[1] + k * latents->nb[2]);
+
+                // should be incremented by 1 for each pixel
+                size_t pixel_id = k * rgb_width * rgb_height + rgb_y * rgb_width + rgb_x;
+
                 float r = 0, g = 0, b = 0;
                 if (latent_rgb_proj != nullptr) {
-                    for (int d = 0; d < dim; d++) {
-                        float value = *(float*)((char*)latents->data + latent_id + d * latents->nb[ggml_n_dims(latents) - 1]);
+                    for (int d = 0; d < unpatched_dim; d++) {
+                        float value = *(float*)((char*)latents->data + latent_id + (d * patch_size * patch_size + channel_offset) * latents->nb[ggml_n_dims(latents) - 1]);
                         r += value * latent_rgb_proj[d][0];
                         g += value * latent_rgb_proj[d][1];
                         b += value * latent_rgb_proj[d][2];
@@ -164,9 +225,9 @@ void preview_latent_video(uint8_t* buffer, struct ggml_tensor* latents, const fl
                 g = g >= 0 ? g <= 1 ? g : 1 : 0;
                 b = b >= 0 ? b <= 1 ? b : 1 : 0;
 
-                buffer[buffer_head++] = (uint8_t)(r * 255);
-                buffer[buffer_head++] = (uint8_t)(g * 255);
-                buffer[buffer_head++] = (uint8_t)(b * 255);
+                buffer[pixel_id * 3 + 0] = (uint8_t)(r * 255);
+                buffer[pixel_id * 3 + 1] = (uint8_t)(g * 255);
+                buffer[pixel_id * 3 + 2] = (uint8_t)(b * 255);
             }
         }
     }
diff --git a/otherarch/sdcpp/llm.hpp b/otherarch/sdcpp/llm.hpp
index aa1e46a2e..f8c03add0 100644
--- a/otherarch/sdcpp/llm.hpp
+++ b/otherarch/sdcpp/llm.hpp
@@ -356,6 +356,10 @@ namespace LLM {
                 "<|fim_pad|>",
                 "<|repo_name|>",
                 "<|file_sep|>",
+                "<tool_response>",
+                "</tool_response>",
+                "<think>",
+                "</think>",
             };
 
             if (merges_utf8_str.size() > 0) {
@@ -859,11 +863,11 @@ namespace LLM {
             }
 
             if (arch == LLMArch::MISTRAL_SMALL_3_2) {
-                q = ggml_rope_ext(ctx->ggml_ctx, q, input_pos, nullptr, 128, GGML_ROPE_TYPE_NORMAL, 131072, 1000000000.f, 1.f, 0.f, 1.f, 32.f, 1.f);
-                k = ggml_rope_ext(ctx->ggml_ctx, k, input_pos, nullptr, 128, GGML_ROPE_TYPE_NORMAL, 131072, 1000000000.f, 1.f, 0.f, 1.f, 32.f, 1.f);
+                q = ggml_rope_ext(ctx->ggml_ctx, q, input_pos, nullptr, 128, GGML_ROPE_TYPE_NORMAL, 8192, 1000000000.f, 1.f, 0.f, 1.f, 32.f, 1.f);
+                k = ggml_rope_ext(ctx->ggml_ctx, k, input_pos, nullptr, 128, GGML_ROPE_TYPE_NORMAL, 8192, 1000000000.f, 1.f, 0.f, 1.f, 32.f, 1.f);
             } else if (arch == LLMArch::QWEN3) {
-                q = ggml_rope_ext(ctx->ggml_ctx, q, input_pos, nullptr, 128, GGML_ROPE_TYPE_NEOX, 151936, 1000000.f, 1.f, 0.f, 1.f, 32.f, 1.f);
-                k = ggml_rope_ext(ctx->ggml_ctx, k, input_pos, nullptr, 128, GGML_ROPE_TYPE_NEOX, 151936, 1000000.f, 1.f, 0.f, 1.f, 32.f, 1.f);
+                q = ggml_rope_ext(ctx->ggml_ctx, q, input_pos, nullptr, 128, GGML_ROPE_TYPE_NEOX, 40960, 1000000.f, 1.f, 0.f, 1.f, 32.f, 1.f);
+                k = ggml_rope_ext(ctx->ggml_ctx, k, input_pos, nullptr, 128, GGML_ROPE_TYPE_NEOX, 40960, 1000000.f, 1.f, 0.f, 1.f, 32.f, 1.f);
             } else {
                 int sections[4] = {16, 24, 24, 0};
                 q               = ggml_rope_multi(ctx->ggml_ctx, q, input_pos, nullptr, head_dim, sections, GGML_ROPE_TYPE_MROPE, 128000, 1000000.f, 1.f, 0.f, 1.f, 32.f, 1.f);
@@ -1073,29 +1077,22 @@ namespace LLM {
             : GGMLRunner(backend, offload_params_to_cpu), enable_vision(enable_vision_) {
             params.arch = arch;
             if (arch == LLMArch::MISTRAL_SMALL_3_2) {
-                params.num_layers        = 40;
-                params.hidden_size       = 5120;
-                params.intermediate_size = 32768;
-                params.head_dim          = 128;
-                params.num_heads         = 32;
-                params.num_kv_heads      = 8;
-                params.qkv_bias          = false;
-                params.vocab_size        = 131072;
-                params.rms_norm_eps      = 1e-5f;
+                params.head_dim     = 128;
+                params.num_heads    = 32;
+                params.num_kv_heads = 8;
+                params.qkv_bias     = false;
+                params.rms_norm_eps = 1e-5f;
             } else if (arch == LLMArch::QWEN3) {
-                params.num_layers        = 36;
-                params.hidden_size       = 2560;
-                params.intermediate_size = 9728;
-                params.head_dim          = 128;
-                params.num_heads         = 32;
-                params.num_kv_heads      = 8;
-                params.qkv_bias          = false;
-                params.qk_norm           = true;
-                params.vocab_size        = 151936;
-                params.rms_norm_eps      = 1e-6f;
+                params.head_dim     = 128;
+                params.num_heads    = 32;
+                params.num_kv_heads = 8;
+                params.qkv_bias     = false;
+                params.qk_norm      = true;
+                params.rms_norm_eps = 1e-6f;
             }
             bool have_vision_weight = false;
             bool llama_cpp_style    = false;
+            params.num_layers       = 0;
             for (auto pair : tensor_storage_map) {
                 std::string tensor_name = pair.first;
                 if (tensor_name.find(prefix) == std::string::npos)
@@ -1105,10 +1102,36 @@ namespace LLM {
                     have_vision_weight = true;
                     if (contains(tensor_name, "attn.q_proj")) {
                         llama_cpp_style = true;
-                        break;
+                    }
+                    continue;
+                }
+                pos = tensor_name.find("layers.");
+                if (pos != std::string::npos) {
+                    tensor_name = tensor_name.substr(pos);  // remove prefix
+                    auto items  = split_string(tensor_name, '.');
+                    if (items.size() > 1) {
+                        int block_index = atoi(items[1].c_str());
+                        if (block_index + 1 > params.num_layers) {
+                            params.num_layers = block_index + 1;
+                        }
                     }
                 }
+                if (contains(tensor_name, "embed_tokens.weight")) {
+                    params.hidden_size = pair.second.ne[0];
+                    params.vocab_size  = pair.second.ne[1];
+                }
+                if (contains(tensor_name, "layers.0.mlp.gate_proj.weight")) {
+                    params.intermediate_size = pair.second.ne[1];
+                }
             }
+            if (arch == LLMArch::QWEN3 && params.num_layers == 28) {  // Qwen3 2B
+                params.num_heads = 16;
+            }
+            LOG_DEBUG("llm: num_layers = %" PRId64 ", vocab_size = %" PRId64 ", hidden_size = %" PRId64 ", intermediate_size = %" PRId64,
+                      params.num_layers,
+                      params.vocab_size,
+                      params.hidden_size,
+                      params.intermediate_size);
             if (enable_vision && !have_vision_weight) {
                 LOG_WARN("no vision weights detected, vision disabled");
                 enable_vision = false;
diff --git a/otherarch/sdcpp/main.cpp b/otherarch/sdcpp/main.cpp
index c58da7940..e37434b71 100644
--- a/otherarch/sdcpp/main.cpp
+++ b/otherarch/sdcpp/main.cpp
@@ -15,38 +15,10 @@
 // #include "preprocessing.hpp"
 #include "stable-diffusion.h"
 
-#define STB_IMAGE_IMPLEMENTATION
-//#define STB_IMAGE_STATIC
-#include "stb_image.h"
-
-#define STB_IMAGE_WRITE_IMPLEMENTATION
-//#define STB_IMAGE_WRITE_STATIC
-#include "stb_image_write.h"
-
-#define STB_IMAGE_RESIZE_IMPLEMENTATION
-//#define STB_IMAGE_RESIZE_STATIC
-#include "stb_image_resize.h"
+#include "common/common.hpp"
 
 #include "avi_writer.h"
 
-#if defined(_WIN32)
-#define NOMINMAX
-#include <windows.h>
-#endif  // _WIN32
-
-#define SAFE_STR(s) ((s) ? (s) : "")
-#define BOOL_STR(b) ((b) ? "true" : "false")
-
-namespace fs = std::filesystem;
-
-const char* modes_str[] = {
-    "img_gen",
-    "vid_gen",
-    "convert",
-    "upscale",
-};
-#define SD_ALL_MODES_STR "img_gen, vid_gen, convert, upscale"
-
 const char* previews_str[] = {
     "none",
     "proj",
@@ -54,276 +26,12 @@ const char* previews_str[] = {
     "vae",
 };
 
-enum SDMode {
-    IMG_GEN,
-    VID_GEN,
-    CONVERT,
-    UPSCALE,
-    MODE_COUNT
-};
-
-#if defined(_WIN32)
-static std::string utf16_to_utf8(const std::wstring& wstr) {
-    if (wstr.empty())
-        return {};
-    int size_needed = WideCharToMultiByte(CP_UTF8, 0, wstr.data(), (int)wstr.size(),
-                                          nullptr, 0, nullptr, nullptr);
-    if (size_needed <= 0)
-        throw std::runtime_error("UTF-16 to UTF-8 conversion failed");
-
-    std::string utf8(size_needed, 0);
-    WideCharToMultiByte(CP_UTF8, 0, wstr.data(), (int)wstr.size(),
-                        (char*)utf8.data(), size_needed, nullptr, nullptr);
-    return utf8;
-}
-
-static std::string argv_to_utf8(int index, const char** argv) {
-    int argc;
-    wchar_t** argv_w = CommandLineToArgvW(GetCommandLineW(), &argc);
-    if (!argv_w)
-        throw std::runtime_error("Failed to parse command line");
-
-    std::string result;
-    if (index < argc) {
-        result = utf16_to_utf8(argv_w[index]);
-    }
-    LocalFree(argv_w);
-    return result;
-}
-
-#else  // Linux / macOS
-static std::string argv_to_utf8(int index, const char** argv) {
-    return std::string(argv[index]);
-}
-
-#endif
-
-struct StringOption {
-    std::string short_name;
-    std::string long_name;
-    std::string desc;
-    std::string* target;
-};
-
-struct IntOption {
-    std::string short_name;
-    std::string long_name;
-    std::string desc;
-    int* target;
-};
-
-struct FloatOption {
-    std::string short_name;
-    std::string long_name;
-    std::string desc;
-    float* target;
-};
-
-struct BoolOption {
-    std::string short_name;
-    std::string long_name;
-    std::string desc;
-    bool keep_true;
-    bool* target;
-};
-
-struct ManualOption {
-    std::string short_name;
-    std::string long_name;
-    std::string desc;
-    std::function<int(int argc, const char** argv, int index)> cb;
-};
-
-struct ArgOptions {
-    std::vector<StringOption> string_options;
-    std::vector<IntOption> int_options;
-    std::vector<FloatOption> float_options;
-    std::vector<BoolOption> bool_options;
-    std::vector<ManualOption> manual_options;
-
-    static std::string wrap_text(const std::string& text, size_t width, size_t indent) {
-        std::ostringstream oss;
-        size_t line_len = 0;
-        size_t pos      = 0;
-
-        while (pos < text.size()) {
-            // Preserve manual newlines
-            if (text[pos] == '\n') {
-                oss << '\n'
-                    << std::string(indent, ' ');
-                line_len = indent;
-                ++pos;
-                continue;
-            }
-
-            // Add the character
-            oss << text[pos];
-            ++line_len;
-            ++pos;
-
-            // If the current line exceeds width, try to break at the last space
-            if (line_len >= width) {
-                std::string current = oss.str();
-                size_t back         = current.size();
-
-                // Find the last space (for a clean break)
-                while (back > 0 && current[back - 1] != ' ' && current[back - 1] != '\n')
-                    --back;
-
-                // If found a space to break on
-                if (back > 0 && current[back - 1] != '\n') {
-                    std::string before = current.substr(0, back - 1);
-                    std::string after  = current.substr(back);
-                    oss.str("");
-                    oss.clear();
-                    oss << before << "\n"
-                        << std::string(indent, ' ') << after;
-                } else {
-                    // If no space found, just break at width
-                    oss << "\n"
-                        << std::string(indent, ' ');
-                }
-                line_len = indent;
-            }
-        }
-
-        return oss.str();
-    }
-
-    void print() const {
-        constexpr size_t max_line_width = 120;
-
-        struct Entry {
-            std::string names;
-            std::string desc;
-        };
-        std::vector<Entry> entries;
-
-        auto add_entry = [&](const std::string& s, const std::string& l,
-                             const std::string& desc, const std::string& hint = "") {
-            std::ostringstream ss;
-            if (!s.empty())
-                ss << s;
-            if (!s.empty() && !l.empty())
-                ss << ", ";
-            if (!l.empty())
-                ss << l;
-            if (!hint.empty())
-                ss << " " << hint;
-            entries.push_back({ss.str(), desc});
-        };
-
-        for (auto& o : string_options)
-            add_entry(o.short_name, o.long_name, o.desc, "<string>");
-        for (auto& o : int_options)
-            add_entry(o.short_name, o.long_name, o.desc, "<int>");
-        for (auto& o : float_options)
-            add_entry(o.short_name, o.long_name, o.desc, "<float>");
-        for (auto& o : bool_options)
-            add_entry(o.short_name, o.long_name, o.desc, "");
-        for (auto& o : manual_options)
-            add_entry(o.short_name, o.long_name, o.desc);
-
-        size_t max_name_width = 0;
-        for (auto& e : entries)
-            max_name_width = std::max(max_name_width, e.names.size());
-
-        for (auto& e : entries) {
-            size_t indent            = 2 + max_name_width + 4;
-            size_t desc_width        = (max_line_width > indent ? max_line_width - indent : 40);
-            std::string wrapped_desc = wrap_text(e.desc, max_line_width, indent);
-            std::cout << "  " << std::left << std::setw(static_cast<int>(max_name_width) + 4)
-                      << e.names << wrapped_desc << "\n";
-        }
-    }
-};
-
-bool parse_options(int argc, const char** argv, const std::vector<ArgOptions>& options_list) {
-    bool invalid_arg = false;
-    std::string arg;
-
-    auto match_and_apply = [&](auto& opts, auto&& apply_fn) -> bool {
-        for (auto& option : opts) {
-            if ((option.short_name.size() > 0 && arg == option.short_name) ||
-                (option.long_name.size() > 0 && arg == option.long_name)) {
-                apply_fn(option);
-                return true;
-            }
-        }
-        return false;
-    };
-
-    for (int i = 1; i < argc; i++) {
-        arg            = argv[i];
-        bool found_arg = false;
-
-        for (auto& options : options_list) {
-            if (match_and_apply(options.string_options, [&](auto& option) {
-                    if (++i >= argc) {
-                        invalid_arg = true;
-                        return;
-                    }
-                    *option.target = argv_to_utf8(i, argv);
-                    found_arg      = true;
-                }))
-                break;
-
-            if (match_and_apply(options.int_options, [&](auto& option) {
-                    if (++i >= argc) {
-                        invalid_arg = true;
-                        return;
-                    }
-                    *option.target = std::stoi(argv[i]);
-                    found_arg      = true;
-                }))
-                break;
-
-            if (match_and_apply(options.float_options, [&](auto& option) {
-                    if (++i >= argc) {
-                        invalid_arg = true;
-                        return;
-                    }
-                    *option.target = std::stof(argv[i]);
-                    found_arg      = true;
-                }))
-                break;
-
-            if (match_and_apply(options.bool_options, [&](auto& option) {
-                    *option.target = option.keep_true ? true : false;
-                    found_arg      = true;
-                }))
-                break;
-
-            if (match_and_apply(options.manual_options, [&](auto& option) {
-                    int ret = option.cb(argc, argv, i);
-                    if (ret < 0) {
-                        invalid_arg = true;
-                        return;
-                    }
-                    i += ret;
-                    found_arg = true;
-                }))
-                break;
-        }
-
-        if (invalid_arg) {
-            fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
-            return false;
-        }
-        if (!found_arg) {
-            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            return false;
-        }
-    }
-
-    return true;
-}
-
 struct SDCliParams {
     SDMode mode             = IMG_GEN;
     std::string output_path = "output.png";
 
     bool verbose          = false;
+    bool version          = false;
     bool canny_preprocess = false;
 
     preview_t preview_method = PREVIEW_NONE;
@@ -366,6 +74,10 @@ struct SDCliParams {
              "--verbose",
              "print extra info",
              true, &verbose},
+            {"",
+             "--version",
+             "print stable-diffusion.cpp version",
+             true, &version},
             {"",
              "--color",
              "colors the logging tags according to level",
@@ -480,1066 +192,8 @@ struct SDCliParams {
     }
 };
 
-struct SDContextParams {
-    int n_threads = -1;
-    std::string model_path;
-    std::string clip_l_path;
-    std::string clip_g_path;
-    std::string clip_vision_path;
-    std::string t5xxl_path;
-    std::string llm_path;
-    std::string llm_vision_path;
-    std::string diffusion_model_path;
-    std::string high_noise_diffusion_model_path;
-    std::string vae_path;
-    std::string taesd_path;
-    std::string esrgan_path;
-    std::string control_net_path;
-    std::string embedding_dir;
-    std::string photo_maker_path;
-    sd_type_t wtype = SD_TYPE_COUNT;
-    std::string tensor_type_rules;
-    std::string lora_model_dir;
-
-    rng_type_t rng_type         = CUDA_RNG;
-    rng_type_t sampler_rng_type = RNG_TYPE_COUNT;
-    bool offload_params_to_cpu  = false;
-    bool control_net_cpu        = false;
-    bool clip_on_cpu            = false;
-    bool vae_on_cpu             = false;
-    bool diffusion_flash_attn   = false;
-    bool diffusion_conv_direct  = false;
-    bool vae_conv_direct        = false;
-
-    bool chroma_use_dit_mask = true;
-    bool chroma_use_t5_mask  = false;
-    int chroma_t5_mask_pad   = 1;
-
-    prediction_t prediction           = PREDICTION_COUNT;
-    lora_apply_mode_t lora_apply_mode = LORA_APPLY_AUTO;
-
-    sd_tiling_params_t vae_tiling_params = {false, 0, 0, 0.5f, 0.0f, 0.0f};
-    bool force_sdxl_vae_conv_scale       = false;
-
-    float flow_shift = INFINITY;
-
-    ArgOptions get_options() {
-        ArgOptions options;
-        options.string_options = {
-            {"-m",
-             "--model",
-             "path to full model",
-             &model_path},
-            {"",
-             "--clip_l",
-             "path to the clip-l text encoder", &clip_l_path},
-            {"", "--clip_g",
-             "path to the clip-g text encoder",
-             &clip_g_path},
-            {"",
-             "--clip_vision",
-             "path to the clip-vision encoder",
-             &clip_vision_path},
-            {"",
-             "--t5xxl",
-             "path to the t5xxl text encoder",
-             &t5xxl_path},
-            {"",
-             "--llm",
-             "path to the llm text encoder. For example: (qwenvl2.5 for qwen-image, mistral-small3.2 for flux2, ...)",
-             &llm_path},
-            {"",
-             "--llm_vision",
-             "path to the llm vit",
-             &llm_vision_path},
-            {"",
-             "--qwen2vl",
-             "alias of --llm. Deprecated.",
-             &llm_path},
-            {"",
-             "--qwen2vl_vision",
-             "alias of --llm_vision. Deprecated.",
-             &llm_vision_path},
-            {"",
-             "--diffusion-model",
-             "path to the standalone diffusion model",
-             &diffusion_model_path},
-            {"",
-             "--high-noise-diffusion-model",
-             "path to the standalone high noise diffusion model",
-             &high_noise_diffusion_model_path},
-            {"",
-             "--vae",
-             "path to standalone vae model",
-             &vae_path},
-            {"",
-             "--taesd",
-             "path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)",
-             &taesd_path},
-            {"",
-             "--control-net",
-             "path to control net model",
-             &control_net_path},
-            {"",
-             "--embd-dir",
-             "embeddings directory",
-             &embedding_dir},
-            {"",
-             "--lora-model-dir",
-             "lora model directory",
-             &lora_model_dir},
-
-            {"",
-             "--tensor-type-rules",
-             "weight type per tensor pattern (example: \"^vae\\.=f16,model\\.=q8_0\")",
-             &tensor_type_rules},
-            {"",
-             "--photo-maker",
-             "path to PHOTOMAKER model",
-             &photo_maker_path},
-            {"",
-             "--upscale-model",
-             "path to esrgan model.",
-             &esrgan_path},
-        };
-
-        options.int_options = {
-            {"-t",
-             "--threads",
-             "number of threads to use during computation (default: -1). "
-             "If threads <= 0, then threads will be set to the number of CPU physical cores",
-             &n_threads},
-            {"",
-             "--chroma-t5-mask-pad",
-             "t5 mask pad size of chroma",
-             &chroma_t5_mask_pad},
-        };
-
-        options.float_options = {
-            {"",
-             "--vae-tile-overlap",
-             "tile overlap for vae tiling, in fraction of tile size (default: 0.5)",
-             &vae_tiling_params.target_overlap},
-            {"",
-             "--flow-shift",
-             "shift value for Flow models like SD3.x or WAN (default: auto)",
-             &flow_shift},
-        };
-
-        options.bool_options = {
-            {"",
-             "--vae-tiling",
-             "process vae in tiles to reduce memory usage",
-             true, &vae_tiling_params.enabled},
-            {"",
-             "--force-sdxl-vae-conv-scale",
-             "force use of conv scale on sdxl vae",
-             true, &force_sdxl_vae_conv_scale},
-            {"",
-             "--offload-to-cpu",
-             "place the weights in RAM to save VRAM, and automatically load them into VRAM when needed",
-             true, &offload_params_to_cpu},
-            {"",
-             "--control-net-cpu",
-             "keep controlnet in cpu (for low vram)",
-             true, &control_net_cpu},
-            {"",
-             "--clip-on-cpu",
-             "keep clip in cpu (for low vram)",
-             true, &clip_on_cpu},
-            {"",
-             "--vae-on-cpu",
-             "keep vae in cpu (for low vram)",
-             true, &vae_on_cpu},
-            {"",
-             "--diffusion-fa",
-             "use flash attention in the diffusion model",
-             true, &diffusion_flash_attn},
-            {"",
-             "--diffusion-conv-direct",
-             "use ggml_conv2d_direct in the diffusion model",
-             true, &diffusion_conv_direct},
-            {"",
-             "--vae-conv-direct",
-             "use ggml_conv2d_direct in the vae model",
-             true, &vae_conv_direct},
-            {"",
-             "--chroma-disable-dit-mask",
-             "disable dit mask for chroma",
-             false, &chroma_use_dit_mask},
-            {"",
-             "--chroma-enable-t5-mask",
-             "enable t5 mask for chroma",
-             true, &chroma_use_t5_mask},
-        };
-
-        auto on_type_arg = [&](int argc, const char** argv, int index) {
-            if (++index >= argc) {
-                return -1;
-            }
-            const char* arg = argv[index];
-            wtype           = str_to_sd_type(arg);
-            if (wtype == SD_TYPE_COUNT) {
-                fprintf(stderr, "error: invalid weight format %s\n",
-                        arg);
-                return -1;
-            }
-            return 1;
-        };
-
-        auto on_rng_arg = [&](int argc, const char** argv, int index) {
-            if (++index >= argc) {
-                return -1;
-            }
-            const char* arg = argv[index];
-            rng_type        = str_to_rng_type(arg);
-            if (rng_type == RNG_TYPE_COUNT) {
-                fprintf(stderr, "error: invalid rng type %s\n",
-                        arg);
-                return -1;
-            }
-            return 1;
-        };
-
-        auto on_sampler_rng_arg = [&](int argc, const char** argv, int index) {
-            if (++index >= argc) {
-                return -1;
-            }
-            const char* arg  = argv[index];
-            sampler_rng_type = str_to_rng_type(arg);
-            if (sampler_rng_type == RNG_TYPE_COUNT) {
-                fprintf(stderr, "error: invalid sampler rng type %s\n",
-                        arg);
-                return -1;
-            }
-            return 1;
-        };
-
-        auto on_prediction_arg = [&](int argc, const char** argv, int index) {
-            if (++index >= argc) {
-                return -1;
-            }
-            const char* arg = argv[index];
-            prediction      = str_to_prediction(arg);
-            if (prediction == PREDICTION_COUNT) {
-                fprintf(stderr, "error: invalid prediction type %s\n",
-                        arg);
-                return -1;
-            }
-            return 1;
-        };
-
-        auto on_lora_apply_mode_arg = [&](int argc, const char** argv, int index) {
-            if (++index >= argc) {
-                return -1;
-            }
-            const char* arg = argv[index];
-            lora_apply_mode = str_to_lora_apply_mode(arg);
-            if (lora_apply_mode == LORA_APPLY_MODE_COUNT) {
-                fprintf(stderr, "error: invalid lora apply model %s\n",
-                        arg);
-                return -1;
-            }
-            return 1;
-        };
-
-        auto on_tile_size_arg = [&](int argc, const char** argv, int index) {
-            if (++index >= argc) {
-                return -1;
-            }
-            std::string tile_size_str = argv[index];
-            size_t x_pos              = tile_size_str.find('x');
-            try {
-                if (x_pos != std::string::npos) {
-                    std::string tile_x_str        = tile_size_str.substr(0, x_pos);
-                    std::string tile_y_str        = tile_size_str.substr(x_pos + 1);
-                    vae_tiling_params.tile_size_x = std::stoi(tile_x_str);
-                    vae_tiling_params.tile_size_y = std::stoi(tile_y_str);
-                } else {
-                    vae_tiling_params.tile_size_x = vae_tiling_params.tile_size_y = std::stoi(tile_size_str);
-                }
-            } catch (const std::invalid_argument&) {
-                return -1;
-            } catch (const std::out_of_range&) {
-                return -1;
-            }
-            return 1;
-        };
-
-        auto on_relative_tile_size_arg = [&](int argc, const char** argv, int index) {
-            if (++index >= argc) {
-                return -1;
-            }
-            std::string rel_size_str = argv[index];
-            size_t x_pos             = rel_size_str.find('x');
-            try {
-                if (x_pos != std::string::npos) {
-                    std::string rel_x_str        = rel_size_str.substr(0, x_pos);
-                    std::string rel_y_str        = rel_size_str.substr(x_pos + 1);
-                    vae_tiling_params.rel_size_x = std::stof(rel_x_str);
-                    vae_tiling_params.rel_size_y = std::stof(rel_y_str);
-                } else {
-                    vae_tiling_params.rel_size_x = vae_tiling_params.rel_size_y = std::stof(rel_size_str);
-                }
-            } catch (const std::invalid_argument&) {
-                return -1;
-            } catch (const std::out_of_range&) {
-                return -1;
-            }
-            return 1;
-        };
-
-        options.manual_options = {
-            {"",
-             "--type",
-             "weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). "
-             "If not specified, the default is the type of the weight file",
-             on_type_arg},
-            {"",
-             "--rng",
-             "RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui)",
-             on_rng_arg},
-            {"",
-             "--sampler-rng",
-             "sampler RNG, one of [std_default, cuda, cpu]. If not specified, use --rng",
-             on_sampler_rng_arg},
-            {"",
-             "--prediction",
-             "prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow, flux2_flow]",
-             on_prediction_arg},
-            {"",
-             "--lora-apply-mode",
-             "the way to apply LoRA, one of [auto, immediately, at_runtime], default is auto. "
-             "In auto mode, if the model weights contain any quantized parameters, the at_runtime mode will be used; otherwise, immediately will be used."
-             "The immediately mode may have precision and compatibility issues with quantized parameters, "
-             "but it usually offers faster inference speed and, in some cases, lower memory usage. "
-             "The at_runtime mode, on the other hand, is exactly the opposite.",
-             on_lora_apply_mode_arg},
-            {"",
-             "--vae-tile-size",
-             "tile size for vae tiling, format [X]x[Y] (default: 32x32)",
-             on_tile_size_arg},
-            {"",
-             "--vae-relative-tile-size",
-             "relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)",
-             on_relative_tile_size_arg},
-        };
-
-        return options;
-    }
-
-    bool process_and_check(SDMode mode) {
-        if (mode != UPSCALE && model_path.length() == 0 && diffusion_model_path.length() == 0) {
-            fprintf(stderr, "error: the following arguments are required: model_path/diffusion_model\n");
-            return false;
-        }
-
-        if (mode == UPSCALE) {
-            if (esrgan_path.length() == 0) {
-                fprintf(stderr, "error: upscale mode needs an upscaler model (--upscale-model)\n");
-                return false;
-            }
-        }
-
-        if (n_threads <= 0) {
-            n_threads = sd_get_num_physical_cores();
-        }
-
-        return true;
-    }
-
-    std::string to_string() const {
-        std::ostringstream oss;
-        oss << "SDContextParams {\n"
-            << "  n_threads: " << n_threads << ",\n"
-            << "  model_path: \"" << model_path << "\",\n"
-            << "  clip_l_path: \"" << clip_l_path << "\",\n"
-            << "  clip_g_path: \"" << clip_g_path << "\",\n"
-            << "  clip_vision_path: \"" << clip_vision_path << "\",\n"
-            << "  t5xxl_path: \"" << t5xxl_path << "\",\n"
-            << "  llm_path: \"" << llm_path << "\",\n"
-            << "  llm_vision_path: \"" << llm_vision_path << "\",\n"
-            << "  diffusion_model_path: \"" << diffusion_model_path << "\",\n"
-            << "  high_noise_diffusion_model_path: \"" << high_noise_diffusion_model_path << "\",\n"
-            << "  vae_path: \"" << vae_path << "\",\n"
-            << "  taesd_path: \"" << taesd_path << "\",\n"
-            << "  esrgan_path: \"" << esrgan_path << "\",\n"
-            << "  control_net_path: \"" << control_net_path << "\",\n"
-            << "  embedding_dir: \"" << embedding_dir << "\",\n"
-            << "  wtype: " << sd_type_name(wtype) << ",\n"
-            << "  tensor_type_rules: \"" << tensor_type_rules << "\",\n"
-            << "  lora_model_dir: \"" << lora_model_dir << "\",\n"
-            << "  photo_maker_path: \"" << photo_maker_path << "\",\n"
-            << "  rng_type: " << sd_rng_type_name(rng_type) << ",\n"
-            << "  sampler_rng_type: " << sd_rng_type_name(sampler_rng_type) << ",\n"
-            << "  flow_shift: " << (std::isinf(flow_shift) ? "INF" : std::to_string(flow_shift)) << "\n"
-            << "  offload_params_to_cpu: " << (offload_params_to_cpu ? "true" : "false") << ",\n"
-            << "  control_net_cpu: " << (control_net_cpu ? "true" : "false") << ",\n"
-            << "  clip_on_cpu: " << (clip_on_cpu ? "true" : "false") << ",\n"
-            << "  vae_on_cpu: " << (vae_on_cpu ? "true" : "false") << ",\n"
-            << "  diffusion_flash_attn: " << (diffusion_flash_attn ? "true" : "false") << ",\n"
-            << "  diffusion_conv_direct: " << (diffusion_conv_direct ? "true" : "false") << ",\n"
-            << "  vae_conv_direct: " << (vae_conv_direct ? "true" : "false") << ",\n"
-            << "  chroma_use_dit_mask: " << (chroma_use_dit_mask ? "true" : "false") << ",\n"
-            << "  chroma_use_t5_mask: " << (chroma_use_t5_mask ? "true" : "false") << ",\n"
-            << "  chroma_t5_mask_pad: " << chroma_t5_mask_pad << ",\n"
-            << "  prediction: " << sd_prediction_name(prediction) << ",\n"
-            << "  lora_apply_mode: " << sd_lora_apply_mode_name(lora_apply_mode) << ",\n"
-            << "  vae_tiling_params: { "
-            << vae_tiling_params.enabled << ", "
-            << vae_tiling_params.tile_size_x << ", "
-            << vae_tiling_params.tile_size_y << ", "
-            << vae_tiling_params.target_overlap << ", "
-            << vae_tiling_params.rel_size_x << ", "
-            << vae_tiling_params.rel_size_y << " },\n"
-            << "  force_sdxl_vae_conv_scale: " << (force_sdxl_vae_conv_scale ? "true" : "false") << "\n"
-            << "}";
-        return oss.str();
-    }
-
-    sd_ctx_params_t to_sd_ctx_params_t(bool vae_decode_only, bool free_params_immediately, bool taesd_preview) {
-        sd_ctx_params_t sd_ctx_params = {
-            model_path.c_str(),
-            clip_l_path.c_str(),
-            clip_g_path.c_str(),
-            clip_vision_path.c_str(),
-            t5xxl_path.c_str(),
-            llm_path.c_str(),
-            llm_vision_path.c_str(),
-            diffusion_model_path.c_str(),
-            high_noise_diffusion_model_path.c_str(),
-            vae_path.c_str(),
-            taesd_path.c_str(),
-            control_net_path.c_str(),
-            lora_model_dir.c_str(),
-            embedding_dir.c_str(),
-            photo_maker_path.c_str(),
-            tensor_type_rules.c_str(),
-            vae_decode_only,
-            free_params_immediately,
-            n_threads,
-            wtype,
-            rng_type,
-            sampler_rng_type,
-            prediction,
-            lora_apply_mode,
-            offload_params_to_cpu,
-            clip_on_cpu,
-            control_net_cpu,
-            vae_on_cpu,
-            diffusion_flash_attn,
-            taesd_preview,
-            diffusion_conv_direct,
-            vae_conv_direct,
-            force_sdxl_vae_conv_scale,
-            chroma_use_dit_mask,
-            chroma_use_t5_mask,
-            chroma_t5_mask_pad,
-            flow_shift,
-        };
-        return sd_ctx_params;
-    }
-};
-
-template <typename T>
-static std::string vec_to_string(const std::vector<T>& v) {
-    std::ostringstream oss;
-    oss << "[";
-    for (size_t i = 0; i < v.size(); i++) {
-        oss << v[i];
-        if (i + 1 < v.size())
-            oss << ", ";
-    }
-    oss << "]";
-    return oss.str();
-}
-
-static std::string vec_str_to_string(const std::vector<std::string>& v) {
-    std::ostringstream oss;
-    oss << "[";
-    for (size_t i = 0; i < v.size(); i++) {
-        oss << "\"" << v[i] << "\"";
-        if (i + 1 < v.size())
-            oss << ", ";
-    }
-    oss << "]";
-    return oss.str();
-}
-
-struct SDGenerationParams {
-    std::string prompt;
-    std::string negative_prompt;
-    int clip_skip   = -1;  // <= 0 represents unspecified
-    int width       = 512;
-    int height      = 512;
-    int batch_count = 1;
-    std::string init_image_path;
-    std::string end_image_path;
-    std::string mask_image_path;
-    std::string control_image_path;
-    std::vector<std::string> ref_image_paths;
-    std::string control_video_path;
-    bool auto_resize_ref_image = true;
-    bool increase_ref_index    = false;
-
-    std::vector<int> skip_layers = {7, 8, 9};
-    sd_sample_params_t sample_params;
-
-    std::vector<int> high_noise_skip_layers = {7, 8, 9};
-    sd_sample_params_t high_noise_sample_params;
-
-    std::string easycache_option;
-    sd_easycache_params_t easycache_params;
-
-    float moe_boundary  = 0.875f;
-    int video_frames    = 1;
-    int fps             = 16;
-    float vace_strength = 1.f;
-
-    float strength         = 0.75f;
-    float control_strength = 0.9f;
-
-    int64_t seed = 42;
-
-    // Photo Maker
-    std::string pm_id_images_dir;
-    std::string pm_id_embed_path;
-    float pm_style_strength = 20.f;
-
-    int upscale_repeats = 1;
-
-    SDGenerationParams() {
-        sd_sample_params_init(&sample_params);
-        sd_sample_params_init(&high_noise_sample_params);
-    }
-
-    ArgOptions get_options() {
-        ArgOptions options;
-        options.string_options = {
-            {"-p",
-             "--prompt",
-             "the prompt to render",
-             &prompt},
-            {"-n",
-             "--negative-prompt",
-             "the negative prompt (default: \"\")",
-             &negative_prompt},
-            {"-i",
-             "--init-img",
-             "path to the init image",
-             &init_image_path},
-            {"",
-             "--end-img",
-             "path to the end image, required by flf2v",
-             &end_image_path},
-            {"",
-             "--mask",
-             "path to the mask image",
-             &mask_image_path},
-            {"",
-             "--control-image",
-             "path to control image, control net",
-             &control_image_path},
-            {"",
-             "--control-video",
-             "path to control video frames, It must be a directory path. The video frames inside should be stored as images in "
-             "lexicographical (character) order. For example, if the control video path is `frames`, the directory contain images "
-             "such as 00.png, 01.png, ... etc.",
-             &control_video_path},
-            {"",
-             "--pm-id-images-dir",
-             "path to PHOTOMAKER input id images dir",
-             &pm_id_images_dir},
-            {"",
-             "--pm-id-embed-path",
-             "path to PHOTOMAKER v2 id embed",
-             &pm_id_embed_path},
-        };
-
-        options.int_options = {
-            {"-H",
-             "--height",
-             "image height, in pixel space (default: 512)",
-             &height},
-            {"-W",
-             "--width",
-             "image width, in pixel space (default: 512)",
-             &width},
-            {"",
-             "--steps",
-             "number of sample steps (default: 20)",
-             &sample_params.sample_steps},
-            {"",
-             "--high-noise-steps",
-             "(high noise) number of sample steps (default: -1 = auto)",
-             &high_noise_sample_params.sample_steps},
-            {"",
-             "--clip-skip",
-             "ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1). "
-             "<= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x",
-             &clip_skip},
-            {"-b",
-             "--batch-count",
-             "batch count",
-             &batch_count},
-            {"",
-             "--video-frames",
-             "video frames (default: 1)",
-             &video_frames},
-            {"",
-             "--fps",
-             "fps (default: 24)",
-             &fps},
-            {"",
-             "--timestep-shift",
-             "shift timestep for NitroFusion models (default: 0). "
-             "recommended N for NitroSD-Realism around 250 and 500 for NitroSD-Vibrant",
-             &sample_params.shifted_timestep},
-            {"",
-             "--upscale-repeats",
-             "Run the ESRGAN upscaler this many times (default: 1)",
-             &upscale_repeats},
-        };
-
-        options.float_options = {
-            {"",
-             "--cfg-scale",
-             "unconditional guidance scale: (default: 7.0)",
-             &sample_params.guidance.txt_cfg},
-            {"",
-             "--img-cfg-scale",
-             "image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)",
-             &sample_params.guidance.img_cfg},
-            {"",
-             "--guidance",
-             "distilled guidance scale for models with guidance input (default: 3.5)",
-             &sample_params.guidance.distilled_guidance},
-            {"",
-             "--slg-scale",
-             "skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means disabled, a value of 2.5 is nice for sd3.5 medium",
-             &sample_params.guidance.slg.scale},
-            {"",
-             "--skip-layer-start",
-             "SLG enabling point (default: 0.01)",
-             &sample_params.guidance.slg.layer_start},
-            {"",
-             "--skip-layer-end",
-             "SLG disabling point (default: 0.2)",
-             &sample_params.guidance.slg.layer_end},
-            {"",
-             "--eta",
-             "eta in DDIM, only for DDIM and TCD (default: 0)",
-             &sample_params.eta},
-            {"",
-             "--high-noise-cfg-scale",
-             "(high noise) unconditional guidance scale: (default: 7.0)",
-             &high_noise_sample_params.guidance.txt_cfg},
-            {"",
-             "--high-noise-img-cfg-scale",
-             "(high noise) image guidance scale for inpaint or instruct-pix2pix models (default: same as --cfg-scale)",
-             &high_noise_sample_params.guidance.img_cfg},
-            {"",
-             "--high-noise-guidance",
-             "(high noise) distilled guidance scale for models with guidance input (default: 3.5)",
-             &high_noise_sample_params.guidance.distilled_guidance},
-            {"",
-             "--high-noise-slg-scale",
-             "(high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0)",
-             &high_noise_sample_params.guidance.slg.scale},
-            {"",
-             "--high-noise-skip-layer-start",
-             "(high noise) SLG enabling point (default: 0.01)",
-             &high_noise_sample_params.guidance.slg.layer_start},
-            {"",
-             "--high-noise-skip-layer-end",
-             "(high noise) SLG disabling point (default: 0.2)",
-             &high_noise_sample_params.guidance.slg.layer_end},
-            {"",
-             "--high-noise-eta",
-             "(high noise) eta in DDIM, only for DDIM and TCD (default: 0)",
-             &high_noise_sample_params.eta},
-            {"",
-             "--strength",
-             "strength for noising/unnoising (default: 0.75)",
-             &strength},
-            {"",
-             "--pm-style-strength",
-             "",
-             &pm_style_strength},
-            {"",
-             "--control-strength",
-             "strength to apply Control Net (default: 0.9). 1.0 corresponds to full destruction of information in init image",
-             &control_strength},
-            {"",
-             "--moe-boundary",
-             "timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if `--high-noise-steps` is set to -1",
-             &moe_boundary},
-            {"",
-             "--vace-strength",
-             "wan vace strength",
-             &vace_strength},
-        };
-
-        options.bool_options = {
-            {"",
-             "--increase-ref-index",
-             "automatically increase the indices of references images based on the order they are listed (starting with 1).",
-             true,
-             &increase_ref_index},
-            {"",
-             "--disable-auto-resize-ref-image",
-             "disable auto resize of ref images",
-             false,
-             &auto_resize_ref_image},
-        };
-
-        auto on_seed_arg = [&](int argc, const char** argv, int index) {
-            if (++index >= argc) {
-                return -1;
-            }
-            seed = std::stoll(argv[index]);
-            return 1;
-        };
-
-        auto on_sample_method_arg = [&](int argc, const char** argv, int index) {
-            if (++index >= argc) {
-                return -1;
-            }
-            const char* arg             = argv[index];
-            sample_params.sample_method = str_to_sample_method(arg);
-            if (sample_params.sample_method == SAMPLE_METHOD_COUNT) {
-                fprintf(stderr, "error: invalid sample method %s\n",
-                        arg);
-                return -1;
-            }
-            return 1;
-        };
-
-        auto on_high_noise_sample_method_arg = [&](int argc, const char** argv, int index) {
-            if (++index >= argc) {
-                return -1;
-            }
-            const char* arg                        = argv[index];
-            high_noise_sample_params.sample_method = str_to_sample_method(arg);
-            if (high_noise_sample_params.sample_method == SAMPLE_METHOD_COUNT) {
-                fprintf(stderr, "error: invalid high noise sample method %s\n",
-                        arg);
-                return -1;
-            }
-            return 1;
-        };
-
-        auto on_scheduler_arg = [&](int argc, const char** argv, int index) {
-            if (++index >= argc) {
-                return -1;
-            }
-            const char* arg         = argv[index];
-            sample_params.scheduler = str_to_scheduler(arg);
-            if (sample_params.scheduler == SCHEDULER_COUNT) {
-                fprintf(stderr, "error: invalid scheduler %s\n",
-                        arg);
-                return -1;
-            }
-            return 1;
-        };
-
-        auto on_skip_layers_arg = [&](int argc, const char** argv, int index) {
-            if (++index >= argc) {
-                return -1;
-            }
-            std::string layers_str = argv[index];
-            if (layers_str[0] != '[' || layers_str[layers_str.size() - 1] != ']') {
-                return -1;
-            }
-
-            layers_str = layers_str.substr(1, layers_str.size() - 2);
-
-            std::regex regex("[, ]+");
-            std::sregex_token_iterator iter(layers_str.begin(), layers_str.end(), regex, -1);
-            std::sregex_token_iterator end;
-            std::vector<std::string> tokens(iter, end);
-            std::vector<int> layers;
-            for (const auto& token : tokens) {
-                try {
-                    layers.push_back(std::stoi(token));
-                } catch (const std::invalid_argument&) {
-                    return -1;
-                }
-            }
-            skip_layers = layers;
-            return 1;
-        };
-
-        auto on_high_noise_skip_layers_arg = [&](int argc, const char** argv, int index) {
-            if (++index >= argc) {
-                return -1;
-            }
-            std::string layers_str = argv[index];
-            if (layers_str[0] != '[' || layers_str[layers_str.size() - 1] != ']') {
-                return -1;
-            }
-
-            layers_str = layers_str.substr(1, layers_str.size() - 2);
-
-            std::regex regex("[, ]+");
-            std::sregex_token_iterator iter(layers_str.begin(), layers_str.end(), regex, -1);
-            std::sregex_token_iterator end;
-            std::vector<std::string> tokens(iter, end);
-            std::vector<int> layers;
-            for (const auto& token : tokens) {
-                try {
-                    layers.push_back(std::stoi(token));
-                } catch (const std::invalid_argument&) {
-                    return -1;
-                }
-            }
-            high_noise_skip_layers = layers;
-            return 1;
-        };
-
-        auto on_ref_image_arg = [&](int argc, const char** argv, int index) {
-            if (++index >= argc) {
-                return -1;
-            }
-            ref_image_paths.push_back(argv[index]);
-            return 1;
-        };
-
-        auto on_easycache_arg = [&](int argc, const char** argv, int index) {
-            const std::string default_values = "0.2,0.15,0.95";
-            auto looks_like_value            = [](const std::string& token) {
-                if (token.empty()) {
-                    return false;
-                }
-                if (token[0] != '-') {
-                    return true;
-                }
-                if (token.size() == 1) {
-                    return false;
-                }
-                unsigned char next = static_cast<unsigned char>(token[1]);
-                return std::isdigit(next) || token[1] == '.';
-            };
-
-            std::string option_value;
-            int consumed = 0;
-            if (index + 1 < argc) {
-                std::string next_arg = argv[index + 1];
-                if (looks_like_value(next_arg)) {
-                    option_value = argv_to_utf8(index + 1, argv);
-                    consumed     = 1;
-                }
-            }
-            if (option_value.empty()) {
-                option_value = default_values;
-            }
-            easycache_option = option_value;
-            return consumed;
-        };
-
-        options.manual_options = {
-            {"-s",
-             "--seed",
-             "RNG seed (default: 42, use random seed for < 0)",
-             on_seed_arg},
-            {"",
-             "--sampling-method",
-             "sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd] "
-             "(default: euler for Flux/SD3/Wan, euler_a otherwise)",
-             on_sample_method_arg},
-            {"",
-             "--high-noise-sampling-method",
-             "(high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd]"
-             " default: euler for Flux/SD3/Wan, euler_a otherwise",
-             on_high_noise_sample_method_arg},
-            {"",
-             "--scheduler",
-             "denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple, lcm], default: discrete",
-             on_scheduler_arg},
-            {"",
-             "--skip-layers",
-             "layers to skip for SLG steps (default: [7,8,9])",
-             on_skip_layers_arg},
-            {"",
-             "--high-noise-skip-layers",
-             "(high noise) layers to skip for SLG steps (default: [7,8,9])",
-             on_high_noise_skip_layers_arg},
-            {"-r",
-             "--ref-image",
-             "reference image for Flux Kontext models (can be used multiple times)",
-             on_ref_image_arg},
-            {"",
-             "--easycache",
-             "enable EasyCache for DiT models with optional \"threshold,start_percent,end_percent\" (default: 0.2,0.15,0.95)",
-             on_easycache_arg},
-
-        };
-
-        return options;
-    }
-
-    bool process_and_check(SDMode mode) {
-        if (width <= 0) {
-            fprintf(stderr, "error: the width must be greater than 0\n");
-            return false;
-        }
-
-        if (height <= 0) {
-            fprintf(stderr, "error: the height must be greater than 0\n");
-            return false;
-        }
-
-        if (sample_params.sample_steps <= 0) {
-            fprintf(stderr, "error: the sample_steps must be greater than 0\n");
-            return false;
-        }
-
-        if (high_noise_sample_params.sample_steps <= 0) {
-            high_noise_sample_params.sample_steps = -1;
-        }
-
-        if (strength < 0.f || strength > 1.f) {
-            fprintf(stderr, "error: can only work with strength in [0.0, 1.0]\n");
-            return false;
-        }
-
-        if (!easycache_option.empty()) {
-            float values[3] = {0.0f, 0.0f, 0.0f};
-            std::stringstream ss(easycache_option);
-            std::string token;
-            int idx = 0;
-            while (std::getline(ss, token, ',')) {
-                auto trim = [](std::string& s) {
-                    const char* whitespace = " \t\r\n";
-                    auto start             = s.find_first_not_of(whitespace);
-                    if (start == std::string::npos) {
-                        s.clear();
-                        return;
-                    }
-                    auto end = s.find_last_not_of(whitespace);
-                    s        = s.substr(start, end - start + 1);
-                };
-                trim(token);
-                if (token.empty()) {
-                    fprintf(stderr, "error: invalid easycache option '%s'\n", easycache_option.c_str());
-                    return false;
-                }
-                if (idx >= 3) {
-                    fprintf(stderr, "error: easycache expects exactly 3 comma-separated values (threshold,start,end)\n");
-                    return false;
-                }
-                try {
-                    values[idx] = std::stof(token);
-                } catch (const std::exception&) {
-                    fprintf(stderr, "error: invalid easycache value '%s'\n", token.c_str());
-                    return false;
-                }
-                idx++;
-            }
-            if (idx != 3) {
-                fprintf(stderr, "error: easycache expects exactly 3 comma-separated values (threshold,start,end)\n");
-                return false;
-            }
-            if (values[0] < 0.0f) {
-                fprintf(stderr, "error: easycache threshold must be non-negative\n");
-                return false;
-            }
-            if (values[1] < 0.0f || values[1] >= 1.0f || values[2] <= 0.0f || values[2] > 1.0f || values[1] >= values[2]) {
-                fprintf(stderr, "error: easycache start/end percents must satisfy 0.0 <= start < end <= 1.0\n");
-                return false;
-            }
-            easycache_params.enabled         = true;
-            easycache_params.reuse_threshold = values[0];
-            easycache_params.start_percent   = values[1];
-            easycache_params.end_percent     = values[2];
-        } else {
-            easycache_params.enabled = false;
-        }
-
-        sample_params.guidance.slg.layers                 = skip_layers.data();
-        sample_params.guidance.slg.layer_count            = skip_layers.size();
-        high_noise_sample_params.guidance.slg.layers      = high_noise_skip_layers.data();
-        high_noise_sample_params.guidance.slg.layer_count = high_noise_skip_layers.size();
-
-        if (mode == VID_GEN && video_frames <= 0) {
-            return false;
-        }
-
-        if (mode == VID_GEN && fps <= 0) {
-            return false;
-        }
-
-        if (sample_params.shifted_timestep < 0 || sample_params.shifted_timestep > 1000) {
-            return false;
-        }
-
-        if (upscale_repeats < 1) {
-            return false;
-        }
-
-        if (mode == UPSCALE) {
-            if (init_image_path.length() == 0) {
-                fprintf(stderr, "error: upscale mode needs an init image (--init-img)\n");
-                return false;
-            }
-        }
-
-        if (seed < 0) {
-            srand((int)time(nullptr));
-            seed = rand();
-        }
-
-        return true;
-    }
-
-    std::string to_string() const {
-        char* sample_params_str            = sd_sample_params_to_str(&sample_params);
-        char* high_noise_sample_params_str = sd_sample_params_to_str(&high_noise_sample_params);
-        std::ostringstream oss;
-        oss << "SDGenerationParams {\n"
-            << "  prompt: \"" << prompt << "\",\n"
-            << "  negative_prompt: \"" << negative_prompt << "\",\n"
-            << "  clip_skip: " << clip_skip << ",\n"
-            << "  width: " << width << ",\n"
-            << "  height: " << height << ",\n"
-            << "  batch_count: " << batch_count << ",\n"
-            << "  init_image_path: \"" << init_image_path << "\",\n"
-            << "  end_image_path: \"" << end_image_path << "\",\n"
-            << "  mask_image_path: \"" << mask_image_path << "\",\n"
-            << "  control_image_path: \"" << control_image_path << "\",\n"
-            << "  ref_image_paths: " << vec_str_to_string(ref_image_paths) << ",\n"
-            << "  control_video_path: \"" << control_video_path << "\",\n"
-            << "  auto_resize_ref_image: " << (auto_resize_ref_image ? "true" : "false") << ",\n"
-            << "  increase_ref_index: " << (increase_ref_index ? "true" : "false") << ",\n"
-            << "  pm_id_images_dir: \"" << pm_id_images_dir << "\",\n"
-            << "  pm_id_embed_path: \"" << pm_id_embed_path << "\",\n"
-            << "  pm_style_strength: " << pm_style_strength << ",\n"
-            << "  skip_layers: " << vec_to_string(skip_layers) << ",\n"
-            << "  sample_params: " << sample_params_str << ",\n"
-            << "  high_noise_skip_layers: " << vec_to_string(high_noise_skip_layers) << ",\n"
-            << "  high_noise_sample_params: " << high_noise_sample_params_str << ",\n"
-            << "  easycache_option: \"" << easycache_option << "\",\n"
-            << "  easycache: "
-            << (easycache_params.enabled ? "enabled" : "disabled")
-            << " (threshold=" << easycache_params.reuse_threshold
-            << ", start=" << easycache_params.start_percent
-            << ", end=" << easycache_params.end_percent << "),\n"
-            << "  moe_boundary: " << moe_boundary << ",\n"
-            << "  video_frames: " << video_frames << ",\n"
-            << "  fps: " << fps << ",\n"
-            << "  vace_strength: " << vace_strength << ",\n"
-            << "  strength: " << strength << ",\n"
-            << "  control_strength: " << control_strength << ",\n"
-            << "  seed: " << seed << ",\n"
-            << "  upscale_repeats: " << upscale_repeats << ",\n"
-            << "}";
-        free(sample_params_str);
-        free(high_noise_sample_params_str);
-        return oss.str();
-    }
-};
-
 void print_usage(int argc, const char* argv[], const std::vector<ArgOptions>& options_list) {
+    std::cout << version_string() << "\n";
     std::cout << "Usage: " << argv[0] << " [options]\n\n";
     std::cout << "CLI Options:\n";
     options_list[0].print();
@@ -1557,7 +211,9 @@ void parse_args(int argc, const char** argv, SDCliParams& cli_params, SDContextP
         exit(cli_params.normal_exit ? 0 : 1);
     }
 
-    if (!cli_params.process_and_check() || !ctx_params.process_and_check(cli_params.mode) || !gen_params.process_and_check(cli_params.mode)) {
+    if (!cli_params.process_and_check() ||
+        !ctx_params.process_and_check(cli_params.mode) ||
+        !gen_params.process_and_check(cli_params.mode, ctx_params.lora_model_dir)) {
         print_usage(argc, argv, options_vec);
         exit(1);
     }
@@ -1576,7 +232,7 @@ static std::string sd_basename(const std::string& path) {
 }
 
 std::string get_image_params(const SDCliParams& cli_params, const SDContextParams& ctx_params, const SDGenerationParams& gen_params, int64_t seed) {
-    std::string parameter_string = gen_params.prompt + "\n";
+    std::string parameter_string = gen_params.prompt_with_lora + "\n";
     if (gen_params.negative_prompt.size() != 0) {
         parameter_string += "Negative prompt: " + gen_params.negative_prompt + "\n";
     }
@@ -1602,7 +258,15 @@ std::string get_image_params(const SDCliParams& cli_params, const SDContextParam
         parameter_string += "Sampler RNG: " + std::string(sd_rng_type_name(ctx_params.sampler_rng_type)) + ", ";
     }
     parameter_string += "Sampler: " + std::string(sd_sample_method_name(gen_params.sample_params.sample_method));
-    if (gen_params.sample_params.scheduler != SCHEDULER_COUNT) {
+    if (!gen_params.custom_sigmas.empty()) {
+        parameter_string += ", Custom Sigmas: [";
+        for (size_t i = 0; i < gen_params.custom_sigmas.size(); ++i) {
+            std::ostringstream oss;
+            oss << std::fixed << std::setprecision(4) << gen_params.custom_sigmas[i];
+            parameter_string += oss.str() + (i == gen_params.custom_sigmas.size() - 1 ? "" : ", ");
+        }
+        parameter_string += "]";
+    } else if (gen_params.sample_params.scheduler != SCHEDULER_COUNT) {  // Only show schedule if not using custom sigmas
         parameter_string += " " + std::string(sd_scheduler_name(gen_params.sample_params.scheduler));
     }
     parameter_string += ", ";
@@ -1667,94 +331,6 @@ void sd_log_cb(enum sd_log_level_t level, const char* log, void* data) {
     fflush(out_stream);
 }
 
-uint8_t* load_image(const char* image_path, int& width, int& height, int expected_width = 0, int expected_height = 0, int expected_channel = 3) {
-    int c                 = 0;
-    uint8_t* image_buffer = (uint8_t*)stbi_load(image_path, &width, &height, &c, expected_channel);
-    if (image_buffer == nullptr) {
-        fprintf(stderr, "load image from '%s' failed\n", image_path);
-        return nullptr;
-    }
-    if (c < expected_channel) {
-        fprintf(stderr,
-                "the number of channels for the input image must be >= %d,"
-                "but got %d channels, image_path = %s\n",
-                expected_channel,
-                c,
-                image_path);
-        free(image_buffer);
-        return nullptr;
-    }
-    if (width <= 0) {
-        fprintf(stderr, "error: the width of image must be greater than 0, image_path = %s\n", image_path);
-        free(image_buffer);
-        return nullptr;
-    }
-    if (height <= 0) {
-        fprintf(stderr, "error: the height of image must be greater than 0, image_path = %s\n", image_path);
-        free(image_buffer);
-        return nullptr;
-    }
-
-    // Resize input image ...
-    if ((expected_width > 0 && expected_height > 0) && (height != expected_height || width != expected_width)) {
-        float dst_aspect = (float)expected_width / (float)expected_height;
-        float src_aspect = (float)width / (float)height;
-
-        int crop_x = 0, crop_y = 0;
-        int crop_w = width, crop_h = height;
-
-        if (src_aspect > dst_aspect) {
-            crop_w = (int)(height * dst_aspect);
-            crop_x = (width - crop_w) / 2;
-        } else if (src_aspect < dst_aspect) {
-            crop_h = (int)(width / dst_aspect);
-            crop_y = (height - crop_h) / 2;
-        }
-
-        if (crop_x != 0 || crop_y != 0) {
-            printf("crop input image from %dx%d to %dx%d, image_path = %s\n", width, height, crop_w, crop_h, image_path);
-            uint8_t* cropped_image_buffer = (uint8_t*)malloc(crop_w * crop_h * expected_channel);
-            if (cropped_image_buffer == nullptr) {
-                fprintf(stderr, "error: allocate memory for crop\n");
-                free(image_buffer);
-                return nullptr;
-            }
-            for (int row = 0; row < crop_h; row++) {
-                uint8_t* src = image_buffer + ((crop_y + row) * width + crop_x) * expected_channel;
-                uint8_t* dst = cropped_image_buffer + (row * crop_w) * expected_channel;
-                memcpy(dst, src, crop_w * expected_channel);
-            }
-
-            width  = crop_w;
-            height = crop_h;
-            free(image_buffer);
-            image_buffer = cropped_image_buffer;
-        }
-
-        printf("resize input image from %dx%d to %dx%d\n", width, height, expected_width, expected_height);
-        int resized_height = expected_height;
-        int resized_width  = expected_width;
-
-        uint8_t* resized_image_buffer = (uint8_t*)malloc(resized_height * resized_width * expected_channel);
-        if (resized_image_buffer == nullptr) {
-            fprintf(stderr, "error: allocate memory for resize input image\n");
-            free(image_buffer);
-            return nullptr;
-        }
-        stbir_resize(image_buffer, width, height, 0,
-                     resized_image_buffer, resized_width, resized_height, 0, STBIR_TYPE_UINT8,
-                     expected_channel, STBIR_ALPHA_CHANNEL_NONE, 0,
-                     STBIR_EDGE_CLAMP, STBIR_EDGE_CLAMP,
-                     STBIR_FILTER_BOX, STBIR_FILTER_BOX,
-                     STBIR_COLORSPACE_SRGB, nullptr);
-        width  = resized_width;
-        height = resized_height;
-        free(image_buffer);
-        image_buffer = resized_image_buffer;
-    }
-    return image_buffer;
-}
-
 bool load_images_from_dir(const std::string dir,
                           std::vector<sd_image_t>& images,
                           int expected_width  = 0,
@@ -1789,7 +365,7 @@ bool load_images_from_dir(const std::string dir,
             }
             int width             = 0;
             int height            = 0;
-            uint8_t* image_buffer = load_image(path.c_str(), width, height, expected_width, expected_height);
+            uint8_t* image_buffer = load_image_from_file(path.c_str(), width, height, expected_width, expected_height);
             if (image_buffer == nullptr) {
                 fprintf(stderr, "load image from '%s' failed\n", path.c_str());
                 return false;
@@ -1822,11 +398,19 @@ void step_callback(int step, int frame_count, sd_image_t* image, bool is_noisy,
 }
 
 int main(int argc, const char* argv[]) {
+    if (argc > 1 && std::string(argv[1]) == "--version") {
+        std::cout << version_string() << "\n";
+        return EXIT_SUCCESS;
+    }
+
     SDCliParams cli_params;
     SDContextParams ctx_params;
     SDGenerationParams gen_params;
 
     parse_args(argc, argv, cli_params, ctx_params, gen_params);
+    if (cli_params.verbose || cli_params.version) {
+        std::cout << version_string() << "\n";
+    }
     if (gen_params.video_frames > 4) {
         size_t last_dot_pos   = cli_params.preview_path.find_last_of(".");
         std::string base_path = cli_params.preview_path;
@@ -1917,7 +501,7 @@ int main(int argc, const char* argv[]) {
 
         int width       = 0;
         int height      = 0;
-        init_image.data = load_image(gen_params.init_image_path.c_str(), width, height, gen_params.width, gen_params.height);
+        init_image.data = load_image_from_file(gen_params.init_image_path.c_str(), width, height, gen_params.width, gen_params.height);
         if (init_image.data == nullptr) {
             fprintf(stderr, "load image from '%s' failed\n", gen_params.init_image_path.c_str());
             release_all_resources();
@@ -1930,7 +514,7 @@ int main(int argc, const char* argv[]) {
 
         int width      = 0;
         int height     = 0;
-        end_image.data = load_image(gen_params.end_image_path.c_str(), width, height, gen_params.width, gen_params.height);
+        end_image.data = load_image_from_file(gen_params.end_image_path.c_str(), width, height, gen_params.width, gen_params.height);
         if (end_image.data == nullptr) {
             fprintf(stderr, "load image from '%s' failed\n", gen_params.end_image_path.c_str());
             release_all_resources();
@@ -1942,7 +526,7 @@ int main(int argc, const char* argv[]) {
         int c           = 0;
         int width       = 0;
         int height      = 0;
-        mask_image.data = load_image(gen_params.mask_image_path.c_str(), width, height, gen_params.width, gen_params.height, 1);
+        mask_image.data = load_image_from_file(gen_params.mask_image_path.c_str(), width, height, gen_params.width, gen_params.height, 1);
         if (mask_image.data == nullptr) {
             fprintf(stderr, "load image from '%s' failed\n", gen_params.mask_image_path.c_str());
             release_all_resources();
@@ -1961,7 +545,7 @@ int main(int argc, const char* argv[]) {
     if (gen_params.control_image_path.size() > 0) {
         int width          = 0;
         int height         = 0;
-        control_image.data = load_image(gen_params.control_image_path.c_str(), width, height, gen_params.width, gen_params.height);
+        control_image.data = load_image_from_file(gen_params.control_image_path.c_str(), width, height, gen_params.width, gen_params.height);
         if (control_image.data == nullptr) {
             fprintf(stderr, "load image from '%s' failed\n", gen_params.control_image_path.c_str());
             release_all_resources();
@@ -1982,7 +566,7 @@ int main(int argc, const char* argv[]) {
         for (auto& path : gen_params.ref_image_paths) {
             int width             = 0;
             int height            = 0;
-            uint8_t* image_buffer = load_image(path.c_str(), width, height);
+            uint8_t* image_buffer = load_image_from_file(path.c_str(), width, height);
             if (image_buffer == nullptr) {
                 fprintf(stderr, "load image from '%s' failed\n", path.c_str());
                 release_all_resources();
@@ -2062,6 +646,8 @@ int main(int argc, const char* argv[]) {
 
         if (cli_params.mode == IMG_GEN) {
             sd_img_gen_params_t img_gen_params = {
+                gen_params.lora_vec.data(),
+                static_cast<uint32_t>(gen_params.lora_vec.size()),
                 gen_params.prompt.c_str(),
                 gen_params.negative_prompt.c_str(),
                 gen_params.clip_skip,
@@ -2093,6 +679,8 @@ int main(int argc, const char* argv[]) {
             num_results = gen_params.batch_count;
         } else if (cli_params.mode == VID_GEN) {
             sd_vid_gen_params_t vid_gen_params = {
+                gen_params.lora_vec.data(),
+                static_cast<uint32_t>(gen_params.lora_vec.size()),
                 gen_params.prompt.c_str(),
                 gen_params.negative_prompt.c_str(),
                 gen_params.clip_skip,
@@ -2129,7 +717,8 @@ int main(int argc, const char* argv[]) {
         upscaler_ctx_t* upscaler_ctx = new_upscaler_ctx(ctx_params.esrgan_path.c_str(),
                                                         ctx_params.offload_params_to_cpu,
                                                         ctx_params.diffusion_conv_direct,
-                                                        ctx_params.n_threads);
+                                                        ctx_params.n_threads,
+                                                        gen_params.upscale_tile_size);
 
         if (upscaler_ctx == nullptr) {
             printf("new_upscaler_ctx failed\n");
diff --git a/otherarch/sdcpp/model.cpp b/otherarch/sdcpp/model.cpp
index 1a683e396..682a72c81 100644
--- a/otherarch/sdcpp/model.cpp
+++ b/otherarch/sdcpp/model.cpp
@@ -11,7 +11,6 @@
 #include <thread>
 #include <unordered_map>
 #include <vector>
-#include <filesystem>
 
 #include "gguf_reader.hpp"
 #include "model.h"
@@ -317,12 +316,7 @@ bool is_zip_file(const std::string& file_path) {
 }
 
 bool is_gguf_file(const std::string& file_path) {
-    #ifdef _WIN32
-        std::filesystem::path fpath = std::filesystem::u8path(file_path);
-    #else
-        std::filesystem::path fpath = std::filesystem::path(file_path);
-    #endif
-    std::ifstream file(fpath, std::ios::binary);
+    std::ifstream file(sd_get_u8path(file_path), std::ios::binary);
     if (!file.is_open()) {
         return false;
     }
@@ -343,12 +337,7 @@ bool is_gguf_file(const std::string& file_path) {
 }
 
 bool is_safetensors_file(const std::string& file_path) {
-    #ifdef _WIN32
-        std::filesystem::path fpath = std::filesystem::u8path(file_path);
-    #else
-        std::filesystem::path fpath = std::filesystem::path(file_path);
-    #endif
-    std::ifstream file(fpath, std::ios::binary);
+    std::ifstream file(sd_get_u8path(file_path), std::ios::binary);
     if (!file.is_open()) {
         return false;
     }
@@ -531,12 +520,7 @@ bool ModelLoader::init_from_safetensors_file(const std::string& file_path, const
     LOG_DEBUG("init from '%s', prefix = '%s'", file_path.c_str(), prefix.c_str());
     file_paths_.push_back(file_path);
     size_t file_index = file_paths_.size() - 1;
-    #ifdef _WIN32
-        std::filesystem::path fpath = std::filesystem::u8path(file_path);
-    #else
-        std::filesystem::path fpath = std::filesystem::path(file_path);
-    #endif
-    std::ifstream file(fpath, std::ios::binary);
+    std::ifstream file(sd_get_u8path(file_path), std::ios::binary);
     if (!file.is_open()) {
         LOG_ERROR("failed to open '%s'", file_path.c_str());
         file_paths_.pop_back();
@@ -1101,6 +1085,9 @@ SDVersion ModelLoader::get_sd_version() {
             if (tensor_storage.name.find("model.diffusion_model.double_stream_modulation_img.lin.weight") != std::string::npos) {
                 return VERSION_FLUX2;
             }
+            if (tensor_storage.name.find("model.diffusion_model.double_blocks.0.img_mlp.gate_proj.weight") != std::string::npos) {
+                return VERSION_OVIS_IMAGE;
+            }
             if (tensor_storage.name.find("model.diffusion_model.cap_embedder.0.weight") != std::string::npos) {
                 return VERSION_Z_IMAGE;
             }
@@ -1479,12 +1466,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
                     }
                 } else {
                     // kcpp
-                    #ifdef _WIN32
-                        std::filesystem::path fpath = std::filesystem::u8path(file_path);
-                    #else
-                        std::filesystem::path fpath = std::filesystem::path(file_path);
-                    #endif
-                    file.open(fpath, std::ios::binary);
+                    file.open(sd_get_u8path(file_path), std::ios::binary);
                     if (!file.is_open()) {
                         LOG_ERROR("failed to open '%s'", file_path.c_str());
                         failed = true;
diff --git a/otherarch/sdcpp/model.h b/otherarch/sdcpp/model.h
index 4f82df8e3..598e4c6a5 100644
--- a/otherarch/sdcpp/model.h
+++ b/otherarch/sdcpp/model.h
@@ -45,6 +45,7 @@ enum SDVersion {
     VERSION_QWEN_IMAGE,
     VERSION_FLUX2,
     VERSION_Z_IMAGE,
+    VERSION_OVIS_IMAGE,
     VERSION_COUNT,
 };
 
@@ -90,6 +91,7 @@ static inline bool sd_version_is_flux(SDVersion version) {
         version == VERSION_FLUX_FILL ||
         version == VERSION_FLUX_CONTROLS ||
         version == VERSION_FLEX_2 ||
+        version == VERSION_OVIS_IMAGE ||
         version == VERSION_CHROMA_RADIANCE) {
         return true;
     }
diff --git a/otherarch/sdcpp/rope.hpp b/otherarch/sdcpp/rope.hpp
index 7a35926eb..4abc51469 100644
--- a/otherarch/sdcpp/rope.hpp
+++ b/otherarch/sdcpp/rope.hpp
@@ -72,11 +72,13 @@ namespace Rope {
     }
 
     // Generate IDs for image patches and text
-    __STATIC_INLINE__ std::vector<std::vector<float>> gen_flux_txt_ids(int bs, int context_len, int axes_dim_num) {
+    __STATIC_INLINE__ std::vector<std::vector<float>> gen_flux_txt_ids(int bs, int context_len, int axes_dim_num, std::set<int> arange_dims) {
         auto txt_ids = std::vector<std::vector<float>>(bs * context_len, std::vector<float>(axes_dim_num, 0.0f));
-        if (axes_dim_num == 4) {
-            for (int i = 0; i < bs * context_len; i++) {
-                txt_ids[i][3] = (i % context_len);
+        for (int dim = 0; dim < axes_dim_num; dim++) {
+            if (arange_dims.find(dim) != arange_dims.end()) {
+                for (int i = 0; i < bs * context_len; i++) {
+                    txt_ids[i][dim] = (i % context_len);
+                }
             }
         }
         return txt_ids;
@@ -211,10 +213,11 @@ namespace Rope {
                                                                    int bs,
                                                                    int axes_dim_num,
                                                                    int context_len,
+                                                                   std::set<int> txt_arange_dims,
                                                                    const std::vector<ggml_tensor*>& ref_latents,
                                                                    bool increase_ref_index,
                                                                    float ref_index_scale) {
-        auto txt_ids = gen_flux_txt_ids(bs, context_len, axes_dim_num);
+        auto txt_ids = gen_flux_txt_ids(bs, context_len, axes_dim_num, txt_arange_dims);
         auto img_ids = gen_flux_img_ids(h, w, patch_size, bs, axes_dim_num);
 
         auto ids = concat_ids(txt_ids, img_ids, bs);
@@ -231,6 +234,7 @@ namespace Rope {
                                                      int patch_size,
                                                      int bs,
                                                      int context_len,
+                                                     std::set<int> txt_arange_dims,
                                                      const std::vector<ggml_tensor*>& ref_latents,
                                                      bool increase_ref_index,
                                                      float ref_index_scale,
@@ -242,6 +246,7 @@ namespace Rope {
                                                            bs,
                                                            static_cast<int>(axes_dim.size()),
                                                            context_len,
+                                                           txt_arange_dims,
                                                            ref_latents,
                                                            increase_ref_index,
                                                            ref_index_scale);
diff --git a/otherarch/sdcpp/sdtype_adapter.cpp b/otherarch/sdcpp/sdtype_adapter.cpp
index 460d0029a..0490bde5d 100644
--- a/otherarch/sdcpp/sdtype_adapter.cpp
+++ b/otherarch/sdcpp/sdtype_adapter.cpp
@@ -85,6 +85,10 @@ struct SDParams {
     bool vae_conv_direct          = false;
 
     bool chroma_use_dit_mask     = true;
+
+    std::string lora_path;
+    sd_lora_t lora_spec;
+    uint32_t lora_count;
 };
 
 //shared
@@ -320,6 +324,7 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) {
     sd_params->clip_l_path = clip1_filename;
     sd_params->clip_g_path = clip2_filename;
     sd_params->stacked_id_embeddings_path = photomaker_filename;
+    sd_params->lora_path = lorafilename;
     //if t5 is set, and model is a gguf, load it as a diffusion model path
     bool endswithgguf = (sd_params->model_path.rfind(".gguf") == sd_params->model_path.size() - 5);
     if((sd_params->t5xxl_path!="" || sd_params->clip_l_path!="" || sd_params->clip_g_path!="") && endswithgguf)
@@ -414,10 +419,15 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) {
     std::filesystem::path mpath(inputs.model_filename);
     sdmodelfilename = mpath.filename().string();
 
-    if(lorafilename!="" && inputs.lora_multiplier>0)
+    sd_params->lora_spec = {};
+    sd_params->lora_spec.path = sd_params->lora_path.c_str();
+    sd_params->lora_spec.multiplier = inputs.lora_multiplier;
+
+    if(sd_params->lora_path!="" && sd_params->lora_spec.multiplier>0)
     {
         printf("\nApply LoRA...\n");
-        sd_ctx->sd->apply_lora_from_file(lorafilename,inputs.lora_multiplier);
+        sd_params->lora_count = 1;
+        sd_ctx->sd->apply_loras(&sd_params->lora_spec, sd_params->lora_count);
     }
 
     input_extraimage_buffers.reserve(max_extra_images);
@@ -977,6 +987,11 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
     params.vae_tiling_params.enabled = dotile;
     params.batch_count = 1;
 
+    // needs to be "reapplied" because sdcpp tracks previously applied LoRAs
+    // and weights, and apply/unapply the differences at each gen
+    params.loras = &sd_params->lora_spec;
+    params.lora_count = sd_params->lora_count;
+
     params.ref_images = reference_imgs.data();
     params.ref_images_count = reference_imgs.size();
     params.pm_params.id_images = photomaker_imgs.data();
diff --git a/otherarch/sdcpp/stable-diffusion.cpp b/otherarch/sdcpp/stable-diffusion.cpp
index 10ce0c8b7..c802eace2 100644
--- a/otherarch/sdcpp/stable-diffusion.cpp
+++ b/otherarch/sdcpp/stable-diffusion.cpp
@@ -48,6 +48,7 @@ const char* model_version_to_str[] = {
     "Qwen Image",
     "Flux.2",
     "Z-Image",
+    "Ovis Image",
 };
 
 const char* sampling_methods_str[] = {
@@ -548,6 +549,13 @@ public:
                                                                         tensor_storage_map,
                                                                         sd_ctx_params->chroma_use_t5_mask,
                                                                         sd_ctx_params->chroma_t5_mask_pad);
+                } else if (version == VERSION_OVIS_IMAGE) {
+                    cond_stage_model = std::make_shared<LLMEmbedder>(clip_backend,
+                                                                     offload_params_to_cpu,
+                                                                     tensor_storage_map,
+                                                                     version,
+                                                                     "",
+                                                                     false);
                 } else {
                     cond_stage_model = std::make_shared<FluxCLIPEmbedder>(clip_backend,
                                                                           offload_params_to_cpu,
@@ -624,18 +632,22 @@ public:
                                                                 "model.diffusion_model",
                                                                 version);
             } else {  // SD1.x SD2.x SDXL
+                std::map<std::string, std::string> embbeding_map;
+                for (int i = 0; i < sd_ctx_params->embedding_count; i++) {
+                    embbeding_map.emplace(SAFE_STR(sd_ctx_params->embeddings[i].name), SAFE_STR(sd_ctx_params->embeddings[i].path));
+                }
                 if (strstr(SAFE_STR(sd_ctx_params->photo_maker_path), "v2")) {
                     cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend,
                                                                                            offload_params_to_cpu,
                                                                                            tensor_storage_map,
-                                                                                           SAFE_STR(sd_ctx_params->embedding_dir),
+                                                                                           embbeding_map,
                                                                                            version,
                                                                                            PM_VERSION_2);
                 } else {
                     cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend,
                                                                                            offload_params_to_cpu,
                                                                                            tensor_storage_map,
-                                                                                           SAFE_STR(sd_ctx_params->embedding_dir),
+                                                                                           embbeding_map,
                                                                                            version);
                 }
                 diffusion_model = std::make_shared<UNetModel>(backend,
@@ -818,6 +830,11 @@ public:
             ignore_tensors.insert("first_stage_model.quant");
             ignore_tensors.insert("text_encoders.llm.visual.");
         }
+        if (version == VERSION_OVIS_IMAGE) {
+            ignore_tensors.insert("text_encoders.llm.vision_model.");
+            ignore_tensors.insert("text_encoders.llm.visual_tokenizer.");
+            ignore_tensors.insert("text_encoders.llm.vte.");
+        }
         if (version == VERSION_SVD) {
             ignore_tensors.insert("conditioner.embedders.3");
         }
@@ -1044,71 +1061,21 @@ public:
         return result < -1;
     }
 
-    // kcpp
-    void apply_lora_from_file(const std::string& lora_path, float multiplier) {
-        std::unordered_map<std::string, float> lora_f2m;  // lora_name -> multiplier
-
-        lora_f2m[lora_path] = multiplier;
-
-        LOG_DEBUG("lora %s:%.2f", lora_path.c_str(), multiplier);
-
-        int64_t t0 = ggml_time_ms();
-        if (apply_lora_immediately) {
-            LOG_INFO("apply lora immediately");
-            apply_loras_immediately(lora_f2m);
-        } else {
-            LOG_INFO("apply at runtime");
-            apply_loras_at_runtime(lora_f2m);
-        }
-        int64_t t1 = ggml_time_ms();
-
-        LOG_INFO("lora '%s' applied, taking %.2fs",
-                 lora_path.c_str(),
-                 (t1 - t0) * 1.0f / 1000);
-    }
-
     std::shared_ptr<LoraModel> load_lora_model_from_file(const std::string& lora_id,
                                                          float multiplier,
                                                          ggml_backend_t backend,
                                                          LoraModel::filter_t lora_tensor_filter = nullptr) {
-        // kcpp: LoRA is passed as a path
-        #if 1
-        std::string file_path = lora_id;
-        #ifdef _WIN32
-            std::string lora_ident = std::filesystem::u8path(file_path).stem().u8string();
-        #else
-            std::string lora_ident = std::filesystem::path(file_path).stem().string();
-        #endif
-
-        if (!file_exists(file_path)) {
-            LOG_WARN("can not find lora file %s", file_path.c_str());
-            return nullptr;
-        }
-        auto lora = std::make_shared<LoraModel>(lora_ident, backend, file_path, "", version);
-        #else
-        std::string lora_name      = lora_id;
-        std::string high_noise_tag = "|high_noise|";
-        bool is_high_noise         = false;
-        if (starts_with(lora_name, high_noise_tag)) {
-            lora_name     = lora_name.substr(high_noise_tag.size());
+        std::string lora_path             = lora_id;
+        static std::string high_noise_tag = "|high_noise|";
+        bool is_high_noise                = false;
+        if (starts_with(lora_path, high_noise_tag)) {
+            lora_path     = lora_path.substr(high_noise_tag.size());
             is_high_noise = true;
-            LOG_DEBUG("high noise lora: %s", lora_name.c_str());
+            LOG_DEBUG("high noise lora: %s", lora_path.c_str());
         }
-        std::string st_file_path   = path_join(lora_model_dir, lora_name + ".safetensors");
-        std::string ckpt_file_path = path_join(lora_model_dir, lora_name + ".ckpt");
-        std::string file_path;
-        if (file_exists(st_file_path)) {
-            file_path = st_file_path;
-        } else if (file_exists(ckpt_file_path)) {
-            file_path = ckpt_file_path;
-        } else {
-            LOG_WARN("can not find %s or %s for lora %s", st_file_path.c_str(), ckpt_file_path.c_str(), lora_name.c_str());
-            return nullptr;
-        }
-        auto lora = std::make_shared<LoraModel>(lora_id, backend, file_path, is_high_noise ? "model.high_noise_" : "", version);
-        #endif
+        auto lora = std::make_shared<LoraModel>(lora_id, backend, lora_path, is_high_noise ? "model.high_noise_" : "", version);
         if (!lora->load_from_file(n_threads, lora_tensor_filter)) {
-            LOG_WARN("load lora tensors from %s failed", file_path.c_str());
+            LOG_WARN("load lora tensors from %s failed", lora_path.c_str());
             return nullptr;
         }
 
@@ -1293,19 +1260,16 @@ public:
         }
     }
 
-    std::string apply_loras_from_prompt(const std::string& prompt) {
-        auto result_pair                                = extract_and_remove_lora(prompt);
-        std::unordered_map<std::string, float> lora_f2m = result_pair.first;  // lora_name -> multiplier
-
-        for (auto& kv : lora_f2m) {
-            LOG_DEBUG("lora %s:%.2f", kv.first.c_str(), kv.second);
+    void apply_loras(const sd_lora_t* loras, uint32_t lora_count) {
+        std::unordered_map<std::string, float> lora_f2m;
+        for (int i = 0; i < lora_count; i++) {
+            std::string lora_id = SAFE_STR(loras[i].path);
+            if (loras[i].is_high_noise) {
+                lora_id = "|high_noise|" + lora_id;
+            }
+            lora_f2m[lora_id] = loras[i].multiplier;
+            LOG_DEBUG("lora %s:%.2f", lora_id.c_str(), loras[i].multiplier);
         }
-        #if 1 // kcpp
-        //only use hardcoded lora
-        if (!lora_f2m.empty()) {
-            printf("\nWarning: not applying LoRAs requested by prompt!\n");
-        }
-        #else
         int64_t t0 = ggml_time_ms();
         if (apply_lora_immediately) {
             apply_loras_immediately(lora_f2m);
@@ -1315,10 +1279,7 @@ public:
         int64_t t1 = ggml_time_ms();
         if (!lora_f2m.empty()) {
             LOG_INFO("apply_loras completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
-            LOG_DEBUG("prompt after extract and remove lora: \"%s\"", result_pair.second.c_str());
         }
-        #endif
-        return result_pair.second;
     }
 
     ggml_tensor* id_encoder(ggml_context* work_ctx,
@@ -1483,10 +1444,17 @@ public:
         uint32_t dim           = latents->ne[ggml_n_dims(latents) - 1];
 
         if (preview_mode == PREVIEW_PROJ) {
+            int64_t patch_sz                       = 1;
             const float(*latent_rgb_proj)[channel] = nullptr;
             float* latent_rgb_bias                 = nullptr;
 
-            if (dim == 48) {
+            if (dim == 128) {
+                if (sd_version_is_flux2(version)) {
+                    latent_rgb_proj = flux2_latent_rgb_proj;
+                    latent_rgb_bias = flux2_latent_rgb_bias;
+                    patch_sz        = 2;
+                }
+            } else if (dim == 48) {
                 if (sd_version_is_wan(version)) {
                     latent_rgb_proj = wan_22_latent_rgb_proj;
                     latent_rgb_bias = wan_22_latent_rgb_bias;
@@ -1539,12 +1507,15 @@ public:
                 frames = latents->ne[2];
             }
 
-            uint8_t* data = (uint8_t*)malloc(frames * width * height * channel * sizeof(uint8_t));
+            uint32_t img_width  = width * patch_sz;
+            uint32_t img_height = height * patch_sz;
 
-            preview_latent_video(data, latents, latent_rgb_proj, latent_rgb_bias, width, height, frames, dim);
+            uint8_t* data = (uint8_t*)malloc(frames * img_width * img_height * channel * sizeof(uint8_t));
+
+            preview_latent_video(data, latents, latent_rgb_proj, latent_rgb_bias, patch_sz);
             sd_image_t* images = (sd_image_t*)malloc(frames * sizeof(sd_image_t));
             for (int i = 0; i < frames; i++) {
-                images[i] = {width, height, channel, data + i * width * height * channel};
+                images[i] = {img_width, img_height, channel, data + i * img_width * img_height * channel};
             }
             step_callback(step, frames, images, is_noisy, step_callback_data);
             free(data);
@@ -2055,6 +2026,18 @@ public:
         return vae_scale_factor;
     }
 
+    int get_diffusion_model_down_factor() {
+        int down_factor = 8;  // unet
+        if (sd_version_is_dit(version)) {
+            if (sd_version_is_wan(version)) {
+                down_factor = 2;
+            } else {
+                down_factor = 1;
+            }
+        }
+        return down_factor;
+    }
+
     int get_latent_channel() {
         int latent_channel = 4;
         if (sd_version_is_dit(version)) {
@@ -2682,7 +2665,6 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
              "taesd_path: %s\n"
              "control_net_path: %s\n"
              "lora_model_dir: %s\n"
-             "embedding_dir: %s\n"
              "photo_maker_path: %s\n"
              "tensor_type_rules: %s\n"
              "vae_decode_only: %s\n"
@@ -2713,7 +2695,6 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
              SAFE_STR(sd_ctx_params->taesd_path),
              SAFE_STR(sd_ctx_params->control_net_path),
              SAFE_STR(sd_ctx_params->lora_model_dir),
-             SAFE_STR(sd_ctx_params->embedding_dir),
              SAFE_STR(sd_ctx_params->photo_maker_path),
              SAFE_STR(sd_ctx_params->tensor_type_rules),
              BOOL_STR(sd_ctx_params->vae_decode_only),
@@ -2747,6 +2728,8 @@ void sd_sample_params_init(sd_sample_params_t* sample_params) {
     sample_params->scheduler                   = SCHEDULER_COUNT;
     sample_params->sample_method               = SAMPLE_METHOD_COUNT;
     sample_params->sample_steps                = 20;
+    sample_params->custom_sigmas               = nullptr;
+    sample_params->custom_sigmas_count         = 0;
 }
 
 char* sd_sample_params_to_str(const sd_sample_params_t* sample_params) {
@@ -2964,8 +2947,6 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
     int sample_steps = sigmas.size() - 1;
 
     int64_t t0 = ggml_time_ms();
-    // Apply lora
-    prompt = sd_ctx->sd->apply_loras_from_prompt(prompt);
 
     // Photo Maker
     std::string prompt_text_only;
@@ -3294,22 +3275,19 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
     sd_ctx->sd->vae_tiling_params = sd_img_gen_params->vae_tiling_params;
     int width                     = sd_img_gen_params->width;
     int height                    = sd_img_gen_params->height;
-    int vae_scale_factor          = sd_ctx->sd->get_vae_scale_factor();
-    if (sd_version_is_dit(sd_ctx->sd->version)) {
-        if (width % 16 || height % 16) {
-            LOG_ERROR("Image dimensions must be must be a multiple of 16 on each axis for %s models. (Got %dx%d)",
-                      model_version_to_str[sd_ctx->sd->version],
-                      width,
-                      height);
-            return nullptr;
-        }
-    } else if (width % 64 || height % 64) {
-        LOG_ERROR("Image dimensions must be must be a multiple of 64 on each axis for %s models. (Got %dx%d)",
-                  model_version_to_str[sd_ctx->sd->version],
-                  width,
-                  height);
-        return nullptr;
+
+    int vae_scale_factor            = sd_ctx->sd->get_vae_scale_factor();
+    int diffusion_model_down_factor = sd_ctx->sd->get_diffusion_model_down_factor();
+    int spatial_multiple            = vae_scale_factor * diffusion_model_down_factor;
+
+    int width_offset  = align_up_offset(width, spatial_multiple);
+    int height_offset = align_up_offset(height, spatial_multiple);
+    if (width_offset > 0 || height_offset > 0) {
+        width += width_offset;
+        height += height_offset;
+        LOG_WARN("align up %dx%d to %dx%d (multiple=%d)", sd_img_gen_params->width, sd_img_gen_params->height, width, height, spatial_multiple);
     }
+
     LOG_DEBUG("generate_image %dx%d", width, height);
     if (sd_ctx == nullptr || sd_img_gen_params == nullptr) {
         return nullptr;
@@ -3337,17 +3315,30 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
 
     size_t t0 = ggml_time_ms();
 
+    // Apply lora
+    sd_ctx->sd->apply_loras(sd_img_gen_params->loras, sd_img_gen_params->lora_count);
+
     enum sample_method_t sample_method = sd_img_gen_params->sample_params.sample_method;
     if (sample_method == SAMPLE_METHOD_COUNT) {
         sample_method = sd_get_default_sample_method(sd_ctx);
     }
     LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]);
 
-    int sample_steps          = sd_img_gen_params->sample_params.sample_steps;
-    std::vector<float> sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps,
-                                                                 sd_ctx->sd->get_image_seq_len(height, width),
-                                                                 sd_img_gen_params->sample_params.scheduler,
-                                                                 sd_ctx->sd->version);
+    int sample_steps = sd_img_gen_params->sample_params.sample_steps;
+    std::vector<float> sigmas;
+    if (sd_img_gen_params->sample_params.custom_sigmas_count > 0) {
+        sigmas = std::vector<float>(sd_img_gen_params->sample_params.custom_sigmas,
+                                    sd_img_gen_params->sample_params.custom_sigmas + sd_img_gen_params->sample_params.custom_sigmas_count);
+        if (sample_steps != sigmas.size() - 1) {
+            sample_steps = static_cast<int>(sigmas.size()) - 1;
+            LOG_WARN("sample_steps != custom_sigmas_count - 1, set sample_steps to %d", sample_steps);
+        }
+    } else {
+        sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps,
+                                                  sd_ctx->sd->get_image_seq_len(height, width),
+                                                  sd_img_gen_params->sample_params.scheduler,
+                                                  sd_ctx->sd->version);
+    }
 
     ggml_tensor* init_latent   = nullptr;
     ggml_tensor* concat_latent = nullptr;
@@ -3580,9 +3571,19 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
     int frames       = sd_vid_gen_params->video_frames;
     frames           = (frames - 1) / 4 * 4 + 1;
     int sample_steps = sd_vid_gen_params->sample_params.sample_steps;
-    LOG_INFO("generate_video %dx%dx%d", width, height, frames);
 
-    int vae_scale_factor = sd_ctx->sd->get_vae_scale_factor();
+    int vae_scale_factor            = sd_ctx->sd->get_vae_scale_factor();
+    int diffusion_model_down_factor = sd_ctx->sd->get_diffusion_model_down_factor();
+    int spatial_multiple            = vae_scale_factor * diffusion_model_down_factor;
+
+    int width_offset  = align_up_offset(width, spatial_multiple);
+    int height_offset = align_up_offset(height, spatial_multiple);
+    if (width_offset > 0 || height_offset > 0) {
+        width += width_offset;
+        height += height_offset;
+        LOG_WARN("align up %dx%d to %dx%d (multiple=%d)", sd_vid_gen_params->width, sd_vid_gen_params->height, width, height, spatial_multiple);
+    }
+    LOG_INFO("generate_video %dx%dx%d", width, height, frames);
 
     enum sample_method_t sample_method = sd_vid_gen_params->sample_params.sample_method;
     if (sample_method == SAMPLE_METHOD_COUNT) {
@@ -3600,7 +3601,29 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
     if (high_noise_sample_steps > 0) {
         total_steps += high_noise_sample_steps;
     }
-    std::vector<float> sigmas = sd_ctx->sd->denoiser->get_sigmas(total_steps, 0, sd_vid_gen_params->sample_params.scheduler, sd_ctx->sd->version);
+
+    std::vector<float> sigmas;
+    if (sd_vid_gen_params->sample_params.custom_sigmas_count > 0) {
+        sigmas = std::vector<float>(sd_vid_gen_params->sample_params.custom_sigmas,
+                                    sd_vid_gen_params->sample_params.custom_sigmas + sd_vid_gen_params->sample_params.custom_sigmas_count);
+        if (total_steps != sigmas.size() - 1) {
+            total_steps = static_cast<int>(sigmas.size()) - 1;
+            LOG_WARN("total_steps != custom_sigmas_count - 1, set total_steps to %d", total_steps);
+            if (sample_steps >= total_steps) {
+                sample_steps = total_steps;
+                LOG_WARN("total_steps != custom_sigmas_count - 1, set sample_steps to %d", sample_steps);
+            }
+            if (high_noise_sample_steps > 0) {
+                high_noise_sample_steps = total_steps - sample_steps;
+                LOG_WARN("total_steps != custom_sigmas_count - 1, set high_noise_sample_steps to %d", high_noise_sample_steps);
+            }
+        }
+    } else {
+        sigmas = sd_ctx->sd->denoiser->get_sigmas(total_steps,
+                                                  0,
+                                                  sd_vid_gen_params->sample_params.scheduler,
+                                                  sd_ctx->sd->version);
+    }
 
     if (high_noise_sample_steps < 0) {
         // timesteps ∝ sigmas for Flow models (like wan2.2 a14b)
@@ -3636,7 +3659,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
     int64_t t0 = ggml_time_ms();
 
     // Apply lora
-    prompt = sd_ctx->sd->apply_loras_from_prompt(prompt);
+    sd_ctx->sd->apply_loras(sd_vid_gen_params->loras, sd_vid_gen_params->lora_count);
 
     ggml_tensor* init_latent        = nullptr;
     ggml_tensor* clip_vision_output = nullptr;
diff --git a/otherarch/sdcpp/stable-diffusion.h b/otherarch/sdcpp/stable-diffusion.h
index e34cdec17..e4abc8dcd 100644
--- a/otherarch/sdcpp/stable-diffusion.h
+++ b/otherarch/sdcpp/stable-diffusion.h
@@ -150,6 +150,11 @@ typedef struct {
     float rel_size_y;
 } sd_tiling_params_t;
 
+typedef struct {
+    const char* name;
+    const char* path;
+} sd_embedding_t;
+
 typedef struct {
     const char* model_path;
     const char* clip_l_path;
@@ -164,7 +169,8 @@ typedef struct {
     const char* taesd_path;
     const char* control_net_path;
     const char* lora_model_dir;
-    const char* embedding_dir;
+    const sd_embedding_t* embeddings;
+    uint32_t embedding_count;
     const char* photo_maker_path;
     const char* tensor_type_rules;
     bool vae_decode_only;
@@ -219,6 +225,8 @@ typedef struct {
     int sample_steps;
     float eta;
     int shifted_timestep;
+    float* custom_sigmas;
+    int custom_sigmas_count;
 } sd_sample_params_t;
 
 typedef struct {
@@ -236,6 +244,14 @@ typedef struct {
 } sd_easycache_params_t;
 
 typedef struct {
+    bool is_high_noise;
+    float multiplier;
+    const char* path;
+} sd_lora_t;
+
+typedef struct {
+    const sd_lora_t* loras;
+    uint32_t lora_count;
     const char* prompt;
     const char* negative_prompt;
     int clip_skip;
@@ -259,6 +275,8 @@ typedef struct {
 } sd_img_gen_params_t;
 
 typedef struct {
+    const sd_lora_t* loras;
+    uint32_t lora_count;
     const char* prompt;
     const char* negative_prompt;
     int clip_skip;
@@ -331,7 +349,8 @@ typedef struct upscaler_ctx_t upscaler_ctx_t;
 SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path,
                                         bool offload_params_to_cpu,
                                         bool direct,
-                                        int n_threads);
+                                        int n_threads,
+                                        int tile_size);
 SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx);
 
 SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx,
@@ -353,6 +372,9 @@ SD_API bool preprocess_canny(sd_image_t image,
                              float strong,
                              bool inverse);
 
+SD_API const char* sd_commit(void);
+SD_API const char* sd_version(void);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/otherarch/sdcpp/upscaler.cpp b/otherarch/sdcpp/upscaler.cpp
index 62c0d29ad..29ac981e6 100644
--- a/otherarch/sdcpp/upscaler.cpp
+++ b/otherarch/sdcpp/upscaler.cpp
@@ -9,12 +9,15 @@ struct UpscalerGGML {
     std::shared_ptr<ESRGAN> esrgan_upscaler;
     std::string esrgan_path;
     int n_threads;
-    bool direct = false;
+    bool direct   = false;
+    int tile_size = 128;
 
     UpscalerGGML(int n_threads,
-                 bool direct = false)
+                 bool direct   = false,
+                 int tile_size = 128)
         : n_threads(n_threads),
-          direct(direct) {
+          direct(direct),
+          tile_size(tile_size) {
     }
 
     bool load_from_file(const std::string& esrgan_path,
@@ -51,7 +54,7 @@ struct UpscalerGGML {
             backend = ggml_backend_cpu_init();
         }
         LOG_INFO("Upscaler weight type: %s", ggml_type_name(model_data_type));
-        esrgan_upscaler = std::make_shared<ESRGAN>(backend, offload_params_to_cpu, model_loader.get_tensor_storage_map());
+        esrgan_upscaler = std::make_shared<ESRGAN>(backend, offload_params_to_cpu, tile_size, model_loader.get_tensor_storage_map());
         if (direct) {
             esrgan_upscaler->set_conv2d_direct_enabled(true);
         }
@@ -113,14 +116,15 @@ struct upscaler_ctx_t {
 upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str,
                                  bool offload_params_to_cpu,
                                  bool direct,
-                                 int n_threads) {
+                                 int n_threads,
+                                 int tile_size) {
     upscaler_ctx_t* upscaler_ctx = (upscaler_ctx_t*)malloc(sizeof(upscaler_ctx_t));
     if (upscaler_ctx == nullptr) {
         return nullptr;
     }
     std::string esrgan_path(esrgan_path_c_str);
 
-    upscaler_ctx->upscaler = new UpscalerGGML(n_threads, direct);
+    upscaler_ctx->upscaler = new UpscalerGGML(n_threads, direct, tile_size);
     if (upscaler_ctx->upscaler == nullptr) {
         return nullptr;
     }
diff --git a/otherarch/sdcpp/util.cpp b/otherarch/sdcpp/util.cpp
index 9d58a7ec2..fd0c60624 100644
--- a/otherarch/sdcpp/util.cpp
+++ b/otherarch/sdcpp/util.cpp
@@ -3,6 +3,7 @@
 #include <cmath>
 #include <codecvt>
 #include <cstdarg>
+#include <filesystem>
 #include <fstream>
 #include <locale>
 #include <regex>
@@ -98,18 +99,9 @@ bool is_directory(const std::string& path) {
     return (attributes != INVALID_FILE_ATTRIBUTES && (attributes & FILE_ATTRIBUTE_DIRECTORY));
 }
 
-std::string get_full_path(const std::string& dir, const std::string& filename) {
-    std::string full_path = dir + "\\" + filename;
-
-    WIN32_FIND_DATA find_file_data;
-    HANDLE hFind = FindFirstFile(full_path.c_str(), &find_file_data);
-
-    if (hFind != INVALID_HANDLE_VALUE) {
-        FindClose(hFind);
-        return full_path;
-    } else {
-        return "";
-    }
+std::string sd_get_u8path(const std::string& file_path)
+{
+    return std::filesystem::u8path(file_path).string();
 }
 
 #else  // Unix
@@ -126,24 +118,9 @@ bool is_directory(const std::string& path) {
     return (stat(path.c_str(), &buffer) == 0 && S_ISDIR(buffer.st_mode));
 }
 
-// TODO: add windows version
-std::string get_full_path(const std::string& dir, const std::string& filename) {
-    DIR* dp = opendir(dir.c_str());
-
-    if (dp != nullptr) {
-        struct dirent* entry;
-
-        while ((entry = readdir(dp)) != nullptr) {
-            if (strcasecmp(entry->d_name, filename.c_str()) == 0) {
-                closedir(dp);
-                return dir + "/" + entry->d_name;
-            }
-        }
-
-        closedir(dp);
-    }
-
-    return "";
+std::string sd_get_u8path(const std::string& file_path)
+{
+    return std::filesystem::path(file_path).string();
 }
 
 #endif
diff --git a/otherarch/sdcpp/util.h b/otherarch/sdcpp/util.h
index ac2283c62..b92b76071 100644
--- a/otherarch/sdcpp/util.h
+++ b/otherarch/sdcpp/util.h
@@ -22,13 +22,14 @@ int round_up_to(int value, int base);
 
 bool file_exists(const std::string& filename);
 bool is_directory(const std::string& path);
-std::string get_full_path(const std::string& dir, const std::string& filename);
 
 std::u32string utf8_to_utf32(const std::string& utf8_str);
 std::string utf32_to_utf8(const std::u32string& utf32_str);
 std::u32string unicode_value_to_utf32(int unicode_value);
 // std::string sd_basename(const std::string& path);
 
+std::string sd_get_u8path(const std::string& file_path);
+
 typedef struct {
     uint32_t width;
     uint32_t height;
diff --git a/otherarch/sdcpp/version.cpp b/otherarch/sdcpp/version.cpp
new file mode 100644
index 000000000..97dc8426b
--- /dev/null
+++ b/otherarch/sdcpp/version.cpp
@@ -0,0 +1,20 @@
+#include "stable-diffusion.h"
+
+#ifndef SDCPP_BUILD_COMMIT
+#define SDCPP_BUILD_COMMIT unknown
+#endif
+
+#ifndef SDCPP_BUILD_VERSION
+#define SDCPP_BUILD_VERSION unknown
+#endif
+
+#define STRINGIZE2(x) #x
+#define STRINGIZE(x) STRINGIZE2(x)
+
+const char* sd_commit(void) {
+    return STRINGIZE(SDCPP_BUILD_COMMIT);
+}
+
+const char* sd_version(void) {
+    return STRINGIZE(SDCPP_BUILD_VERSION);
+}