From 78bbe8995650551a4412a911a8f359575f01c8a3 Mon Sep 17 00:00:00 2001 From: Wagner Bruna Date: Tue, 16 Dec 2025 05:16:48 -0300 Subject: [PATCH] sd: sync to master-417-43a70e8 (#1889) * sd: sync to master-417-43a70e8 * fix sdmain build * switch to upstream apply_loras() * refactor u8 path conversions and add it to the gguf reader --- Makefile | 2 +- otherarch/sdcpp/clip.hpp | 103 +- otherarch/sdcpp/common.hpp | 4 +- otherarch/sdcpp/common/common.hpp | 1804 ++++++++++++++++++++++++++ otherarch/sdcpp/conditioner.hpp | 128 +- otherarch/sdcpp/esrgan.hpp | 3 +- otherarch/sdcpp/flux.hpp | 134 +- otherarch/sdcpp/ggml_extend.hpp | 76 +- otherarch/sdcpp/gguf_reader.hpp | 2 +- otherarch/sdcpp/latent-preview.h | 79 +- otherarch/sdcpp/llm.hpp | 71 +- otherarch/sdcpp/main.cpp | 1491 +-------------------- otherarch/sdcpp/model.cpp | 32 +- otherarch/sdcpp/model.h | 2 + otherarch/sdcpp/rope.hpp | 15 +- otherarch/sdcpp/sdtype_adapter.cpp | 19 +- otherarch/sdcpp/stable-diffusion.cpp | 237 ++-- otherarch/sdcpp/stable-diffusion.h | 26 +- otherarch/sdcpp/upscaler.cpp | 16 +- otherarch/sdcpp/util.cpp | 37 +- otherarch/sdcpp/util.h | 3 +- otherarch/sdcpp/version.cpp | 20 + 22 files changed, 2462 insertions(+), 1842 deletions(-) create mode 100644 otherarch/sdcpp/common/common.hpp create mode 100644 otherarch/sdcpp/version.cpp diff --git a/Makefile b/Makefile index f355367aa..ce78dd4b6 100644 --- a/Makefile +++ b/Makefile @@ -770,7 +770,7 @@ main: tools/completion/completion.cpp common/arg.cpp common/download.cpp build-i $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) mainvk: tools/completion/completion.cpp common/arg.cpp common/download.cpp build-info.h ggml_v4_vulkan.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_vulkan.o llava.o ggml-backend_vulkan.o ggml-backend-reg_vulkan.o ggml-vulkan.o ggml-vulkan-shaders.o ggml-repack.o $(OBJS_FULL) $(OBJS) lib/vulkan-1.lib $(CXX) $(CXXFLAGS) -DGGML_USE_VULKAN -DSD_USE_VULKAN $(filter-out %.h,$^) -o $@ $(LDFLAGS) -sdmain: otherarch/sdcpp/util.cpp otherarch/sdcpp/main.cpp otherarch/sdcpp/stable-diffusion.cpp otherarch/sdcpp/upscaler.cpp otherarch/sdcpp/model.cpp otherarch/sdcpp/name_conversion.cpp otherarch/sdcpp/tokenize_util.cpp otherarch/sdcpp/thirdparty/zip.c build-info.h ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS) +sdmain: otherarch/sdcpp/util.cpp otherarch/sdcpp/main.cpp otherarch/sdcpp/stable-diffusion.cpp otherarch/sdcpp/upscaler.cpp otherarch/sdcpp/model.cpp otherarch/sdcpp/name_conversion.cpp otherarch/sdcpp/tokenize_util.cpp otherarch/sdcpp/version.cpp otherarch/sdcpp/thirdparty/zip.c build-info.h ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) whispermain: otherarch/whispercpp/main.cpp otherarch/whispercpp/whisper.cpp build-info.h ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) diff --git a/otherarch/sdcpp/clip.hpp b/otherarch/sdcpp/clip.hpp index 1f983271f..24c94f1bb 100644 --- a/otherarch/sdcpp/clip.hpp +++ b/otherarch/sdcpp/clip.hpp @@ -3,34 +3,10 @@ #include "ggml_extend.hpp" #include "model.h" +#include "tokenize_util.h" /*================================================== CLIPTokenizer ===================================================*/ -__STATIC_INLINE__ std::pair, std::string> extract_and_remove_lora(std::string text) { - std::regex re("]+)>"); - std::smatch matches; - std::unordered_map filename2multiplier; - - while (std::regex_search(text, matches, re)) { - std::string filename = matches[1].str(); - float multiplier = std::stof(matches[2].str()); - - text = std::regex_replace(text, re, "", std::regex_constants::format_first_only); - - if (multiplier == 0.f) { - continue; - } - - if (filename2multiplier.find(filename) == filename2multiplier.end()) { - filename2multiplier[filename] = multiplier; - } else { - filename2multiplier[filename] += multiplier; - } - } - - return std::make_pair(filename2multiplier, text); -} - __STATIC_INLINE__ std::vector> bytes_to_unicode() { std::vector> byte_unicode_pairs; std::set byte_set; @@ -72,6 +48,8 @@ private: int encoder_len; int bpe_len; + std::vector special_tokens; + public: const std::string UNK_TOKEN = "<|endoftext|>"; const std::string BOS_TOKEN = "<|startoftext|>"; @@ -117,6 +95,15 @@ private: return pairs; } + bool is_special_token(const std::string& token) { + for (auto& special_token : special_tokens) { + if (special_token == token) { + return true; + } + } + return false; + } + public: CLIPTokenizer(int pad_token_id = 49407, const std::string& merges_utf8_str = "") : PAD_TOKEN_ID(pad_token_id) { @@ -125,6 +112,8 @@ public: } else { load_from_merges(ModelLoader::load_merges()); } + add_special_token("<|startoftext|>"); + add_special_token("<|endoftext|>"); } void load_from_merges(const std::string& merges_utf8_str) { @@ -201,6 +190,10 @@ public: } } + void add_special_token(const std::string& token) { + special_tokens.push_back(token); + } + std::u32string bpe(const std::u32string& token) { std::vector word; @@ -379,25 +372,54 @@ public: return trim(text); } + std::vector token_split(const std::string& text) { + std::regex pat(R"('s|'t|'re|'ve|'m|'ll|'d|[[:alpha:]]+|[[:digit:]]|[^[:space:][:alpha:][:digit:]]+)", + std::regex::icase); + std::sregex_iterator iter(text.begin(), text.end(), pat); + std::sregex_iterator end; + + std::vector result; + for (; iter != end; ++iter) { + result.emplace_back(iter->str()); + } + + return result; + } + std::vector encode(std::string text, on_new_token_cb_t on_new_token_cb) { std::string original_text = text; std::vector bpe_tokens; text = whitespace_clean(text); std::transform(text.begin(), text.end(), text.begin(), [](unsigned char c) { return std::tolower(c); }); - std::regex pat(R"(<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[[:alpha:]]+|[[:digit:]]|[^[:space:][:alpha:][:digit:]]+)", - std::regex::icase); - - std::smatch matches; std::string str = text; std::vector token_strs; - while (std::regex_search(str, matches, pat)) { - bool skip = on_new_token_cb(str, bpe_tokens); - if (skip) { + + auto splited_texts = split_with_special_tokens(text, special_tokens); + + for (auto& splited_text : splited_texts) { + LOG_DEBUG("token %s", splited_text.c_str()); + if (is_special_token(splited_text)) { + LOG_DEBUG("special %s", splited_text.c_str()); + bool skip = on_new_token_cb(splited_text, bpe_tokens); + if (skip) { + token_strs.push_back(splited_text); + continue; + } continue; } - for (auto& token : matches) { - std::string token_str = token.str(); + + auto tokens = token_split(splited_text); + for (auto& token : tokens) { + if (on_new_token_cb != nullptr) { + bool skip = on_new_token_cb(token, bpe_tokens); + if (skip) { + token_strs.push_back(token); + continue; + } + } + + std::string token_str = token; std::u32string utf32_token; for (int i = 0; i < token_str.length(); i++) { unsigned char b = token_str[i]; @@ -417,14 +439,13 @@ public: bpe_tokens.push_back(encoder[bpe_str]); token_strs.push_back(utf32_to_utf8(bpe_str)); } - str = matches.suffix(); } - std::stringstream ss; - ss << "["; - for (auto token : token_strs) { - ss << "\"" << token << "\", "; - } - ss << "]"; + // std::stringstream ss; + // ss << "["; + // for (auto token : token_strs) { + // ss << "\"" << token << "\", "; + // } + // ss << "]"; // LOG_DEBUG("split prompt \"%s\" to tokens %s", original_text.c_str(), ss.str().c_str()); // printf("split prompt \"%s\" to tokens %s \n", original_text.c_str(), ss.str().c_str()); return bpe_tokens; diff --git a/otherarch/sdcpp/common.hpp b/otherarch/sdcpp/common.hpp index 33d499fb1..74b218ab7 100644 --- a/otherarch/sdcpp/common.hpp +++ b/otherarch/sdcpp/common.hpp @@ -194,10 +194,12 @@ public: auto proj = std::dynamic_pointer_cast(blocks["proj"]); x = proj->forward(ctx, x); // [ne3, ne2, ne1, dim_out*2] - auto x_vec = ggml_ext_chunk(ctx->ggml_ctx, x, 2, 0); + auto x_vec = ggml_ext_chunk(ctx->ggml_ctx, x, 2, 0, false); x = x_vec[0]; // [ne3, ne2, ne1, dim_out] auto gate = x_vec[1]; // [ne3, ne2, ne1, dim_out] + gate = ggml_cont(ctx->ggml_ctx, gate); + gate = ggml_gelu_inplace(ctx->ggml_ctx, gate); x = ggml_mul(ctx->ggml_ctx, x, gate); // [ne3, ne2, ne1, dim_out] diff --git a/otherarch/sdcpp/common/common.hpp b/otherarch/sdcpp/common/common.hpp new file mode 100644 index 000000000..e9be436be --- /dev/null +++ b/otherarch/sdcpp/common/common.hpp @@ -0,0 +1,1804 @@ + +#include +#include +#include +#include +#include +#include +#include +#include + +//kcpp +#include +using json = nlohmann::json; +namespace fs = std::filesystem; + +#if defined(_WIN32) +#define NOMINMAX +#include +#endif // _WIN32 + +#include "stable-diffusion.h" + +#define STB_IMAGE_IMPLEMENTATION +//#define STB_IMAGE_STATIC +#include "stb_image.h" + +#define STB_IMAGE_WRITE_IMPLEMENTATION +//#define STB_IMAGE_WRITE_STATIC +#include "stb_image_write.h" + +#define STB_IMAGE_RESIZE_IMPLEMENTATION +//#define STB_IMAGE_RESIZE_STATIC +#include "stb_image_resize.h" + +#define SAFE_STR(s) ((s) ? (s) : "") +#define BOOL_STR(b) ((b) ? "true" : "false") + +const char* modes_str[] = { + "img_gen", + "vid_gen", + "convert", + "upscale", +}; +#define SD_ALL_MODES_STR "img_gen, vid_gen, convert, upscale" + +enum SDMode { + IMG_GEN, + VID_GEN, + CONVERT, + UPSCALE, + MODE_COUNT +}; + +#if defined(_WIN32) +static std::string utf16_to_utf8(const std::wstring& wstr) { + if (wstr.empty()) + return {}; + int size_needed = WideCharToMultiByte(CP_UTF8, 0, wstr.data(), (int)wstr.size(), + nullptr, 0, nullptr, nullptr); + if (size_needed <= 0) + throw std::runtime_error("UTF-16 to UTF-8 conversion failed"); + + std::string utf8(size_needed, 0); + WideCharToMultiByte(CP_UTF8, 0, wstr.data(), (int)wstr.size(), + (char*)utf8.data(), size_needed, nullptr, nullptr); + return utf8; +} + +static std::string argv_to_utf8(int index, const char** argv) { + int argc; + wchar_t** argv_w = CommandLineToArgvW(GetCommandLineW(), &argc); + if (!argv_w) + throw std::runtime_error("Failed to parse command line"); + + std::string result; + if (index < argc) { + result = utf16_to_utf8(argv_w[index]); + } + LocalFree(argv_w); + return result; +} + +#else // Linux / macOS +static std::string argv_to_utf8(int index, const char** argv) { + return std::string(argv[index]); +} + +#endif + +struct StringOption { + std::string short_name; + std::string long_name; + std::string desc; + std::string* target; +}; + +struct IntOption { + std::string short_name; + std::string long_name; + std::string desc; + int* target; +}; + +struct FloatOption { + std::string short_name; + std::string long_name; + std::string desc; + float* target; +}; + +struct BoolOption { + std::string short_name; + std::string long_name; + std::string desc; + bool keep_true; + bool* target; +}; + +struct ManualOption { + std::string short_name; + std::string long_name; + std::string desc; + std::function cb; +}; + +struct ArgOptions { + std::vector string_options; + std::vector int_options; + std::vector float_options; + std::vector bool_options; + std::vector manual_options; + + static std::string wrap_text(const std::string& text, size_t width, size_t indent) { + std::ostringstream oss; + size_t line_len = 0; + size_t pos = 0; + + while (pos < text.size()) { + // Preserve manual newlines + if (text[pos] == '\n') { + oss << '\n' + << std::string(indent, ' '); + line_len = indent; + ++pos; + continue; + } + + // Add the character + oss << text[pos]; + ++line_len; + ++pos; + + // If the current line exceeds width, try to break at the last space + if (line_len >= width) { + std::string current = oss.str(); + size_t back = current.size(); + + // Find the last space (for a clean break) + while (back > 0 && current[back - 1] != ' ' && current[back - 1] != '\n') + --back; + + // If found a space to break on + if (back > 0 && current[back - 1] != '\n') { + std::string before = current.substr(0, back - 1); + std::string after = current.substr(back); + oss.str(""); + oss.clear(); + oss << before << "\n" + << std::string(indent, ' ') << after; + } else { + // If no space found, just break at width + oss << "\n" + << std::string(indent, ' '); + } + line_len = indent; + } + } + + return oss.str(); + } + + void print() const { + constexpr size_t max_line_width = 120; + + struct Entry { + std::string names; + std::string desc; + }; + std::vector entries; + + auto add_entry = [&](const std::string& s, const std::string& l, + const std::string& desc, const std::string& hint = "") { + std::ostringstream ss; + if (!s.empty()) + ss << s; + if (!s.empty() && !l.empty()) + ss << ", "; + if (!l.empty()) + ss << l; + if (!hint.empty()) + ss << " " << hint; + entries.push_back({ss.str(), desc}); + }; + + for (auto& o : string_options) + add_entry(o.short_name, o.long_name, o.desc, ""); + for (auto& o : int_options) + add_entry(o.short_name, o.long_name, o.desc, ""); + for (auto& o : float_options) + add_entry(o.short_name, o.long_name, o.desc, ""); + for (auto& o : bool_options) + add_entry(o.short_name, o.long_name, o.desc, ""); + for (auto& o : manual_options) + add_entry(o.short_name, o.long_name, o.desc); + + size_t max_name_width = 0; + for (auto& e : entries) + max_name_width = std::max(max_name_width, e.names.size()); + + for (auto& e : entries) { + size_t indent = 2 + max_name_width + 4; + size_t desc_width = (max_line_width > indent ? max_line_width - indent : 40); + std::string wrapped_desc = wrap_text(e.desc, max_line_width, indent); + std::cout << " " << std::left << std::setw(static_cast(max_name_width) + 4) + << e.names << wrapped_desc << "\n"; + } + } +}; + +static bool parse_options(int argc, const char** argv, const std::vector& options_list) { + bool invalid_arg = false; + std::string arg; + + auto match_and_apply = [&](auto& opts, auto&& apply_fn) -> bool { + for (auto& option : opts) { + if ((option.short_name.size() > 0 && arg == option.short_name) || + (option.long_name.size() > 0 && arg == option.long_name)) { + apply_fn(option); + return true; + } + } + return false; + }; + + for (int i = 1; i < argc; i++) { + arg = argv[i]; + bool found_arg = false; + + for (auto& options : options_list) { + if (match_and_apply(options.string_options, [&](auto& option) { + if (++i >= argc) { + invalid_arg = true; + return; + } + *option.target = argv_to_utf8(i, argv); + found_arg = true; + })) + break; + + if (match_and_apply(options.int_options, [&](auto& option) { + if (++i >= argc) { + invalid_arg = true; + return; + } + *option.target = std::stoi(argv[i]); + found_arg = true; + })) + break; + + if (match_and_apply(options.float_options, [&](auto& option) { + if (++i >= argc) { + invalid_arg = true; + return; + } + *option.target = std::stof(argv[i]); + found_arg = true; + })) + break; + + if (match_and_apply(options.bool_options, [&](auto& option) { + *option.target = option.keep_true ? true : false; + found_arg = true; + })) + break; + + if (match_and_apply(options.manual_options, [&](auto& option) { + int ret = option.cb(argc, argv, i); + if (ret < 0) { + invalid_arg = true; + return; + } + i += ret; + found_arg = true; + })) + break; + } + + if (invalid_arg) { + fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); + return false; + } + if (!found_arg) { + fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); + return false; + } + } + + return true; +} + +struct SDContextParams { + int n_threads = -1; + std::string model_path; + std::string clip_l_path; + std::string clip_g_path; + std::string clip_vision_path; + std::string t5xxl_path; + std::string llm_path; + std::string llm_vision_path; + std::string diffusion_model_path; + std::string high_noise_diffusion_model_path; + std::string vae_path; + std::string taesd_path; + std::string esrgan_path; + std::string control_net_path; + std::string embedding_dir; + std::string photo_maker_path; + sd_type_t wtype = SD_TYPE_COUNT; + std::string tensor_type_rules; + std::string lora_model_dir; + + std::map embedding_map; + std::vector embedding_vec; + + rng_type_t rng_type = CUDA_RNG; + rng_type_t sampler_rng_type = RNG_TYPE_COUNT; + bool offload_params_to_cpu = false; + bool control_net_cpu = false; + bool clip_on_cpu = false; + bool vae_on_cpu = false; + bool diffusion_flash_attn = false; + bool diffusion_conv_direct = false; + bool vae_conv_direct = false; + + bool chroma_use_dit_mask = true; + bool chroma_use_t5_mask = false; + int chroma_t5_mask_pad = 1; + + prediction_t prediction = PREDICTION_COUNT; + lora_apply_mode_t lora_apply_mode = LORA_APPLY_AUTO; + + sd_tiling_params_t vae_tiling_params = {false, 0, 0, 0.5f, 0.0f, 0.0f}; + bool force_sdxl_vae_conv_scale = false; + + float flow_shift = INFINITY; + + ArgOptions get_options() { + ArgOptions options; + options.string_options = { + {"-m", + "--model", + "path to full model", + &model_path}, + {"", + "--clip_l", + "path to the clip-l text encoder", &clip_l_path}, + {"", "--clip_g", + "path to the clip-g text encoder", + &clip_g_path}, + {"", + "--clip_vision", + "path to the clip-vision encoder", + &clip_vision_path}, + {"", + "--t5xxl", + "path to the t5xxl text encoder", + &t5xxl_path}, + {"", + "--llm", + "path to the llm text encoder. For example: (qwenvl2.5 for qwen-image, mistral-small3.2 for flux2, ...)", + &llm_path}, + {"", + "--llm_vision", + "path to the llm vit", + &llm_vision_path}, + {"", + "--qwen2vl", + "alias of --llm. Deprecated.", + &llm_path}, + {"", + "--qwen2vl_vision", + "alias of --llm_vision. Deprecated.", + &llm_vision_path}, + {"", + "--diffusion-model", + "path to the standalone diffusion model", + &diffusion_model_path}, + {"", + "--high-noise-diffusion-model", + "path to the standalone high noise diffusion model", + &high_noise_diffusion_model_path}, + {"", + "--vae", + "path to standalone vae model", + &vae_path}, + {"", + "--taesd", + "path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)", + &taesd_path}, + {"", + "--control-net", + "path to control net model", + &control_net_path}, + {"", + "--embd-dir", + "embeddings directory", + &embedding_dir}, + {"", + "--lora-model-dir", + "lora model directory", + &lora_model_dir}, + + {"", + "--tensor-type-rules", + "weight type per tensor pattern (example: \"^vae\\.=f16,model\\.=q8_0\")", + &tensor_type_rules}, + {"", + "--photo-maker", + "path to PHOTOMAKER model", + &photo_maker_path}, + {"", + "--upscale-model", + "path to esrgan model.", + &esrgan_path}, + }; + + options.int_options = { + {"-t", + "--threads", + "number of threads to use during computation (default: -1). " + "If threads <= 0, then threads will be set to the number of CPU physical cores", + &n_threads}, + {"", + "--chroma-t5-mask-pad", + "t5 mask pad size of chroma", + &chroma_t5_mask_pad}, + }; + + options.float_options = { + {"", + "--vae-tile-overlap", + "tile overlap for vae tiling, in fraction of tile size (default: 0.5)", + &vae_tiling_params.target_overlap}, + {"", + "--flow-shift", + "shift value for Flow models like SD3.x or WAN (default: auto)", + &flow_shift}, + }; + + options.bool_options = { + {"", + "--vae-tiling", + "process vae in tiles to reduce memory usage", + true, &vae_tiling_params.enabled}, + {"", + "--force-sdxl-vae-conv-scale", + "force use of conv scale on sdxl vae", + true, &force_sdxl_vae_conv_scale}, + {"", + "--offload-to-cpu", + "place the weights in RAM to save VRAM, and automatically load them into VRAM when needed", + true, &offload_params_to_cpu}, + {"", + "--control-net-cpu", + "keep controlnet in cpu (for low vram)", + true, &control_net_cpu}, + {"", + "--clip-on-cpu", + "keep clip in cpu (for low vram)", + true, &clip_on_cpu}, + {"", + "--vae-on-cpu", + "keep vae in cpu (for low vram)", + true, &vae_on_cpu}, + {"", + "--diffusion-fa", + "use flash attention in the diffusion model", + true, &diffusion_flash_attn}, + {"", + "--diffusion-conv-direct", + "use ggml_conv2d_direct in the diffusion model", + true, &diffusion_conv_direct}, + {"", + "--vae-conv-direct", + "use ggml_conv2d_direct in the vae model", + true, &vae_conv_direct}, + {"", + "--chroma-disable-dit-mask", + "disable dit mask for chroma", + false, &chroma_use_dit_mask}, + {"", + "--chroma-enable-t5-mask", + "enable t5 mask for chroma", + true, &chroma_use_t5_mask}, + }; + + auto on_type_arg = [&](int argc, const char** argv, int index) { + if (++index >= argc) { + return -1; + } + const char* arg = argv[index]; + wtype = str_to_sd_type(arg); + if (wtype == SD_TYPE_COUNT) { + fprintf(stderr, "error: invalid weight format %s\n", + arg); + return -1; + } + return 1; + }; + + auto on_rng_arg = [&](int argc, const char** argv, int index) { + if (++index >= argc) { + return -1; + } + const char* arg = argv[index]; + rng_type = str_to_rng_type(arg); + if (rng_type == RNG_TYPE_COUNT) { + fprintf(stderr, "error: invalid rng type %s\n", + arg); + return -1; + } + return 1; + }; + + auto on_sampler_rng_arg = [&](int argc, const char** argv, int index) { + if (++index >= argc) { + return -1; + } + const char* arg = argv[index]; + sampler_rng_type = str_to_rng_type(arg); + if (sampler_rng_type == RNG_TYPE_COUNT) { + fprintf(stderr, "error: invalid sampler rng type %s\n", + arg); + return -1; + } + return 1; + }; + + auto on_prediction_arg = [&](int argc, const char** argv, int index) { + if (++index >= argc) { + return -1; + } + const char* arg = argv[index]; + prediction = str_to_prediction(arg); + if (prediction == PREDICTION_COUNT) { + fprintf(stderr, "error: invalid prediction type %s\n", + arg); + return -1; + } + return 1; + }; + + auto on_lora_apply_mode_arg = [&](int argc, const char** argv, int index) { + if (++index >= argc) { + return -1; + } + const char* arg = argv[index]; + lora_apply_mode = str_to_lora_apply_mode(arg); + if (lora_apply_mode == LORA_APPLY_MODE_COUNT) { + fprintf(stderr, "error: invalid lora apply model %s\n", + arg); + return -1; + } + return 1; + }; + + auto on_tile_size_arg = [&](int argc, const char** argv, int index) { + if (++index >= argc) { + return -1; + } + std::string tile_size_str = argv[index]; + size_t x_pos = tile_size_str.find('x'); + try { + if (x_pos != std::string::npos) { + std::string tile_x_str = tile_size_str.substr(0, x_pos); + std::string tile_y_str = tile_size_str.substr(x_pos + 1); + vae_tiling_params.tile_size_x = std::stoi(tile_x_str); + vae_tiling_params.tile_size_y = std::stoi(tile_y_str); + } else { + vae_tiling_params.tile_size_x = vae_tiling_params.tile_size_y = std::stoi(tile_size_str); + } + } catch (const std::invalid_argument&) { + return -1; + } catch (const std::out_of_range&) { + return -1; + } + return 1; + }; + + auto on_relative_tile_size_arg = [&](int argc, const char** argv, int index) { + if (++index >= argc) { + return -1; + } + std::string rel_size_str = argv[index]; + size_t x_pos = rel_size_str.find('x'); + try { + if (x_pos != std::string::npos) { + std::string rel_x_str = rel_size_str.substr(0, x_pos); + std::string rel_y_str = rel_size_str.substr(x_pos + 1); + vae_tiling_params.rel_size_x = std::stof(rel_x_str); + vae_tiling_params.rel_size_y = std::stof(rel_y_str); + } else { + vae_tiling_params.rel_size_x = vae_tiling_params.rel_size_y = std::stof(rel_size_str); + } + } catch (const std::invalid_argument&) { + return -1; + } catch (const std::out_of_range&) { + return -1; + } + return 1; + }; + + options.manual_options = { + {"", + "--type", + "weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). " + "If not specified, the default is the type of the weight file", + on_type_arg}, + {"", + "--rng", + "RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui)", + on_rng_arg}, + {"", + "--sampler-rng", + "sampler RNG, one of [std_default, cuda, cpu]. If not specified, use --rng", + on_sampler_rng_arg}, + {"", + "--prediction", + "prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow, flux2_flow]", + on_prediction_arg}, + {"", + "--lora-apply-mode", + "the way to apply LoRA, one of [auto, immediately, at_runtime], default is auto. " + "In auto mode, if the model weights contain any quantized parameters, the at_runtime mode will be used; otherwise, immediately will be used." + "The immediately mode may have precision and compatibility issues with quantized parameters, " + "but it usually offers faster inference speed and, in some cases, lower memory usage. " + "The at_runtime mode, on the other hand, is exactly the opposite.", + on_lora_apply_mode_arg}, + {"", + "--vae-tile-size", + "tile size for vae tiling, format [X]x[Y] (default: 32x32)", + on_tile_size_arg}, + {"", + "--vae-relative-tile-size", + "relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)", + on_relative_tile_size_arg}, + }; + + return options; + } + + void build_embedding_map() { + static const std::vector valid_ext = {".pt", ".safetensors", ".gguf"}; + + if (!fs::exists(embedding_dir) || !fs::is_directory(embedding_dir)) { + return; + } + + for (auto& p : fs::directory_iterator(embedding_dir)) { + if (!p.is_regular_file()) + continue; + + auto path = p.path(); + std::string ext = path.extension().string(); + + bool valid = false; + for (auto& e : valid_ext) { + if (ext == e) { + valid = true; + break; + } + } + if (!valid) + continue; + + std::string key = path.stem().string(); + std::string value = path.string(); + + embedding_map[key] = value; + } + } + + bool process_and_check(SDMode mode) { + if (mode != UPSCALE && model_path.length() == 0 && diffusion_model_path.length() == 0) { + fprintf(stderr, "error: the following arguments are required: model_path/diffusion_model\n"); + return false; + } + + if (mode == UPSCALE) { + if (esrgan_path.length() == 0) { + fprintf(stderr, "error: upscale mode needs an upscaler model (--upscale-model)\n"); + return false; + } + } + + if (n_threads <= 0) { + n_threads = sd_get_num_physical_cores(); + } + + build_embedding_map(); + + return true; + } + + std::string to_string() const { + std::ostringstream emb_ss; + emb_ss << "{\n"; + for (auto it = embedding_map.begin(); it != embedding_map.end(); ++it) { + emb_ss << " \"" << it->first << "\": \"" << it->second << "\""; + if (std::next(it) != embedding_map.end()) { + emb_ss << ","; + } + emb_ss << "\n"; + } + emb_ss << " }"; + + std::string embeddings_str = emb_ss.str(); + std::ostringstream oss; + oss << "SDContextParams {\n" + << " n_threads: " << n_threads << ",\n" + << " model_path: \"" << model_path << "\",\n" + << " clip_l_path: \"" << clip_l_path << "\",\n" + << " clip_g_path: \"" << clip_g_path << "\",\n" + << " clip_vision_path: \"" << clip_vision_path << "\",\n" + << " t5xxl_path: \"" << t5xxl_path << "\",\n" + << " llm_path: \"" << llm_path << "\",\n" + << " llm_vision_path: \"" << llm_vision_path << "\",\n" + << " diffusion_model_path: \"" << diffusion_model_path << "\",\n" + << " high_noise_diffusion_model_path: \"" << high_noise_diffusion_model_path << "\",\n" + << " vae_path: \"" << vae_path << "\",\n" + << " taesd_path: \"" << taesd_path << "\",\n" + << " esrgan_path: \"" << esrgan_path << "\",\n" + << " control_net_path: \"" << control_net_path << "\",\n" + << " embedding_dir: \"" << embedding_dir << "\",\n" + << " embeddings: " << embeddings_str << "\n" + << " wtype: " << sd_type_name(wtype) << ",\n" + << " tensor_type_rules: \"" << tensor_type_rules << "\",\n" + << " lora_model_dir: \"" << lora_model_dir << "\",\n" + << " photo_maker_path: \"" << photo_maker_path << "\",\n" + << " rng_type: " << sd_rng_type_name(rng_type) << ",\n" + << " sampler_rng_type: " << sd_rng_type_name(sampler_rng_type) << ",\n" + << " flow_shift: " << (std::isinf(flow_shift) ? "INF" : std::to_string(flow_shift)) << "\n" + << " offload_params_to_cpu: " << (offload_params_to_cpu ? "true" : "false") << ",\n" + << " control_net_cpu: " << (control_net_cpu ? "true" : "false") << ",\n" + << " clip_on_cpu: " << (clip_on_cpu ? "true" : "false") << ",\n" + << " vae_on_cpu: " << (vae_on_cpu ? "true" : "false") << ",\n" + << " diffusion_flash_attn: " << (diffusion_flash_attn ? "true" : "false") << ",\n" + << " diffusion_conv_direct: " << (diffusion_conv_direct ? "true" : "false") << ",\n" + << " vae_conv_direct: " << (vae_conv_direct ? "true" : "false") << ",\n" + << " chroma_use_dit_mask: " << (chroma_use_dit_mask ? "true" : "false") << ",\n" + << " chroma_use_t5_mask: " << (chroma_use_t5_mask ? "true" : "false") << ",\n" + << " chroma_t5_mask_pad: " << chroma_t5_mask_pad << ",\n" + << " prediction: " << sd_prediction_name(prediction) << ",\n" + << " lora_apply_mode: " << sd_lora_apply_mode_name(lora_apply_mode) << ",\n" + << " vae_tiling_params: { " + << vae_tiling_params.enabled << ", " + << vae_tiling_params.tile_size_x << ", " + << vae_tiling_params.tile_size_y << ", " + << vae_tiling_params.target_overlap << ", " + << vae_tiling_params.rel_size_x << ", " + << vae_tiling_params.rel_size_y << " },\n" + << " force_sdxl_vae_conv_scale: " << (force_sdxl_vae_conv_scale ? "true" : "false") << "\n" + << "}"; + return oss.str(); + } + + sd_ctx_params_t to_sd_ctx_params_t(bool vae_decode_only, bool free_params_immediately, bool taesd_preview) { + embedding_vec.clear(); + embedding_vec.reserve(embedding_map.size()); + for (const auto& kv : embedding_map) { + sd_embedding_t item; + item.name = kv.first.c_str(); + item.path = kv.second.c_str(); + embedding_vec.emplace_back(item); + } + + sd_ctx_params_t sd_ctx_params = { + model_path.c_str(), + clip_l_path.c_str(), + clip_g_path.c_str(), + clip_vision_path.c_str(), + t5xxl_path.c_str(), + llm_path.c_str(), + llm_vision_path.c_str(), + diffusion_model_path.c_str(), + high_noise_diffusion_model_path.c_str(), + vae_path.c_str(), + taesd_path.c_str(), + control_net_path.c_str(), + lora_model_dir.c_str(), + embedding_vec.data(), + static_cast(embedding_vec.size()), + photo_maker_path.c_str(), + tensor_type_rules.c_str(), + vae_decode_only, + free_params_immediately, + n_threads, + wtype, + rng_type, + sampler_rng_type, + prediction, + lora_apply_mode, + offload_params_to_cpu, + clip_on_cpu, + control_net_cpu, + vae_on_cpu, + diffusion_flash_attn, + taesd_preview, + diffusion_conv_direct, + vae_conv_direct, + force_sdxl_vae_conv_scale, + chroma_use_dit_mask, + chroma_use_t5_mask, + chroma_t5_mask_pad, + flow_shift, + }; + return sd_ctx_params; + } +}; + +template +static std::string vec_to_string(const std::vector& v) { + std::ostringstream oss; + oss << "["; + for (size_t i = 0; i < v.size(); i++) { + oss << v[i]; + if (i + 1 < v.size()) + oss << ", "; + } + oss << "]"; + return oss.str(); +} + +static std::string vec_str_to_string(const std::vector& v) { + std::ostringstream oss; + oss << "["; + for (size_t i = 0; i < v.size(); i++) { + oss << "\"" << v[i] << "\""; + if (i + 1 < v.size()) + oss << ", "; + } + oss << "]"; + return oss.str(); +} + +static bool is_absolute_path(const std::string& p) { +#ifdef _WIN32 + // Windows: C:/path or C:\path + return p.size() > 1 && std::isalpha(static_cast(p[0])) && p[1] == ':'; +#else + return !p.empty() && p[0] == '/'; +#endif +} + +struct SDGenerationParams { + std::string prompt; + std::string prompt_with_lora; // for metadata record only + std::string negative_prompt; + int clip_skip = -1; // <= 0 represents unspecified + int width = 512; + int height = 512; + int batch_count = 1; + std::string init_image_path; + std::string end_image_path; + std::string mask_image_path; + std::string control_image_path; + std::vector ref_image_paths; + std::string control_video_path; + bool auto_resize_ref_image = true; + bool increase_ref_index = false; + + std::vector skip_layers = {7, 8, 9}; + sd_sample_params_t sample_params; + + std::vector high_noise_skip_layers = {7, 8, 9}; + sd_sample_params_t high_noise_sample_params; + + std::vector custom_sigmas; + + std::string easycache_option; + sd_easycache_params_t easycache_params; + + float moe_boundary = 0.875f; + int video_frames = 1; + int fps = 16; + float vace_strength = 1.f; + + float strength = 0.75f; + float control_strength = 0.9f; + + int64_t seed = 42; + + // Photo Maker + std::string pm_id_images_dir; + std::string pm_id_embed_path; + float pm_style_strength = 20.f; + + int upscale_repeats = 1; + int upscale_tile_size = 128; + + std::map lora_map; + std::map high_noise_lora_map; + std::vector lora_vec; + + SDGenerationParams() { + sd_sample_params_init(&sample_params); + sd_sample_params_init(&high_noise_sample_params); + } + + ArgOptions get_options() { + ArgOptions options; + options.string_options = { + {"-p", + "--prompt", + "the prompt to render", + &prompt}, + {"-n", + "--negative-prompt", + "the negative prompt (default: \"\")", + &negative_prompt}, + {"-i", + "--init-img", + "path to the init image", + &init_image_path}, + {"", + "--end-img", + "path to the end image, required by flf2v", + &end_image_path}, + {"", + "--mask", + "path to the mask image", + &mask_image_path}, + {"", + "--control-image", + "path to control image, control net", + &control_image_path}, + {"", + "--control-video", + "path to control video frames, It must be a directory path. The video frames inside should be stored as images in " + "lexicographical (character) order. For example, if the control video path is `frames`, the directory contain images " + "such as 00.png, 01.png, ... etc.", + &control_video_path}, + {"", + "--pm-id-images-dir", + "path to PHOTOMAKER input id images dir", + &pm_id_images_dir}, + {"", + "--pm-id-embed-path", + "path to PHOTOMAKER v2 id embed", + &pm_id_embed_path}, + }; + + options.int_options = { + {"-H", + "--height", + "image height, in pixel space (default: 512)", + &height}, + {"-W", + "--width", + "image width, in pixel space (default: 512)", + &width}, + {"", + "--steps", + "number of sample steps (default: 20)", + &sample_params.sample_steps}, + {"", + "--high-noise-steps", + "(high noise) number of sample steps (default: -1 = auto)", + &high_noise_sample_params.sample_steps}, + {"", + "--clip-skip", + "ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1). " + "<= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x", + &clip_skip}, + {"-b", + "--batch-count", + "batch count", + &batch_count}, + {"", + "--video-frames", + "video frames (default: 1)", + &video_frames}, + {"", + "--fps", + "fps (default: 24)", + &fps}, + {"", + "--timestep-shift", + "shift timestep for NitroFusion models (default: 0). " + "recommended N for NitroSD-Realism around 250 and 500 for NitroSD-Vibrant", + &sample_params.shifted_timestep}, + {"", + "--upscale-repeats", + "Run the ESRGAN upscaler this many times (default: 1)", + &upscale_repeats}, + {"", + "--upscale-tile-size", + "tile size for ESRGAN upscaling (default: 128)", + &upscale_tile_size}, + }; + + options.float_options = { + {"", + "--cfg-scale", + "unconditional guidance scale: (default: 7.0)", + &sample_params.guidance.txt_cfg}, + {"", + "--img-cfg-scale", + "image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)", + &sample_params.guidance.img_cfg}, + {"", + "--guidance", + "distilled guidance scale for models with guidance input (default: 3.5)", + &sample_params.guidance.distilled_guidance}, + {"", + "--slg-scale", + "skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means disabled, a value of 2.5 is nice for sd3.5 medium", + &sample_params.guidance.slg.scale}, + {"", + "--skip-layer-start", + "SLG enabling point (default: 0.01)", + &sample_params.guidance.slg.layer_start}, + {"", + "--skip-layer-end", + "SLG disabling point (default: 0.2)", + &sample_params.guidance.slg.layer_end}, + {"", + "--eta", + "eta in DDIM, only for DDIM and TCD (default: 0)", + &sample_params.eta}, + {"", + "--high-noise-cfg-scale", + "(high noise) unconditional guidance scale: (default: 7.0)", + &high_noise_sample_params.guidance.txt_cfg}, + {"", + "--high-noise-img-cfg-scale", + "(high noise) image guidance scale for inpaint or instruct-pix2pix models (default: same as --cfg-scale)", + &high_noise_sample_params.guidance.img_cfg}, + {"", + "--high-noise-guidance", + "(high noise) distilled guidance scale for models with guidance input (default: 3.5)", + &high_noise_sample_params.guidance.distilled_guidance}, + {"", + "--high-noise-slg-scale", + "(high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0)", + &high_noise_sample_params.guidance.slg.scale}, + {"", + "--high-noise-skip-layer-start", + "(high noise) SLG enabling point (default: 0.01)", + &high_noise_sample_params.guidance.slg.layer_start}, + {"", + "--high-noise-skip-layer-end", + "(high noise) SLG disabling point (default: 0.2)", + &high_noise_sample_params.guidance.slg.layer_end}, + {"", + "--high-noise-eta", + "(high noise) eta in DDIM, only for DDIM and TCD (default: 0)", + &high_noise_sample_params.eta}, + {"", + "--strength", + "strength for noising/unnoising (default: 0.75)", + &strength}, + {"", + "--pm-style-strength", + "", + &pm_style_strength}, + {"", + "--control-strength", + "strength to apply Control Net (default: 0.9). 1.0 corresponds to full destruction of information in init image", + &control_strength}, + {"", + "--moe-boundary", + "timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if `--high-noise-steps` is set to -1", + &moe_boundary}, + {"", + "--vace-strength", + "wan vace strength", + &vace_strength}, + }; + + options.bool_options = { + {"", + "--increase-ref-index", + "automatically increase the indices of references images based on the order they are listed (starting with 1).", + true, + &increase_ref_index}, + {"", + "--disable-auto-resize-ref-image", + "disable auto resize of ref images", + false, + &auto_resize_ref_image}, + }; + + auto on_seed_arg = [&](int argc, const char** argv, int index) { + if (++index >= argc) { + return -1; + } + seed = std::stoll(argv[index]); + return 1; + }; + + auto on_sample_method_arg = [&](int argc, const char** argv, int index) { + if (++index >= argc) { + return -1; + } + const char* arg = argv[index]; + sample_params.sample_method = str_to_sample_method(arg); + if (sample_params.sample_method == SAMPLE_METHOD_COUNT) { + fprintf(stderr, "error: invalid sample method %s\n", + arg); + return -1; + } + return 1; + }; + + auto on_high_noise_sample_method_arg = [&](int argc, const char** argv, int index) { + if (++index >= argc) { + return -1; + } + const char* arg = argv[index]; + high_noise_sample_params.sample_method = str_to_sample_method(arg); + if (high_noise_sample_params.sample_method == SAMPLE_METHOD_COUNT) { + fprintf(stderr, "error: invalid high noise sample method %s\n", + arg); + return -1; + } + return 1; + }; + + auto on_scheduler_arg = [&](int argc, const char** argv, int index) { + if (++index >= argc) { + return -1; + } + const char* arg = argv[index]; + sample_params.scheduler = str_to_scheduler(arg); + if (sample_params.scheduler == SCHEDULER_COUNT) { + fprintf(stderr, "error: invalid scheduler %s\n", + arg); + return -1; + } + return 1; + }; + + auto on_skip_layers_arg = [&](int argc, const char** argv, int index) { + if (++index >= argc) { + return -1; + } + std::string layers_str = argv[index]; + if (layers_str[0] != '[' || layers_str[layers_str.size() - 1] != ']') { + return -1; + } + + layers_str = layers_str.substr(1, layers_str.size() - 2); + + std::regex regex("[, ]+"); + std::sregex_token_iterator iter(layers_str.begin(), layers_str.end(), regex, -1); + std::sregex_token_iterator end; + std::vector tokens(iter, end); + std::vector layers; + for (const auto& token : tokens) { + try { + layers.push_back(std::stoi(token)); + } catch (const std::invalid_argument&) { + return -1; + } + } + skip_layers = layers; + return 1; + }; + + auto on_high_noise_skip_layers_arg = [&](int argc, const char** argv, int index) { + if (++index >= argc) { + return -1; + } + std::string layers_str = argv[index]; + if (layers_str[0] != '[' || layers_str[layers_str.size() - 1] != ']') { + return -1; + } + + layers_str = layers_str.substr(1, layers_str.size() - 2); + + std::regex regex("[, ]+"); + std::sregex_token_iterator iter(layers_str.begin(), layers_str.end(), regex, -1); + std::sregex_token_iterator end; + std::vector tokens(iter, end); + std::vector layers; + for (const auto& token : tokens) { + try { + layers.push_back(std::stoi(token)); + } catch (const std::invalid_argument&) { + return -1; + } + } + high_noise_skip_layers = layers; + return 1; + }; + + auto on_sigmas_arg = [&](int argc, const char** argv, int index) { + if (++index >= argc) { + return -1; + } + std::string sigmas_str = argv[index]; + if (!sigmas_str.empty() && sigmas_str.front() == '[') { + sigmas_str.erase(0, 1); + } + if (!sigmas_str.empty() && sigmas_str.back() == ']') { + sigmas_str.pop_back(); + } + + std::stringstream ss(sigmas_str); + std::string item; + while (std::getline(ss, item, ',')) { + item.erase(0, item.find_first_not_of(" \t\n\r\f\v")); + item.erase(item.find_last_not_of(" \t\n\r\f\v") + 1); + if (!item.empty()) { + try { + custom_sigmas.push_back(std::stof(item)); + } catch (const std::invalid_argument& e) { + fprintf(stderr, "error: invalid float value '%s' in --sigmas\n", item.c_str()); + return -1; + } catch (const std::out_of_range& e) { + fprintf(stderr, "error: float value '%s' out of range in --sigmas\n", item.c_str()); + return -1; + } + } + } + + if (custom_sigmas.empty() && !sigmas_str.empty()) { + fprintf(stderr, "error: could not parse any sigma values from '%s'\n", argv[index]); + return -1; + } + return 1; + }; + + auto on_ref_image_arg = [&](int argc, const char** argv, int index) { + if (++index >= argc) { + return -1; + } + ref_image_paths.push_back(argv[index]); + return 1; + }; + + auto on_easycache_arg = [&](int argc, const char** argv, int index) { + const std::string default_values = "0.2,0.15,0.95"; + auto looks_like_value = [](const std::string& token) { + if (token.empty()) { + return false; + } + if (token[0] != '-') { + return true; + } + if (token.size() == 1) { + return false; + } + unsigned char next = static_cast(token[1]); + return std::isdigit(next) || token[1] == '.'; + }; + + std::string option_value; + int consumed = 0; + if (index + 1 < argc) { + std::string next_arg = argv[index + 1]; + if (looks_like_value(next_arg)) { + option_value = argv_to_utf8(index + 1, argv); + consumed = 1; + } + } + if (option_value.empty()) { + option_value = default_values; + } + easycache_option = option_value; + return consumed; + }; + + options.manual_options = { + {"-s", + "--seed", + "RNG seed (default: 42, use random seed for < 0)", + on_seed_arg}, + {"", + "--sampling-method", + "sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd] " + "(default: euler for Flux/SD3/Wan, euler_a otherwise)", + on_sample_method_arg}, + {"", + "--high-noise-sampling-method", + "(high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd]" + " default: euler for Flux/SD3/Wan, euler_a otherwise", + on_high_noise_sample_method_arg}, + {"", + "--scheduler", + "denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple, lcm], default: discrete", + on_scheduler_arg}, + {"", + "--sigmas", + "custom sigma values for the sampler, comma-separated (e.g., \"14.61,7.8,3.5,0.0\").", + on_sigmas_arg}, + {"", + "--skip-layers", + "layers to skip for SLG steps (default: [7,8,9])", + on_skip_layers_arg}, + {"", + "--high-noise-skip-layers", + "(high noise) layers to skip for SLG steps (default: [7,8,9])", + on_high_noise_skip_layers_arg}, + {"-r", + "--ref-image", + "reference image for Flux Kontext models (can be used multiple times)", + on_ref_image_arg}, + {"", + "--easycache", + "enable EasyCache for DiT models with optional \"threshold,start_percent,end_percent\" (default: 0.2,0.15,0.95)", + on_easycache_arg}, + + }; + + return options; + } + + bool from_json_str(const std::string& json_str) { + json j; + try { + j = json::parse(json_str); + } catch (...) { + fprintf(stderr, "json parse failed %s\n", json_str.c_str()); + return false; + } + + auto load_if_exists = [&](const char* key, auto& out) { + if (j.contains(key)) { + using T = std::decay_t; + if constexpr (std::is_same_v) { + if (j[key].is_string()) + out = j[key]; + } else if constexpr (std::is_same_v || std::is_same_v) { + if (j[key].is_number_integer()) + out = j[key]; + } else if constexpr (std::is_same_v) { + if (j[key].is_number()) + out = j[key]; + } else if constexpr (std::is_same_v) { + if (j[key].is_boolean()) + out = j[key]; + } else if constexpr (std::is_same_v>) { + if (j[key].is_array()) + out = j[key].get>(); + } else if constexpr (std::is_same_v>) { + if (j[key].is_array()) + out = j[key].get>(); + } + } + }; + + load_if_exists("prompt", prompt); + load_if_exists("negative_prompt", negative_prompt); + load_if_exists("easycache_option", easycache_option); + + load_if_exists("clip_skip", clip_skip); + load_if_exists("width", width); + load_if_exists("height", height); + load_if_exists("batch_count", batch_count); + load_if_exists("video_frames", video_frames); + load_if_exists("fps", fps); + load_if_exists("upscale_repeats", upscale_repeats); + load_if_exists("seed", seed); + + load_if_exists("strength", strength); + load_if_exists("control_strength", control_strength); + load_if_exists("pm_style_strength", pm_style_strength); + load_if_exists("moe_boundary", moe_boundary); + load_if_exists("vace_strength", vace_strength); + + load_if_exists("auto_resize_ref_image", auto_resize_ref_image); + load_if_exists("increase_ref_index", increase_ref_index); + + load_if_exists("skip_layers", skip_layers); + load_if_exists("high_noise_skip_layers", high_noise_skip_layers); + + load_if_exists("cfg_scale", sample_params.guidance.txt_cfg); + load_if_exists("img_cfg_scale", sample_params.guidance.img_cfg); + load_if_exists("guidance", sample_params.guidance.distilled_guidance); + + return true; + } + + void extract_and_remove_lora(const std::string& lora_model_dir) { + if (lora_model_dir.empty()) { + return; + } + static const std::regex re(R"(]+):([^>]+)>)"); + static const std::vector valid_ext = {".pt", ".safetensors", ".gguf"}; + std::smatch m; + + std::string tmp = prompt; + + while (std::regex_search(tmp, m, re)) { + std::string raw_path = m[1].str(); + const std::string raw_mul = m[2].str(); + + float mul = 0.f; + try { + mul = std::stof(raw_mul); + } catch (...) { + tmp = m.suffix().str(); + prompt = std::regex_replace(prompt, re, "", std::regex_constants::format_first_only); + continue; + } + + bool is_high_noise = false; + static const std::string prefix = "|high_noise|"; + if (raw_path.rfind(prefix, 0) == 0) { + raw_path.erase(0, prefix.size()); + is_high_noise = true; + } + + fs::path final_path; + if (is_absolute_path(raw_path)) { + final_path = raw_path; + } else { + final_path = fs::path(lora_model_dir) / raw_path; + } + if (!fs::exists(final_path)) { + bool found = false; + for (const auto& ext : valid_ext) { + fs::path try_path = final_path; + try_path += ext; + if (fs::exists(try_path)) { + final_path = try_path; + found = true; + break; + } + } + if (!found) { + printf("can not found lora %s\n", final_path.lexically_normal().string().c_str()); + tmp = m.suffix().str(); + prompt = std::regex_replace(prompt, re, "", std::regex_constants::format_first_only); + continue; + } + } + + const std::string key = final_path.lexically_normal().string(); + + if (is_high_noise) + high_noise_lora_map[key] += mul; + else + lora_map[key] += mul; + + prompt = std::regex_replace(prompt, re, "", std::regex_constants::format_first_only); + + tmp = m.suffix().str(); + } + + for (const auto& kv : lora_map) { + sd_lora_t item; + item.is_high_noise = false; + item.path = kv.first.c_str(); + item.multiplier = kv.second; + lora_vec.emplace_back(item); + } + + for (const auto& kv : high_noise_lora_map) { + sd_lora_t item; + item.is_high_noise = true; + item.path = kv.first.c_str(); + item.multiplier = kv.second; + lora_vec.emplace_back(item); + } + } + + bool process_and_check(SDMode mode, const std::string& lora_model_dir) { + prompt_with_lora = prompt; + if (width <= 0) { + fprintf(stderr, "error: the width must be greater than 0\n"); + return false; + } + + if (height <= 0) { + fprintf(stderr, "error: the height must be greater than 0\n"); + return false; + } + + if (sample_params.sample_steps <= 0) { + fprintf(stderr, "error: the sample_steps must be greater than 0\n"); + return false; + } + + if (high_noise_sample_params.sample_steps <= 0) { + high_noise_sample_params.sample_steps = -1; + } + + if (strength < 0.f || strength > 1.f) { + fprintf(stderr, "error: can only work with strength in [0.0, 1.0]\n"); + return false; + } + + if (!easycache_option.empty()) { + float values[3] = {0.0f, 0.0f, 0.0f}; + std::stringstream ss(easycache_option); + std::string token; + int idx = 0; + while (std::getline(ss, token, ',')) { + auto trim = [](std::string& s) { + const char* whitespace = " \t\r\n"; + auto start = s.find_first_not_of(whitespace); + if (start == std::string::npos) { + s.clear(); + return; + } + auto end = s.find_last_not_of(whitespace); + s = s.substr(start, end - start + 1); + }; + trim(token); + if (token.empty()) { + fprintf(stderr, "error: invalid easycache option '%s'\n", easycache_option.c_str()); + return false; + } + if (idx >= 3) { + fprintf(stderr, "error: easycache expects exactly 3 comma-separated values (threshold,start,end)\n"); + return false; + } + try { + values[idx] = std::stof(token); + } catch (const std::exception&) { + fprintf(stderr, "error: invalid easycache value '%s'\n", token.c_str()); + return false; + } + idx++; + } + if (idx != 3) { + fprintf(stderr, "error: easycache expects exactly 3 comma-separated values (threshold,start,end)\n"); + return false; + } + if (values[0] < 0.0f) { + fprintf(stderr, "error: easycache threshold must be non-negative\n"); + return false; + } + if (values[1] < 0.0f || values[1] >= 1.0f || values[2] <= 0.0f || values[2] > 1.0f || values[1] >= values[2]) { + fprintf(stderr, "error: easycache start/end percents must satisfy 0.0 <= start < end <= 1.0\n"); + return false; + } + easycache_params.enabled = true; + easycache_params.reuse_threshold = values[0]; + easycache_params.start_percent = values[1]; + easycache_params.end_percent = values[2]; + } else { + easycache_params.enabled = false; + } + + sample_params.guidance.slg.layers = skip_layers.data(); + sample_params.guidance.slg.layer_count = skip_layers.size(); + sample_params.custom_sigmas = custom_sigmas.data(); + sample_params.custom_sigmas_count = static_cast(custom_sigmas.size()); + high_noise_sample_params.guidance.slg.layers = high_noise_skip_layers.data(); + high_noise_sample_params.guidance.slg.layer_count = high_noise_skip_layers.size(); + + if (mode == VID_GEN && video_frames <= 0) { + return false; + } + + if (mode == VID_GEN && fps <= 0) { + return false; + } + + if (sample_params.shifted_timestep < 0 || sample_params.shifted_timestep > 1000) { + return false; + } + + if (upscale_repeats < 1) { + return false; + } + + if (upscale_tile_size < 1) { + return false; + } + + if (mode == UPSCALE) { + if (init_image_path.length() == 0) { + fprintf(stderr, "error: upscale mode needs an init image (--init-img)\n"); + return false; + } + } + + if (seed < 0) { + srand((int)time(nullptr)); + seed = rand(); + } + + extract_and_remove_lora(lora_model_dir); + + return true; + } + + std::string to_string() const { + char* sample_params_str = sd_sample_params_to_str(&sample_params); + char* high_noise_sample_params_str = sd_sample_params_to_str(&high_noise_sample_params); + + std::ostringstream lora_ss; + lora_ss << "{\n"; + for (auto it = lora_map.begin(); it != lora_map.end(); ++it) { + lora_ss << " \"" << it->first << "\": \"" << it->second << "\""; + if (std::next(it) != lora_map.end()) { + lora_ss << ","; + } + lora_ss << "\n"; + } + lora_ss << " }"; + std::string loras_str = lora_ss.str(); + + lora_ss = std::ostringstream(); + ; + lora_ss << "{\n"; + for (auto it = high_noise_lora_map.begin(); it != high_noise_lora_map.end(); ++it) { + lora_ss << " \"" << it->first << "\": \"" << it->second << "\""; + if (std::next(it) != high_noise_lora_map.end()) { + lora_ss << ","; + } + lora_ss << "\n"; + } + lora_ss << " }"; + std::string high_noise_loras_str = lora_ss.str(); + + std::ostringstream oss; + oss << "SDGenerationParams {\n" + << " loras: \"" << loras_str << "\",\n" + << " high_noise_loras: \"" << high_noise_loras_str << "\",\n" + << " prompt: \"" << prompt << "\",\n" + << " negative_prompt: \"" << negative_prompt << "\",\n" + << " clip_skip: " << clip_skip << ",\n" + << " width: " << width << ",\n" + << " height: " << height << ",\n" + << " batch_count: " << batch_count << ",\n" + << " init_image_path: \"" << init_image_path << "\",\n" + << " end_image_path: \"" << end_image_path << "\",\n" + << " mask_image_path: \"" << mask_image_path << "\",\n" + << " control_image_path: \"" << control_image_path << "\",\n" + << " ref_image_paths: " << vec_str_to_string(ref_image_paths) << ",\n" + << " control_video_path: \"" << control_video_path << "\",\n" + << " auto_resize_ref_image: " << (auto_resize_ref_image ? "true" : "false") << ",\n" + << " increase_ref_index: " << (increase_ref_index ? "true" : "false") << ",\n" + << " pm_id_images_dir: \"" << pm_id_images_dir << "\",\n" + << " pm_id_embed_path: \"" << pm_id_embed_path << "\",\n" + << " pm_style_strength: " << pm_style_strength << ",\n" + << " skip_layers: " << vec_to_string(skip_layers) << ",\n" + << " sample_params: " << sample_params_str << ",\n" + << " high_noise_skip_layers: " << vec_to_string(high_noise_skip_layers) << ",\n" + << " high_noise_sample_params: " << high_noise_sample_params_str << ",\n" + << " custom_sigmas: " << vec_to_string(custom_sigmas) << ",\n" + << " easycache_option: \"" << easycache_option << "\",\n" + << " easycache: " + << (easycache_params.enabled ? "enabled" : "disabled") + << " (threshold=" << easycache_params.reuse_threshold + << ", start=" << easycache_params.start_percent + << ", end=" << easycache_params.end_percent << "),\n" + << " moe_boundary: " << moe_boundary << ",\n" + << " video_frames: " << video_frames << ",\n" + << " fps: " << fps << ",\n" + << " vace_strength: " << vace_strength << ",\n" + << " strength: " << strength << ",\n" + << " control_strength: " << control_strength << ",\n" + << " seed: " << seed << ",\n" + << " upscale_repeats: " << upscale_repeats << ",\n" + << " upscale_tile_size: " << upscale_tile_size << ",\n" + << "}"; + free(sample_params_str); + free(high_noise_sample_params_str); + return oss.str(); + } +}; + +static std::string version_string() { + return std::string("stable-diffusion.cpp version ") + sd_version() + ", commit " + sd_commit(); +} + +uint8_t* load_image_common(bool from_memory, + const char* image_path_or_bytes, + int len, + int& width, + int& height, + int expected_width = 0, + int expected_height = 0, + int expected_channel = 3) { + int c = 0; + const char* image_path; + uint8_t* image_buffer = nullptr; + if (from_memory) { + image_path = "memory"; + image_buffer = (uint8_t*)stbi_load_from_memory((const stbi_uc*)image_path_or_bytes, len, &width, &height, &c, expected_channel); + } else { + image_path = image_path_or_bytes; + image_buffer = (uint8_t*)stbi_load(image_path_or_bytes, &width, &height, &c, expected_channel); + } + if (image_buffer == nullptr) { + fprintf(stderr, "load image from '%s' failed\n", image_path); + return nullptr; + } + if (c < expected_channel) { + fprintf(stderr, + "the number of channels for the input image must be >= %d," + "but got %d channels, image_path = %s\n", + expected_channel, + c, + image_path); + free(image_buffer); + return nullptr; + } + if (width <= 0) { + fprintf(stderr, "error: the width of image must be greater than 0, image_path = %s\n", image_path); + free(image_buffer); + return nullptr; + } + if (height <= 0) { + fprintf(stderr, "error: the height of image must be greater than 0, image_path = %s\n", image_path); + free(image_buffer); + return nullptr; + } + + // Resize input image ... + if ((expected_width > 0 && expected_height > 0) && (height != expected_height || width != expected_width)) { + float dst_aspect = (float)expected_width / (float)expected_height; + float src_aspect = (float)width / (float)height; + + int crop_x = 0, crop_y = 0; + int crop_w = width, crop_h = height; + + if (src_aspect > dst_aspect) { + crop_w = (int)(height * dst_aspect); + crop_x = (width - crop_w) / 2; + } else if (src_aspect < dst_aspect) { + crop_h = (int)(width / dst_aspect); + crop_y = (height - crop_h) / 2; + } + + if (crop_x != 0 || crop_y != 0) { + printf("crop input image from %dx%d to %dx%d, image_path = %s\n", width, height, crop_w, crop_h, image_path); + uint8_t* cropped_image_buffer = (uint8_t*)malloc(crop_w * crop_h * expected_channel); + if (cropped_image_buffer == nullptr) { + fprintf(stderr, "error: allocate memory for crop\n"); + free(image_buffer); + return nullptr; + } + for (int row = 0; row < crop_h; row++) { + uint8_t* src = image_buffer + ((crop_y + row) * width + crop_x) * expected_channel; + uint8_t* dst = cropped_image_buffer + (row * crop_w) * expected_channel; + memcpy(dst, src, crop_w * expected_channel); + } + + width = crop_w; + height = crop_h; + free(image_buffer); + image_buffer = cropped_image_buffer; + } + + printf("resize input image from %dx%d to %dx%d\n", width, height, expected_width, expected_height); + int resized_height = expected_height; + int resized_width = expected_width; + + uint8_t* resized_image_buffer = (uint8_t*)malloc(resized_height * resized_width * expected_channel); + if (resized_image_buffer == nullptr) { + fprintf(stderr, "error: allocate memory for resize input image\n"); + free(image_buffer); + return nullptr; + } + stbir_resize(image_buffer, width, height, 0, + resized_image_buffer, resized_width, resized_height, 0, STBIR_TYPE_UINT8, + expected_channel, STBIR_ALPHA_CHANNEL_NONE, 0, + STBIR_EDGE_CLAMP, STBIR_EDGE_CLAMP, + STBIR_FILTER_BOX, STBIR_FILTER_BOX, + STBIR_COLORSPACE_SRGB, nullptr); + width = resized_width; + height = resized_height; + free(image_buffer); + image_buffer = resized_image_buffer; + } + return image_buffer; +} + +uint8_t* load_image_from_file(const char* image_path, + int& width, + int& height, + int expected_width = 0, + int expected_height = 0, + int expected_channel = 3) { + return load_image_common(false, image_path, 0, width, height, expected_width, expected_height, expected_channel); +} + +uint8_t* load_image_from_memory(const char* image_bytes, + int len, + int& width, + int& height, + int expected_width = 0, + int expected_height = 0, + int expected_channel = 3) { + return load_image_common(true, image_bytes, len, width, height, expected_width, expected_height, expected_channel); +} diff --git a/otherarch/sdcpp/conditioner.hpp b/otherarch/sdcpp/conditioner.hpp index 403120d9b..45db314b9 100644 --- a/otherarch/sdcpp/conditioner.hpp +++ b/otherarch/sdcpp/conditioner.hpp @@ -56,20 +56,26 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { std::shared_ptr text_model2; std::string trigger_word = "img"; // should be user settable - std::string embd_dir; + std::map embedding_map; int32_t num_custom_embeddings = 0; int32_t num_custom_embeddings_2 = 0; std::vector token_embed_custom; - std::vector readed_embeddings; + std::map> embedding_pos_map; FrozenCLIPEmbedderWithCustomWords(ggml_backend_t backend, bool offload_params_to_cpu, const String2TensorStorage& tensor_storage_map, - const std::string& embd_dir, + const std::map& orig_embedding_map, SDVersion version = VERSION_SD1, PMVersion pv = PM_VERSION_1) - : version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407), embd_dir(embd_dir) { - bool force_clip_f32 = embd_dir.size() > 0; + : version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407) { + for (const auto& kv : orig_embedding_map) { + std::string name = kv.first; + std::transform(name.begin(), name.end(), name.begin(), [](unsigned char c) { return std::tolower(c); }); + embedding_map[name] = kv.second; + tokenizer.add_special_token(name); + } + bool force_clip_f32 = !embedding_map.empty(); if (sd_version_is_sd1(version)) { text_model = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, true, force_clip_f32); } else if (sd_version_is_sd2(version)) { @@ -117,14 +123,17 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { } bool load_embedding(std::string embd_name, std::string embd_path, std::vector& bpe_tokens) { - // the order matters ModelLoader model_loader; if (!model_loader.init_from_file_and_convert_name(embd_path)) { LOG_ERROR("embedding '%s' failed", embd_name.c_str()); return false; } - if (std::find(readed_embeddings.begin(), readed_embeddings.end(), embd_name) != readed_embeddings.end()) { + auto iter = embedding_pos_map.find(embd_name); + if (iter != embedding_pos_map.end()) { LOG_DEBUG("embedding already read in: %s", embd_name.c_str()); + for (int i = iter->second.first; i < iter->second.second; i++) { + bpe_tokens.push_back(text_model->model.vocab_size + i); + } return true; } struct ggml_init_params params; @@ -155,7 +164,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { return true; }; model_loader.load_tensors(on_load, 1); - readed_embeddings.push_back(embd_name); + int pos_start = num_custom_embeddings; if (embd) { int64_t hidden_size = text_model->model.hidden_size; token_embed_custom.resize(token_embed_custom.size() + ggml_nbytes(embd)); @@ -182,6 +191,11 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { } LOG_DEBUG("embedding '%s' applied, custom embeddings: %i (text model 2)", embd_name.c_str(), num_custom_embeddings_2); } + int pos_end = num_custom_embeddings; + if (pos_end == pos_start) { + return false; + } + embedding_pos_map[embd_name] = std::pair{pos_start, pos_end}; return true; } @@ -196,25 +210,13 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { std::vector convert_token_to_id(std::string text) { auto on_new_token_cb = [&](std::string& str, std::vector& bpe_tokens) -> bool { - size_t word_end = str.find(","); - std::string embd_name = word_end == std::string::npos ? str : str.substr(0, word_end); - embd_name = trim(embd_name); - std::string embd_path = get_full_path(embd_dir, embd_name + ".pt"); - if (embd_path.size() == 0) { - embd_path = get_full_path(embd_dir, embd_name + ".ckpt"); + auto iter = embedding_map.find(str); + if (iter == embedding_map.end()) { + return false; } - if (embd_path.size() == 0) { - embd_path = get_full_path(embd_dir, embd_name + ".safetensors"); - } - if (embd_path.size() > 0) { - if (load_embedding(embd_name, embd_path, bpe_tokens)) { - if (word_end != std::string::npos) { - str = str.substr(word_end); - } else { - str = ""; - } - return true; - } + std::string embedding_path = iter->second; + if (load_embedding(str, embedding_path, bpe_tokens)) { + return true; } return false; }; @@ -245,25 +247,13 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { } auto on_new_token_cb = [&](std::string& str, std::vector& bpe_tokens) -> bool { - size_t word_end = str.find(","); - std::string embd_name = word_end == std::string::npos ? str : str.substr(0, word_end); - embd_name = trim(embd_name); - std::string embd_path = get_full_path(embd_dir, embd_name + ".pt"); - if (embd_path.size() == 0) { - embd_path = get_full_path(embd_dir, embd_name + ".ckpt"); + auto iter = embedding_map.find(str); + if (iter == embedding_map.end()) { + return false; } - if (embd_path.size() == 0) { - embd_path = get_full_path(embd_dir, embd_name + ".safetensors"); - } - if (embd_path.size() > 0) { - if (load_embedding(embd_name, embd_path, bpe_tokens)) { - if (word_end != std::string::npos) { - str = str.substr(word_end); - } else { - str = ""; - } - return true; - } + std::string embedding_path = iter->second; + if (load_embedding(str, embedding_path, bpe_tokens)) { + return true; } return false; }; @@ -376,25 +366,13 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { } auto on_new_token_cb = [&](std::string& str, std::vector& bpe_tokens) -> bool { - size_t word_end = str.find(","); - std::string embd_name = word_end == std::string::npos ? str : str.substr(0, word_end); - embd_name = trim(embd_name); - std::string embd_path = get_full_path(embd_dir, embd_name + ".pt"); - if (embd_path.size() == 0) { - embd_path = get_full_path(embd_dir, embd_name + ".ckpt"); + auto iter = embedding_map.find(str); + if (iter == embedding_map.end()) { + return false; } - if (embd_path.size() == 0) { - embd_path = get_full_path(embd_dir, embd_name + ".safetensors"); - } - if (embd_path.size() > 0) { - if (load_embedding(embd_name, embd_path, bpe_tokens)) { - if (word_end != std::string::npos) { - str = str.substr(word_end); - } else { - str = ""; - } - return true; - } + std::string embedding_path = iter->second; + if (load_embedding(str, embedding_path, bpe_tokens)) { + return true; } return false; }; @@ -1638,7 +1616,7 @@ struct LLMEmbedder : public Conditioner { LLM::LLMArch arch = LLM::LLMArch::QWEN2_5_VL; if (sd_version_is_flux2(version)) { arch = LLM::LLMArch::MISTRAL_SMALL_3_2; - } else if (sd_version_is_z_image(version)) { + } else if (sd_version_is_z_image(version) || version == VERSION_OVIS_IMAGE) { arch = LLM::LLMArch::QWEN3; } if (arch == LLM::LLMArch::MISTRAL_SMALL_3_2) { @@ -1728,6 +1706,7 @@ struct LLMEmbedder : public Conditioner { std::vector> image_embeds; std::pair prompt_attn_range; int prompt_template_encode_start_idx = 34; + int max_length = 0; std::set out_layers; if (llm->enable_vision && conditioner_params.ref_images.size() > 0) { LOG_INFO("QwenImageEditPlusPipeline"); @@ -1825,6 +1804,17 @@ struct LLMEmbedder : public Conditioner { prompt_attn_range.second = prompt.size(); prompt += "[/INST]"; + } else if (version == VERSION_OVIS_IMAGE) { + prompt_template_encode_start_idx = 28; + max_length = prompt_template_encode_start_idx + 256; + + prompt = "<|im_start|>user\nDescribe the image by detailing the color, quantity, text, shape, size, texture, spatial relationships of the objects and background:"; + + prompt_attn_range.first = static_cast(prompt.size()); + prompt += " " + conditioner_params.text; + prompt_attn_range.second = static_cast(prompt.size()); + + prompt += "<|im_end|>\n<|im_start|>assistant\n\n\n\n\n"; } else { prompt_template_encode_start_idx = 34; @@ -1837,7 +1827,7 @@ struct LLMEmbedder : public Conditioner { prompt += "<|im_end|>\n<|im_start|>assistant\n"; } - auto tokens_and_weights = tokenize(prompt, prompt_attn_range, 0, false); + auto tokens_and_weights = tokenize(prompt, prompt_attn_range, max_length, max_length > 0); auto& tokens = std::get<0>(tokens_and_weights); auto& weights = std::get<1>(tokens_and_weights); @@ -1870,9 +1860,13 @@ struct LLMEmbedder : public Conditioner { GGML_ASSERT(hidden_states->ne[1] > prompt_template_encode_start_idx); - int64_t zero_pad_len = 0; + int64_t min_length = 0; if (sd_version_is_flux2(version)) { - int64_t min_length = 512; + min_length = 512; + } + + int64_t zero_pad_len = 0; + if (min_length > 0) { if (hidden_states->ne[1] - prompt_template_encode_start_idx < min_length) { zero_pad_len = min_length - hidden_states->ne[1] + prompt_template_encode_start_idx; } @@ -1892,6 +1886,8 @@ struct LLMEmbedder : public Conditioner { ggml_ext_tensor_set_f32(new_hidden_states, value, i0, i1, i2, i3); }); + // print_ggml_tensor(new_hidden_states); + int64_t t1 = ggml_time_ms(); LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0); return {new_hidden_states, nullptr, nullptr}; diff --git a/otherarch/sdcpp/esrgan.hpp b/otherarch/sdcpp/esrgan.hpp index 4cac95686..961e84f89 100644 --- a/otherarch/sdcpp/esrgan.hpp +++ b/otherarch/sdcpp/esrgan.hpp @@ -156,9 +156,10 @@ struct ESRGAN : public GGMLRunner { ESRGAN(ggml_backend_t backend, bool offload_params_to_cpu, + int tile_size = 128, const String2TensorStorage& tensor_storage_map = {}) : GGMLRunner(backend, offload_params_to_cpu) { - // rrdb_net will be created in load_from_file + this->tile_size = tile_size; } std::string get_desc() override { diff --git a/otherarch/sdcpp/flux.hpp b/otherarch/sdcpp/flux.hpp index f0c65e3d7..1df2874ae 100644 --- a/otherarch/sdcpp/flux.hpp +++ b/otherarch/sdcpp/flux.hpp @@ -134,6 +134,54 @@ namespace Flux { } }; + struct MLP : public UnaryBlock { + bool use_mlp_silu_act; + + public: + MLP(int64_t hidden_size, int64_t intermediate_size, bool use_mlp_silu_act = false, bool bias = false) + : use_mlp_silu_act(use_mlp_silu_act) { + int64_t mlp_mult_factor = use_mlp_silu_act ? 2 : 1; + blocks["0"] = std::make_shared(hidden_size, intermediate_size * mlp_mult_factor, bias); + blocks["2"] = std::make_shared(intermediate_size, hidden_size, bias); + } + + struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { + auto mlp_0 = std::dynamic_pointer_cast(blocks["0"]); + auto mlp_2 = std::dynamic_pointer_cast(blocks["2"]); + + x = mlp_0->forward(ctx, x); + if (use_mlp_silu_act) { + x = ggml_ext_silu_act(ctx->ggml_ctx, x); + } else { + x = ggml_gelu_inplace(ctx->ggml_ctx, x); + } + x = mlp_2->forward(ctx, x); + return x; + } + }; + + struct YakMLP : public UnaryBlock { + public: + YakMLP(int64_t hidden_size, int64_t intermediate_size, bool bias = true) { + blocks["gate_proj"] = std::make_shared(hidden_size, intermediate_size, bias); + blocks["up_proj"] = std::make_shared(hidden_size, intermediate_size, bias); + blocks["down_proj"] = std::make_shared(intermediate_size, hidden_size, bias); + } + + struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { + auto gate_proj = std::dynamic_pointer_cast(blocks["gate_proj"]); + auto up_proj = std::dynamic_pointer_cast(blocks["up_proj"]); + auto down_proj = std::dynamic_pointer_cast(blocks["down_proj"]); + + auto gate = gate_proj->forward(ctx, x); + gate = ggml_silu_inplace(ctx->ggml_ctx, gate); + x = up_proj->forward(ctx, x); + x = ggml_mul(ctx->ggml_ctx, x, gate); + x = down_proj->forward(ctx, x); + return x; + } + }; + struct ModulationOut { ggml_tensor* shift = nullptr; ggml_tensor* scale = nullptr; @@ -199,7 +247,6 @@ namespace Flux { struct DoubleStreamBlock : public GGMLBlock { bool prune_mod; int idx = 0; - bool use_mlp_silu_act; public: DoubleStreamBlock(int64_t hidden_size, @@ -210,10 +257,10 @@ namespace Flux { bool prune_mod = false, bool share_modulation = false, bool mlp_proj_bias = true, + bool use_yak_mlp = false, bool use_mlp_silu_act = false) - : idx(idx), prune_mod(prune_mod), use_mlp_silu_act(use_mlp_silu_act) { - int64_t mlp_hidden_dim = hidden_size * mlp_ratio; - int64_t mlp_mult_factor = use_mlp_silu_act ? 2 : 1; + : idx(idx), prune_mod(prune_mod) { + int64_t mlp_hidden_dim = hidden_size * mlp_ratio; if (!prune_mod && !share_modulation) { blocks["img_mod"] = std::shared_ptr(new Modulation(hidden_size, true)); @@ -222,9 +269,11 @@ namespace Flux { blocks["img_attn"] = std::shared_ptr(new SelfAttention(hidden_size, num_heads, qkv_bias, mlp_proj_bias)); blocks["img_norm2"] = std::shared_ptr(new LayerNorm(hidden_size, 1e-6f, false)); - blocks["img_mlp.0"] = std::shared_ptr(new Linear(hidden_size, mlp_hidden_dim * mlp_mult_factor, mlp_proj_bias)); - // img_mlp.1 is nn.GELU(approximate="tanh") - blocks["img_mlp.2"] = std::shared_ptr(new Linear(mlp_hidden_dim, hidden_size, mlp_proj_bias)); + if (use_yak_mlp) { + blocks["img_mlp"] = std::shared_ptr(new YakMLP(hidden_size, mlp_hidden_dim, mlp_proj_bias)); + } else { + blocks["img_mlp"] = std::shared_ptr(new MLP(hidden_size, mlp_hidden_dim, use_mlp_silu_act, mlp_proj_bias)); + } if (!prune_mod && !share_modulation) { blocks["txt_mod"] = std::shared_ptr(new Modulation(hidden_size, true)); @@ -233,9 +282,11 @@ namespace Flux { blocks["txt_attn"] = std::shared_ptr(new SelfAttention(hidden_size, num_heads, qkv_bias, mlp_proj_bias)); blocks["txt_norm2"] = std::shared_ptr(new LayerNorm(hidden_size, 1e-6f, false)); - blocks["txt_mlp.0"] = std::shared_ptr(new Linear(hidden_size, mlp_hidden_dim * mlp_mult_factor, mlp_proj_bias)); - // img_mlp.1 is nn.GELU(approximate="tanh") - blocks["txt_mlp.2"] = std::shared_ptr(new Linear(mlp_hidden_dim, hidden_size, mlp_proj_bias)); + if (use_yak_mlp) { + blocks["txt_mlp"] = std::shared_ptr(new YakMLP(hidden_size, mlp_hidden_dim, mlp_proj_bias)); + } else { + blocks["txt_mlp"] = std::shared_ptr(new MLP(hidden_size, mlp_hidden_dim, use_mlp_silu_act, mlp_proj_bias)); + } } std::vector get_distil_img_mod(GGMLRunnerContext* ctx, struct ggml_tensor* vec) { @@ -272,15 +323,13 @@ namespace Flux { auto img_attn = std::dynamic_pointer_cast(blocks["img_attn"]); auto img_norm2 = std::dynamic_pointer_cast(blocks["img_norm2"]); - auto img_mlp_0 = std::dynamic_pointer_cast(blocks["img_mlp.0"]); - auto img_mlp_2 = std::dynamic_pointer_cast(blocks["img_mlp.2"]); + auto img_mlp = std::dynamic_pointer_cast(blocks["img_mlp"]); auto txt_norm1 = std::dynamic_pointer_cast(blocks["txt_norm1"]); auto txt_attn = std::dynamic_pointer_cast(blocks["txt_attn"]); auto txt_norm2 = std::dynamic_pointer_cast(blocks["txt_norm2"]); - auto txt_mlp_0 = std::dynamic_pointer_cast(blocks["txt_mlp.0"]); - auto txt_mlp_2 = std::dynamic_pointer_cast(blocks["txt_mlp.2"]); + auto txt_mlp = std::dynamic_pointer_cast(blocks["txt_mlp"]); if (img_mods.empty()) { if (prune_mod) { @@ -348,27 +397,15 @@ namespace Flux { // calculate the img bloks img = ggml_add(ctx->ggml_ctx, img, ggml_mul(ctx->ggml_ctx, img_attn->post_attention(ctx, img_attn_out), img_mod1.gate)); - auto img_mlp_out = img_mlp_0->forward(ctx, Flux::modulate(ctx->ggml_ctx, img_norm2->forward(ctx, img), img_mod2.shift, img_mod2.scale)); - if (use_mlp_silu_act) { - img_mlp_out = ggml_ext_silu_act(ctx->ggml_ctx, img_mlp_out); - } else { - img_mlp_out = ggml_gelu_inplace(ctx->ggml_ctx, img_mlp_out); - } - img_mlp_out = img_mlp_2->forward(ctx, img_mlp_out); + auto img_mlp_out = img_mlp->forward(ctx, Flux::modulate(ctx->ggml_ctx, img_norm2->forward(ctx, img), img_mod2.shift, img_mod2.scale)); img = ggml_add(ctx->ggml_ctx, img, ggml_mul(ctx->ggml_ctx, img_mlp_out, img_mod2.gate)); // calculate the txt bloks txt = ggml_add(ctx->ggml_ctx, txt, ggml_mul(ctx->ggml_ctx, txt_attn->post_attention(ctx, txt_attn_out), txt_mod1.gate)); - auto txt_mlp_out = txt_mlp_0->forward(ctx, Flux::modulate(ctx->ggml_ctx, txt_norm2->forward(ctx, txt), txt_mod2.shift, txt_mod2.scale)); - if (use_mlp_silu_act) { - txt_mlp_out = ggml_ext_silu_act(ctx->ggml_ctx, txt_mlp_out); - } else { - txt_mlp_out = ggml_gelu_inplace(ctx->ggml_ctx, txt_mlp_out); - } - txt_mlp_out = txt_mlp_2->forward(ctx, txt_mlp_out); - txt = ggml_add(ctx->ggml_ctx, txt, ggml_mul(ctx->ggml_ctx, txt_mlp_out, txt_mod2.gate)); + auto txt_mlp_out = txt_mlp->forward(ctx, Flux::modulate(ctx->ggml_ctx, txt_norm2->forward(ctx, txt), txt_mod2.shift, txt_mod2.scale)); + txt = ggml_add(ctx->ggml_ctx, txt, ggml_mul(ctx->ggml_ctx, txt_mlp_out, txt_mod2.gate)); return {img, txt}; } @@ -381,6 +418,7 @@ namespace Flux { int64_t mlp_hidden_dim; bool prune_mod; int idx = 0; + bool use_yak_mlp; bool use_mlp_silu_act; int64_t mlp_mult_factor; @@ -393,8 +431,9 @@ namespace Flux { bool prune_mod = false, bool share_modulation = false, bool mlp_proj_bias = true, + bool use_yak_mlp = false, bool use_mlp_silu_act = false) - : hidden_size(hidden_size), num_heads(num_heads), idx(idx), prune_mod(prune_mod), use_mlp_silu_act(use_mlp_silu_act) { + : hidden_size(hidden_size), num_heads(num_heads), idx(idx), prune_mod(prune_mod), use_yak_mlp(use_yak_mlp), use_mlp_silu_act(use_mlp_silu_act) { int64_t head_dim = hidden_size / num_heads; float scale = qk_scale; if (scale <= 0.f) { @@ -402,7 +441,7 @@ namespace Flux { } mlp_hidden_dim = hidden_size * mlp_ratio; mlp_mult_factor = 1; - if (use_mlp_silu_act) { + if (use_yak_mlp || use_mlp_silu_act) { mlp_mult_factor = 2; } @@ -481,7 +520,9 @@ namespace Flux { k = norm->key_norm(ctx, k); auto attn = Rope::attention(ctx, q, k, v, pe, mask); // [N, n_token, hidden_size] - if (use_mlp_silu_act) { + if (use_yak_mlp) { + mlp = ggml_ext_silu_act(ctx->ggml_ctx, mlp, false); + } else if (use_mlp_silu_act) { mlp = ggml_ext_silu_act(ctx->ggml_ctx, mlp); } else { mlp = ggml_gelu_inplace(ctx->ggml_ctx, mlp); @@ -726,6 +767,8 @@ namespace Flux { int64_t in_dim = 64; bool disable_bias = false; bool share_modulation = false; + bool semantic_txt_norm = false; + bool use_yak_mlp = false; bool use_mlp_silu_act = false; float ref_index_scale = 1.f; ChromaRadianceParams chroma_radiance_params; @@ -759,6 +802,9 @@ namespace Flux { blocks["guidance_in"] = std::make_shared(256, params.hidden_size, !params.disable_bias); } } + if (params.semantic_txt_norm) { + blocks["txt_norm"] = std::make_shared(params.context_in_dim); + } blocks["txt_in"] = std::make_shared(params.context_in_dim, params.hidden_size, !params.disable_bias); for (int i = 0; i < params.depth; i++) { @@ -770,6 +816,7 @@ namespace Flux { params.is_chroma, params.share_modulation, !params.disable_bias, + params.use_yak_mlp, params.use_mlp_silu_act); } @@ -782,6 +829,7 @@ namespace Flux { params.is_chroma, params.share_modulation, !params.disable_bias, + params.use_yak_mlp, params.use_mlp_silu_act); } @@ -948,6 +996,12 @@ namespace Flux { ss_mods = single_stream_modulation->forward(ctx, vec); } + if (params.semantic_txt_norm) { + auto semantic_txt_norm = std::dynamic_pointer_cast(blocks["txt_norm"]); + + txt = semantic_txt_norm->forward(ctx, txt); + } + txt = txt_in->forward(ctx, txt); for (int i = 0; i < params.depth; i++) { @@ -1206,6 +1260,11 @@ namespace Flux { } else if (version == VERSION_CHROMA_RADIANCE) { flux_params.in_channels = 3; flux_params.patch_size = 16; + } else if (version == VERSION_OVIS_IMAGE) { + flux_params.semantic_txt_norm = true; + flux_params.use_yak_mlp = true; + flux_params.context_in_dim = 2048; + flux_params.vec_in_dim = 0; } else if (sd_version_is_flux2(version)) { flux_params.context_in_dim = 15360; flux_params.in_channels = 128; @@ -1364,13 +1423,22 @@ namespace Flux { ref_latents[i] = to_backend(ref_latents[i]); } + std::set txt_arange_dims; + if (sd_version_is_flux2(version)) { + txt_arange_dims = {3}; + increase_ref_index = true; + } else if (version == VERSION_OVIS_IMAGE) { + txt_arange_dims = {1, 2}; + } + pe_vec = Rope::gen_flux_pe(x->ne[1], x->ne[0], flux_params.patch_size, x->ne[3], context->ne[1], + txt_arange_dims, ref_latents, - sd_version_is_flux2(version) ? true : increase_ref_index, + increase_ref_index, flux_params.ref_index_scale, flux_params.theta, flux_params.axes_dim); diff --git a/otherarch/sdcpp/ggml_extend.hpp b/otherarch/sdcpp/ggml_extend.hpp index 3669b17ba..f76aaef42 100644 --- a/otherarch/sdcpp/ggml_extend.hpp +++ b/otherarch/sdcpp/ggml_extend.hpp @@ -19,7 +19,6 @@ #include #include #include -#include #include "ggml-alloc.h" #include "ggml-backend.h" @@ -61,6 +60,14 @@ #define SD_UNUSED(x) (void)(x) #endif +__STATIC_INLINE__ int align_up_offset(int n, int multiple) { + return (multiple - n % multiple) % multiple; +} + +__STATIC_INLINE__ int align_up(int n, int multiple) { + return n + align_up_offset(n, multiple); +} + __STATIC_INLINE__ void ggml_log_callback_default(ggml_log_level level, const char* text, void*) { switch (level) { case GGML_LOG_LEVEL_DEBUG: @@ -289,12 +296,7 @@ __STATIC_INLINE__ void ggml_ext_tensor_diff( } __STATIC_INLINE__ ggml_tensor* load_tensor_from_file(ggml_context* ctx, const std::string& file_path) { - #ifdef _WIN32 - std::filesystem::path fpath = std::filesystem::u8path(file_path); - #else - std::filesystem::path fpath = std::filesystem::path(file_path); - #endif - std::ifstream file(fpath, std::ios::binary); + std::ifstream file(sd_get_u8path(file_path), std::ios::binary); if (!file.is_open()) { LOG_ERROR("failed to open '%s'", file_path.c_str()); return nullptr; @@ -730,34 +732,22 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_slice(struct ggml_context* ctx, __STATIC_INLINE__ std::vector ggml_ext_chunk(struct ggml_context* ctx, struct ggml_tensor* x, int num, - int64_t dim) { + int64_t dim, + bool cont = true) { GGML_ASSERT(dim >= 0 && dim < 4); GGML_ASSERT(x->ne[dim] % num == 0); - int perm[4] = {0, 1, 2, 3}; - for (int i = dim; i < 3; ++i) - perm[i] = perm[i + 1]; - perm[3] = dim; - - int inv_perm[4]; - for (int i = 0; i < 4; ++i) - inv_perm[perm[i]] = i; - - if (dim != 3) { - x = ggml_ext_torch_permute(ctx, x, perm[0], perm[1], perm[2], perm[3]); - x = ggml_cont(ctx, x); - } - std::vector chunks; - int64_t chunk_size = x->ne[3] / num; + int64_t chunk_size = x->ne[dim] / num; + int64_t stride = chunk_size * x->nb[dim]; + int64_t chunk_ne[4] = {x->ne[0], x->ne[1], x->ne[2], x->ne[3]}; + chunk_ne[dim] = chunk_size; for (int i = 0; i < num; i++) { auto chunk = ggml_view_4d( ctx, x, - x->ne[0], x->ne[1], x->ne[2], chunk_size, - x->nb[1], x->nb[2], x->nb[3], x->nb[3] * i * chunk_size); - - if (dim != 3) { - chunk = ggml_ext_torch_permute(ctx, chunk, inv_perm[0], inv_perm[1], inv_perm[2], inv_perm[3]); + chunk_ne[0], chunk_ne[1], chunk_ne[2], chunk_ne[3], + x->nb[1], x->nb[2], x->nb[3], stride * i); + if (cont) { chunk = ggml_cont(ctx, chunk); } chunks.push_back(chunk); @@ -766,17 +756,23 @@ __STATIC_INLINE__ std::vector ggml_ext_chunk(struct ggml_co return chunks; } -__STATIC_INLINE__ ggml_tensor* ggml_ext_silu_act(ggml_context* ctx, ggml_tensor* x) { +__STATIC_INLINE__ ggml_tensor* ggml_ext_silu_act(ggml_context* ctx, ggml_tensor* x, bool gate_first = true) { // x: [ne3, ne2, ne1, ne0] // return: [ne3, ne2, ne1, ne0/2] - auto x_vec = ggml_ext_chunk(ctx, x, 2, 0); - auto x1 = x_vec[0]; // [ne3, ne2, ne1, ne0/2] - auto x2 = x_vec[1]; // [ne3, ne2, ne1, ne0/2] + auto x_vec = ggml_ext_chunk(ctx, x, 2, 0, false); + ggml_tensor* gate; + if (gate_first) { + gate = x_vec[0]; + x = x_vec[1]; + } else { + x = x_vec[0]; + gate = x_vec[1]; + } + gate = ggml_cont(ctx, gate); + gate = ggml_silu_inplace(ctx, gate); - x1 = ggml_silu_inplace(ctx, x1); - - x = ggml_mul(ctx, x1, x2); // [ne3, ne2, ne1, ne0/2] + x = ggml_mul(ctx, x, gate); // [ne3, ne2, ne1, ne0/2] return x; } @@ -1274,6 +1270,9 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_attention_ext(struct ggml_context } if (mask_in != nullptr) { + // the need for padding got removed in ggml 4767bda + // ensure we can still use the old version for now +#ifdef GGML_KQ_MASK_PAD int mask_pad = 0; if (mask_in->ne[1] % GGML_KQ_MASK_PAD != 0) { mask_pad = GGML_PAD(L_q, GGML_KQ_MASK_PAD) - mask_in->ne[1]; @@ -1281,6 +1280,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_attention_ext(struct ggml_context if (mask_pad > 0) { mask_in = ggml_pad(ctx, mask_in, 0, mask_pad, 0, 0); } +#endif mask_in = ggml_cast(ctx, mask_in, GGML_TYPE_F16); } @@ -1392,10 +1392,14 @@ __STATIC_INLINE__ void ggml_ext_backend_tensor_get_and_sync(ggml_backend_t backe } __STATIC_INLINE__ float ggml_ext_backend_tensor_get_f32(ggml_tensor* tensor) { - GGML_ASSERT(tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_I32); + GGML_ASSERT(tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_I32 || tensor->type == GGML_TYPE_BF16); float value; if (tensor->type == GGML_TYPE_F32) { ggml_backend_tensor_get(tensor, &value, 0, sizeof(value)); + } else if (tensor->type == GGML_TYPE_BF16) { + ggml_bf16_t bf16_value; + ggml_backend_tensor_get(tensor, &bf16_value, 0, sizeof(bf16_value)); + value = ggml_bf16_to_fp32(bf16_value); } else if (tensor->type == GGML_TYPE_F16) { ggml_fp16_t f16_value; ggml_backend_tensor_get(tensor, &f16_value, 0, sizeof(f16_value)); diff --git a/otherarch/sdcpp/gguf_reader.hpp b/otherarch/sdcpp/gguf_reader.hpp index 53482662e..edf5899a7 100644 --- a/otherarch/sdcpp/gguf_reader.hpp +++ b/otherarch/sdcpp/gguf_reader.hpp @@ -171,7 +171,7 @@ private: public: bool load(const std::string& file_path) { - std::ifstream fin(file_path, std::ios::binary); + std::ifstream fin(sd_get_u8path(file_path), std::ios::binary); if (!fin) { LOG_ERROR("failed to open '%s'", file_path.c_str()); return false; diff --git a/otherarch/sdcpp/latent-preview.h b/otherarch/sdcpp/latent-preview.h index 97409a7d8..2c54c3b5e 100644 --- a/otherarch/sdcpp/latent-preview.h +++ b/otherarch/sdcpp/latent-preview.h @@ -91,6 +91,41 @@ const float flux_latent_rgb_proj[16][3] = { {-0.111849f, -0.055589f, -0.032361f}}; float flux_latent_rgb_bias[3] = {0.024600f, -0.006937f, -0.008089f}; +const float flux2_latent_rgb_proj[32][3] = { + {0.000736f, -0.008385f, -0.019710f}, + {-0.001352f, -0.016392f, 0.020693f}, + {-0.006376f, 0.002428f, 0.036736f}, + {0.039384f, 0.074167f, 0.119789f}, + {0.007464f, -0.005705f, -0.004734f}, + {-0.004086f, 0.005287f, -0.000409f}, + {-0.032835f, 0.050802f, -0.028120f}, + {-0.003158f, -0.000835f, 0.000406f}, + {-0.112840f, -0.084337f, -0.023083f}, + {0.001462f, -0.006656f, 0.000549f}, + {-0.009980f, -0.007480f, 0.009702f}, + {0.032540f, 0.000214f, -0.061388f}, + {0.011023f, 0.000694f, 0.007143f}, + {-0.001468f, -0.006723f, -0.001678f}, + {-0.005921f, -0.010320f, -0.003907f}, + {-0.028434f, 0.027584f, 0.018457f}, + {0.014349f, 0.011523f, 0.000441f}, + {0.009874f, 0.003081f, 0.001507f}, + {0.002218f, 0.005712f, 0.001563f}, + {0.053010f, -0.019844f, 0.008683f}, + {-0.002507f, 0.005384f, 0.000938f}, + {-0.002177f, -0.011366f, 0.003559f}, + {-0.000261f, 0.015121f, -0.003240f}, + {-0.003944f, -0.002083f, 0.005043f}, + {-0.009138f, 0.011336f, 0.003781f}, + {0.011429f, 0.003985f, -0.003855f}, + {0.010518f, -0.005586f, 0.010131f}, + {0.007883f, 0.002912f, -0.001473f}, + {-0.003318f, -0.003160f, 0.003684f}, + {-0.034560f, -0.008740f, 0.012996f}, + {0.000166f, 0.001079f, -0.012153f}, + {0.017772f, 0.000937f, -0.011953f}}; +float flux2_latent_rgb_bias[3] = {-0.028738f, -0.098463f, -0.107619f}; + // This one was taken straight from // https://github.com/Stability-AI/sd3.5/blob/8565799a3b41eb0c7ba976d18375f0f753f56402/sd3_impls.py#L288-L303 // (MiT Licence) @@ -128,16 +163,42 @@ const float sd_latent_rgb_proj[4][3] = { {-0.178022f, -0.200862f, -0.678514f}}; float sd_latent_rgb_bias[3] = {-0.017478f, -0.055834f, -0.105825f}; -void preview_latent_video(uint8_t* buffer, struct ggml_tensor* latents, const float (*latent_rgb_proj)[3], const float latent_rgb_bias[3], int width, int height, int frames, int dim) { +void preview_latent_video(uint8_t* buffer, struct ggml_tensor* latents, const float (*latent_rgb_proj)[3], const float latent_rgb_bias[3], int patch_size) { size_t buffer_head = 0; + + uint32_t latent_width = latents->ne[0]; + uint32_t latent_height = latents->ne[1]; + uint32_t dim = latents->ne[ggml_n_dims(latents) - 1]; + uint32_t frames = 1; + if (ggml_n_dims(latents) == 4) { + frames = latents->ne[2]; + } + + uint32_t rgb_width = latent_width * patch_size; + uint32_t rgb_height = latent_height * patch_size; + + uint32_t unpatched_dim = dim / (patch_size * patch_size); + for (int k = 0; k < frames; k++) { - for (int j = 0; j < height; j++) { - for (int i = 0; i < width; i++) { - size_t latent_id = (i * latents->nb[0] + j * latents->nb[1] + k * latents->nb[2]); + for (int rgb_x = 0; rgb_x < rgb_width; rgb_x++) { + for (int rgb_y = 0; rgb_y < rgb_height; rgb_y++) { + int latent_x = rgb_x / patch_size; + int latent_y = rgb_y / patch_size; + + int channel_offset = 0; + if (patch_size > 1) { + channel_offset = ((rgb_y % patch_size) * patch_size + (rgb_x % patch_size)); + } + + size_t latent_id = (latent_x * latents->nb[0] + latent_y * latents->nb[1] + k * latents->nb[2]); + + // should be incremented by 1 for each pixel + size_t pixel_id = k * rgb_width * rgb_height + rgb_y * rgb_width + rgb_x; + float r = 0, g = 0, b = 0; if (latent_rgb_proj != nullptr) { - for (int d = 0; d < dim; d++) { - float value = *(float*)((char*)latents->data + latent_id + d * latents->nb[ggml_n_dims(latents) - 1]); + for (int d = 0; d < unpatched_dim; d++) { + float value = *(float*)((char*)latents->data + latent_id + (d * patch_size * patch_size + channel_offset) * latents->nb[ggml_n_dims(latents) - 1]); r += value * latent_rgb_proj[d][0]; g += value * latent_rgb_proj[d][1]; b += value * latent_rgb_proj[d][2]; @@ -164,9 +225,9 @@ void preview_latent_video(uint8_t* buffer, struct ggml_tensor* latents, const fl g = g >= 0 ? g <= 1 ? g : 1 : 0; b = b >= 0 ? b <= 1 ? b : 1 : 0; - buffer[buffer_head++] = (uint8_t)(r * 255); - buffer[buffer_head++] = (uint8_t)(g * 255); - buffer[buffer_head++] = (uint8_t)(b * 255); + buffer[pixel_id * 3 + 0] = (uint8_t)(r * 255); + buffer[pixel_id * 3 + 1] = (uint8_t)(g * 255); + buffer[pixel_id * 3 + 2] = (uint8_t)(b * 255); } } } diff --git a/otherarch/sdcpp/llm.hpp b/otherarch/sdcpp/llm.hpp index aa1e46a2e..f8c03add0 100644 --- a/otherarch/sdcpp/llm.hpp +++ b/otherarch/sdcpp/llm.hpp @@ -356,6 +356,10 @@ namespace LLM { "<|fim_pad|>", "<|repo_name|>", "<|file_sep|>", + "", + "", + "", + "", }; if (merges_utf8_str.size() > 0) { @@ -859,11 +863,11 @@ namespace LLM { } if (arch == LLMArch::MISTRAL_SMALL_3_2) { - q = ggml_rope_ext(ctx->ggml_ctx, q, input_pos, nullptr, 128, GGML_ROPE_TYPE_NORMAL, 131072, 1000000000.f, 1.f, 0.f, 1.f, 32.f, 1.f); - k = ggml_rope_ext(ctx->ggml_ctx, k, input_pos, nullptr, 128, GGML_ROPE_TYPE_NORMAL, 131072, 1000000000.f, 1.f, 0.f, 1.f, 32.f, 1.f); + q = ggml_rope_ext(ctx->ggml_ctx, q, input_pos, nullptr, 128, GGML_ROPE_TYPE_NORMAL, 8192, 1000000000.f, 1.f, 0.f, 1.f, 32.f, 1.f); + k = ggml_rope_ext(ctx->ggml_ctx, k, input_pos, nullptr, 128, GGML_ROPE_TYPE_NORMAL, 8192, 1000000000.f, 1.f, 0.f, 1.f, 32.f, 1.f); } else if (arch == LLMArch::QWEN3) { - q = ggml_rope_ext(ctx->ggml_ctx, q, input_pos, nullptr, 128, GGML_ROPE_TYPE_NEOX, 151936, 1000000.f, 1.f, 0.f, 1.f, 32.f, 1.f); - k = ggml_rope_ext(ctx->ggml_ctx, k, input_pos, nullptr, 128, GGML_ROPE_TYPE_NEOX, 151936, 1000000.f, 1.f, 0.f, 1.f, 32.f, 1.f); + q = ggml_rope_ext(ctx->ggml_ctx, q, input_pos, nullptr, 128, GGML_ROPE_TYPE_NEOX, 40960, 1000000.f, 1.f, 0.f, 1.f, 32.f, 1.f); + k = ggml_rope_ext(ctx->ggml_ctx, k, input_pos, nullptr, 128, GGML_ROPE_TYPE_NEOX, 40960, 1000000.f, 1.f, 0.f, 1.f, 32.f, 1.f); } else { int sections[4] = {16, 24, 24, 0}; q = ggml_rope_multi(ctx->ggml_ctx, q, input_pos, nullptr, head_dim, sections, GGML_ROPE_TYPE_MROPE, 128000, 1000000.f, 1.f, 0.f, 1.f, 32.f, 1.f); @@ -1073,29 +1077,22 @@ namespace LLM { : GGMLRunner(backend, offload_params_to_cpu), enable_vision(enable_vision_) { params.arch = arch; if (arch == LLMArch::MISTRAL_SMALL_3_2) { - params.num_layers = 40; - params.hidden_size = 5120; - params.intermediate_size = 32768; - params.head_dim = 128; - params.num_heads = 32; - params.num_kv_heads = 8; - params.qkv_bias = false; - params.vocab_size = 131072; - params.rms_norm_eps = 1e-5f; + params.head_dim = 128; + params.num_heads = 32; + params.num_kv_heads = 8; + params.qkv_bias = false; + params.rms_norm_eps = 1e-5f; } else if (arch == LLMArch::QWEN3) { - params.num_layers = 36; - params.hidden_size = 2560; - params.intermediate_size = 9728; - params.head_dim = 128; - params.num_heads = 32; - params.num_kv_heads = 8; - params.qkv_bias = false; - params.qk_norm = true; - params.vocab_size = 151936; - params.rms_norm_eps = 1e-6f; + params.head_dim = 128; + params.num_heads = 32; + params.num_kv_heads = 8; + params.qkv_bias = false; + params.qk_norm = true; + params.rms_norm_eps = 1e-6f; } bool have_vision_weight = false; bool llama_cpp_style = false; + params.num_layers = 0; for (auto pair : tensor_storage_map) { std::string tensor_name = pair.first; if (tensor_name.find(prefix) == std::string::npos) @@ -1105,10 +1102,36 @@ namespace LLM { have_vision_weight = true; if (contains(tensor_name, "attn.q_proj")) { llama_cpp_style = true; - break; + } + continue; + } + pos = tensor_name.find("layers."); + if (pos != std::string::npos) { + tensor_name = tensor_name.substr(pos); // remove prefix + auto items = split_string(tensor_name, '.'); + if (items.size() > 1) { + int block_index = atoi(items[1].c_str()); + if (block_index + 1 > params.num_layers) { + params.num_layers = block_index + 1; + } } } + if (contains(tensor_name, "embed_tokens.weight")) { + params.hidden_size = pair.second.ne[0]; + params.vocab_size = pair.second.ne[1]; + } + if (contains(tensor_name, "layers.0.mlp.gate_proj.weight")) { + params.intermediate_size = pair.second.ne[1]; + } } + if (arch == LLMArch::QWEN3 && params.num_layers == 28) { // Qwen3 2B + params.num_heads = 16; + } + LOG_DEBUG("llm: num_layers = %" PRId64 ", vocab_size = %" PRId64 ", hidden_size = %" PRId64 ", intermediate_size = %" PRId64, + params.num_layers, + params.vocab_size, + params.hidden_size, + params.intermediate_size); if (enable_vision && !have_vision_weight) { LOG_WARN("no vision weights detected, vision disabled"); enable_vision = false; diff --git a/otherarch/sdcpp/main.cpp b/otherarch/sdcpp/main.cpp index c58da7940..e37434b71 100644 --- a/otherarch/sdcpp/main.cpp +++ b/otherarch/sdcpp/main.cpp @@ -15,38 +15,10 @@ // #include "preprocessing.hpp" #include "stable-diffusion.h" -#define STB_IMAGE_IMPLEMENTATION -//#define STB_IMAGE_STATIC -#include "stb_image.h" - -#define STB_IMAGE_WRITE_IMPLEMENTATION -//#define STB_IMAGE_WRITE_STATIC -#include "stb_image_write.h" - -#define STB_IMAGE_RESIZE_IMPLEMENTATION -//#define STB_IMAGE_RESIZE_STATIC -#include "stb_image_resize.h" +#include "common/common.hpp" #include "avi_writer.h" -#if defined(_WIN32) -#define NOMINMAX -#include -#endif // _WIN32 - -#define SAFE_STR(s) ((s) ? (s) : "") -#define BOOL_STR(b) ((b) ? "true" : "false") - -namespace fs = std::filesystem; - -const char* modes_str[] = { - "img_gen", - "vid_gen", - "convert", - "upscale", -}; -#define SD_ALL_MODES_STR "img_gen, vid_gen, convert, upscale" - const char* previews_str[] = { "none", "proj", @@ -54,276 +26,12 @@ const char* previews_str[] = { "vae", }; -enum SDMode { - IMG_GEN, - VID_GEN, - CONVERT, - UPSCALE, - MODE_COUNT -}; - -#if defined(_WIN32) -static std::string utf16_to_utf8(const std::wstring& wstr) { - if (wstr.empty()) - return {}; - int size_needed = WideCharToMultiByte(CP_UTF8, 0, wstr.data(), (int)wstr.size(), - nullptr, 0, nullptr, nullptr); - if (size_needed <= 0) - throw std::runtime_error("UTF-16 to UTF-8 conversion failed"); - - std::string utf8(size_needed, 0); - WideCharToMultiByte(CP_UTF8, 0, wstr.data(), (int)wstr.size(), - (char*)utf8.data(), size_needed, nullptr, nullptr); - return utf8; -} - -static std::string argv_to_utf8(int index, const char** argv) { - int argc; - wchar_t** argv_w = CommandLineToArgvW(GetCommandLineW(), &argc); - if (!argv_w) - throw std::runtime_error("Failed to parse command line"); - - std::string result; - if (index < argc) { - result = utf16_to_utf8(argv_w[index]); - } - LocalFree(argv_w); - return result; -} - -#else // Linux / macOS -static std::string argv_to_utf8(int index, const char** argv) { - return std::string(argv[index]); -} - -#endif - -struct StringOption { - std::string short_name; - std::string long_name; - std::string desc; - std::string* target; -}; - -struct IntOption { - std::string short_name; - std::string long_name; - std::string desc; - int* target; -}; - -struct FloatOption { - std::string short_name; - std::string long_name; - std::string desc; - float* target; -}; - -struct BoolOption { - std::string short_name; - std::string long_name; - std::string desc; - bool keep_true; - bool* target; -}; - -struct ManualOption { - std::string short_name; - std::string long_name; - std::string desc; - std::function cb; -}; - -struct ArgOptions { - std::vector string_options; - std::vector int_options; - std::vector float_options; - std::vector bool_options; - std::vector manual_options; - - static std::string wrap_text(const std::string& text, size_t width, size_t indent) { - std::ostringstream oss; - size_t line_len = 0; - size_t pos = 0; - - while (pos < text.size()) { - // Preserve manual newlines - if (text[pos] == '\n') { - oss << '\n' - << std::string(indent, ' '); - line_len = indent; - ++pos; - continue; - } - - // Add the character - oss << text[pos]; - ++line_len; - ++pos; - - // If the current line exceeds width, try to break at the last space - if (line_len >= width) { - std::string current = oss.str(); - size_t back = current.size(); - - // Find the last space (for a clean break) - while (back > 0 && current[back - 1] != ' ' && current[back - 1] != '\n') - --back; - - // If found a space to break on - if (back > 0 && current[back - 1] != '\n') { - std::string before = current.substr(0, back - 1); - std::string after = current.substr(back); - oss.str(""); - oss.clear(); - oss << before << "\n" - << std::string(indent, ' ') << after; - } else { - // If no space found, just break at width - oss << "\n" - << std::string(indent, ' '); - } - line_len = indent; - } - } - - return oss.str(); - } - - void print() const { - constexpr size_t max_line_width = 120; - - struct Entry { - std::string names; - std::string desc; - }; - std::vector entries; - - auto add_entry = [&](const std::string& s, const std::string& l, - const std::string& desc, const std::string& hint = "") { - std::ostringstream ss; - if (!s.empty()) - ss << s; - if (!s.empty() && !l.empty()) - ss << ", "; - if (!l.empty()) - ss << l; - if (!hint.empty()) - ss << " " << hint; - entries.push_back({ss.str(), desc}); - }; - - for (auto& o : string_options) - add_entry(o.short_name, o.long_name, o.desc, ""); - for (auto& o : int_options) - add_entry(o.short_name, o.long_name, o.desc, ""); - for (auto& o : float_options) - add_entry(o.short_name, o.long_name, o.desc, ""); - for (auto& o : bool_options) - add_entry(o.short_name, o.long_name, o.desc, ""); - for (auto& o : manual_options) - add_entry(o.short_name, o.long_name, o.desc); - - size_t max_name_width = 0; - for (auto& e : entries) - max_name_width = std::max(max_name_width, e.names.size()); - - for (auto& e : entries) { - size_t indent = 2 + max_name_width + 4; - size_t desc_width = (max_line_width > indent ? max_line_width - indent : 40); - std::string wrapped_desc = wrap_text(e.desc, max_line_width, indent); - std::cout << " " << std::left << std::setw(static_cast(max_name_width) + 4) - << e.names << wrapped_desc << "\n"; - } - } -}; - -bool parse_options(int argc, const char** argv, const std::vector& options_list) { - bool invalid_arg = false; - std::string arg; - - auto match_and_apply = [&](auto& opts, auto&& apply_fn) -> bool { - for (auto& option : opts) { - if ((option.short_name.size() > 0 && arg == option.short_name) || - (option.long_name.size() > 0 && arg == option.long_name)) { - apply_fn(option); - return true; - } - } - return false; - }; - - for (int i = 1; i < argc; i++) { - arg = argv[i]; - bool found_arg = false; - - for (auto& options : options_list) { - if (match_and_apply(options.string_options, [&](auto& option) { - if (++i >= argc) { - invalid_arg = true; - return; - } - *option.target = argv_to_utf8(i, argv); - found_arg = true; - })) - break; - - if (match_and_apply(options.int_options, [&](auto& option) { - if (++i >= argc) { - invalid_arg = true; - return; - } - *option.target = std::stoi(argv[i]); - found_arg = true; - })) - break; - - if (match_and_apply(options.float_options, [&](auto& option) { - if (++i >= argc) { - invalid_arg = true; - return; - } - *option.target = std::stof(argv[i]); - found_arg = true; - })) - break; - - if (match_and_apply(options.bool_options, [&](auto& option) { - *option.target = option.keep_true ? true : false; - found_arg = true; - })) - break; - - if (match_and_apply(options.manual_options, [&](auto& option) { - int ret = option.cb(argc, argv, i); - if (ret < 0) { - invalid_arg = true; - return; - } - i += ret; - found_arg = true; - })) - break; - } - - if (invalid_arg) { - fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); - return false; - } - if (!found_arg) { - fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); - return false; - } - } - - return true; -} - struct SDCliParams { SDMode mode = IMG_GEN; std::string output_path = "output.png"; bool verbose = false; + bool version = false; bool canny_preprocess = false; preview_t preview_method = PREVIEW_NONE; @@ -366,6 +74,10 @@ struct SDCliParams { "--verbose", "print extra info", true, &verbose}, + {"", + "--version", + "print stable-diffusion.cpp version", + true, &version}, {"", "--color", "colors the logging tags according to level", @@ -480,1066 +192,8 @@ struct SDCliParams { } }; -struct SDContextParams { - int n_threads = -1; - std::string model_path; - std::string clip_l_path; - std::string clip_g_path; - std::string clip_vision_path; - std::string t5xxl_path; - std::string llm_path; - std::string llm_vision_path; - std::string diffusion_model_path; - std::string high_noise_diffusion_model_path; - std::string vae_path; - std::string taesd_path; - std::string esrgan_path; - std::string control_net_path; - std::string embedding_dir; - std::string photo_maker_path; - sd_type_t wtype = SD_TYPE_COUNT; - std::string tensor_type_rules; - std::string lora_model_dir; - - rng_type_t rng_type = CUDA_RNG; - rng_type_t sampler_rng_type = RNG_TYPE_COUNT; - bool offload_params_to_cpu = false; - bool control_net_cpu = false; - bool clip_on_cpu = false; - bool vae_on_cpu = false; - bool diffusion_flash_attn = false; - bool diffusion_conv_direct = false; - bool vae_conv_direct = false; - - bool chroma_use_dit_mask = true; - bool chroma_use_t5_mask = false; - int chroma_t5_mask_pad = 1; - - prediction_t prediction = PREDICTION_COUNT; - lora_apply_mode_t lora_apply_mode = LORA_APPLY_AUTO; - - sd_tiling_params_t vae_tiling_params = {false, 0, 0, 0.5f, 0.0f, 0.0f}; - bool force_sdxl_vae_conv_scale = false; - - float flow_shift = INFINITY; - - ArgOptions get_options() { - ArgOptions options; - options.string_options = { - {"-m", - "--model", - "path to full model", - &model_path}, - {"", - "--clip_l", - "path to the clip-l text encoder", &clip_l_path}, - {"", "--clip_g", - "path to the clip-g text encoder", - &clip_g_path}, - {"", - "--clip_vision", - "path to the clip-vision encoder", - &clip_vision_path}, - {"", - "--t5xxl", - "path to the t5xxl text encoder", - &t5xxl_path}, - {"", - "--llm", - "path to the llm text encoder. For example: (qwenvl2.5 for qwen-image, mistral-small3.2 for flux2, ...)", - &llm_path}, - {"", - "--llm_vision", - "path to the llm vit", - &llm_vision_path}, - {"", - "--qwen2vl", - "alias of --llm. Deprecated.", - &llm_path}, - {"", - "--qwen2vl_vision", - "alias of --llm_vision. Deprecated.", - &llm_vision_path}, - {"", - "--diffusion-model", - "path to the standalone diffusion model", - &diffusion_model_path}, - {"", - "--high-noise-diffusion-model", - "path to the standalone high noise diffusion model", - &high_noise_diffusion_model_path}, - {"", - "--vae", - "path to standalone vae model", - &vae_path}, - {"", - "--taesd", - "path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)", - &taesd_path}, - {"", - "--control-net", - "path to control net model", - &control_net_path}, - {"", - "--embd-dir", - "embeddings directory", - &embedding_dir}, - {"", - "--lora-model-dir", - "lora model directory", - &lora_model_dir}, - - {"", - "--tensor-type-rules", - "weight type per tensor pattern (example: \"^vae\\.=f16,model\\.=q8_0\")", - &tensor_type_rules}, - {"", - "--photo-maker", - "path to PHOTOMAKER model", - &photo_maker_path}, - {"", - "--upscale-model", - "path to esrgan model.", - &esrgan_path}, - }; - - options.int_options = { - {"-t", - "--threads", - "number of threads to use during computation (default: -1). " - "If threads <= 0, then threads will be set to the number of CPU physical cores", - &n_threads}, - {"", - "--chroma-t5-mask-pad", - "t5 mask pad size of chroma", - &chroma_t5_mask_pad}, - }; - - options.float_options = { - {"", - "--vae-tile-overlap", - "tile overlap for vae tiling, in fraction of tile size (default: 0.5)", - &vae_tiling_params.target_overlap}, - {"", - "--flow-shift", - "shift value for Flow models like SD3.x or WAN (default: auto)", - &flow_shift}, - }; - - options.bool_options = { - {"", - "--vae-tiling", - "process vae in tiles to reduce memory usage", - true, &vae_tiling_params.enabled}, - {"", - "--force-sdxl-vae-conv-scale", - "force use of conv scale on sdxl vae", - true, &force_sdxl_vae_conv_scale}, - {"", - "--offload-to-cpu", - "place the weights in RAM to save VRAM, and automatically load them into VRAM when needed", - true, &offload_params_to_cpu}, - {"", - "--control-net-cpu", - "keep controlnet in cpu (for low vram)", - true, &control_net_cpu}, - {"", - "--clip-on-cpu", - "keep clip in cpu (for low vram)", - true, &clip_on_cpu}, - {"", - "--vae-on-cpu", - "keep vae in cpu (for low vram)", - true, &vae_on_cpu}, - {"", - "--diffusion-fa", - "use flash attention in the diffusion model", - true, &diffusion_flash_attn}, - {"", - "--diffusion-conv-direct", - "use ggml_conv2d_direct in the diffusion model", - true, &diffusion_conv_direct}, - {"", - "--vae-conv-direct", - "use ggml_conv2d_direct in the vae model", - true, &vae_conv_direct}, - {"", - "--chroma-disable-dit-mask", - "disable dit mask for chroma", - false, &chroma_use_dit_mask}, - {"", - "--chroma-enable-t5-mask", - "enable t5 mask for chroma", - true, &chroma_use_t5_mask}, - }; - - auto on_type_arg = [&](int argc, const char** argv, int index) { - if (++index >= argc) { - return -1; - } - const char* arg = argv[index]; - wtype = str_to_sd_type(arg); - if (wtype == SD_TYPE_COUNT) { - fprintf(stderr, "error: invalid weight format %s\n", - arg); - return -1; - } - return 1; - }; - - auto on_rng_arg = [&](int argc, const char** argv, int index) { - if (++index >= argc) { - return -1; - } - const char* arg = argv[index]; - rng_type = str_to_rng_type(arg); - if (rng_type == RNG_TYPE_COUNT) { - fprintf(stderr, "error: invalid rng type %s\n", - arg); - return -1; - } - return 1; - }; - - auto on_sampler_rng_arg = [&](int argc, const char** argv, int index) { - if (++index >= argc) { - return -1; - } - const char* arg = argv[index]; - sampler_rng_type = str_to_rng_type(arg); - if (sampler_rng_type == RNG_TYPE_COUNT) { - fprintf(stderr, "error: invalid sampler rng type %s\n", - arg); - return -1; - } - return 1; - }; - - auto on_prediction_arg = [&](int argc, const char** argv, int index) { - if (++index >= argc) { - return -1; - } - const char* arg = argv[index]; - prediction = str_to_prediction(arg); - if (prediction == PREDICTION_COUNT) { - fprintf(stderr, "error: invalid prediction type %s\n", - arg); - return -1; - } - return 1; - }; - - auto on_lora_apply_mode_arg = [&](int argc, const char** argv, int index) { - if (++index >= argc) { - return -1; - } - const char* arg = argv[index]; - lora_apply_mode = str_to_lora_apply_mode(arg); - if (lora_apply_mode == LORA_APPLY_MODE_COUNT) { - fprintf(stderr, "error: invalid lora apply model %s\n", - arg); - return -1; - } - return 1; - }; - - auto on_tile_size_arg = [&](int argc, const char** argv, int index) { - if (++index >= argc) { - return -1; - } - std::string tile_size_str = argv[index]; - size_t x_pos = tile_size_str.find('x'); - try { - if (x_pos != std::string::npos) { - std::string tile_x_str = tile_size_str.substr(0, x_pos); - std::string tile_y_str = tile_size_str.substr(x_pos + 1); - vae_tiling_params.tile_size_x = std::stoi(tile_x_str); - vae_tiling_params.tile_size_y = std::stoi(tile_y_str); - } else { - vae_tiling_params.tile_size_x = vae_tiling_params.tile_size_y = std::stoi(tile_size_str); - } - } catch (const std::invalid_argument&) { - return -1; - } catch (const std::out_of_range&) { - return -1; - } - return 1; - }; - - auto on_relative_tile_size_arg = [&](int argc, const char** argv, int index) { - if (++index >= argc) { - return -1; - } - std::string rel_size_str = argv[index]; - size_t x_pos = rel_size_str.find('x'); - try { - if (x_pos != std::string::npos) { - std::string rel_x_str = rel_size_str.substr(0, x_pos); - std::string rel_y_str = rel_size_str.substr(x_pos + 1); - vae_tiling_params.rel_size_x = std::stof(rel_x_str); - vae_tiling_params.rel_size_y = std::stof(rel_y_str); - } else { - vae_tiling_params.rel_size_x = vae_tiling_params.rel_size_y = std::stof(rel_size_str); - } - } catch (const std::invalid_argument&) { - return -1; - } catch (const std::out_of_range&) { - return -1; - } - return 1; - }; - - options.manual_options = { - {"", - "--type", - "weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). " - "If not specified, the default is the type of the weight file", - on_type_arg}, - {"", - "--rng", - "RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui)", - on_rng_arg}, - {"", - "--sampler-rng", - "sampler RNG, one of [std_default, cuda, cpu]. If not specified, use --rng", - on_sampler_rng_arg}, - {"", - "--prediction", - "prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow, flux2_flow]", - on_prediction_arg}, - {"", - "--lora-apply-mode", - "the way to apply LoRA, one of [auto, immediately, at_runtime], default is auto. " - "In auto mode, if the model weights contain any quantized parameters, the at_runtime mode will be used; otherwise, immediately will be used." - "The immediately mode may have precision and compatibility issues with quantized parameters, " - "but it usually offers faster inference speed and, in some cases, lower memory usage. " - "The at_runtime mode, on the other hand, is exactly the opposite.", - on_lora_apply_mode_arg}, - {"", - "--vae-tile-size", - "tile size for vae tiling, format [X]x[Y] (default: 32x32)", - on_tile_size_arg}, - {"", - "--vae-relative-tile-size", - "relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)", - on_relative_tile_size_arg}, - }; - - return options; - } - - bool process_and_check(SDMode mode) { - if (mode != UPSCALE && model_path.length() == 0 && diffusion_model_path.length() == 0) { - fprintf(stderr, "error: the following arguments are required: model_path/diffusion_model\n"); - return false; - } - - if (mode == UPSCALE) { - if (esrgan_path.length() == 0) { - fprintf(stderr, "error: upscale mode needs an upscaler model (--upscale-model)\n"); - return false; - } - } - - if (n_threads <= 0) { - n_threads = sd_get_num_physical_cores(); - } - - return true; - } - - std::string to_string() const { - std::ostringstream oss; - oss << "SDContextParams {\n" - << " n_threads: " << n_threads << ",\n" - << " model_path: \"" << model_path << "\",\n" - << " clip_l_path: \"" << clip_l_path << "\",\n" - << " clip_g_path: \"" << clip_g_path << "\",\n" - << " clip_vision_path: \"" << clip_vision_path << "\",\n" - << " t5xxl_path: \"" << t5xxl_path << "\",\n" - << " llm_path: \"" << llm_path << "\",\n" - << " llm_vision_path: \"" << llm_vision_path << "\",\n" - << " diffusion_model_path: \"" << diffusion_model_path << "\",\n" - << " high_noise_diffusion_model_path: \"" << high_noise_diffusion_model_path << "\",\n" - << " vae_path: \"" << vae_path << "\",\n" - << " taesd_path: \"" << taesd_path << "\",\n" - << " esrgan_path: \"" << esrgan_path << "\",\n" - << " control_net_path: \"" << control_net_path << "\",\n" - << " embedding_dir: \"" << embedding_dir << "\",\n" - << " wtype: " << sd_type_name(wtype) << ",\n" - << " tensor_type_rules: \"" << tensor_type_rules << "\",\n" - << " lora_model_dir: \"" << lora_model_dir << "\",\n" - << " photo_maker_path: \"" << photo_maker_path << "\",\n" - << " rng_type: " << sd_rng_type_name(rng_type) << ",\n" - << " sampler_rng_type: " << sd_rng_type_name(sampler_rng_type) << ",\n" - << " flow_shift: " << (std::isinf(flow_shift) ? "INF" : std::to_string(flow_shift)) << "\n" - << " offload_params_to_cpu: " << (offload_params_to_cpu ? "true" : "false") << ",\n" - << " control_net_cpu: " << (control_net_cpu ? "true" : "false") << ",\n" - << " clip_on_cpu: " << (clip_on_cpu ? "true" : "false") << ",\n" - << " vae_on_cpu: " << (vae_on_cpu ? "true" : "false") << ",\n" - << " diffusion_flash_attn: " << (diffusion_flash_attn ? "true" : "false") << ",\n" - << " diffusion_conv_direct: " << (diffusion_conv_direct ? "true" : "false") << ",\n" - << " vae_conv_direct: " << (vae_conv_direct ? "true" : "false") << ",\n" - << " chroma_use_dit_mask: " << (chroma_use_dit_mask ? "true" : "false") << ",\n" - << " chroma_use_t5_mask: " << (chroma_use_t5_mask ? "true" : "false") << ",\n" - << " chroma_t5_mask_pad: " << chroma_t5_mask_pad << ",\n" - << " prediction: " << sd_prediction_name(prediction) << ",\n" - << " lora_apply_mode: " << sd_lora_apply_mode_name(lora_apply_mode) << ",\n" - << " vae_tiling_params: { " - << vae_tiling_params.enabled << ", " - << vae_tiling_params.tile_size_x << ", " - << vae_tiling_params.tile_size_y << ", " - << vae_tiling_params.target_overlap << ", " - << vae_tiling_params.rel_size_x << ", " - << vae_tiling_params.rel_size_y << " },\n" - << " force_sdxl_vae_conv_scale: " << (force_sdxl_vae_conv_scale ? "true" : "false") << "\n" - << "}"; - return oss.str(); - } - - sd_ctx_params_t to_sd_ctx_params_t(bool vae_decode_only, bool free_params_immediately, bool taesd_preview) { - sd_ctx_params_t sd_ctx_params = { - model_path.c_str(), - clip_l_path.c_str(), - clip_g_path.c_str(), - clip_vision_path.c_str(), - t5xxl_path.c_str(), - llm_path.c_str(), - llm_vision_path.c_str(), - diffusion_model_path.c_str(), - high_noise_diffusion_model_path.c_str(), - vae_path.c_str(), - taesd_path.c_str(), - control_net_path.c_str(), - lora_model_dir.c_str(), - embedding_dir.c_str(), - photo_maker_path.c_str(), - tensor_type_rules.c_str(), - vae_decode_only, - free_params_immediately, - n_threads, - wtype, - rng_type, - sampler_rng_type, - prediction, - lora_apply_mode, - offload_params_to_cpu, - clip_on_cpu, - control_net_cpu, - vae_on_cpu, - diffusion_flash_attn, - taesd_preview, - diffusion_conv_direct, - vae_conv_direct, - force_sdxl_vae_conv_scale, - chroma_use_dit_mask, - chroma_use_t5_mask, - chroma_t5_mask_pad, - flow_shift, - }; - return sd_ctx_params; - } -}; - -template -static std::string vec_to_string(const std::vector& v) { - std::ostringstream oss; - oss << "["; - for (size_t i = 0; i < v.size(); i++) { - oss << v[i]; - if (i + 1 < v.size()) - oss << ", "; - } - oss << "]"; - return oss.str(); -} - -static std::string vec_str_to_string(const std::vector& v) { - std::ostringstream oss; - oss << "["; - for (size_t i = 0; i < v.size(); i++) { - oss << "\"" << v[i] << "\""; - if (i + 1 < v.size()) - oss << ", "; - } - oss << "]"; - return oss.str(); -} - -struct SDGenerationParams { - std::string prompt; - std::string negative_prompt; - int clip_skip = -1; // <= 0 represents unspecified - int width = 512; - int height = 512; - int batch_count = 1; - std::string init_image_path; - std::string end_image_path; - std::string mask_image_path; - std::string control_image_path; - std::vector ref_image_paths; - std::string control_video_path; - bool auto_resize_ref_image = true; - bool increase_ref_index = false; - - std::vector skip_layers = {7, 8, 9}; - sd_sample_params_t sample_params; - - std::vector high_noise_skip_layers = {7, 8, 9}; - sd_sample_params_t high_noise_sample_params; - - std::string easycache_option; - sd_easycache_params_t easycache_params; - - float moe_boundary = 0.875f; - int video_frames = 1; - int fps = 16; - float vace_strength = 1.f; - - float strength = 0.75f; - float control_strength = 0.9f; - - int64_t seed = 42; - - // Photo Maker - std::string pm_id_images_dir; - std::string pm_id_embed_path; - float pm_style_strength = 20.f; - - int upscale_repeats = 1; - - SDGenerationParams() { - sd_sample_params_init(&sample_params); - sd_sample_params_init(&high_noise_sample_params); - } - - ArgOptions get_options() { - ArgOptions options; - options.string_options = { - {"-p", - "--prompt", - "the prompt to render", - &prompt}, - {"-n", - "--negative-prompt", - "the negative prompt (default: \"\")", - &negative_prompt}, - {"-i", - "--init-img", - "path to the init image", - &init_image_path}, - {"", - "--end-img", - "path to the end image, required by flf2v", - &end_image_path}, - {"", - "--mask", - "path to the mask image", - &mask_image_path}, - {"", - "--control-image", - "path to control image, control net", - &control_image_path}, - {"", - "--control-video", - "path to control video frames, It must be a directory path. The video frames inside should be stored as images in " - "lexicographical (character) order. For example, if the control video path is `frames`, the directory contain images " - "such as 00.png, 01.png, ... etc.", - &control_video_path}, - {"", - "--pm-id-images-dir", - "path to PHOTOMAKER input id images dir", - &pm_id_images_dir}, - {"", - "--pm-id-embed-path", - "path to PHOTOMAKER v2 id embed", - &pm_id_embed_path}, - }; - - options.int_options = { - {"-H", - "--height", - "image height, in pixel space (default: 512)", - &height}, - {"-W", - "--width", - "image width, in pixel space (default: 512)", - &width}, - {"", - "--steps", - "number of sample steps (default: 20)", - &sample_params.sample_steps}, - {"", - "--high-noise-steps", - "(high noise) number of sample steps (default: -1 = auto)", - &high_noise_sample_params.sample_steps}, - {"", - "--clip-skip", - "ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1). " - "<= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x", - &clip_skip}, - {"-b", - "--batch-count", - "batch count", - &batch_count}, - {"", - "--video-frames", - "video frames (default: 1)", - &video_frames}, - {"", - "--fps", - "fps (default: 24)", - &fps}, - {"", - "--timestep-shift", - "shift timestep for NitroFusion models (default: 0). " - "recommended N for NitroSD-Realism around 250 and 500 for NitroSD-Vibrant", - &sample_params.shifted_timestep}, - {"", - "--upscale-repeats", - "Run the ESRGAN upscaler this many times (default: 1)", - &upscale_repeats}, - }; - - options.float_options = { - {"", - "--cfg-scale", - "unconditional guidance scale: (default: 7.0)", - &sample_params.guidance.txt_cfg}, - {"", - "--img-cfg-scale", - "image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)", - &sample_params.guidance.img_cfg}, - {"", - "--guidance", - "distilled guidance scale for models with guidance input (default: 3.5)", - &sample_params.guidance.distilled_guidance}, - {"", - "--slg-scale", - "skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means disabled, a value of 2.5 is nice for sd3.5 medium", - &sample_params.guidance.slg.scale}, - {"", - "--skip-layer-start", - "SLG enabling point (default: 0.01)", - &sample_params.guidance.slg.layer_start}, - {"", - "--skip-layer-end", - "SLG disabling point (default: 0.2)", - &sample_params.guidance.slg.layer_end}, - {"", - "--eta", - "eta in DDIM, only for DDIM and TCD (default: 0)", - &sample_params.eta}, - {"", - "--high-noise-cfg-scale", - "(high noise) unconditional guidance scale: (default: 7.0)", - &high_noise_sample_params.guidance.txt_cfg}, - {"", - "--high-noise-img-cfg-scale", - "(high noise) image guidance scale for inpaint or instruct-pix2pix models (default: same as --cfg-scale)", - &high_noise_sample_params.guidance.img_cfg}, - {"", - "--high-noise-guidance", - "(high noise) distilled guidance scale for models with guidance input (default: 3.5)", - &high_noise_sample_params.guidance.distilled_guidance}, - {"", - "--high-noise-slg-scale", - "(high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0)", - &high_noise_sample_params.guidance.slg.scale}, - {"", - "--high-noise-skip-layer-start", - "(high noise) SLG enabling point (default: 0.01)", - &high_noise_sample_params.guidance.slg.layer_start}, - {"", - "--high-noise-skip-layer-end", - "(high noise) SLG disabling point (default: 0.2)", - &high_noise_sample_params.guidance.slg.layer_end}, - {"", - "--high-noise-eta", - "(high noise) eta in DDIM, only for DDIM and TCD (default: 0)", - &high_noise_sample_params.eta}, - {"", - "--strength", - "strength for noising/unnoising (default: 0.75)", - &strength}, - {"", - "--pm-style-strength", - "", - &pm_style_strength}, - {"", - "--control-strength", - "strength to apply Control Net (default: 0.9). 1.0 corresponds to full destruction of information in init image", - &control_strength}, - {"", - "--moe-boundary", - "timestep boundary for Wan2.2 MoE model. (default: 0.875). Only enabled if `--high-noise-steps` is set to -1", - &moe_boundary}, - {"", - "--vace-strength", - "wan vace strength", - &vace_strength}, - }; - - options.bool_options = { - {"", - "--increase-ref-index", - "automatically increase the indices of references images based on the order they are listed (starting with 1).", - true, - &increase_ref_index}, - {"", - "--disable-auto-resize-ref-image", - "disable auto resize of ref images", - false, - &auto_resize_ref_image}, - }; - - auto on_seed_arg = [&](int argc, const char** argv, int index) { - if (++index >= argc) { - return -1; - } - seed = std::stoll(argv[index]); - return 1; - }; - - auto on_sample_method_arg = [&](int argc, const char** argv, int index) { - if (++index >= argc) { - return -1; - } - const char* arg = argv[index]; - sample_params.sample_method = str_to_sample_method(arg); - if (sample_params.sample_method == SAMPLE_METHOD_COUNT) { - fprintf(stderr, "error: invalid sample method %s\n", - arg); - return -1; - } - return 1; - }; - - auto on_high_noise_sample_method_arg = [&](int argc, const char** argv, int index) { - if (++index >= argc) { - return -1; - } - const char* arg = argv[index]; - high_noise_sample_params.sample_method = str_to_sample_method(arg); - if (high_noise_sample_params.sample_method == SAMPLE_METHOD_COUNT) { - fprintf(stderr, "error: invalid high noise sample method %s\n", - arg); - return -1; - } - return 1; - }; - - auto on_scheduler_arg = [&](int argc, const char** argv, int index) { - if (++index >= argc) { - return -1; - } - const char* arg = argv[index]; - sample_params.scheduler = str_to_scheduler(arg); - if (sample_params.scheduler == SCHEDULER_COUNT) { - fprintf(stderr, "error: invalid scheduler %s\n", - arg); - return -1; - } - return 1; - }; - - auto on_skip_layers_arg = [&](int argc, const char** argv, int index) { - if (++index >= argc) { - return -1; - } - std::string layers_str = argv[index]; - if (layers_str[0] != '[' || layers_str[layers_str.size() - 1] != ']') { - return -1; - } - - layers_str = layers_str.substr(1, layers_str.size() - 2); - - std::regex regex("[, ]+"); - std::sregex_token_iterator iter(layers_str.begin(), layers_str.end(), regex, -1); - std::sregex_token_iterator end; - std::vector tokens(iter, end); - std::vector layers; - for (const auto& token : tokens) { - try { - layers.push_back(std::stoi(token)); - } catch (const std::invalid_argument&) { - return -1; - } - } - skip_layers = layers; - return 1; - }; - - auto on_high_noise_skip_layers_arg = [&](int argc, const char** argv, int index) { - if (++index >= argc) { - return -1; - } - std::string layers_str = argv[index]; - if (layers_str[0] != '[' || layers_str[layers_str.size() - 1] != ']') { - return -1; - } - - layers_str = layers_str.substr(1, layers_str.size() - 2); - - std::regex regex("[, ]+"); - std::sregex_token_iterator iter(layers_str.begin(), layers_str.end(), regex, -1); - std::sregex_token_iterator end; - std::vector tokens(iter, end); - std::vector layers; - for (const auto& token : tokens) { - try { - layers.push_back(std::stoi(token)); - } catch (const std::invalid_argument&) { - return -1; - } - } - high_noise_skip_layers = layers; - return 1; - }; - - auto on_ref_image_arg = [&](int argc, const char** argv, int index) { - if (++index >= argc) { - return -1; - } - ref_image_paths.push_back(argv[index]); - return 1; - }; - - auto on_easycache_arg = [&](int argc, const char** argv, int index) { - const std::string default_values = "0.2,0.15,0.95"; - auto looks_like_value = [](const std::string& token) { - if (token.empty()) { - return false; - } - if (token[0] != '-') { - return true; - } - if (token.size() == 1) { - return false; - } - unsigned char next = static_cast(token[1]); - return std::isdigit(next) || token[1] == '.'; - }; - - std::string option_value; - int consumed = 0; - if (index + 1 < argc) { - std::string next_arg = argv[index + 1]; - if (looks_like_value(next_arg)) { - option_value = argv_to_utf8(index + 1, argv); - consumed = 1; - } - } - if (option_value.empty()) { - option_value = default_values; - } - easycache_option = option_value; - return consumed; - }; - - options.manual_options = { - {"-s", - "--seed", - "RNG seed (default: 42, use random seed for < 0)", - on_seed_arg}, - {"", - "--sampling-method", - "sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd] " - "(default: euler for Flux/SD3/Wan, euler_a otherwise)", - on_sample_method_arg}, - {"", - "--high-noise-sampling-method", - "(high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd]" - " default: euler for Flux/SD3/Wan, euler_a otherwise", - on_high_noise_sample_method_arg}, - {"", - "--scheduler", - "denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple, lcm], default: discrete", - on_scheduler_arg}, - {"", - "--skip-layers", - "layers to skip for SLG steps (default: [7,8,9])", - on_skip_layers_arg}, - {"", - "--high-noise-skip-layers", - "(high noise) layers to skip for SLG steps (default: [7,8,9])", - on_high_noise_skip_layers_arg}, - {"-r", - "--ref-image", - "reference image for Flux Kontext models (can be used multiple times)", - on_ref_image_arg}, - {"", - "--easycache", - "enable EasyCache for DiT models with optional \"threshold,start_percent,end_percent\" (default: 0.2,0.15,0.95)", - on_easycache_arg}, - - }; - - return options; - } - - bool process_and_check(SDMode mode) { - if (width <= 0) { - fprintf(stderr, "error: the width must be greater than 0\n"); - return false; - } - - if (height <= 0) { - fprintf(stderr, "error: the height must be greater than 0\n"); - return false; - } - - if (sample_params.sample_steps <= 0) { - fprintf(stderr, "error: the sample_steps must be greater than 0\n"); - return false; - } - - if (high_noise_sample_params.sample_steps <= 0) { - high_noise_sample_params.sample_steps = -1; - } - - if (strength < 0.f || strength > 1.f) { - fprintf(stderr, "error: can only work with strength in [0.0, 1.0]\n"); - return false; - } - - if (!easycache_option.empty()) { - float values[3] = {0.0f, 0.0f, 0.0f}; - std::stringstream ss(easycache_option); - std::string token; - int idx = 0; - while (std::getline(ss, token, ',')) { - auto trim = [](std::string& s) { - const char* whitespace = " \t\r\n"; - auto start = s.find_first_not_of(whitespace); - if (start == std::string::npos) { - s.clear(); - return; - } - auto end = s.find_last_not_of(whitespace); - s = s.substr(start, end - start + 1); - }; - trim(token); - if (token.empty()) { - fprintf(stderr, "error: invalid easycache option '%s'\n", easycache_option.c_str()); - return false; - } - if (idx >= 3) { - fprintf(stderr, "error: easycache expects exactly 3 comma-separated values (threshold,start,end)\n"); - return false; - } - try { - values[idx] = std::stof(token); - } catch (const std::exception&) { - fprintf(stderr, "error: invalid easycache value '%s'\n", token.c_str()); - return false; - } - idx++; - } - if (idx != 3) { - fprintf(stderr, "error: easycache expects exactly 3 comma-separated values (threshold,start,end)\n"); - return false; - } - if (values[0] < 0.0f) { - fprintf(stderr, "error: easycache threshold must be non-negative\n"); - return false; - } - if (values[1] < 0.0f || values[1] >= 1.0f || values[2] <= 0.0f || values[2] > 1.0f || values[1] >= values[2]) { - fprintf(stderr, "error: easycache start/end percents must satisfy 0.0 <= start < end <= 1.0\n"); - return false; - } - easycache_params.enabled = true; - easycache_params.reuse_threshold = values[0]; - easycache_params.start_percent = values[1]; - easycache_params.end_percent = values[2]; - } else { - easycache_params.enabled = false; - } - - sample_params.guidance.slg.layers = skip_layers.data(); - sample_params.guidance.slg.layer_count = skip_layers.size(); - high_noise_sample_params.guidance.slg.layers = high_noise_skip_layers.data(); - high_noise_sample_params.guidance.slg.layer_count = high_noise_skip_layers.size(); - - if (mode == VID_GEN && video_frames <= 0) { - return false; - } - - if (mode == VID_GEN && fps <= 0) { - return false; - } - - if (sample_params.shifted_timestep < 0 || sample_params.shifted_timestep > 1000) { - return false; - } - - if (upscale_repeats < 1) { - return false; - } - - if (mode == UPSCALE) { - if (init_image_path.length() == 0) { - fprintf(stderr, "error: upscale mode needs an init image (--init-img)\n"); - return false; - } - } - - if (seed < 0) { - srand((int)time(nullptr)); - seed = rand(); - } - - return true; - } - - std::string to_string() const { - char* sample_params_str = sd_sample_params_to_str(&sample_params); - char* high_noise_sample_params_str = sd_sample_params_to_str(&high_noise_sample_params); - std::ostringstream oss; - oss << "SDGenerationParams {\n" - << " prompt: \"" << prompt << "\",\n" - << " negative_prompt: \"" << negative_prompt << "\",\n" - << " clip_skip: " << clip_skip << ",\n" - << " width: " << width << ",\n" - << " height: " << height << ",\n" - << " batch_count: " << batch_count << ",\n" - << " init_image_path: \"" << init_image_path << "\",\n" - << " end_image_path: \"" << end_image_path << "\",\n" - << " mask_image_path: \"" << mask_image_path << "\",\n" - << " control_image_path: \"" << control_image_path << "\",\n" - << " ref_image_paths: " << vec_str_to_string(ref_image_paths) << ",\n" - << " control_video_path: \"" << control_video_path << "\",\n" - << " auto_resize_ref_image: " << (auto_resize_ref_image ? "true" : "false") << ",\n" - << " increase_ref_index: " << (increase_ref_index ? "true" : "false") << ",\n" - << " pm_id_images_dir: \"" << pm_id_images_dir << "\",\n" - << " pm_id_embed_path: \"" << pm_id_embed_path << "\",\n" - << " pm_style_strength: " << pm_style_strength << ",\n" - << " skip_layers: " << vec_to_string(skip_layers) << ",\n" - << " sample_params: " << sample_params_str << ",\n" - << " high_noise_skip_layers: " << vec_to_string(high_noise_skip_layers) << ",\n" - << " high_noise_sample_params: " << high_noise_sample_params_str << ",\n" - << " easycache_option: \"" << easycache_option << "\",\n" - << " easycache: " - << (easycache_params.enabled ? "enabled" : "disabled") - << " (threshold=" << easycache_params.reuse_threshold - << ", start=" << easycache_params.start_percent - << ", end=" << easycache_params.end_percent << "),\n" - << " moe_boundary: " << moe_boundary << ",\n" - << " video_frames: " << video_frames << ",\n" - << " fps: " << fps << ",\n" - << " vace_strength: " << vace_strength << ",\n" - << " strength: " << strength << ",\n" - << " control_strength: " << control_strength << ",\n" - << " seed: " << seed << ",\n" - << " upscale_repeats: " << upscale_repeats << ",\n" - << "}"; - free(sample_params_str); - free(high_noise_sample_params_str); - return oss.str(); - } -}; - void print_usage(int argc, const char* argv[], const std::vector& options_list) { + std::cout << version_string() << "\n"; std::cout << "Usage: " << argv[0] << " [options]\n\n"; std::cout << "CLI Options:\n"; options_list[0].print(); @@ -1557,7 +211,9 @@ void parse_args(int argc, const char** argv, SDCliParams& cli_params, SDContextP exit(cli_params.normal_exit ? 0 : 1); } - if (!cli_params.process_and_check() || !ctx_params.process_and_check(cli_params.mode) || !gen_params.process_and_check(cli_params.mode)) { + if (!cli_params.process_and_check() || + !ctx_params.process_and_check(cli_params.mode) || + !gen_params.process_and_check(cli_params.mode, ctx_params.lora_model_dir)) { print_usage(argc, argv, options_vec); exit(1); } @@ -1576,7 +232,7 @@ static std::string sd_basename(const std::string& path) { } std::string get_image_params(const SDCliParams& cli_params, const SDContextParams& ctx_params, const SDGenerationParams& gen_params, int64_t seed) { - std::string parameter_string = gen_params.prompt + "\n"; + std::string parameter_string = gen_params.prompt_with_lora + "\n"; if (gen_params.negative_prompt.size() != 0) { parameter_string += "Negative prompt: " + gen_params.negative_prompt + "\n"; } @@ -1602,7 +258,15 @@ std::string get_image_params(const SDCliParams& cli_params, const SDContextParam parameter_string += "Sampler RNG: " + std::string(sd_rng_type_name(ctx_params.sampler_rng_type)) + ", "; } parameter_string += "Sampler: " + std::string(sd_sample_method_name(gen_params.sample_params.sample_method)); - if (gen_params.sample_params.scheduler != SCHEDULER_COUNT) { + if (!gen_params.custom_sigmas.empty()) { + parameter_string += ", Custom Sigmas: ["; + for (size_t i = 0; i < gen_params.custom_sigmas.size(); ++i) { + std::ostringstream oss; + oss << std::fixed << std::setprecision(4) << gen_params.custom_sigmas[i]; + parameter_string += oss.str() + (i == gen_params.custom_sigmas.size() - 1 ? "" : ", "); + } + parameter_string += "]"; + } else if (gen_params.sample_params.scheduler != SCHEDULER_COUNT) { // Only show schedule if not using custom sigmas parameter_string += " " + std::string(sd_scheduler_name(gen_params.sample_params.scheduler)); } parameter_string += ", "; @@ -1667,94 +331,6 @@ void sd_log_cb(enum sd_log_level_t level, const char* log, void* data) { fflush(out_stream); } -uint8_t* load_image(const char* image_path, int& width, int& height, int expected_width = 0, int expected_height = 0, int expected_channel = 3) { - int c = 0; - uint8_t* image_buffer = (uint8_t*)stbi_load(image_path, &width, &height, &c, expected_channel); - if (image_buffer == nullptr) { - fprintf(stderr, "load image from '%s' failed\n", image_path); - return nullptr; - } - if (c < expected_channel) { - fprintf(stderr, - "the number of channels for the input image must be >= %d," - "but got %d channels, image_path = %s\n", - expected_channel, - c, - image_path); - free(image_buffer); - return nullptr; - } - if (width <= 0) { - fprintf(stderr, "error: the width of image must be greater than 0, image_path = %s\n", image_path); - free(image_buffer); - return nullptr; - } - if (height <= 0) { - fprintf(stderr, "error: the height of image must be greater than 0, image_path = %s\n", image_path); - free(image_buffer); - return nullptr; - } - - // Resize input image ... - if ((expected_width > 0 && expected_height > 0) && (height != expected_height || width != expected_width)) { - float dst_aspect = (float)expected_width / (float)expected_height; - float src_aspect = (float)width / (float)height; - - int crop_x = 0, crop_y = 0; - int crop_w = width, crop_h = height; - - if (src_aspect > dst_aspect) { - crop_w = (int)(height * dst_aspect); - crop_x = (width - crop_w) / 2; - } else if (src_aspect < dst_aspect) { - crop_h = (int)(width / dst_aspect); - crop_y = (height - crop_h) / 2; - } - - if (crop_x != 0 || crop_y != 0) { - printf("crop input image from %dx%d to %dx%d, image_path = %s\n", width, height, crop_w, crop_h, image_path); - uint8_t* cropped_image_buffer = (uint8_t*)malloc(crop_w * crop_h * expected_channel); - if (cropped_image_buffer == nullptr) { - fprintf(stderr, "error: allocate memory for crop\n"); - free(image_buffer); - return nullptr; - } - for (int row = 0; row < crop_h; row++) { - uint8_t* src = image_buffer + ((crop_y + row) * width + crop_x) * expected_channel; - uint8_t* dst = cropped_image_buffer + (row * crop_w) * expected_channel; - memcpy(dst, src, crop_w * expected_channel); - } - - width = crop_w; - height = crop_h; - free(image_buffer); - image_buffer = cropped_image_buffer; - } - - printf("resize input image from %dx%d to %dx%d\n", width, height, expected_width, expected_height); - int resized_height = expected_height; - int resized_width = expected_width; - - uint8_t* resized_image_buffer = (uint8_t*)malloc(resized_height * resized_width * expected_channel); - if (resized_image_buffer == nullptr) { - fprintf(stderr, "error: allocate memory for resize input image\n"); - free(image_buffer); - return nullptr; - } - stbir_resize(image_buffer, width, height, 0, - resized_image_buffer, resized_width, resized_height, 0, STBIR_TYPE_UINT8, - expected_channel, STBIR_ALPHA_CHANNEL_NONE, 0, - STBIR_EDGE_CLAMP, STBIR_EDGE_CLAMP, - STBIR_FILTER_BOX, STBIR_FILTER_BOX, - STBIR_COLORSPACE_SRGB, nullptr); - width = resized_width; - height = resized_height; - free(image_buffer); - image_buffer = resized_image_buffer; - } - return image_buffer; -} - bool load_images_from_dir(const std::string dir, std::vector& images, int expected_width = 0, @@ -1789,7 +365,7 @@ bool load_images_from_dir(const std::string dir, } int width = 0; int height = 0; - uint8_t* image_buffer = load_image(path.c_str(), width, height, expected_width, expected_height); + uint8_t* image_buffer = load_image_from_file(path.c_str(), width, height, expected_width, expected_height); if (image_buffer == nullptr) { fprintf(stderr, "load image from '%s' failed\n", path.c_str()); return false; @@ -1822,11 +398,19 @@ void step_callback(int step, int frame_count, sd_image_t* image, bool is_noisy, } int main(int argc, const char* argv[]) { + if (argc > 1 && std::string(argv[1]) == "--version") { + std::cout << version_string() << "\n"; + return EXIT_SUCCESS; + } + SDCliParams cli_params; SDContextParams ctx_params; SDGenerationParams gen_params; parse_args(argc, argv, cli_params, ctx_params, gen_params); + if (cli_params.verbose || cli_params.version) { + std::cout << version_string() << "\n"; + } if (gen_params.video_frames > 4) { size_t last_dot_pos = cli_params.preview_path.find_last_of("."); std::string base_path = cli_params.preview_path; @@ -1917,7 +501,7 @@ int main(int argc, const char* argv[]) { int width = 0; int height = 0; - init_image.data = load_image(gen_params.init_image_path.c_str(), width, height, gen_params.width, gen_params.height); + init_image.data = load_image_from_file(gen_params.init_image_path.c_str(), width, height, gen_params.width, gen_params.height); if (init_image.data == nullptr) { fprintf(stderr, "load image from '%s' failed\n", gen_params.init_image_path.c_str()); release_all_resources(); @@ -1930,7 +514,7 @@ int main(int argc, const char* argv[]) { int width = 0; int height = 0; - end_image.data = load_image(gen_params.end_image_path.c_str(), width, height, gen_params.width, gen_params.height); + end_image.data = load_image_from_file(gen_params.end_image_path.c_str(), width, height, gen_params.width, gen_params.height); if (end_image.data == nullptr) { fprintf(stderr, "load image from '%s' failed\n", gen_params.end_image_path.c_str()); release_all_resources(); @@ -1942,7 +526,7 @@ int main(int argc, const char* argv[]) { int c = 0; int width = 0; int height = 0; - mask_image.data = load_image(gen_params.mask_image_path.c_str(), width, height, gen_params.width, gen_params.height, 1); + mask_image.data = load_image_from_file(gen_params.mask_image_path.c_str(), width, height, gen_params.width, gen_params.height, 1); if (mask_image.data == nullptr) { fprintf(stderr, "load image from '%s' failed\n", gen_params.mask_image_path.c_str()); release_all_resources(); @@ -1961,7 +545,7 @@ int main(int argc, const char* argv[]) { if (gen_params.control_image_path.size() > 0) { int width = 0; int height = 0; - control_image.data = load_image(gen_params.control_image_path.c_str(), width, height, gen_params.width, gen_params.height); + control_image.data = load_image_from_file(gen_params.control_image_path.c_str(), width, height, gen_params.width, gen_params.height); if (control_image.data == nullptr) { fprintf(stderr, "load image from '%s' failed\n", gen_params.control_image_path.c_str()); release_all_resources(); @@ -1982,7 +566,7 @@ int main(int argc, const char* argv[]) { for (auto& path : gen_params.ref_image_paths) { int width = 0; int height = 0; - uint8_t* image_buffer = load_image(path.c_str(), width, height); + uint8_t* image_buffer = load_image_from_file(path.c_str(), width, height); if (image_buffer == nullptr) { fprintf(stderr, "load image from '%s' failed\n", path.c_str()); release_all_resources(); @@ -2062,6 +646,8 @@ int main(int argc, const char* argv[]) { if (cli_params.mode == IMG_GEN) { sd_img_gen_params_t img_gen_params = { + gen_params.lora_vec.data(), + static_cast(gen_params.lora_vec.size()), gen_params.prompt.c_str(), gen_params.negative_prompt.c_str(), gen_params.clip_skip, @@ -2093,6 +679,8 @@ int main(int argc, const char* argv[]) { num_results = gen_params.batch_count; } else if (cli_params.mode == VID_GEN) { sd_vid_gen_params_t vid_gen_params = { + gen_params.lora_vec.data(), + static_cast(gen_params.lora_vec.size()), gen_params.prompt.c_str(), gen_params.negative_prompt.c_str(), gen_params.clip_skip, @@ -2129,7 +717,8 @@ int main(int argc, const char* argv[]) { upscaler_ctx_t* upscaler_ctx = new_upscaler_ctx(ctx_params.esrgan_path.c_str(), ctx_params.offload_params_to_cpu, ctx_params.diffusion_conv_direct, - ctx_params.n_threads); + ctx_params.n_threads, + gen_params.upscale_tile_size); if (upscaler_ctx == nullptr) { printf("new_upscaler_ctx failed\n"); diff --git a/otherarch/sdcpp/model.cpp b/otherarch/sdcpp/model.cpp index 1a683e396..682a72c81 100644 --- a/otherarch/sdcpp/model.cpp +++ b/otherarch/sdcpp/model.cpp @@ -11,7 +11,6 @@ #include #include #include -#include #include "gguf_reader.hpp" #include "model.h" @@ -317,12 +316,7 @@ bool is_zip_file(const std::string& file_path) { } bool is_gguf_file(const std::string& file_path) { - #ifdef _WIN32 - std::filesystem::path fpath = std::filesystem::u8path(file_path); - #else - std::filesystem::path fpath = std::filesystem::path(file_path); - #endif - std::ifstream file(fpath, std::ios::binary); + std::ifstream file(sd_get_u8path(file_path), std::ios::binary); if (!file.is_open()) { return false; } @@ -343,12 +337,7 @@ bool is_gguf_file(const std::string& file_path) { } bool is_safetensors_file(const std::string& file_path) { - #ifdef _WIN32 - std::filesystem::path fpath = std::filesystem::u8path(file_path); - #else - std::filesystem::path fpath = std::filesystem::path(file_path); - #endif - std::ifstream file(fpath, std::ios::binary); + std::ifstream file(sd_get_u8path(file_path), std::ios::binary); if (!file.is_open()) { return false; } @@ -531,12 +520,7 @@ bool ModelLoader::init_from_safetensors_file(const std::string& file_path, const LOG_DEBUG("init from '%s', prefix = '%s'", file_path.c_str(), prefix.c_str()); file_paths_.push_back(file_path); size_t file_index = file_paths_.size() - 1; - #ifdef _WIN32 - std::filesystem::path fpath = std::filesystem::u8path(file_path); - #else - std::filesystem::path fpath = std::filesystem::path(file_path); - #endif - std::ifstream file(fpath, std::ios::binary); + std::ifstream file(sd_get_u8path(file_path), std::ios::binary); if (!file.is_open()) { LOG_ERROR("failed to open '%s'", file_path.c_str()); file_paths_.pop_back(); @@ -1101,6 +1085,9 @@ SDVersion ModelLoader::get_sd_version() { if (tensor_storage.name.find("model.diffusion_model.double_stream_modulation_img.lin.weight") != std::string::npos) { return VERSION_FLUX2; } + if (tensor_storage.name.find("model.diffusion_model.double_blocks.0.img_mlp.gate_proj.weight") != std::string::npos) { + return VERSION_OVIS_IMAGE; + } if (tensor_storage.name.find("model.diffusion_model.cap_embedder.0.weight") != std::string::npos) { return VERSION_Z_IMAGE; } @@ -1479,12 +1466,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread } } else { // kcpp - #ifdef _WIN32 - std::filesystem::path fpath = std::filesystem::u8path(file_path); - #else - std::filesystem::path fpath = std::filesystem::path(file_path); - #endif - file.open(fpath, std::ios::binary); + file.open(sd_get_u8path(file_path), std::ios::binary); if (!file.is_open()) { LOG_ERROR("failed to open '%s'", file_path.c_str()); failed = true; diff --git a/otherarch/sdcpp/model.h b/otherarch/sdcpp/model.h index 4f82df8e3..598e4c6a5 100644 --- a/otherarch/sdcpp/model.h +++ b/otherarch/sdcpp/model.h @@ -45,6 +45,7 @@ enum SDVersion { VERSION_QWEN_IMAGE, VERSION_FLUX2, VERSION_Z_IMAGE, + VERSION_OVIS_IMAGE, VERSION_COUNT, }; @@ -90,6 +91,7 @@ static inline bool sd_version_is_flux(SDVersion version) { version == VERSION_FLUX_FILL || version == VERSION_FLUX_CONTROLS || version == VERSION_FLEX_2 || + version == VERSION_OVIS_IMAGE || version == VERSION_CHROMA_RADIANCE) { return true; } diff --git a/otherarch/sdcpp/rope.hpp b/otherarch/sdcpp/rope.hpp index 7a35926eb..4abc51469 100644 --- a/otherarch/sdcpp/rope.hpp +++ b/otherarch/sdcpp/rope.hpp @@ -72,11 +72,13 @@ namespace Rope { } // Generate IDs for image patches and text - __STATIC_INLINE__ std::vector> gen_flux_txt_ids(int bs, int context_len, int axes_dim_num) { + __STATIC_INLINE__ std::vector> gen_flux_txt_ids(int bs, int context_len, int axes_dim_num, std::set arange_dims) { auto txt_ids = std::vector>(bs * context_len, std::vector(axes_dim_num, 0.0f)); - if (axes_dim_num == 4) { - for (int i = 0; i < bs * context_len; i++) { - txt_ids[i][3] = (i % context_len); + for (int dim = 0; dim < axes_dim_num; dim++) { + if (arange_dims.find(dim) != arange_dims.end()) { + for (int i = 0; i < bs * context_len; i++) { + txt_ids[i][dim] = (i % context_len); + } } } return txt_ids; @@ -211,10 +213,11 @@ namespace Rope { int bs, int axes_dim_num, int context_len, + std::set txt_arange_dims, const std::vector& ref_latents, bool increase_ref_index, float ref_index_scale) { - auto txt_ids = gen_flux_txt_ids(bs, context_len, axes_dim_num); + auto txt_ids = gen_flux_txt_ids(bs, context_len, axes_dim_num, txt_arange_dims); auto img_ids = gen_flux_img_ids(h, w, patch_size, bs, axes_dim_num); auto ids = concat_ids(txt_ids, img_ids, bs); @@ -231,6 +234,7 @@ namespace Rope { int patch_size, int bs, int context_len, + std::set txt_arange_dims, const std::vector& ref_latents, bool increase_ref_index, float ref_index_scale, @@ -242,6 +246,7 @@ namespace Rope { bs, static_cast(axes_dim.size()), context_len, + txt_arange_dims, ref_latents, increase_ref_index, ref_index_scale); diff --git a/otherarch/sdcpp/sdtype_adapter.cpp b/otherarch/sdcpp/sdtype_adapter.cpp index 460d0029a..0490bde5d 100644 --- a/otherarch/sdcpp/sdtype_adapter.cpp +++ b/otherarch/sdcpp/sdtype_adapter.cpp @@ -85,6 +85,10 @@ struct SDParams { bool vae_conv_direct = false; bool chroma_use_dit_mask = true; + + std::string lora_path; + sd_lora_t lora_spec; + uint32_t lora_count; }; //shared @@ -320,6 +324,7 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) { sd_params->clip_l_path = clip1_filename; sd_params->clip_g_path = clip2_filename; sd_params->stacked_id_embeddings_path = photomaker_filename; + sd_params->lora_path = lorafilename; //if t5 is set, and model is a gguf, load it as a diffusion model path bool endswithgguf = (sd_params->model_path.rfind(".gguf") == sd_params->model_path.size() - 5); if((sd_params->t5xxl_path!="" || sd_params->clip_l_path!="" || sd_params->clip_g_path!="") && endswithgguf) @@ -414,10 +419,15 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) { std::filesystem::path mpath(inputs.model_filename); sdmodelfilename = mpath.filename().string(); - if(lorafilename!="" && inputs.lora_multiplier>0) + sd_params->lora_spec = {}; + sd_params->lora_spec.path = sd_params->lora_path.c_str(); + sd_params->lora_spec.multiplier = inputs.lora_multiplier; + + if(sd_params->lora_path!="" && sd_params->lora_spec.multiplier>0) { printf("\nApply LoRA...\n"); - sd_ctx->sd->apply_lora_from_file(lorafilename,inputs.lora_multiplier); + sd_params->lora_count = 1; + sd_ctx->sd->apply_loras(&sd_params->lora_spec, sd_params->lora_count); } input_extraimage_buffers.reserve(max_extra_images); @@ -977,6 +987,11 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs) params.vae_tiling_params.enabled = dotile; params.batch_count = 1; + // needs to be "reapplied" because sdcpp tracks previously applied LoRAs + // and weights, and apply/unapply the differences at each gen + params.loras = &sd_params->lora_spec; + params.lora_count = sd_params->lora_count; + params.ref_images = reference_imgs.data(); params.ref_images_count = reference_imgs.size(); params.pm_params.id_images = photomaker_imgs.data(); diff --git a/otherarch/sdcpp/stable-diffusion.cpp b/otherarch/sdcpp/stable-diffusion.cpp index 10ce0c8b7..c802eace2 100644 --- a/otherarch/sdcpp/stable-diffusion.cpp +++ b/otherarch/sdcpp/stable-diffusion.cpp @@ -48,6 +48,7 @@ const char* model_version_to_str[] = { "Qwen Image", "Flux.2", "Z-Image", + "Ovis Image", }; const char* sampling_methods_str[] = { @@ -548,6 +549,13 @@ public: tensor_storage_map, sd_ctx_params->chroma_use_t5_mask, sd_ctx_params->chroma_t5_mask_pad); + } else if (version == VERSION_OVIS_IMAGE) { + cond_stage_model = std::make_shared(clip_backend, + offload_params_to_cpu, + tensor_storage_map, + version, + "", + false); } else { cond_stage_model = std::make_shared(clip_backend, offload_params_to_cpu, @@ -624,18 +632,22 @@ public: "model.diffusion_model", version); } else { // SD1.x SD2.x SDXL + std::map embbeding_map; + for (int i = 0; i < sd_ctx_params->embedding_count; i++) { + embbeding_map.emplace(SAFE_STR(sd_ctx_params->embeddings[i].name), SAFE_STR(sd_ctx_params->embeddings[i].path)); + } if (strstr(SAFE_STR(sd_ctx_params->photo_maker_path), "v2")) { cond_stage_model = std::make_shared(clip_backend, offload_params_to_cpu, tensor_storage_map, - SAFE_STR(sd_ctx_params->embedding_dir), + embbeding_map, version, PM_VERSION_2); } else { cond_stage_model = std::make_shared(clip_backend, offload_params_to_cpu, tensor_storage_map, - SAFE_STR(sd_ctx_params->embedding_dir), + embbeding_map, version); } diffusion_model = std::make_shared(backend, @@ -818,6 +830,11 @@ public: ignore_tensors.insert("first_stage_model.quant"); ignore_tensors.insert("text_encoders.llm.visual."); } + if (version == VERSION_OVIS_IMAGE) { + ignore_tensors.insert("text_encoders.llm.vision_model."); + ignore_tensors.insert("text_encoders.llm.visual_tokenizer."); + ignore_tensors.insert("text_encoders.llm.vte."); + } if (version == VERSION_SVD) { ignore_tensors.insert("conditioner.embedders.3"); } @@ -1044,71 +1061,21 @@ public: return result < -1; } - // kcpp - void apply_lora_from_file(const std::string& lora_path, float multiplier) { - std::unordered_map lora_f2m; // lora_name -> multiplier - - lora_f2m[lora_path] = multiplier; - - LOG_DEBUG("lora %s:%.2f", lora_path.c_str(), multiplier); - - int64_t t0 = ggml_time_ms(); - if (apply_lora_immediately) { - LOG_INFO("apply lora immediately"); - apply_loras_immediately(lora_f2m); - } else { - LOG_INFO("apply at runtime"); - apply_loras_at_runtime(lora_f2m); - } - int64_t t1 = ggml_time_ms(); - - LOG_INFO("lora '%s' applied, taking %.2fs", - lora_path.c_str(), - (t1 - t0) * 1.0f / 1000); - } - std::shared_ptr load_lora_model_from_file(const std::string& lora_id, float multiplier, ggml_backend_t backend, LoraModel::filter_t lora_tensor_filter = nullptr) { - // kcpp: LoRA is passed as a path - #if 1 - std::string file_path = lora_id; - #ifdef _WIN32 - std::string lora_ident = std::filesystem::u8path(file_path).stem().u8string(); - #else - std::string lora_ident = std::filesystem::path(file_path).stem().string(); - #endif - - if (!file_exists(file_path)) { - LOG_WARN("can not find lora file %s", file_path.c_str()); - return nullptr; - } - auto lora = std::make_shared(lora_ident, backend, file_path, "", version); - #else - std::string lora_name = lora_id; - std::string high_noise_tag = "|high_noise|"; - bool is_high_noise = false; - if (starts_with(lora_name, high_noise_tag)) { - lora_name = lora_name.substr(high_noise_tag.size()); + std::string lora_path = lora_id; + static std::string high_noise_tag = "|high_noise|"; + bool is_high_noise = false; + if (starts_with(lora_path, high_noise_tag)) { + lora_path = lora_path.substr(high_noise_tag.size()); is_high_noise = true; - LOG_DEBUG("high noise lora: %s", lora_name.c_str()); + LOG_DEBUG("high noise lora: %s", lora_path.c_str()); } - std::string st_file_path = path_join(lora_model_dir, lora_name + ".safetensors"); - std::string ckpt_file_path = path_join(lora_model_dir, lora_name + ".ckpt"); - std::string file_path; - if (file_exists(st_file_path)) { - file_path = st_file_path; - } else if (file_exists(ckpt_file_path)) { - file_path = ckpt_file_path; - } else { - LOG_WARN("can not find %s or %s for lora %s", st_file_path.c_str(), ckpt_file_path.c_str(), lora_name.c_str()); - return nullptr; - } - auto lora = std::make_shared(lora_id, backend, file_path, is_high_noise ? "model.high_noise_" : "", version); - #endif + auto lora = std::make_shared(lora_id, backend, lora_path, is_high_noise ? "model.high_noise_" : "", version); if (!lora->load_from_file(n_threads, lora_tensor_filter)) { - LOG_WARN("load lora tensors from %s failed", file_path.c_str()); + LOG_WARN("load lora tensors from %s failed", lora_path.c_str()); return nullptr; } @@ -1293,19 +1260,16 @@ public: } } - std::string apply_loras_from_prompt(const std::string& prompt) { - auto result_pair = extract_and_remove_lora(prompt); - std::unordered_map lora_f2m = result_pair.first; // lora_name -> multiplier - - for (auto& kv : lora_f2m) { - LOG_DEBUG("lora %s:%.2f", kv.first.c_str(), kv.second); + void apply_loras(const sd_lora_t* loras, uint32_t lora_count) { + std::unordered_map lora_f2m; + for (int i = 0; i < lora_count; i++) { + std::string lora_id = SAFE_STR(loras[i].path); + if (loras[i].is_high_noise) { + lora_id = "|high_noise|" + lora_id; + } + lora_f2m[lora_id] = loras[i].multiplier; + LOG_DEBUG("lora %s:%.2f", lora_id.c_str(), loras[i].multiplier); } - #if 1 // kcpp - //only use hardcoded lora - if (!lora_f2m.empty()) { - printf("\nWarning: not applying LoRAs requested by prompt!\n"); - } - #else int64_t t0 = ggml_time_ms(); if (apply_lora_immediately) { apply_loras_immediately(lora_f2m); @@ -1315,10 +1279,7 @@ public: int64_t t1 = ggml_time_ms(); if (!lora_f2m.empty()) { LOG_INFO("apply_loras completed, taking %.2fs", (t1 - t0) * 1.0f / 1000); - LOG_DEBUG("prompt after extract and remove lora: \"%s\"", result_pair.second.c_str()); } - #endif - return result_pair.second; } ggml_tensor* id_encoder(ggml_context* work_ctx, @@ -1483,10 +1444,17 @@ public: uint32_t dim = latents->ne[ggml_n_dims(latents) - 1]; if (preview_mode == PREVIEW_PROJ) { + int64_t patch_sz = 1; const float(*latent_rgb_proj)[channel] = nullptr; float* latent_rgb_bias = nullptr; - if (dim == 48) { + if (dim == 128) { + if (sd_version_is_flux2(version)) { + latent_rgb_proj = flux2_latent_rgb_proj; + latent_rgb_bias = flux2_latent_rgb_bias; + patch_sz = 2; + } + } else if (dim == 48) { if (sd_version_is_wan(version)) { latent_rgb_proj = wan_22_latent_rgb_proj; latent_rgb_bias = wan_22_latent_rgb_bias; @@ -1539,12 +1507,15 @@ public: frames = latents->ne[2]; } - uint8_t* data = (uint8_t*)malloc(frames * width * height * channel * sizeof(uint8_t)); + uint32_t img_width = width * patch_sz; + uint32_t img_height = height * patch_sz; - preview_latent_video(data, latents, latent_rgb_proj, latent_rgb_bias, width, height, frames, dim); + uint8_t* data = (uint8_t*)malloc(frames * img_width * img_height * channel * sizeof(uint8_t)); + + preview_latent_video(data, latents, latent_rgb_proj, latent_rgb_bias, patch_sz); sd_image_t* images = (sd_image_t*)malloc(frames * sizeof(sd_image_t)); for (int i = 0; i < frames; i++) { - images[i] = {width, height, channel, data + i * width * height * channel}; + images[i] = {img_width, img_height, channel, data + i * img_width * img_height * channel}; } step_callback(step, frames, images, is_noisy, step_callback_data); free(data); @@ -2055,6 +2026,18 @@ public: return vae_scale_factor; } + int get_diffusion_model_down_factor() { + int down_factor = 8; // unet + if (sd_version_is_dit(version)) { + if (sd_version_is_wan(version)) { + down_factor = 2; + } else { + down_factor = 1; + } + } + return down_factor; + } + int get_latent_channel() { int latent_channel = 4; if (sd_version_is_dit(version)) { @@ -2682,7 +2665,6 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { "taesd_path: %s\n" "control_net_path: %s\n" "lora_model_dir: %s\n" - "embedding_dir: %s\n" "photo_maker_path: %s\n" "tensor_type_rules: %s\n" "vae_decode_only: %s\n" @@ -2713,7 +2695,6 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { SAFE_STR(sd_ctx_params->taesd_path), SAFE_STR(sd_ctx_params->control_net_path), SAFE_STR(sd_ctx_params->lora_model_dir), - SAFE_STR(sd_ctx_params->embedding_dir), SAFE_STR(sd_ctx_params->photo_maker_path), SAFE_STR(sd_ctx_params->tensor_type_rules), BOOL_STR(sd_ctx_params->vae_decode_only), @@ -2747,6 +2728,8 @@ void sd_sample_params_init(sd_sample_params_t* sample_params) { sample_params->scheduler = SCHEDULER_COUNT; sample_params->sample_method = SAMPLE_METHOD_COUNT; sample_params->sample_steps = 20; + sample_params->custom_sigmas = nullptr; + sample_params->custom_sigmas_count = 0; } char* sd_sample_params_to_str(const sd_sample_params_t* sample_params) { @@ -2964,8 +2947,6 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx, int sample_steps = sigmas.size() - 1; int64_t t0 = ggml_time_ms(); - // Apply lora - prompt = sd_ctx->sd->apply_loras_from_prompt(prompt); // Photo Maker std::string prompt_text_only; @@ -3294,22 +3275,19 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g sd_ctx->sd->vae_tiling_params = sd_img_gen_params->vae_tiling_params; int width = sd_img_gen_params->width; int height = sd_img_gen_params->height; - int vae_scale_factor = sd_ctx->sd->get_vae_scale_factor(); - if (sd_version_is_dit(sd_ctx->sd->version)) { - if (width % 16 || height % 16) { - LOG_ERROR("Image dimensions must be must be a multiple of 16 on each axis for %s models. (Got %dx%d)", - model_version_to_str[sd_ctx->sd->version], - width, - height); - return nullptr; - } - } else if (width % 64 || height % 64) { - LOG_ERROR("Image dimensions must be must be a multiple of 64 on each axis for %s models. (Got %dx%d)", - model_version_to_str[sd_ctx->sd->version], - width, - height); - return nullptr; + + int vae_scale_factor = sd_ctx->sd->get_vae_scale_factor(); + int diffusion_model_down_factor = sd_ctx->sd->get_diffusion_model_down_factor(); + int spatial_multiple = vae_scale_factor * diffusion_model_down_factor; + + int width_offset = align_up_offset(width, spatial_multiple); + int height_offset = align_up_offset(height, spatial_multiple); + if (width_offset > 0 || height_offset > 0) { + width += width_offset; + height += height_offset; + LOG_WARN("align up %dx%d to %dx%d (multiple=%d)", sd_img_gen_params->width, sd_img_gen_params->height, width, height, spatial_multiple); } + LOG_DEBUG("generate_image %dx%d", width, height); if (sd_ctx == nullptr || sd_img_gen_params == nullptr) { return nullptr; @@ -3337,17 +3315,30 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g size_t t0 = ggml_time_ms(); + // Apply lora + sd_ctx->sd->apply_loras(sd_img_gen_params->loras, sd_img_gen_params->lora_count); + enum sample_method_t sample_method = sd_img_gen_params->sample_params.sample_method; if (sample_method == SAMPLE_METHOD_COUNT) { sample_method = sd_get_default_sample_method(sd_ctx); } LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]); - int sample_steps = sd_img_gen_params->sample_params.sample_steps; - std::vector sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps, - sd_ctx->sd->get_image_seq_len(height, width), - sd_img_gen_params->sample_params.scheduler, - sd_ctx->sd->version); + int sample_steps = sd_img_gen_params->sample_params.sample_steps; + std::vector sigmas; + if (sd_img_gen_params->sample_params.custom_sigmas_count > 0) { + sigmas = std::vector(sd_img_gen_params->sample_params.custom_sigmas, + sd_img_gen_params->sample_params.custom_sigmas + sd_img_gen_params->sample_params.custom_sigmas_count); + if (sample_steps != sigmas.size() - 1) { + sample_steps = static_cast(sigmas.size()) - 1; + LOG_WARN("sample_steps != custom_sigmas_count - 1, set sample_steps to %d", sample_steps); + } + } else { + sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps, + sd_ctx->sd->get_image_seq_len(height, width), + sd_img_gen_params->sample_params.scheduler, + sd_ctx->sd->version); + } ggml_tensor* init_latent = nullptr; ggml_tensor* concat_latent = nullptr; @@ -3580,9 +3571,19 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s int frames = sd_vid_gen_params->video_frames; frames = (frames - 1) / 4 * 4 + 1; int sample_steps = sd_vid_gen_params->sample_params.sample_steps; - LOG_INFO("generate_video %dx%dx%d", width, height, frames); - int vae_scale_factor = sd_ctx->sd->get_vae_scale_factor(); + int vae_scale_factor = sd_ctx->sd->get_vae_scale_factor(); + int diffusion_model_down_factor = sd_ctx->sd->get_diffusion_model_down_factor(); + int spatial_multiple = vae_scale_factor * diffusion_model_down_factor; + + int width_offset = align_up_offset(width, spatial_multiple); + int height_offset = align_up_offset(height, spatial_multiple); + if (width_offset > 0 || height_offset > 0) { + width += width_offset; + height += height_offset; + LOG_WARN("align up %dx%d to %dx%d (multiple=%d)", sd_vid_gen_params->width, sd_vid_gen_params->height, width, height, spatial_multiple); + } + LOG_INFO("generate_video %dx%dx%d", width, height, frames); enum sample_method_t sample_method = sd_vid_gen_params->sample_params.sample_method; if (sample_method == SAMPLE_METHOD_COUNT) { @@ -3600,7 +3601,29 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s if (high_noise_sample_steps > 0) { total_steps += high_noise_sample_steps; } - std::vector sigmas = sd_ctx->sd->denoiser->get_sigmas(total_steps, 0, sd_vid_gen_params->sample_params.scheduler, sd_ctx->sd->version); + + std::vector sigmas; + if (sd_vid_gen_params->sample_params.custom_sigmas_count > 0) { + sigmas = std::vector(sd_vid_gen_params->sample_params.custom_sigmas, + sd_vid_gen_params->sample_params.custom_sigmas + sd_vid_gen_params->sample_params.custom_sigmas_count); + if (total_steps != sigmas.size() - 1) { + total_steps = static_cast(sigmas.size()) - 1; + LOG_WARN("total_steps != custom_sigmas_count - 1, set total_steps to %d", total_steps); + if (sample_steps >= total_steps) { + sample_steps = total_steps; + LOG_WARN("total_steps != custom_sigmas_count - 1, set sample_steps to %d", sample_steps); + } + if (high_noise_sample_steps > 0) { + high_noise_sample_steps = total_steps - sample_steps; + LOG_WARN("total_steps != custom_sigmas_count - 1, set high_noise_sample_steps to %d", high_noise_sample_steps); + } + } + } else { + sigmas = sd_ctx->sd->denoiser->get_sigmas(total_steps, + 0, + sd_vid_gen_params->sample_params.scheduler, + sd_ctx->sd->version); + } if (high_noise_sample_steps < 0) { // timesteps ∝ sigmas for Flow models (like wan2.2 a14b) @@ -3636,7 +3659,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s int64_t t0 = ggml_time_ms(); // Apply lora - prompt = sd_ctx->sd->apply_loras_from_prompt(prompt); + sd_ctx->sd->apply_loras(sd_vid_gen_params->loras, sd_vid_gen_params->lora_count); ggml_tensor* init_latent = nullptr; ggml_tensor* clip_vision_output = nullptr; diff --git a/otherarch/sdcpp/stable-diffusion.h b/otherarch/sdcpp/stable-diffusion.h index e34cdec17..e4abc8dcd 100644 --- a/otherarch/sdcpp/stable-diffusion.h +++ b/otherarch/sdcpp/stable-diffusion.h @@ -150,6 +150,11 @@ typedef struct { float rel_size_y; } sd_tiling_params_t; +typedef struct { + const char* name; + const char* path; +} sd_embedding_t; + typedef struct { const char* model_path; const char* clip_l_path; @@ -164,7 +169,8 @@ typedef struct { const char* taesd_path; const char* control_net_path; const char* lora_model_dir; - const char* embedding_dir; + const sd_embedding_t* embeddings; + uint32_t embedding_count; const char* photo_maker_path; const char* tensor_type_rules; bool vae_decode_only; @@ -219,6 +225,8 @@ typedef struct { int sample_steps; float eta; int shifted_timestep; + float* custom_sigmas; + int custom_sigmas_count; } sd_sample_params_t; typedef struct { @@ -236,6 +244,14 @@ typedef struct { } sd_easycache_params_t; typedef struct { + bool is_high_noise; + float multiplier; + const char* path; +} sd_lora_t; + +typedef struct { + const sd_lora_t* loras; + uint32_t lora_count; const char* prompt; const char* negative_prompt; int clip_skip; @@ -259,6 +275,8 @@ typedef struct { } sd_img_gen_params_t; typedef struct { + const sd_lora_t* loras; + uint32_t lora_count; const char* prompt; const char* negative_prompt; int clip_skip; @@ -331,7 +349,8 @@ typedef struct upscaler_ctx_t upscaler_ctx_t; SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path, bool offload_params_to_cpu, bool direct, - int n_threads); + int n_threads, + int tile_size); SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx); SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx, @@ -353,6 +372,9 @@ SD_API bool preprocess_canny(sd_image_t image, float strong, bool inverse); +SD_API const char* sd_commit(void); +SD_API const char* sd_version(void); + #ifdef __cplusplus } #endif diff --git a/otherarch/sdcpp/upscaler.cpp b/otherarch/sdcpp/upscaler.cpp index 62c0d29ad..29ac981e6 100644 --- a/otherarch/sdcpp/upscaler.cpp +++ b/otherarch/sdcpp/upscaler.cpp @@ -9,12 +9,15 @@ struct UpscalerGGML { std::shared_ptr esrgan_upscaler; std::string esrgan_path; int n_threads; - bool direct = false; + bool direct = false; + int tile_size = 128; UpscalerGGML(int n_threads, - bool direct = false) + bool direct = false, + int tile_size = 128) : n_threads(n_threads), - direct(direct) { + direct(direct), + tile_size(tile_size) { } bool load_from_file(const std::string& esrgan_path, @@ -51,7 +54,7 @@ struct UpscalerGGML { backend = ggml_backend_cpu_init(); } LOG_INFO("Upscaler weight type: %s", ggml_type_name(model_data_type)); - esrgan_upscaler = std::make_shared(backend, offload_params_to_cpu, model_loader.get_tensor_storage_map()); + esrgan_upscaler = std::make_shared(backend, offload_params_to_cpu, tile_size, model_loader.get_tensor_storage_map()); if (direct) { esrgan_upscaler->set_conv2d_direct_enabled(true); } @@ -113,14 +116,15 @@ struct upscaler_ctx_t { upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str, bool offload_params_to_cpu, bool direct, - int n_threads) { + int n_threads, + int tile_size) { upscaler_ctx_t* upscaler_ctx = (upscaler_ctx_t*)malloc(sizeof(upscaler_ctx_t)); if (upscaler_ctx == nullptr) { return nullptr; } std::string esrgan_path(esrgan_path_c_str); - upscaler_ctx->upscaler = new UpscalerGGML(n_threads, direct); + upscaler_ctx->upscaler = new UpscalerGGML(n_threads, direct, tile_size); if (upscaler_ctx->upscaler == nullptr) { return nullptr; } diff --git a/otherarch/sdcpp/util.cpp b/otherarch/sdcpp/util.cpp index 9d58a7ec2..fd0c60624 100644 --- a/otherarch/sdcpp/util.cpp +++ b/otherarch/sdcpp/util.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -98,18 +99,9 @@ bool is_directory(const std::string& path) { return (attributes != INVALID_FILE_ATTRIBUTES && (attributes & FILE_ATTRIBUTE_DIRECTORY)); } -std::string get_full_path(const std::string& dir, const std::string& filename) { - std::string full_path = dir + "\\" + filename; - - WIN32_FIND_DATA find_file_data; - HANDLE hFind = FindFirstFile(full_path.c_str(), &find_file_data); - - if (hFind != INVALID_HANDLE_VALUE) { - FindClose(hFind); - return full_path; - } else { - return ""; - } +std::string sd_get_u8path(const std::string& file_path) +{ + return std::filesystem::u8path(file_path).string(); } #else // Unix @@ -126,24 +118,9 @@ bool is_directory(const std::string& path) { return (stat(path.c_str(), &buffer) == 0 && S_ISDIR(buffer.st_mode)); } -// TODO: add windows version -std::string get_full_path(const std::string& dir, const std::string& filename) { - DIR* dp = opendir(dir.c_str()); - - if (dp != nullptr) { - struct dirent* entry; - - while ((entry = readdir(dp)) != nullptr) { - if (strcasecmp(entry->d_name, filename.c_str()) == 0) { - closedir(dp); - return dir + "/" + entry->d_name; - } - } - - closedir(dp); - } - - return ""; +std::string sd_get_u8path(const std::string& file_path) +{ + return std::filesystem::path(file_path).string(); } #endif diff --git a/otherarch/sdcpp/util.h b/otherarch/sdcpp/util.h index ac2283c62..b92b76071 100644 --- a/otherarch/sdcpp/util.h +++ b/otherarch/sdcpp/util.h @@ -22,13 +22,14 @@ int round_up_to(int value, int base); bool file_exists(const std::string& filename); bool is_directory(const std::string& path); -std::string get_full_path(const std::string& dir, const std::string& filename); std::u32string utf8_to_utf32(const std::string& utf8_str); std::string utf32_to_utf8(const std::u32string& utf32_str); std::u32string unicode_value_to_utf32(int unicode_value); // std::string sd_basename(const std::string& path); +std::string sd_get_u8path(const std::string& file_path); + typedef struct { uint32_t width; uint32_t height; diff --git a/otherarch/sdcpp/version.cpp b/otherarch/sdcpp/version.cpp new file mode 100644 index 000000000..97dc8426b --- /dev/null +++ b/otherarch/sdcpp/version.cpp @@ -0,0 +1,20 @@ +#include "stable-diffusion.h" + +#ifndef SDCPP_BUILD_COMMIT +#define SDCPP_BUILD_COMMIT unknown +#endif + +#ifndef SDCPP_BUILD_VERSION +#define SDCPP_BUILD_VERSION unknown +#endif + +#define STRINGIZE2(x) #x +#define STRINGIZE(x) STRINGIZE2(x) + +const char* sd_commit(void) { + return STRINGIZE(SDCPP_BUILD_COMMIT); +} + +const char* sd_version(void) { + return STRINGIZE(SDCPP_BUILD_VERSION); +}