#include #include #include #include #include #include #include #include #include #include // #include "preprocessing.hpp" #include "stable-diffusion.h" #define STB_IMAGE_IMPLEMENTATION #include "stb_image.h" #define STB_IMAGE_WRITE_IMPLEMENTATION #include "stb_image_write.h" #define STB_IMAGE_RESIZE_IMPLEMENTATION #include "stb_image_resize.h" #define SAFE_STR(s) ((s) ? (s) : "") #define BOOL_STR(b) ((b) ? "true" : "false") const char* modes_str[] = { "img_gen", "vid_gen", "convert", }; #define SD_ALL_MODES_STR "img_gen, vid_gen, convert" enum SDMode { IMG_GEN, VID_GEN, CONVERT, MODE_COUNT }; struct SDParams { int n_threads = -1; SDMode mode = IMG_GEN; std::string model_path; std::string clip_l_path; std::string clip_g_path; std::string t5xxl_path; std::string diffusion_model_path; std::string vae_path; std::string taesd_path; std::string esrgan_path; std::string control_net_path; std::string embedding_dir; std::string stacked_id_embed_dir; std::string input_id_images_path; sd_type_t wtype = SD_TYPE_COUNT; std::string tensor_type_rules; std::string lora_model_dir; std::string output_path = "output.png"; std::string input_path; std::string mask_path; std::string control_image_path; std::vector ref_image_paths; std::string prompt; std::string negative_prompt; float min_cfg = 1.0f; float cfg_scale = 7.0f; float img_cfg_scale = INFINITY; float guidance = 3.5f; float eta = 0.f; float style_ratio = 20.f; int clip_skip = -1; // <= 0 represents unspecified int width = 512; int height = 512; int batch_count = 1; int video_frames = 6; int motion_bucket_id = 127; int fps = 6; float augmentation_level = 0.f; sample_method_t sample_method = EULER_A; schedule_t schedule = DEFAULT; int sample_steps = 20; float strength = 0.75f; float control_strength = 0.9f; rng_type_t rng_type = CUDA_RNG; int64_t seed = 42; bool verbose = false; bool vae_tiling = false; bool control_net_cpu = false; bool normalize_input = false; bool clip_on_cpu = false; bool vae_on_cpu = false; bool diffusion_flash_attn = false; bool diffusion_conv_direct = false; bool vae_conv_direct = false; bool canny_preprocess = false; bool color = false; int upscale_repeats = 1; std::vector skip_layers = {7, 8, 9}; float slg_scale = 0.f; float skip_layer_start = 0.01f; float skip_layer_end = 0.2f; bool chroma_use_dit_mask = true; bool chroma_use_t5_mask = false; int chroma_t5_mask_pad = 1; }; void print_params(SDParams params) { printf("Option: \n"); printf(" n_threads: %d\n", params.n_threads); printf(" mode: %s\n", modes_str[params.mode]); printf(" model_path: %s\n", params.model_path.c_str()); printf(" wtype: %s\n", params.wtype < SD_TYPE_COUNT ? sd_type_name(params.wtype) : "unspecified"); printf(" clip_l_path: %s\n", params.clip_l_path.c_str()); printf(" clip_g_path: %s\n", params.clip_g_path.c_str()); printf(" t5xxl_path: %s\n", params.t5xxl_path.c_str()); printf(" diffusion_model_path: %s\n", params.diffusion_model_path.c_str()); printf(" vae_path: %s\n", params.vae_path.c_str()); printf(" taesd_path: %s\n", params.taesd_path.c_str()); printf(" esrgan_path: %s\n", params.esrgan_path.c_str()); printf(" control_net_path: %s\n", params.control_net_path.c_str()); printf(" embedding_dir: %s\n", params.embedding_dir.c_str()); printf(" stacked_id_embed_dir: %s\n", params.stacked_id_embed_dir.c_str()); printf(" input_id_images_path: %s\n", params.input_id_images_path.c_str()); printf(" style ratio: %.2f\n", params.style_ratio); printf(" normalize input image : %s\n", params.normalize_input ? "true" : "false"); printf(" output_path: %s\n", params.output_path.c_str()); printf(" init_img: %s\n", params.input_path.c_str()); printf(" mask_img: %s\n", params.mask_path.c_str()); printf(" control_image: %s\n", params.control_image_path.c_str()); printf(" ref_images_paths:\n"); for (auto& path : params.ref_image_paths) { printf(" %s\n", path.c_str()); }; printf(" clip on cpu: %s\n", params.clip_on_cpu ? "true" : "false"); printf(" controlnet cpu: %s\n", params.control_net_cpu ? "true" : "false"); printf(" vae decoder on cpu:%s\n", params.vae_on_cpu ? "true" : "false"); printf(" diffusion flash attention:%s\n", params.diffusion_flash_attn ? "true" : "false"); printf(" diffusion Conv2d direct:%s\n", params.diffusion_conv_direct ? "true" : "false"); printf(" vae Conv2d direct:%s\n", params.vae_conv_direct ? "true" : "false"); printf(" strength(control): %.2f\n", params.control_strength); printf(" prompt: %s\n", params.prompt.c_str()); printf(" negative_prompt: %s\n", params.negative_prompt.c_str()); printf(" min_cfg: %.2f\n", params.min_cfg); printf(" cfg_scale: %.2f\n", params.cfg_scale); printf(" img_cfg_scale: %.2f\n", params.img_cfg_scale); printf(" slg_scale: %.2f\n", params.slg_scale); printf(" guidance: %.2f\n", params.guidance); printf(" eta: %.2f\n", params.eta); printf(" clip_skip: %d\n", params.clip_skip); printf(" width: %d\n", params.width); printf(" height: %d\n", params.height); printf(" sample_method: %s\n", sd_sample_method_name(params.sample_method)); printf(" schedule: %s\n", sd_schedule_name(params.schedule)); printf(" sample_steps: %d\n", params.sample_steps); printf(" strength(img2img): %.2f\n", params.strength); printf(" rng: %s\n", sd_rng_type_name(params.rng_type)); printf(" seed: %ld\n", params.seed); printf(" batch_count: %d\n", params.batch_count); printf(" vae_tiling: %s\n", params.vae_tiling ? "true" : "false"); printf(" upscale_repeats: %d\n", params.upscale_repeats); printf(" chroma_use_dit_mask: %s\n", params.chroma_use_dit_mask ? "true" : "false"); printf(" chroma_use_t5_mask: %s\n", params.chroma_use_t5_mask ? "true" : "false"); printf(" chroma_t5_mask_pad: %d\n", params.chroma_t5_mask_pad); } void print_usage(int argc, const char* argv[]) { printf("usage: %s [arguments]\n", argv[0]); printf("\n"); printf("arguments:\n"); printf(" -h, --help show this help message and exit\n"); printf(" -M, --mode [MODE] run mode, one of: [img_gen, convert], default: img_gen\n"); printf(" -t, --threads N number of threads to use during computation (default: -1)\n"); printf(" If threads <= 0, then threads will be set to the number of CPU physical cores\n"); printf(" -m, --model [MODEL] path to full model\n"); printf(" --diffusion-model path to the standalone diffusion model\n"); printf(" --clip_l path to the clip-l text encoder\n"); printf(" --clip_g path to the clip-g text encoder\n"); printf(" --t5xxl path to the t5xxl text encoder\n"); printf(" --vae [VAE] path to vae\n"); printf(" --taesd [TAESD_PATH] path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)\n"); printf(" --control-net [CONTROL_PATH] path to control net model\n"); printf(" --embd-dir [EMBEDDING_PATH] path to embeddings\n"); printf(" --stacked-id-embd-dir [DIR] path to PHOTOMAKER stacked id embeddings\n"); printf(" --input-id-images-dir [DIR] path to PHOTOMAKER input id images dir\n"); printf(" --normalize-input normalize PHOTOMAKER input id images\n"); printf(" --upscale-model [ESRGAN_PATH] path to esrgan model. Upscale images after generate, just RealESRGAN_x4plus_anime_6B supported by now\n"); printf(" --upscale-repeats Run the ESRGAN upscaler this many times (default 1)\n"); printf(" --type [TYPE] weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K)\n"); printf(" If not specified, the default is the type of the weight file\n"); printf(" --tensor-type-rules [EXPRESSION] weight type per tensor pattern (example: \"^vae\\.=f16,model\\.=q8_0\")\n"); printf(" --lora-model-dir [DIR] lora model directory\n"); printf(" -i, --init-img [IMAGE] path to the input image, required by img2img\n"); printf(" --mask [MASK] path to the mask image, required by img2img with mask\n"); printf(" --control-image [IMAGE] path to image condition, control net\n"); printf(" -r, --ref-image [PATH] reference image for Flux Kontext models (can be used multiple times) \n"); printf(" -o, --output OUTPUT path to write result image to (default: ./output.png)\n"); printf(" -p, --prompt [PROMPT] the prompt to render\n"); printf(" -n, --negative-prompt PROMPT the negative prompt (default: \"\")\n"); printf(" --cfg-scale SCALE unconditional guidance scale: (default: 7.0)\n"); printf(" --img-cfg-scale SCALE image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)\n"); printf(" --guidance SCALE distilled guidance scale for models with guidance input (default: 3.5)\n"); printf(" --slg-scale SCALE skip layer guidance (SLG) scale, only for DiT models: (default: 0)\n"); printf(" 0 means disabled, a value of 2.5 is nice for sd3.5 medium\n"); printf(" --eta SCALE eta in DDIM, only for DDIM and TCD: (default: 0)\n"); printf(" --skip-layers LAYERS Layers to skip for SLG steps: (default: [7,8,9])\n"); printf(" --skip-layer-start START SLG enabling point: (default: 0.01)\n"); printf(" --skip-layer-end END SLG disabling point: (default: 0.2)\n"); printf(" SLG will be enabled at step int([STEPS]*[START]) and disabled at int([STEPS]*[END])\n"); printf(" --strength STRENGTH strength for noising/unnoising (default: 0.75)\n"); printf(" --style-ratio STYLE-RATIO strength for keeping input identity (default: 20)\n"); printf(" --control-strength STRENGTH strength to apply Control Net (default: 0.9)\n"); printf(" 1.0 corresponds to full destruction of information in init image\n"); printf(" -H, --height H image height, in pixel space (default: 512)\n"); printf(" -W, --width W image width, in pixel space (default: 512)\n"); printf(" --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}\n"); printf(" sampling method (default: \"euler_a\")\n"); printf(" --steps STEPS number of sample steps (default: 20)\n"); printf(" --rng {std_default, cuda} RNG (default: cuda)\n"); printf(" -s SEED, --seed SEED RNG seed (default: 42, use random seed for < 0)\n"); printf(" -b, --batch-count COUNT number of images to generate\n"); printf(" --schedule {discrete, karras, exponential, ays, gits} Denoiser sigma schedule (default: discrete)\n"); printf(" --clip-skip N ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1)\n"); printf(" <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x\n"); printf(" --vae-tiling process vae in tiles to reduce memory usage\n"); printf(" --vae-on-cpu keep vae in cpu (for low vram)\n"); printf(" --clip-on-cpu keep clip in cpu (for low vram)\n"); printf(" --diffusion-fa use flash attention in the diffusion model (for low vram)\n"); printf(" Might lower quality, since it implies converting k and v to f16.\n"); printf(" This might crash if it is not supported by the backend.\n"); printf(" --diffusion-conv-direct use Conv2d direct in the diffusion model"); printf(" This might crash if it is not supported by the backend.\n"); printf(" --vae-conv-direct use Conv2d direct in the vae model (should improve the performance)"); printf(" This might crash if it is not supported by the backend.\n"); printf(" --control-net-cpu keep controlnet in cpu (for low vram)\n"); printf(" --canny apply canny preprocessor (edge detection)\n"); printf(" --color colors the logging tags according to level\n"); printf(" --chroma-disable-dit-mask disable dit mask for chroma\n"); printf(" --chroma-enable-t5-mask enable t5 mask for chroma\n"); printf(" --chroma-t5-mask-pad PAD_SIZE t5 mask pad size of chroma\n"); printf(" -v, --verbose print extra info\n"); } struct StringOption { std::string short_name; std::string long_name; std::string desc; std::string* target; }; struct IntOption { std::string short_name; std::string long_name; std::string desc; int* target; }; struct FloatOption { std::string short_name; std::string long_name; std::string desc; float* target; }; struct BoolOption { std::string short_name; std::string long_name; std::string desc; bool keep_true; bool* target; }; struct ManualOption { std::string short_name; std::string long_name; std::string desc; std::function cb; }; struct ArgOptions { std::vector string_options; std::vector int_options; std::vector float_options; std::vector bool_options; std::vector manual_options; }; bool parse_options(int argc, const char** argv, ArgOptions& options) { bool invalid_arg = false; std::string arg; for (int i = 1; i < argc; i++) { arg = argv[i]; for (auto& option : options.string_options) { if ((option.short_name.size() > 0 && arg == option.short_name) || (option.long_name.size() > 0 && arg == option.long_name)) { if (++i >= argc) { invalid_arg = true; break; } *option.target = std::string(argv[i]); } } if (invalid_arg) { break; } for (auto& option : options.int_options) { if ((option.short_name.size() > 0 && arg == option.short_name) || (option.long_name.size() > 0 && arg == option.long_name)) { if (++i >= argc) { invalid_arg = true; break; } *option.target = std::stoi(argv[i]); } } if (invalid_arg) { break; } for (auto& option : options.float_options) { if ((option.short_name.size() > 0 && arg == option.short_name) || (option.long_name.size() > 0 && arg == option.long_name)) { if (++i >= argc) { invalid_arg = true; break; } *option.target = std::stof(argv[i]); } } if (invalid_arg) { break; } for (auto& option : options.bool_options) { if ((option.short_name.size() > 0 && arg == option.short_name) || (option.long_name.size() > 0 && arg == option.long_name)) { if (option.keep_true) { *option.target = true; } else { *option.target = false; } } } if (invalid_arg) { break; } for (auto& option : options.manual_options) { if ((option.short_name.size() > 0 && arg == option.short_name) || (option.long_name.size() > 0 && arg == option.long_name)) { int ret = option.cb(argc, argv, i); if (ret < 0) { invalid_arg = true; break; } i += ret; } } if (invalid_arg) { break; } } if (invalid_arg) { fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); return false; } return true; } void parse_args(int argc, const char** argv, SDParams& params) { ArgOptions options; options.string_options = { {"-m", "--model", "", ¶ms.model_path}, {"", "--clip_l", "", ¶ms.clip_l_path}, {"", "--clip_g", "", ¶ms.clip_g_path}, {"", "--t5xxl", "", ¶ms.t5xxl_path}, {"", "--diffusion-model", "", ¶ms.diffusion_model_path}, {"", "--vae", "", ¶ms.vae_path}, {"", "--taesd", "", ¶ms.taesd_path}, {"", "--control-net", "", ¶ms.control_net_path}, {"", "--embd-dir", "", ¶ms.embedding_dir}, {"", "--stacked-id-embd-dir", "", ¶ms.stacked_id_embed_dir}, {"", "--lora-model-dir", "", ¶ms.lora_model_dir}, {"-i", "--init-img", "", ¶ms.input_path}, {"", "--tensor-type-rules", "", ¶ms.tensor_type_rules}, {"", "--input-id-images-dir", "", ¶ms.input_id_images_path}, {"", "--mask", "", ¶ms.mask_path}, {"", "--control-image", "", ¶ms.control_image_path}, {"-o", "--output", "", ¶ms.output_path}, {"-p", "--prompt", "", ¶ms.prompt}, {"-n", "--negative-prompt", "", ¶ms.negative_prompt}, {"", "--upscale-model", "", ¶ms.esrgan_path}, }; options.int_options = { {"-t", "--threads", "", ¶ms.n_threads}, {"", "--upscale-repeats", "", ¶ms.upscale_repeats}, {"-H", "--height", "", ¶ms.height}, {"-W", "--width", "", ¶ms.width}, {"", "--steps", "", ¶ms.sample_steps}, {"", "--clip-skip", "", ¶ms.clip_skip}, {"-b", "--batch-count", "", ¶ms.batch_count}, {"", "--chroma-t5-mask-pad", "", ¶ms.chroma_t5_mask_pad}, }; options.float_options = { {"", "--cfg-scale", "", ¶ms.cfg_scale}, {"", "--img-cfg-scale", "", ¶ms.img_cfg_scale}, {"", "--guidance", "", ¶ms.guidance}, {"", "--eta", "", ¶ms.eta}, {"", "--strength", "", ¶ms.strength}, {"", "--style-ratio", "", ¶ms.style_ratio}, {"", "--control-strength", "", ¶ms.control_strength}, {"", "--slg-scale", "", ¶ms.slg_scale}, {"", "--skip-layer-start", "", ¶ms.skip_layer_start}, {"", "--skip-layer-end", "", ¶ms.skip_layer_end}, }; options.bool_options = { {"", "--vae-tiling", "", true, ¶ms.vae_tiling}, {"", "--control-net-cpu", "", true, ¶ms.control_net_cpu}, {"", "--normalize-input", "", true, ¶ms.normalize_input}, {"", "--clip-on-cpu", "", true, ¶ms.clip_on_cpu}, {"", "--vae-on-cpu", "", true, ¶ms.vae_on_cpu}, {"", "--diffusion-fa", "", true, ¶ms.diffusion_flash_attn}, {"", "--diffusion-conv-direct", "", true, ¶ms.diffusion_conv_direct}, {"", "--vae-conv-direct", "", true, ¶ms.vae_conv_direct}, {"", "--canny", "", true, ¶ms.canny_preprocess}, {"-v", "--verbos", "", true, ¶ms.verbose}, {"", "--color", "", true, ¶ms.color}, {"", "--chroma-disable-dit-mask", "", false, ¶ms.chroma_use_dit_mask}, {"", "--chroma-enable-t5-mask", "", true, ¶ms.chroma_use_t5_mask}, }; auto on_mode_arg = [&](int argc, const char** argv, int index) { if (++index >= argc) { return -1; } const char* mode = argv[index]; if (mode != NULL) { int mode_found = -1; for (int i = 0; i < MODE_COUNT; i++) { if (!strcmp(mode, modes_str[i])) { mode_found = i; } } if (mode_found == -1) { fprintf(stderr, "error: invalid mode %s, must be one of [%s]\n", mode, SD_ALL_MODES_STR); exit(1); } params.mode = (SDMode)mode_found; } return 1; }; auto on_type_arg = [&](int argc, const char** argv, int index) { if (++index >= argc) { return -1; } const char* arg = argv[index]; params.wtype = str_to_sd_type(arg); if (params.wtype == SD_TYPE_COUNT) { fprintf(stderr, "error: invalid weight format %s\n", arg); return -1; } return 1; }; auto on_rng_arg = [&](int argc, const char** argv, int index) { if (++index >= argc) { return -1; } const char* arg = argv[index]; params.rng_type = str_to_rng_type(arg); if (params.rng_type == RNG_TYPE_COUNT) { fprintf(stderr, "error: invalid rng type %s\n", arg); return -1; } return 1; }; auto on_schedule_arg = [&](int argc, const char** argv, int index) { if (++index >= argc) { return -1; } const char* arg = argv[index]; params.schedule = str_to_schedule(arg); if (params.schedule == SCHEDULE_COUNT) { fprintf(stderr, "error: invalid schedule %s\n", arg); return -1; } return 1; }; auto on_sample_method_arg = [&](int argc, const char** argv, int index) { if (++index >= argc) { return -1; } const char* arg = argv[index]; params.sample_method = str_to_sample_method(arg); if (params.sample_method == SAMPLE_METHOD_COUNT) { fprintf(stderr, "error: invalid sample method %s\n", arg); return -1; } return 1; }; auto on_seed_arg = [&](int argc, const char** argv, int index) { if (++index >= argc) { return -1; } params.seed = std::stoll(argv[index]); return 1; }; auto on_help_arg = [&](int argc, const char** argv, int index) { print_usage(argc, argv); exit(0); return 0; }; auto on_skip_layers_arg = [&](int argc, const char** argv, int index) { if (++index >= argc) { return -1; } std::string layers_str = argv[index]; if (layers_str[0] != '[' || layers_str[layers_str.size() - 1] != ']') { return -1; } layers_str = layers_str.substr(1, layers_str.size() - 2); std::regex regex("[, ]+"); std::sregex_token_iterator iter(layers_str.begin(), layers_str.end(), regex, -1); std::sregex_token_iterator end; std::vector tokens(iter, end); std::vector layers; for (const auto& token : tokens) { try { layers.push_back(std::stoi(token)); } catch (const std::invalid_argument& e) { return -1; } } params.skip_layers = layers; return 1; }; auto on_ref_image_arg = [&](int argc, const char** argv, int index) { if (++index >= argc) { return -1; } params.ref_image_paths.push_back(argv[index]); return 1; }; options.manual_options = { {"-M", "--mode", "", on_mode_arg}, {"", "--type", "", on_type_arg}, {"", "--rng", "", on_rng_arg}, {"-s", "--seed", "", on_seed_arg}, {"", "--sampling-method", "", on_sample_method_arg}, {"", "--schedule", "", on_schedule_arg}, {"", "--skip-layers", "", on_skip_layers_arg}, {"-r", "--ref-image", "", on_ref_image_arg}, {"-h", "--help", "", on_help_arg}, }; if (!parse_options(argc, argv, options)) { print_usage(argc, argv); exit(1); } if (params.n_threads <= 0) { params.n_threads = sd_get_num_physical_cores(); } if (params.mode != CONVERT && params.mode != VID_GEN && params.prompt.length() == 0) { fprintf(stderr, "error: the following arguments are required: prompt\n"); print_usage(argc, argv); exit(1); } if (params.model_path.length() == 0 && params.diffusion_model_path.length() == 0) { fprintf(stderr, "error: the following arguments are required: model_path/diffusion_model\n"); print_usage(argc, argv); exit(1); } if (params.output_path.length() == 0) { fprintf(stderr, "error: the following arguments are required: output_path\n"); print_usage(argc, argv); exit(1); } if (params.width <= 0) { fprintf(stderr, "error: the width must be greater than 0\n"); exit(1); } if (params.height <= 0) { fprintf(stderr, "error: the height must be greater than 0\n"); exit(1); } if (params.sample_steps <= 0) { fprintf(stderr, "error: the sample_steps must be greater than 0\n"); exit(1); } if (params.strength < 0.f || params.strength > 1.f) { fprintf(stderr, "error: can only work with strength in [0.0, 1.0]\n"); exit(1); } if (params.mode != CONVERT && params.tensor_type_rules.size() > 0) { fprintf(stderr, "warning: --tensor-type-rules is currently supported only for conversion\n"); } if (params.upscale_repeats < 1) { fprintf(stderr, "error: upscale multiplier must be at least 1\n"); exit(1); } if (params.seed < 0) { srand((int)time(NULL)); params.seed = rand(); } if (params.mode == CONVERT) { if (params.output_path == "output.png") { params.output_path = "output.gguf"; } } if (!isfinite(params.img_cfg_scale)) { params.img_cfg_scale = params.cfg_scale; } } static std::string sd_basename(const std::string& path) { size_t pos = path.find_last_of('/'); if (pos != std::string::npos) { return path.substr(pos + 1); } pos = path.find_last_of('\\'); if (pos != std::string::npos) { return path.substr(pos + 1); } return path; } std::string get_image_params(SDParams params, int64_t seed) { std::string parameter_string = params.prompt + "\n"; if (params.negative_prompt.size() != 0) { parameter_string += "Negative prompt: " + params.negative_prompt + "\n"; } parameter_string += "Steps: " + std::to_string(params.sample_steps) + ", "; parameter_string += "CFG scale: " + std::to_string(params.cfg_scale) + ", "; if (params.slg_scale != 0 && params.skip_layers.size() != 0) { parameter_string += "SLG scale: " + std::to_string(params.cfg_scale) + ", "; parameter_string += "Skip layers: ["; for (const auto& layer : params.skip_layers) { parameter_string += std::to_string(layer) + ", "; } parameter_string += "], "; parameter_string += "Skip layer start: " + std::to_string(params.skip_layer_start) + ", "; parameter_string += "Skip layer end: " + std::to_string(params.skip_layer_end) + ", "; } parameter_string += "Guidance: " + std::to_string(params.guidance) + ", "; parameter_string += "Eta: " + std::to_string(params.eta) + ", "; parameter_string += "Seed: " + std::to_string(seed) + ", "; parameter_string += "Size: " + std::to_string(params.width) + "x" + std::to_string(params.height) + ", "; parameter_string += "Model: " + sd_basename(params.model_path) + ", "; parameter_string += "RNG: " + std::string(sd_rng_type_name(params.rng_type)) + ", "; parameter_string += "Sampler: " + std::string(sd_sample_method_name(params.sample_method)); if (params.schedule != DEFAULT) { parameter_string += " " + std::string(sd_schedule_name(params.schedule)); } parameter_string += ", "; for (const auto& te : {params.clip_l_path, params.clip_g_path, params.t5xxl_path}) { if (!te.empty()) { parameter_string += "TE: " + sd_basename(te) + ", "; } } if (!params.diffusion_model_path.empty()) { parameter_string += "Unet: " + sd_basename(params.diffusion_model_path) + ", "; } if (!params.vae_path.empty()) { parameter_string += "VAE: " + sd_basename(params.vae_path) + ", "; } if (params.clip_skip != -1) { parameter_string += "Clip skip: " + std::to_string(params.clip_skip) + ", "; } parameter_string += "Version: stable-diffusion.cpp"; return parameter_string; } /* Enables Printing the log level tag in color using ANSI escape codes */ void sd_log_cb(enum sd_log_level_t level, const char* log, void* data) { SDParams* params = (SDParams*)data; int tag_color; const char* level_str; FILE* out_stream = (level == SD_LOG_ERROR) ? stderr : stdout; if (!log || (!params->verbose && level <= SD_LOG_DEBUG)) { return; } switch (level) { case SD_LOG_DEBUG: tag_color = 37; level_str = "DEBUG"; break; case SD_LOG_INFO: tag_color = 34; level_str = "INFO"; break; case SD_LOG_WARN: tag_color = 35; level_str = "WARN"; break; case SD_LOG_ERROR: tag_color = 31; level_str = "ERROR"; break; default: /* Potential future-proofing */ tag_color = 33; level_str = "?????"; break; } if (params->color == true) { fprintf(out_stream, "\033[%d;1m[%-5s]\033[0m ", tag_color, level_str); } else { fprintf(out_stream, "[%-5s] ", level_str); } fputs(log, out_stream); fflush(out_stream); } int main(int argc, const char* argv[]) { SDParams params; parse_args(argc, argv, params); sd_guidance_params_t guidance_params = {params.cfg_scale, params.img_cfg_scale, params.min_cfg, params.guidance, { params.skip_layers.data(), params.skip_layers.size(), params.skip_layer_start, params.skip_layer_end, params.slg_scale, }}; sd_set_log_callback(sd_log_cb, (void*)¶ms); if (params.verbose) { print_params(params); printf("%s", sd_get_system_info()); } if (params.mode == CONVERT) { bool success = convert(params.model_path.c_str(), params.vae_path.c_str(), params.output_path.c_str(), params.wtype, params.tensor_type_rules.c_str()); if (!success) { fprintf(stderr, "convert '%s'/'%s' to '%s' failed\n", params.model_path.c_str(), params.vae_path.c_str(), params.output_path.c_str()); return 1; } else { printf("convert '%s'/'%s' to '%s' success\n", params.model_path.c_str(), params.vae_path.c_str(), params.output_path.c_str()); return 0; } } if (params.mode == VID_GEN) { fprintf(stderr, "SVD support is broken, do not use it!!!\n"); return 1; } bool vae_decode_only = true; uint8_t* input_image_buffer = NULL; uint8_t* control_image_buffer = NULL; uint8_t* mask_image_buffer = NULL; std::vector ref_images; if (params.input_path.size() > 0) { vae_decode_only = false; int c = 0; int width = 0; int height = 0; input_image_buffer = stbi_load(params.input_path.c_str(), &width, &height, &c, 3); if (input_image_buffer == NULL) { fprintf(stderr, "load image from '%s' failed\n", params.input_path.c_str()); return 1; } if (c < 3) { fprintf(stderr, "the number of channels for the input image must be >= 3, but got %d channels\n", c); free(input_image_buffer); return 1; } if (width <= 0) { fprintf(stderr, "error: the width of image must be greater than 0\n"); free(input_image_buffer); return 1; } if (height <= 0) { fprintf(stderr, "error: the height of image must be greater than 0\n"); free(input_image_buffer); return 1; } // Resize input image ... if (params.height != height || params.width != width) { printf("resize input image from %dx%d to %dx%d\n", width, height, params.width, params.height); int resized_height = params.height; int resized_width = params.width; uint8_t* resized_image_buffer = (uint8_t*)malloc(resized_height * resized_width * 3); if (resized_image_buffer == NULL) { fprintf(stderr, "error: allocate memory for resize input image\n"); free(input_image_buffer); return 1; } stbir_resize(input_image_buffer, width, height, 0, resized_image_buffer, resized_width, resized_height, 0, STBIR_TYPE_UINT8, 3 /*RGB channel*/, STBIR_ALPHA_CHANNEL_NONE, 0, STBIR_EDGE_CLAMP, STBIR_EDGE_CLAMP, STBIR_FILTER_BOX, STBIR_FILTER_BOX, STBIR_COLORSPACE_SRGB, nullptr); // Save resized result free(input_image_buffer); input_image_buffer = resized_image_buffer; } } else if (params.ref_image_paths.size() > 0) { vae_decode_only = false; for (auto& path : params.ref_image_paths) { int c = 0; int width = 0; int height = 0; uint8_t* image_buffer = stbi_load(path.c_str(), &width, &height, &c, 3); if (image_buffer == NULL) { fprintf(stderr, "load image from '%s' failed\n", path.c_str()); return 1; } if (c < 3) { fprintf(stderr, "the number of channels for the input image must be >= 3, but got %d channels\n", c); free(image_buffer); return 1; } if (width <= 0) { fprintf(stderr, "error: the width of image must be greater than 0\n"); free(image_buffer); return 1; } if (height <= 0) { fprintf(stderr, "error: the height of image must be greater than 0\n"); free(image_buffer); return 1; } ref_images.push_back({(uint32_t)width, (uint32_t)height, 3, image_buffer}); } } sd_ctx_params_t sd_ctx_params = { params.model_path.c_str(), params.clip_l_path.c_str(), params.clip_g_path.c_str(), params.t5xxl_path.c_str(), params.diffusion_model_path.c_str(), params.vae_path.c_str(), params.taesd_path.c_str(), params.control_net_path.c_str(), params.lora_model_dir.c_str(), params.embedding_dir.c_str(), params.stacked_id_embed_dir.c_str(), vae_decode_only, params.vae_tiling, true, params.n_threads, params.wtype, params.rng_type, params.schedule, params.clip_on_cpu, params.control_net_cpu, params.vae_on_cpu, params.diffusion_flash_attn, params.diffusion_conv_direct, params.vae_conv_direct, params.chroma_use_dit_mask, params.chroma_use_t5_mask, params.chroma_t5_mask_pad, }; sd_ctx_t* sd_ctx = new_sd_ctx(&sd_ctx_params); if (sd_ctx == NULL) { printf("new_sd_ctx_t failed\n"); return 1; } sd_image_t input_image = {(uint32_t)params.width, (uint32_t)params.height, 3, input_image_buffer}; sd_image_t* control_image = NULL; if (params.control_net_path.size() > 0 && params.control_image_path.size() > 0) { int c = 0; control_image_buffer = stbi_load(params.control_image_path.c_str(), ¶ms.width, ¶ms.height, &c, 3); if (control_image_buffer == NULL) { fprintf(stderr, "load image from '%s' failed\n", params.control_image_path.c_str()); return 1; } control_image = new sd_image_t{(uint32_t)params.width, (uint32_t)params.height, 3, control_image_buffer}; if (params.canny_preprocess) { // apply preprocessor control_image->data = preprocess_canny(control_image->data, control_image->width, control_image->height, 0.08f, 0.08f, 0.8f, 1.0f, false); } } std::vector default_mask_image_vec(params.width * params.height, 255); if (params.mask_path != "") { int c = 0; mask_image_buffer = stbi_load(params.mask_path.c_str(), ¶ms.width, ¶ms.height, &c, 1); } else { mask_image_buffer = default_mask_image_vec.data(); } sd_image_t mask_image = {(uint32_t)params.width, (uint32_t)params.height, 1, mask_image_buffer}; sd_image_t* results; int expected_num_results = 1; if (params.mode == IMG_GEN) { sd_img_gen_params_t img_gen_params = { params.prompt.c_str(), params.negative_prompt.c_str(), params.clip_skip, guidance_params, input_image, ref_images.data(), (int)ref_images.size(), mask_image, params.width, params.height, params.sample_method, params.sample_steps, params.eta, params.strength, params.seed, params.batch_count, control_image, params.control_strength, params.style_ratio, params.normalize_input, params.input_id_images_path.c_str(), }; kcpp_img_gen_params_t extra_params; extra_params.photomaker_reference_count = 0; extra_params.photomaker_references = nullptr; results = generate_image(sd_ctx, &img_gen_params, &extra_params); expected_num_results = params.batch_count; } else if (params.mode == VID_GEN) { sd_vid_gen_params_t vid_gen_params = { input_image, params.width, params.height, guidance_params, params.sample_method, params.sample_steps, params.strength, params.seed, params.video_frames, params.motion_bucket_id, params.fps, params.augmentation_level, }; results = generate_video(sd_ctx, &vid_gen_params); expected_num_results = params.video_frames; } if (results == NULL) { printf("generate failed\n"); free_sd_ctx(sd_ctx); return 1; } int upscale_factor = 4; // unused for RealESRGAN_x4plus_anime_6B.pth if (params.esrgan_path.size() > 0 && params.upscale_repeats > 0) { upscaler_ctx_t* upscaler_ctx = new_upscaler_ctx(params.esrgan_path.c_str(), params.n_threads, params.diffusion_conv_direct); if (upscaler_ctx == NULL) { printf("new_upscaler_ctx failed\n"); } else { for (int i = 0; i < params.batch_count; i++) { if (results[i].data == NULL) { continue; } sd_image_t current_image = results[i]; for (int u = 0; u < params.upscale_repeats; ++u) { sd_image_t upscaled_image = upscale(upscaler_ctx, current_image, upscale_factor); if (upscaled_image.data == NULL) { printf("upscale failed\n"); break; } free(current_image.data); current_image = upscaled_image; } results[i] = current_image; // Set the final upscaled image as the result } } } std::string dummy_name, ext, lc_ext; bool is_jpg; size_t last = params.output_path.find_last_of("."); size_t last_path = std::min(params.output_path.find_last_of("/"), params.output_path.find_last_of("\\")); if (last != std::string::npos // filename has extension && (last_path == std::string::npos || last > last_path)) { dummy_name = params.output_path.substr(0, last); ext = lc_ext = params.output_path.substr(last); std::transform(ext.begin(), ext.end(), lc_ext.begin(), ::tolower); is_jpg = lc_ext == ".jpg" || lc_ext == ".jpeg" || lc_ext == ".jpe"; } else { dummy_name = params.output_path; ext = lc_ext = ""; is_jpg = false; } // appending ".png" to absent or unknown extension if (!is_jpg && lc_ext != ".png") { dummy_name += ext; ext = ".png"; } for (int i = 0; i < expected_num_results; i++) { if (results[i].data == NULL) { continue; } std::string final_image_path = i > 0 ? dummy_name + "_" + std::to_string(i + 1) + ext : dummy_name + ext; if (is_jpg) { stbi_write_jpg(final_image_path.c_str(), results[i].width, results[i].height, results[i].channel, results[i].data, 90); printf("save result JPEG image to '%s'\n", final_image_path.c_str()); } else { stbi_write_png(final_image_path.c_str(), results[i].width, results[i].height, results[i].channel, results[i].data, 0, get_image_params(params, params.seed + i).c_str()); printf("save result PNG image to '%s'\n", final_image_path.c_str()); } free(results[i].data); results[i].data = NULL; } free(results); free_sd_ctx(sd_ctx); free(control_image_buffer); free(input_image_buffer); return 0; }