mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-13 15:39:11 +00:00
sync sd.cpp to e370258
This commit is contained in:
parent
49535fdfed
commit
fab2ff0687
3 changed files with 276 additions and 146 deletions
|
|
@ -41,13 +41,15 @@ const char* modes_str[] = {
|
|||
"img_gen",
|
||||
"vid_gen",
|
||||
"convert",
|
||||
"upscale",
|
||||
};
|
||||
#define SD_ALL_MODES_STR "img_gen, vid_gen, convert"
|
||||
#define SD_ALL_MODES_STR "img_gen, vid_gen, convert, upscale"
|
||||
|
||||
enum SDMode {
|
||||
IMG_GEN,
|
||||
VID_GEN,
|
||||
CONVERT,
|
||||
UPSCALE,
|
||||
MODE_COUNT
|
||||
};
|
||||
|
||||
|
|
@ -82,6 +84,7 @@ struct SDParams {
|
|||
|
||||
std::string prompt;
|
||||
std::string negative_prompt;
|
||||
|
||||
int clip_skip = -1; // <= 0 represents unspecified
|
||||
int width = 512;
|
||||
int height = 512;
|
||||
|
|
@ -125,6 +128,8 @@ struct SDParams {
|
|||
int chroma_t5_mask_pad = 1;
|
||||
float flow_shift = INFINITY;
|
||||
|
||||
prediction_t prediction = DEFAULT_PRED;
|
||||
|
||||
sd_tiling_params_t vae_tiling_params = {false, 0, 0, 0.5f, 0.0f, 0.0f};
|
||||
|
||||
SDParams() {
|
||||
|
|
@ -186,6 +191,7 @@ void print_params(SDParams params) {
|
|||
printf(" sample_params: %s\n", SAFE_STR(sample_params_str));
|
||||
printf(" high_noise_sample_params: %s\n", SAFE_STR(high_noise_sample_params_str));
|
||||
printf(" moe_boundary: %.3f\n", params.moe_boundary);
|
||||
printf(" prediction: %s\n", sd_prediction_name(params.prediction));
|
||||
printf(" flow_shift: %.2f\n", params.flow_shift);
|
||||
printf(" strength(img2img): %.2f\n", params.strength);
|
||||
printf(" rng: %s\n", sd_rng_type_name(params.rng_type));
|
||||
|
|
@ -208,7 +214,7 @@ void print_usage(int argc, const char* argv[]) {
|
|||
printf("\n");
|
||||
printf("arguments:\n");
|
||||
printf(" -h, --help show this help message and exit\n");
|
||||
printf(" -M, --mode [MODE] run mode, one of: [img_gen, vid_gen, convert], default: img_gen\n");
|
||||
printf(" -M, --mode [MODE] run mode, one of: [img_gen, vid_gen, upscale, convert], default: img_gen\n");
|
||||
printf(" -t, --threads N number of threads to use during computation (default: -1)\n");
|
||||
printf(" If threads <= 0, then threads will be set to the number of CPU physical cores\n");
|
||||
printf(" --offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM when needed\n");
|
||||
|
|
@ -225,7 +231,7 @@ void print_usage(int argc, const char* argv[]) {
|
|||
printf(" --taesd [TAESD_PATH] path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)\n");
|
||||
printf(" --control-net [CONTROL_PATH] path to control net model\n");
|
||||
printf(" --embd-dir [EMBEDDING_PATH] path to embeddings\n");
|
||||
printf(" --upscale-model [ESRGAN_PATH] path to esrgan model. Upscale images after generate, just RealESRGAN_x4plus_anime_6B supported by now\n");
|
||||
printf(" --upscale-model [ESRGAN_PATH] path to esrgan model. For img_gen mode, upscale images after generate, just RealESRGAN_x4plus_anime_6B supported by now\n");
|
||||
printf(" --upscale-repeats Run the ESRGAN upscaler this many times (default 1)\n");
|
||||
printf(" --type [TYPE] weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K)\n");
|
||||
printf(" If not specified, the default is the type of the weight file\n");
|
||||
|
|
@ -279,6 +285,7 @@ void print_usage(int argc, const char* argv[]) {
|
|||
printf(" --rng {std_default, cuda} RNG (default: cuda)\n");
|
||||
printf(" -s SEED, --seed SEED RNG seed (default: 42, use random seed for < 0)\n");
|
||||
printf(" -b, --batch-count COUNT number of images to generate\n");
|
||||
printf(" --prediction {eps, v, edm_v, sd3_flow, flux_flow} Prediction type override.\n");
|
||||
printf(" --clip-skip N ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1)\n");
|
||||
printf(" <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x\n");
|
||||
printf(" --vae-tiling process vae in tiles to reduce memory usage\n");
|
||||
|
|
@ -649,6 +656,20 @@ void parse_args(int argc, const char** argv, SDParams& params) {
|
|||
return 1;
|
||||
};
|
||||
|
||||
auto on_prediction_arg = [&](int argc, const char** argv, int index) {
|
||||
if (++index >= argc) {
|
||||
return -1;
|
||||
}
|
||||
const char* arg = argv[index];
|
||||
params.prediction = str_to_prediction(arg);
|
||||
if (params.prediction == PREDICTION_COUNT) {
|
||||
fprintf(stderr, "error: invalid prediction type %s\n",
|
||||
arg);
|
||||
return -1;
|
||||
}
|
||||
return 1;
|
||||
};
|
||||
|
||||
auto on_sample_method_arg = [&](int argc, const char** argv, int index) {
|
||||
if (++index >= argc) {
|
||||
return -1;
|
||||
|
|
@ -805,6 +826,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
|
|||
{"", "--rng", "", on_rng_arg},
|
||||
{"-s", "--seed", "", on_seed_arg},
|
||||
{"", "--sampling-method", "", on_sample_method_arg},
|
||||
{"", "--prediction", "", on_prediction_arg},
|
||||
{"", "--scheduler", "", on_schedule_arg},
|
||||
{"", "--skip-layers", "", on_skip_layers_arg},
|
||||
{"", "--high-noise-sampling-method", "", on_high_noise_sample_method_arg},
|
||||
|
|
@ -825,13 +847,13 @@ void parse_args(int argc, const char** argv, SDParams& params) {
|
|||
params.n_threads = sd_get_num_physical_cores();
|
||||
}
|
||||
|
||||
if (params.mode != CONVERT && params.mode != VID_GEN && params.prompt.length() == 0) {
|
||||
if ((params.mode == IMG_GEN || params.mode == VID_GEN) && params.prompt.length() == 0) {
|
||||
fprintf(stderr, "error: the following arguments are required: prompt\n");
|
||||
print_usage(argc, argv);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (params.model_path.length() == 0 && params.diffusion_model_path.length() == 0) {
|
||||
if (params.mode != UPSCALE && params.model_path.length() == 0 && params.diffusion_model_path.length() == 0) {
|
||||
fprintf(stderr, "error: the following arguments are required: model_path/diffusion_model\n");
|
||||
print_usage(argc, argv);
|
||||
exit(1);
|
||||
|
|
@ -891,6 +913,17 @@ void parse_args(int argc, const char** argv, SDParams& params) {
|
|||
exit(1);
|
||||
}
|
||||
|
||||
if (params.mode == UPSCALE) {
|
||||
if (params.esrgan_path.length() == 0) {
|
||||
fprintf(stderr, "error: upscale mode needs an upscaler model (--upscale-model)\n");
|
||||
exit(1);
|
||||
}
|
||||
if (params.init_image_path.length() == 0) {
|
||||
fprintf(stderr, "error: upscale mode needs an init image (--init-img)\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
if (params.seed < 0) {
|
||||
srand((int)time(NULL));
|
||||
params.seed = rand();
|
||||
|
|
@ -901,14 +934,6 @@ void parse_args(int argc, const char** argv, SDParams& params) {
|
|||
params.output_path = "output.gguf";
|
||||
}
|
||||
}
|
||||
|
||||
if (!isfinite(params.sample_params.guidance.img_cfg)) {
|
||||
params.sample_params.guidance.img_cfg = params.sample_params.guidance.txt_cfg;
|
||||
}
|
||||
|
||||
if (!isfinite(params.high_noise_sample_params.guidance.img_cfg)) {
|
||||
params.high_noise_sample_params.guidance.img_cfg = params.high_noise_sample_params.guidance.txt_cfg;
|
||||
}
|
||||
}
|
||||
|
||||
static std::string sd_basename(const std::string& path) {
|
||||
|
|
@ -1349,6 +1374,7 @@ int main(int argc, const char* argv[]) {
|
|||
params.n_threads,
|
||||
params.wtype,
|
||||
params.rng_type,
|
||||
params.prediction,
|
||||
params.offload_params_to_cpu,
|
||||
params.clip_on_cpu,
|
||||
params.control_net_cpu,
|
||||
|
|
@ -1362,76 +1388,92 @@ int main(int argc, const char* argv[]) {
|
|||
params.flow_shift,
|
||||
};
|
||||
|
||||
sd_ctx_t* sd_ctx = new_sd_ctx(&sd_ctx_params);
|
||||
sd_image_t* results = nullptr;
|
||||
int num_results = 0;
|
||||
|
||||
if (sd_ctx == NULL) {
|
||||
printf("new_sd_ctx_t failed\n");
|
||||
release_all_resources();
|
||||
return 1;
|
||||
}
|
||||
if (params.mode == UPSCALE) {
|
||||
num_results = 1;
|
||||
results = (sd_image_t*)calloc(num_results, sizeof(sd_image_t));
|
||||
if (results == NULL) {
|
||||
printf("failed to allocate results array\n");
|
||||
release_all_resources();
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (params.sample_params.sample_method == SAMPLE_METHOD_DEFAULT) {
|
||||
params.sample_params.sample_method = sd_get_default_sample_method(sd_ctx);
|
||||
}
|
||||
results[0] = init_image;
|
||||
init_image.data = NULL;
|
||||
} else {
|
||||
sd_ctx_t* sd_ctx = new_sd_ctx(&sd_ctx_params);
|
||||
|
||||
sd_image_t* results;
|
||||
int num_results = 1;
|
||||
if (params.mode == IMG_GEN) {
|
||||
sd_img_gen_params_t img_gen_params = {
|
||||
params.prompt.c_str(),
|
||||
params.negative_prompt.c_str(),
|
||||
params.clip_skip,
|
||||
init_image,
|
||||
ref_images.data(),
|
||||
(int)ref_images.size(),
|
||||
params.increase_ref_index,
|
||||
mask_image,
|
||||
params.width,
|
||||
params.height,
|
||||
params.sample_params,
|
||||
params.strength,
|
||||
params.seed,
|
||||
params.batch_count,
|
||||
control_image,
|
||||
params.control_strength,
|
||||
{
|
||||
pmid_images.data(),
|
||||
(int)pmid_images.size(),
|
||||
params.pm_id_embed_path.c_str(),
|
||||
params.pm_style_strength,
|
||||
}, // pm_params
|
||||
params.vae_tiling_params,
|
||||
};
|
||||
if (sd_ctx == NULL) {
|
||||
printf("new_sd_ctx_t failed\n");
|
||||
release_all_resources();
|
||||
return 1;
|
||||
}
|
||||
|
||||
results = generate_image(sd_ctx, &img_gen_params);
|
||||
num_results = params.batch_count;
|
||||
} else if (params.mode == VID_GEN) {
|
||||
sd_vid_gen_params_t vid_gen_params = {
|
||||
params.prompt.c_str(),
|
||||
params.negative_prompt.c_str(),
|
||||
params.clip_skip,
|
||||
init_image,
|
||||
end_image,
|
||||
control_frames.data(),
|
||||
(int)control_frames.size(),
|
||||
params.width,
|
||||
params.height,
|
||||
params.sample_params,
|
||||
params.high_noise_sample_params,
|
||||
params.moe_boundary,
|
||||
params.strength,
|
||||
params.seed,
|
||||
params.video_frames,
|
||||
params.vace_strength,
|
||||
};
|
||||
if (params.sample_params.sample_method == SAMPLE_METHOD_DEFAULT) {
|
||||
params.sample_params.sample_method = sd_get_default_sample_method(sd_ctx);
|
||||
}
|
||||
|
||||
results = generate_video(sd_ctx, &vid_gen_params, &num_results);
|
||||
}
|
||||
if (params.mode == IMG_GEN) {
|
||||
sd_img_gen_params_t img_gen_params = {
|
||||
params.prompt.c_str(),
|
||||
params.negative_prompt.c_str(),
|
||||
params.clip_skip,
|
||||
init_image,
|
||||
ref_images.data(),
|
||||
(int)ref_images.size(),
|
||||
params.increase_ref_index,
|
||||
mask_image,
|
||||
params.width,
|
||||
params.height,
|
||||
params.sample_params,
|
||||
params.strength,
|
||||
params.seed,
|
||||
params.batch_count,
|
||||
control_image,
|
||||
params.control_strength,
|
||||
{
|
||||
pmid_images.data(),
|
||||
(int)pmid_images.size(),
|
||||
params.pm_id_embed_path.c_str(),
|
||||
params.pm_style_strength,
|
||||
}, // pm_params
|
||||
params.vae_tiling_params,
|
||||
};
|
||||
|
||||
results = generate_image(sd_ctx, &img_gen_params);
|
||||
num_results = params.batch_count;
|
||||
} else if (params.mode == VID_GEN) {
|
||||
sd_vid_gen_params_t vid_gen_params = {
|
||||
params.prompt.c_str(),
|
||||
params.negative_prompt.c_str(),
|
||||
params.clip_skip,
|
||||
init_image,
|
||||
end_image,
|
||||
control_frames.data(),
|
||||
(int)control_frames.size(),
|
||||
params.width,
|
||||
params.height,
|
||||
params.sample_params,
|
||||
params.high_noise_sample_params,
|
||||
params.moe_boundary,
|
||||
params.strength,
|
||||
params.seed,
|
||||
params.video_frames,
|
||||
params.vace_strength,
|
||||
};
|
||||
|
||||
results = generate_video(sd_ctx, &vid_gen_params, &num_results);
|
||||
}
|
||||
|
||||
if (results == NULL) {
|
||||
printf("generate failed\n");
|
||||
free_sd_ctx(sd_ctx);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (results == NULL) {
|
||||
printf("generate failed\n");
|
||||
free_sd_ctx(sd_ctx);
|
||||
return 1;
|
||||
}
|
||||
|
||||
int upscale_factor = 4; // unused for RealESRGAN_x4plus_anime_6B.pth
|
||||
|
|
@ -1444,7 +1486,7 @@ int main(int argc, const char* argv[]) {
|
|||
if (upscaler_ctx == NULL) {
|
||||
printf("new_upscaler_ctx failed\n");
|
||||
} else {
|
||||
for (int i = 0; i < params.batch_count; i++) {
|
||||
for (int i = 0; i < num_results; i++) {
|
||||
if (results[i].data == NULL) {
|
||||
continue;
|
||||
}
|
||||
|
|
@ -1530,7 +1572,6 @@ int main(int argc, const char* argv[]) {
|
|||
results[i].data = NULL;
|
||||
}
|
||||
free(results);
|
||||
free_sd_ctx(sd_ctx);
|
||||
|
||||
release_all_resources();
|
||||
|
||||
|
|
|
|||
|
|
@ -836,64 +836,102 @@ public:
|
|||
ggml_backend_is_cpu(clip_backend) ? "RAM" : "VRAM");
|
||||
}
|
||||
|
||||
// check is_using_v_parameterization_for_sd2
|
||||
if (sd_version_is_sd2(version)) {
|
||||
if (is_using_v_parameterization_for_sd2(ctx, sd_version_is_inpaint(version))) {
|
||||
is_using_v_parameterization = true;
|
||||
}
|
||||
} else if (sd_version_is_sdxl(version)) {
|
||||
if (model_loader.tensor_storages_types.find("edm_vpred.sigma_max") != model_loader.tensor_storages_types.end()) {
|
||||
// CosXL models
|
||||
// TODO: get sigma_min and sigma_max values from file
|
||||
is_using_edm_v_parameterization = true;
|
||||
}
|
||||
if (model_loader.tensor_storages_types.find("v_pred") != model_loader.tensor_storages_types.end()) {
|
||||
is_using_v_parameterization = true;
|
||||
}
|
||||
} else if (version == VERSION_SVD) {
|
||||
// TODO: V_PREDICTION_EDM
|
||||
is_using_v_parameterization = true;
|
||||
}
|
||||
|
||||
if (sd_version_is_sd3(version)) {
|
||||
LOG_INFO("running in FLOW mode");
|
||||
float shift = sd_ctx_params->flow_shift;
|
||||
if (shift == INFINITY) {
|
||||
shift = 3.0;
|
||||
}
|
||||
denoiser = std::make_shared<DiscreteFlowDenoiser>(shift);
|
||||
} else if (sd_version_is_flux(version)) {
|
||||
LOG_INFO("running in Flux FLOW mode");
|
||||
float shift = 1.0f; // TODO: validate
|
||||
for (auto pair : model_loader.tensor_storages_types) {
|
||||
if (pair.first.find("model.diffusion_model.guidance_in.in_layer.weight") != std::string::npos) {
|
||||
shift = 1.15f;
|
||||
if (sd_ctx_params->prediction != DEFAULT_PRED) {
|
||||
switch (sd_ctx_params->prediction) {
|
||||
case EPS_PRED:
|
||||
LOG_INFO("running in eps-prediction mode");
|
||||
break;
|
||||
case V_PRED:
|
||||
LOG_INFO("running in v-prediction mode");
|
||||
denoiser = std::make_shared<CompVisVDenoiser>();
|
||||
break;
|
||||
case EDM_V_PRED:
|
||||
LOG_INFO("running in v-prediction EDM mode");
|
||||
denoiser = std::make_shared<EDMVDenoiser>();
|
||||
break;
|
||||
case SD3_FLOW_PRED: {
|
||||
LOG_INFO("running in FLOW mode");
|
||||
float shift = sd_ctx_params->flow_shift;
|
||||
if (shift == INFINITY) {
|
||||
shift = 3.0;
|
||||
}
|
||||
denoiser = std::make_shared<DiscreteFlowDenoiser>(shift);
|
||||
break;
|
||||
}
|
||||
case FLUX_FLOW_PRED: {
|
||||
LOG_INFO("running in Flux FLOW mode");
|
||||
float shift = sd_ctx_params->flow_shift;
|
||||
if (shift == INFINITY) {
|
||||
shift = 3.0;
|
||||
}
|
||||
denoiser = std::make_shared<FluxFlowDenoiser>(shift);
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
LOG_ERROR("Unknown parametrization %i", sd_ctx_params->prediction);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
denoiser = std::make_shared<FluxFlowDenoiser>(shift);
|
||||
} else if (sd_version_is_wan(version)) {
|
||||
LOG_INFO("running in FLOW mode");
|
||||
float shift = sd_ctx_params->flow_shift;
|
||||
if (shift == INFINITY) {
|
||||
shift = 5.0;
|
||||
}
|
||||
denoiser = std::make_shared<DiscreteFlowDenoiser>(shift);
|
||||
} else if (sd_version_is_qwen_image(version)) {
|
||||
LOG_INFO("running in FLOW mode");
|
||||
float shift = sd_ctx_params->flow_shift;
|
||||
if (shift == INFINITY) {
|
||||
shift = 3.0;
|
||||
}
|
||||
denoiser = std::make_shared<DiscreteFlowDenoiser>(shift);
|
||||
} else if (is_using_v_parameterization) {
|
||||
LOG_INFO("running in v-prediction mode");
|
||||
denoiser = std::make_shared<CompVisVDenoiser>();
|
||||
} else if (is_using_edm_v_parameterization) {
|
||||
LOG_INFO("running in v-prediction EDM mode");
|
||||
denoiser = std::make_shared<EDMVDenoiser>();
|
||||
} else {
|
||||
LOG_INFO("running in eps-prediction mode");
|
||||
if (sd_version_is_sd2(version)) {
|
||||
// check is_using_v_parameterization_for_sd2
|
||||
if (is_using_v_parameterization_for_sd2(ctx, sd_version_is_inpaint(version))) {
|
||||
is_using_v_parameterization = true;
|
||||
}
|
||||
} else if (sd_version_is_sdxl(version)) {
|
||||
if (model_loader.tensor_storages_types.find("edm_vpred.sigma_max") != model_loader.tensor_storages_types.end()) {
|
||||
// CosXL models
|
||||
// TODO: get sigma_min and sigma_max values from file
|
||||
is_using_edm_v_parameterization = true;
|
||||
}
|
||||
if (model_loader.tensor_storages_types.find("v_pred") != model_loader.tensor_storages_types.end()) {
|
||||
is_using_v_parameterization = true;
|
||||
}
|
||||
} else if (version == VERSION_SVD) {
|
||||
// TODO: V_PREDICTION_EDM
|
||||
is_using_v_parameterization = true;
|
||||
}
|
||||
|
||||
if (sd_version_is_sd3(version)) {
|
||||
LOG_INFO("running in FLOW mode");
|
||||
float shift = sd_ctx_params->flow_shift;
|
||||
if (shift == INFINITY) {
|
||||
shift = 3.0;
|
||||
}
|
||||
denoiser = std::make_shared<DiscreteFlowDenoiser>(shift);
|
||||
} else if (sd_version_is_flux(version)) {
|
||||
LOG_INFO("running in Flux FLOW mode");
|
||||
float shift = 1.0f; // TODO: validate
|
||||
for (auto pair : model_loader.tensor_storages_types) {
|
||||
if (pair.first.find("model.diffusion_model.guidance_in.in_layer.weight") != std::string::npos) {
|
||||
shift = 1.15f;
|
||||
break;
|
||||
}
|
||||
}
|
||||
denoiser = std::make_shared<FluxFlowDenoiser>(shift);
|
||||
} else if (sd_version_is_wan(version)) {
|
||||
LOG_INFO("running in FLOW mode");
|
||||
float shift = sd_ctx_params->flow_shift;
|
||||
if (shift == INFINITY) {
|
||||
shift = 5.0;
|
||||
}
|
||||
denoiser = std::make_shared<DiscreteFlowDenoiser>(shift);
|
||||
} else if (sd_version_is_qwen_image(version)) {
|
||||
LOG_INFO("running in FLOW mode");
|
||||
float shift = sd_ctx_params->flow_shift;
|
||||
if (shift == INFINITY) {
|
||||
shift = 3.0;
|
||||
}
|
||||
denoiser = std::make_shared<DiscreteFlowDenoiser>(shift);
|
||||
} else if (is_using_v_parameterization) {
|
||||
LOG_INFO("running in v-prediction mode");
|
||||
denoiser = std::make_shared<CompVisVDenoiser>();
|
||||
} else if (is_using_edm_v_parameterization) {
|
||||
LOG_INFO("running in v-prediction EDM mode");
|
||||
denoiser = std::make_shared<EDMVDenoiser>();
|
||||
} else {
|
||||
LOG_INFO("running in eps-prediction mode");
|
||||
}
|
||||
}
|
||||
|
||||
auto comp_vis_denoiser = std::dynamic_pointer_cast<CompVisDenoiser>(denoiser);
|
||||
|
|
@ -1281,7 +1319,7 @@ public:
|
|||
std::vector<int> skip_layers(guidance.slg.layers, guidance.slg.layers + guidance.slg.layer_count);
|
||||
|
||||
float cfg_scale = guidance.txt_cfg;
|
||||
float img_cfg_scale = guidance.img_cfg;
|
||||
float img_cfg_scale = isfinite(guidance.img_cfg) ? guidance.img_cfg : guidance.txt_cfg;
|
||||
float slg_scale = guidance.slg.scale;
|
||||
|
||||
if (img_cfg_scale != cfg_scale && !sd_version_is_inpaint_or_unet_edit(version)) {
|
||||
|
|
@ -1325,11 +1363,12 @@ public:
|
|||
}
|
||||
struct ggml_tensor* denoised = ggml_dup_tensor(work_ctx, x);
|
||||
|
||||
int64_t t0 = ggml_time_us();
|
||||
|
||||
auto denoise = [&](ggml_tensor* input, float sigma, int step) -> ggml_tensor* {
|
||||
if (step == 1) {
|
||||
if (step == 1 || step == -1) {
|
||||
pretty_progress(0, (int)steps, 0);
|
||||
}
|
||||
int64_t t0 = ggml_time_us();
|
||||
|
||||
std::vector<float> scaling = denoiser->get_scalings(sigma);
|
||||
GGML_ASSERT(scaling.size() == 3);
|
||||
|
|
@ -1483,8 +1522,9 @@ public:
|
|||
}
|
||||
|
||||
int64_t t1 = ggml_time_us();
|
||||
if (step > 0) {
|
||||
pretty_progress(step, (int)steps, (t1 - t0) / 1000000.f);
|
||||
if (step > 0 || step == -(int)steps) {
|
||||
int showstep = std::abs(step);
|
||||
pretty_progress(showstep, (int)steps, (t1 - t0) / 1000000.f / showstep);
|
||||
// LOG_INFO("step %d sampling completed taking %.2fs", step, (t1 - t0) * 1.0f / 1000000);
|
||||
}
|
||||
if (denoise_mask != nullptr) {
|
||||
|
|
@ -1625,19 +1665,19 @@ public:
|
|||
if (vae_tiling_params.enabled && !encode_video) {
|
||||
// TODO wan2.2 vae support?
|
||||
int C = sd_version_is_dit(version) ? 16 : 4;
|
||||
int NE2, NE3;
|
||||
int ne2;
|
||||
int ne3;
|
||||
if (sd_version_is_qwen_image(version)) {
|
||||
NE2 = x->ne[3];
|
||||
NE3 = C;
|
||||
}
|
||||
else {
|
||||
ne2 = 1;
|
||||
ne3 = C * x->ne[3];
|
||||
} else {
|
||||
if (!use_tiny_autoencoder) {
|
||||
C *= 2;
|
||||
}
|
||||
NE2 = C;
|
||||
NE3 = x->ne[3];
|
||||
ne2 = C;
|
||||
ne3 = x->ne[3];
|
||||
}
|
||||
result = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, NE2, NE3);
|
||||
result = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, ne2, ne3);
|
||||
}
|
||||
|
||||
if (sd_version_is_qwen_image(version)) {
|
||||
|
|
@ -1911,6 +1951,31 @@ enum scheduler_t str_to_schedule(const char* str) {
|
|||
return SCHEDULE_COUNT;
|
||||
}
|
||||
|
||||
const char* prediction_to_str[] = {
|
||||
"default",
|
||||
"eps",
|
||||
"v",
|
||||
"edm_v",
|
||||
"sd3_flow",
|
||||
"flux_flow",
|
||||
};
|
||||
|
||||
const char* sd_prediction_name(enum prediction_t prediction) {
|
||||
if (prediction < PREDICTION_COUNT) {
|
||||
return prediction_to_str[prediction];
|
||||
}
|
||||
return NONE_STR;
|
||||
}
|
||||
|
||||
enum prediction_t str_to_prediction(const char* str) {
|
||||
for (int i = 0; i < PREDICTION_COUNT; i++) {
|
||||
if (!strcmp(str, prediction_to_str[i])) {
|
||||
return (enum prediction_t)i;
|
||||
}
|
||||
}
|
||||
return PREDICTION_COUNT;
|
||||
}
|
||||
|
||||
void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
|
||||
*sd_ctx_params = {};
|
||||
sd_ctx_params->vae_decode_only = true;
|
||||
|
|
@ -1918,6 +1983,7 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
|
|||
sd_ctx_params->n_threads = sd_get_num_physical_cores();
|
||||
sd_ctx_params->wtype = SD_TYPE_COUNT;
|
||||
sd_ctx_params->rng_type = CUDA_RNG;
|
||||
sd_ctx_params->prediction = DEFAULT_PRED;
|
||||
sd_ctx_params->offload_params_to_cpu = false;
|
||||
sd_ctx_params->keep_clip_on_cpu = false;
|
||||
sd_ctx_params->keep_control_net_on_cpu = false;
|
||||
|
|
@ -1957,6 +2023,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
|
|||
"n_threads: %d\n"
|
||||
"wtype: %s\n"
|
||||
"rng_type: %s\n"
|
||||
"prediction: %s\n"
|
||||
"offload_params_to_cpu: %s\n"
|
||||
"keep_clip_on_cpu: %s\n"
|
||||
"keep_control_net_on_cpu: %s\n"
|
||||
|
|
@ -1985,6 +2052,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
|
|||
sd_ctx_params->n_threads,
|
||||
sd_type_name(sd_ctx_params->wtype),
|
||||
sd_rng_type_name(sd_ctx_params->rng_type),
|
||||
sd_prediction_name(sd_ctx_params->prediction),
|
||||
BOOL_STR(sd_ctx_params->offload_params_to_cpu),
|
||||
BOOL_STR(sd_ctx_params->keep_clip_on_cpu),
|
||||
BOOL_STR(sd_ctx_params->keep_control_net_on_cpu),
|
||||
|
|
@ -2031,7 +2099,9 @@ char* sd_sample_params_to_str(const sd_sample_params_t* sample_params) {
|
|||
"eta: %.2f, "
|
||||
"shifted_timestep: %d)",
|
||||
sample_params->guidance.txt_cfg,
|
||||
sample_params->guidance.img_cfg,
|
||||
isfinite(sample_params->guidance.img_cfg)
|
||||
? sample_params->guidance.img_cfg
|
||||
: sample_params->guidance.txt_cfg,
|
||||
sample_params->guidance.distilled_guidance,
|
||||
sample_params->guidance.slg.layer_count,
|
||||
sample_params->guidance.slg.layer_start,
|
||||
|
|
@ -2193,6 +2263,10 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
|
|||
seed = rand();
|
||||
}
|
||||
|
||||
if (!isfinite(guidance.img_cfg)) {
|
||||
guidance.img_cfg = guidance.txt_cfg;
|
||||
}
|
||||
|
||||
// for (auto v : sigmas) {
|
||||
// std::cout << v << " ";
|
||||
// }
|
||||
|
|
|
|||
|
|
@ -64,6 +64,16 @@ enum scheduler_t {
|
|||
SCHEDULE_COUNT
|
||||
};
|
||||
|
||||
enum prediction_t {
|
||||
DEFAULT_PRED,
|
||||
EPS_PRED,
|
||||
V_PRED,
|
||||
EDM_V_PRED,
|
||||
SD3_FLOW_PRED,
|
||||
FLUX_FLOW_PRED,
|
||||
PREDICTION_COUNT
|
||||
};
|
||||
|
||||
// same as enum ggml_type
|
||||
enum sd_type_t {
|
||||
SD_TYPE_F32 = 0,
|
||||
|
|
@ -146,6 +156,7 @@ typedef struct {
|
|||
int n_threads;
|
||||
enum sd_type_t wtype;
|
||||
enum rng_type_t rng_type;
|
||||
enum prediction_t prediction;
|
||||
bool offload_params_to_cpu;
|
||||
bool keep_clip_on_cpu;
|
||||
bool keep_control_net_on_cpu;
|
||||
|
|
@ -255,6 +266,8 @@ SD_API const char* sd_sample_method_name(enum sample_method_t sample_method);
|
|||
SD_API enum sample_method_t str_to_sample_method(const char* str);
|
||||
SD_API const char* sd_schedule_name(enum scheduler_t scheduler);
|
||||
SD_API enum scheduler_t str_to_schedule(const char* str);
|
||||
SD_API const char* sd_prediction_name(enum prediction_t prediction);
|
||||
SD_API enum prediction_t str_to_prediction(const char* str);
|
||||
|
||||
SD_API void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params);
|
||||
SD_API char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params);
|
||||
|
|
@ -285,6 +298,8 @@ SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx,
|
|||
sd_image_t input_image,
|
||||
uint32_t upscale_factor);
|
||||
|
||||
SD_API int get_upscale_factor(upscaler_ctx_t* upscaler_ctx);
|
||||
|
||||
SD_API bool convert(const char* input_path,
|
||||
const char* vae_path,
|
||||
const char* output_path,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue