mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 01:24:36 +00:00
updated sdcpp, also set euler as default sampler
This commit is contained in:
parent
e93c2427b4
commit
2ba5949054
27 changed files with 1514 additions and 521 deletions
|
@ -28,11 +28,8 @@ const char* model_version_to_str[] = {
|
|||
"SD 2.x",
|
||||
"SDXL",
|
||||
"SVD",
|
||||
"SD3 2B",
|
||||
"Flux Dev",
|
||||
"Flux Schnell",
|
||||
"SD3.5 8B",
|
||||
"SD3.5 2B"};
|
||||
"SD3.x",
|
||||
"Flux"};
|
||||
|
||||
const char* sampling_methods_str[] = {
|
||||
"Euler A",
|
||||
|
@ -93,6 +90,7 @@ public:
|
|||
std::shared_ptr<ControlNet> control_net;
|
||||
std::shared_ptr<PhotoMakerIDEncoder> pmid_model;
|
||||
std::shared_ptr<LoraModel> pmid_lora;
|
||||
std::shared_ptr<PhotoMakerIDEmbed> pmid_id_embeds;
|
||||
|
||||
std::string taesd_path;
|
||||
bool use_tiny_autoencoder = false;
|
||||
|
@ -153,7 +151,8 @@ public:
|
|||
schedule_t schedule,
|
||||
bool clip_on_cpu,
|
||||
bool control_net_cpu,
|
||||
bool vae_on_cpu) {
|
||||
bool vae_on_cpu,
|
||||
bool diffusion_flash_attn) {
|
||||
use_tiny_autoencoder = taesd_path.size() > 0;
|
||||
std::string taesd_path_fixed = taesd_path;
|
||||
#ifdef SD_USE_CUBLAS
|
||||
|
@ -166,7 +165,12 @@ public:
|
|||
#endif
|
||||
#ifdef SD_USE_VULKAN
|
||||
LOG_DEBUG("Using Vulkan backend");
|
||||
backend = ggml_backend_vk_init(0);
|
||||
for (int device = 0; device < ggml_backend_vk_get_device_count(); ++device) {
|
||||
backend = ggml_backend_vk_init(device);
|
||||
}
|
||||
if (!backend) {
|
||||
LOG_WARN("Failed to initialize Vulkan backend");
|
||||
}
|
||||
#endif
|
||||
#ifdef SD_USE_SYCL
|
||||
LOG_DEBUG("Using SYCL backend");
|
||||
|
@ -177,13 +181,7 @@ public:
|
|||
LOG_DEBUG("Using CPU backend");
|
||||
backend = ggml_backend_cpu_init();
|
||||
}
|
||||
#ifdef SD_USE_FLASH_ATTENTION
|
||||
#if defined(SD_USE_CUBLAS) || defined(SD_USE_METAL) || defined(SD_USE_SYCL) || defined(SD_USE_VULKAN)
|
||||
LOG_WARN("Flash Attention not supported with GPU Backend");
|
||||
#else
|
||||
LOG_INFO("Flash Attention enabled");
|
||||
#endif
|
||||
#endif
|
||||
|
||||
ModelLoader model_loader;
|
||||
|
||||
vae_tiling = vae_tiling_;
|
||||
|
@ -285,16 +283,18 @@ public:
|
|||
conditioner_wtype = wtype;
|
||||
diffusion_model_wtype = wtype;
|
||||
vae_wtype = wtype;
|
||||
model_loader.set_wtype_override(wtype);
|
||||
}
|
||||
|
||||
if (version == VERSION_SDXL) {
|
||||
vae_wtype = GGML_TYPE_F32;
|
||||
model_loader.set_wtype_override(GGML_TYPE_F32, "vae.");
|
||||
}
|
||||
|
||||
LOG_INFO("Weight type: %s", ggml_type_name(model_wtype));
|
||||
LOG_INFO("Conditioner weight type: %s", ggml_type_name(conditioner_wtype));
|
||||
LOG_INFO("Diffusion model weight type: %s", ggml_type_name(diffusion_model_wtype));
|
||||
LOG_INFO("VAE weight type: %s", ggml_type_name(vae_wtype));
|
||||
LOG_INFO("Weight type: %s", model_wtype != SD_TYPE_COUNT ? ggml_type_name(model_wtype) : "??");
|
||||
LOG_INFO("Conditioner weight type: %s", conditioner_wtype != SD_TYPE_COUNT ? ggml_type_name(conditioner_wtype) : "??");
|
||||
LOG_INFO("Diffusion model weight type: %s", diffusion_model_wtype != SD_TYPE_COUNT ? ggml_type_name(diffusion_model_wtype) : "??");
|
||||
LOG_INFO("VAE weight type: %s", vae_wtype != SD_TYPE_COUNT ? ggml_type_name(vae_wtype) : "??");
|
||||
|
||||
LOG_DEBUG("ggml tensor size = %d bytes", (int)sizeof(ggml_tensor));
|
||||
|
||||
|
@ -307,30 +307,30 @@ public:
|
|||
"try specifying SDXL VAE FP16 Fix with the --vae parameter. "
|
||||
"You can find it here: https://huggingface.co/madebyollin/sdxl-vae-fp16-fix/blob/main/sdxl_vae.safetensors");
|
||||
}
|
||||
} else if (version == VERSION_SD3_2B || version == VERSION_SD3_5_8B || version == VERSION_SD3_5_2B) {
|
||||
} else if (sd_version_is_sd3(version)) {
|
||||
scale_factor = 1.5305f;
|
||||
} else if (version == VERSION_FLUX_DEV || version == VERSION_FLUX_SCHNELL) {
|
||||
} else if (sd_version_is_flux(version)) {
|
||||
scale_factor = 0.3611;
|
||||
// TODO: shift_factor
|
||||
}
|
||||
|
||||
if (version == VERSION_SVD) {
|
||||
clip_vision = std::make_shared<FrozenCLIPVisionEmbedder>(backend, conditioner_wtype);
|
||||
clip_vision = std::make_shared<FrozenCLIPVisionEmbedder>(backend, model_loader.tensor_storages_types);
|
||||
clip_vision->alloc_params_buffer();
|
||||
clip_vision->get_param_tensors(tensors);
|
||||
|
||||
diffusion_model = std::make_shared<UNetModel>(backend, diffusion_model_wtype, version);
|
||||
diffusion_model = std::make_shared<UNetModel>(backend, model_loader.tensor_storages_types, version);
|
||||
diffusion_model->alloc_params_buffer();
|
||||
diffusion_model->get_param_tensors(tensors);
|
||||
|
||||
first_stage_model = std::make_shared<AutoEncoderKL>(backend, vae_wtype, vae_decode_only, true, version);
|
||||
first_stage_model = std::make_shared<AutoEncoderKL>(backend, model_loader.tensor_storages_types, "first_stage_model", vae_decode_only, true, version);
|
||||
LOG_DEBUG("vae_decode_only %d", vae_decode_only);
|
||||
first_stage_model->alloc_params_buffer();
|
||||
first_stage_model->get_param_tensors(tensors, "first_stage_model");
|
||||
} else {
|
||||
clip_backend = backend;
|
||||
bool use_t5xxl = false;
|
||||
if (version == VERSION_SD3_2B || version == VERSION_SD3_5_8B || version == VERSION_SD3_5_2B || version == VERSION_FLUX_DEV || version == VERSION_FLUX_SCHNELL) {
|
||||
if (sd_version_is_dit(version)) {
|
||||
use_t5xxl = true;
|
||||
}
|
||||
if (!ggml_backend_is_cpu(backend) && use_t5xxl && conditioner_wtype != GGML_TYPE_F32) {
|
||||
|
@ -341,16 +341,27 @@ public:
|
|||
LOG_INFO("CLIP: Using CPU backend");
|
||||
clip_backend = ggml_backend_cpu_init();
|
||||
}
|
||||
if (version == VERSION_SD3_2B || version == VERSION_SD3_5_8B || version == VERSION_SD3_5_2B) {
|
||||
cond_stage_model = std::make_shared<SD3CLIPEmbedder>(clip_backend, conditioner_wtype);
|
||||
diffusion_model = std::make_shared<MMDiTModel>(backend, diffusion_model_wtype, version);
|
||||
} else if (version == VERSION_FLUX_DEV || version == VERSION_FLUX_SCHNELL) {
|
||||
cond_stage_model = std::make_shared<FluxCLIPEmbedder>(clip_backend, conditioner_wtype);
|
||||
diffusion_model = std::make_shared<FluxModel>(backend, diffusion_model_wtype, version);
|
||||
} else {
|
||||
cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend, conditioner_wtype, embeddings_path, version);
|
||||
diffusion_model = std::make_shared<UNetModel>(backend, diffusion_model_wtype, version);
|
||||
if (diffusion_flash_attn) {
|
||||
LOG_INFO("Using flash attention in the diffusion model");
|
||||
}
|
||||
if (sd_version_is_sd3(version)) {
|
||||
if (diffusion_flash_attn) {
|
||||
LOG_WARN("flash attention in this diffusion model is currently unsupported!");
|
||||
}
|
||||
cond_stage_model = std::make_shared<SD3CLIPEmbedder>(clip_backend, model_loader.tensor_storages_types);
|
||||
diffusion_model = std::make_shared<MMDiTModel>(backend, model_loader.tensor_storages_types);
|
||||
} else if (sd_version_is_flux(version)) {
|
||||
cond_stage_model = std::make_shared<FluxCLIPEmbedder>(clip_backend, model_loader.tensor_storages_types);
|
||||
diffusion_model = std::make_shared<FluxModel>(backend, model_loader.tensor_storages_types, diffusion_flash_attn);
|
||||
} else {
|
||||
if (id_embeddings_path.find("v2") != std::string::npos) {
|
||||
cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend, model_loader.tensor_storages_types, embeddings_path, version, PM_VERSION_2);
|
||||
} else {
|
||||
cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend, model_loader.tensor_storages_types, embeddings_path, version);
|
||||
}
|
||||
diffusion_model = std::make_shared<UNetModel>(backend, model_loader.tensor_storages_types, version, diffusion_flash_attn);
|
||||
}
|
||||
|
||||
cond_stage_model->alloc_params_buffer();
|
||||
cond_stage_model->get_param_tensors(tensors);
|
||||
|
||||
|
@ -364,11 +375,11 @@ public:
|
|||
} else {
|
||||
vae_backend = backend;
|
||||
}
|
||||
first_stage_model = std::make_shared<AutoEncoderKL>(vae_backend, vae_wtype, vae_decode_only, false, version);
|
||||
first_stage_model = std::make_shared<AutoEncoderKL>(vae_backend, model_loader.tensor_storages_types, "first_stage_model", vae_decode_only, false, version);
|
||||
first_stage_model->alloc_params_buffer();
|
||||
first_stage_model->get_param_tensors(tensors, "first_stage_model");
|
||||
} else {
|
||||
tae_first_stage = std::make_shared<TinyAutoEncoder>(backend, vae_wtype, vae_decode_only);
|
||||
tae_first_stage = std::make_shared<TinyAutoEncoder>(backend, model_loader.tensor_storages_types, "decoder.layers", vae_decode_only);
|
||||
}
|
||||
// first_stage_model->get_param_tensors(tensors, "first_stage_model.");
|
||||
|
||||
|
@ -380,12 +391,17 @@ public:
|
|||
} else {
|
||||
controlnet_backend = backend;
|
||||
}
|
||||
control_net = std::make_shared<ControlNet>(controlnet_backend, diffusion_model_wtype, version);
|
||||
control_net = std::make_shared<ControlNet>(controlnet_backend, model_loader.tensor_storages_types, version);
|
||||
}
|
||||
|
||||
pmid_model = std::make_shared<PhotoMakerIDEncoder>(clip_backend, model_wtype, version);
|
||||
if (id_embeddings_path.find("v2") != std::string::npos) {
|
||||
pmid_model = std::make_shared<PhotoMakerIDEncoder>(backend, model_loader.tensor_storages_types, "pmid", version, PM_VERSION_2);
|
||||
LOG_INFO("using PhotoMaker Version 2");
|
||||
} else {
|
||||
pmid_model = std::make_shared<PhotoMakerIDEncoder>(backend, model_loader.tensor_storages_types, "pmid", version);
|
||||
}
|
||||
if (id_embeddings_path.size() > 0) {
|
||||
pmid_lora = std::make_shared<LoraModel>(backend, model_wtype, id_embeddings_path, "");
|
||||
pmid_lora = std::make_shared<LoraModel>(backend, id_embeddings_path, "");
|
||||
if (!pmid_lora->load_from_file(true)) {
|
||||
LOG_WARN("load photomaker lora tensors from %s failed", id_embeddings_path.c_str());
|
||||
return false;
|
||||
|
@ -402,14 +418,8 @@ public:
|
|||
LOG_ERROR(" pmid model params buffer allocation failed");
|
||||
return false;
|
||||
}
|
||||
// LOG_INFO("pmid param memory buffer size = %.2fMB ",
|
||||
// pmid_model->params_buffer_size / 1024.0 / 1024.0);
|
||||
pmid_model->get_param_tensors(tensors, "pmid");
|
||||
}
|
||||
// if(stacked_id){
|
||||
// pmid_model.init_params(GGML_TYPE_F32);
|
||||
// pmid_model.map_by_name(tensors, "pmid.");
|
||||
// }
|
||||
}
|
||||
|
||||
struct ggml_init_params params;
|
||||
|
@ -539,14 +549,17 @@ public:
|
|||
is_using_v_parameterization = true;
|
||||
}
|
||||
|
||||
if (version == VERSION_SD3_2B || version == VERSION_SD3_5_8B || version == VERSION_SD3_5_2B) {
|
||||
if (sd_version_is_sd3(version)) {
|
||||
LOG_INFO("running in FLOW mode");
|
||||
denoiser = std::make_shared<DiscreteFlowDenoiser>();
|
||||
} else if (version == VERSION_FLUX_DEV || version == VERSION_FLUX_SCHNELL) {
|
||||
} else if (sd_version_is_flux(version)) {
|
||||
LOG_INFO("running in Flux FLOW mode");
|
||||
float shift = 1.15f;
|
||||
if (version == VERSION_FLUX_SCHNELL) {
|
||||
shift = 1.0f; // TODO: validate
|
||||
float shift = 1.0f; // TODO: validate
|
||||
for (auto pair : model_loader.tensor_storages_types) {
|
||||
if (pair.first.find("model.diffusion_model.guidance_in.in_layer.weight") != std::string::npos) {
|
||||
shift = 1.15f;
|
||||
break;
|
||||
}
|
||||
}
|
||||
denoiser = std::make_shared<FluxFlowDenoiser>(shift);
|
||||
} else if (is_using_v_parameterization) {
|
||||
|
@ -647,7 +660,7 @@ public:
|
|||
LOG_WARN("can not find %s for lora %s", st_file_path.c_str(), lora_path.c_str());
|
||||
return;
|
||||
}
|
||||
LoraModel lora(backend, model_wtype, file_path);
|
||||
LoraModel lora(backend, file_path);
|
||||
if (!lora.load_from_file()) {
|
||||
LOG_WARN("load lora tensors from %s failed", file_path.c_str());
|
||||
return;
|
||||
|
@ -677,7 +690,7 @@ public:
|
|||
LOG_WARN("can not find %s or %s for lora %s", st_file_path.c_str(), ckpt_file_path.c_str(), lora_name.c_str());
|
||||
return;
|
||||
}
|
||||
LoraModel lora(backend, model_wtype, file_path);
|
||||
LoraModel lora(backend, file_path);
|
||||
if (!lora.load_from_file()) {
|
||||
LOG_WARN("load lora tensors from %s failed", file_path.c_str());
|
||||
return;
|
||||
|
@ -724,10 +737,10 @@ public:
|
|||
ggml_tensor* id_encoder(ggml_context* work_ctx,
|
||||
ggml_tensor* init_img,
|
||||
ggml_tensor* prompts_embeds,
|
||||
ggml_tensor* id_embeds,
|
||||
std::vector<bool>& class_tokens_mask) {
|
||||
ggml_tensor* res = NULL;
|
||||
pmid_model->compute(n_threads, init_img, prompts_embeds, class_tokens_mask, &res, work_ctx);
|
||||
|
||||
pmid_model->compute(n_threads, init_img, prompts_embeds, id_embeds, class_tokens_mask, &res, work_ctx);
|
||||
return res;
|
||||
}
|
||||
|
||||
|
@ -822,7 +835,11 @@ public:
|
|||
sample_method_t method,
|
||||
const std::vector<float>& sigmas,
|
||||
int start_merge_step,
|
||||
SDCondition id_cond) {
|
||||
SDCondition id_cond,
|
||||
std::vector<int> skip_layers = {},
|
||||
float slg_scale = 0,
|
||||
float skip_layer_start = 0.01,
|
||||
float skip_layer_end = 0.2) {
|
||||
size_t steps = sigmas.size() - 1;
|
||||
// noise = load_tensor_from_file(work_ctx, "./rand0.bin");
|
||||
// print_ggml_tensor(noise);
|
||||
|
@ -833,13 +850,24 @@ public:
|
|||
struct ggml_tensor* noised_input = ggml_dup_tensor(work_ctx, noise);
|
||||
|
||||
bool has_unconditioned = cfg_scale != 1.0 && uncond.c_crossattn != NULL;
|
||||
bool has_skiplayer = slg_scale != 0.0 && skip_layers.size() > 0;
|
||||
|
||||
// denoise wrapper
|
||||
struct ggml_tensor* out_cond = ggml_dup_tensor(work_ctx, x);
|
||||
struct ggml_tensor* out_uncond = NULL;
|
||||
struct ggml_tensor* out_skip = NULL;
|
||||
|
||||
if (has_unconditioned) {
|
||||
out_uncond = ggml_dup_tensor(work_ctx, x);
|
||||
}
|
||||
if (has_skiplayer) {
|
||||
if (sd_version_is_dit(version)) {
|
||||
out_skip = ggml_dup_tensor(work_ctx, x);
|
||||
} else {
|
||||
has_skiplayer = false;
|
||||
LOG_WARN("SLG is incompatible with %s models", model_version_to_str[version]);
|
||||
}
|
||||
}
|
||||
struct ggml_tensor* denoised = ggml_dup_tensor(work_ctx, x);
|
||||
|
||||
auto denoise = [&](ggml_tensor* input, float sigma, int step) -> ggml_tensor* {
|
||||
|
@ -920,6 +948,28 @@ public:
|
|||
&out_uncond);
|
||||
negative_data = (float*)out_uncond->data;
|
||||
}
|
||||
|
||||
int step_count = sigmas.size();
|
||||
bool is_skiplayer_step = has_skiplayer && step > (int)(skip_layer_start * step_count) && step < (int)(skip_layer_end * step_count);
|
||||
float* skip_layer_data = NULL;
|
||||
if (is_skiplayer_step) {
|
||||
LOG_DEBUG("Skipping layers at step %d\n", step);
|
||||
// skip layer (same as conditionned)
|
||||
diffusion_model->compute(n_threads,
|
||||
noised_input,
|
||||
timesteps,
|
||||
cond.c_crossattn,
|
||||
cond.c_concat,
|
||||
cond.c_vector,
|
||||
guidance_tensor,
|
||||
-1,
|
||||
controls,
|
||||
control_strength,
|
||||
&out_skip,
|
||||
NULL,
|
||||
skip_layers);
|
||||
skip_layer_data = (float*)out_skip->data;
|
||||
}
|
||||
float* vec_denoised = (float*)denoised->data;
|
||||
float* vec_input = (float*)input->data;
|
||||
float* positive_data = (float*)out_cond->data;
|
||||
|
@ -936,6 +986,9 @@ public:
|
|||
latent_result = negative_data[i] + cfg_scale * (positive_data[i] - negative_data[i]);
|
||||
}
|
||||
}
|
||||
if (is_skiplayer_step) {
|
||||
latent_result = latent_result + (positive_data[i] - skip_layer_data[i]) * slg_scale;
|
||||
}
|
||||
// v = latent_result, eps = latent_result
|
||||
// denoised = (v * c_out + input * c_skip) or (input + eps * c_out)
|
||||
vec_denoised[i] = latent_result * c_out + vec_input[i] * c_skip;
|
||||
|
@ -999,9 +1052,9 @@ public:
|
|||
if (use_tiny_autoencoder) {
|
||||
C = 4;
|
||||
} else {
|
||||
if (version == VERSION_SD3_2B || version == VERSION_SD3_5_8B || version == VERSION_SD3_5_2B) {
|
||||
if (sd_version_is_sd3(version)) {
|
||||
C = 32;
|
||||
} else if (version == VERSION_FLUX_DEV || version == VERSION_FLUX_SCHNELL) {
|
||||
} else if (sd_version_is_flux(version)) {
|
||||
C = 32;
|
||||
}
|
||||
}
|
||||
|
@ -1096,7 +1149,8 @@ sd_ctx_t* new_sd_ctx(const char* model_path_c_str,
|
|||
enum schedule_t s,
|
||||
bool keep_clip_on_cpu,
|
||||
bool keep_control_net_cpu,
|
||||
bool keep_vae_on_cpu) {
|
||||
bool keep_vae_on_cpu,
|
||||
bool diffusion_flash_attn) {
|
||||
sd_ctx_t* sd_ctx = (sd_ctx_t*)malloc(sizeof(sd_ctx_t));
|
||||
if (sd_ctx == NULL) {
|
||||
return NULL;
|
||||
|
@ -1137,7 +1191,8 @@ sd_ctx_t* new_sd_ctx(const char* model_path_c_str,
|
|||
s,
|
||||
keep_clip_on_cpu,
|
||||
keep_control_net_cpu,
|
||||
keep_vae_on_cpu)) {
|
||||
keep_vae_on_cpu,
|
||||
diffusion_flash_attn)) {
|
||||
delete sd_ctx->sd;
|
||||
sd_ctx->sd = NULL;
|
||||
free(sd_ctx);
|
||||
|
@ -1172,7 +1227,11 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
|
|||
float control_strength,
|
||||
float style_ratio,
|
||||
bool normalize_input,
|
||||
std::string input_id_images_path) {
|
||||
std::string input_id_images_path,
|
||||
std::vector<int> skip_layers = {},
|
||||
float slg_scale = 0,
|
||||
float skip_layer_start = 0.01,
|
||||
float skip_layer_end = 0.2) {
|
||||
if (seed < 0) {
|
||||
// Generally, when using the provided command line, the seed is always >0.
|
||||
// However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library
|
||||
|
@ -1228,11 +1287,15 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
|
|||
}
|
||||
// preprocess input id images
|
||||
std::vector<sd_image_t*> input_id_images;
|
||||
bool pmv2 = sd_ctx->sd->pmid_model->get_version() == PM_VERSION_2;
|
||||
if (sd_ctx->sd->pmid_model && input_id_images_path.size() > 0) {
|
||||
std::vector<std::string> img_files = get_files_from_dir(input_id_images_path);
|
||||
for (std::string img_file : img_files) {
|
||||
int c = 0;
|
||||
int width, height;
|
||||
if (ends_with(img_file, "safetensors")) {
|
||||
continue;
|
||||
}
|
||||
uint8_t* input_image_buffer = stbi_load(img_file.c_str(), &width, &height, &c, 3);
|
||||
if (input_image_buffer == NULL) {
|
||||
LOG_ERROR("PhotoMaker load image from '%s' failed", img_file.c_str());
|
||||
|
@ -1270,20 +1333,25 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
|
|||
else
|
||||
sd_mul_images_to_tensor(init_image->data, init_img, i, NULL, NULL);
|
||||
}
|
||||
t0 = ggml_time_ms();
|
||||
auto cond_tup = sd_ctx->sd->cond_stage_model->get_learned_condition_with_trigger(work_ctx,
|
||||
sd_ctx->sd->n_threads, prompt,
|
||||
clip_skip,
|
||||
width,
|
||||
height,
|
||||
num_input_images,
|
||||
sd_ctx->sd->diffusion_model->get_adm_in_channels());
|
||||
id_cond = std::get<0>(cond_tup);
|
||||
class_tokens_mask = std::get<1>(cond_tup); //
|
||||
|
||||
id_cond.c_crossattn = sd_ctx->sd->id_encoder(work_ctx, init_img, id_cond.c_crossattn, class_tokens_mask);
|
||||
t0 = ggml_time_ms();
|
||||
auto cond_tup = sd_ctx->sd->cond_stage_model->get_learned_condition_with_trigger(work_ctx,
|
||||
sd_ctx->sd->n_threads, prompt,
|
||||
clip_skip,
|
||||
width,
|
||||
height,
|
||||
num_input_images,
|
||||
sd_ctx->sd->diffusion_model->get_adm_in_channels());
|
||||
id_cond = std::get<0>(cond_tup);
|
||||
class_tokens_mask = std::get<1>(cond_tup); //
|
||||
struct ggml_tensor* id_embeds = NULL;
|
||||
if (pmv2) {
|
||||
// id_embeds = sd_ctx->sd->pmid_id_embeds->get();
|
||||
id_embeds = load_tensor_from_file(work_ctx, path_join(input_id_images_path, "id_embeds.bin"));
|
||||
// print_ggml_tensor(id_embeds, true, "id_embeds:");
|
||||
}
|
||||
id_cond.c_crossattn = sd_ctx->sd->id_encoder(work_ctx, init_img, id_cond.c_crossattn, id_embeds, class_tokens_mask);
|
||||
t1 = ggml_time_ms();
|
||||
LOG_INFO("Photomaker ID Stacking, taking %d ms", t1 - t0);
|
||||
LOG_INFO("Photomaker ID Stacking, taking %" PRId64 " ms", t1 - t0);
|
||||
if (sd_ctx->sd->free_params_immediately) {
|
||||
sd_ctx->sd->pmid_model->free_params_buffer();
|
||||
}
|
||||
|
@ -1348,9 +1416,9 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
|
|||
// Sample
|
||||
std::vector<struct ggml_tensor*> final_latents; // collect latents to decode
|
||||
int C = 4;
|
||||
if (sd_ctx->sd->version == VERSION_SD3_2B || sd_ctx->sd->version == VERSION_SD3_5_8B || sd_ctx->sd->version == VERSION_SD3_5_2B) {
|
||||
if (sd_version_is_sd3(sd_ctx->sd->version)) {
|
||||
C = 16;
|
||||
} else if (sd_ctx->sd->version == VERSION_FLUX_DEV || sd_ctx->sd->version == VERSION_FLUX_SCHNELL) {
|
||||
} else if (sd_version_is_flux(sd_ctx->sd->version)) {
|
||||
C = 16;
|
||||
}
|
||||
int W = width / 8;
|
||||
|
@ -1387,7 +1455,11 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
|
|||
sample_method,
|
||||
sigmas,
|
||||
start_merge_step,
|
||||
id_cond);
|
||||
id_cond,
|
||||
skip_layers,
|
||||
slg_scale,
|
||||
skip_layer_start,
|
||||
skip_layer_end);
|
||||
// struct ggml_tensor* x_0 = load_tensor_from_file(ctx, "samples_ddim.bin");
|
||||
// print_ggml_tensor(x_0);
|
||||
int64_t sampling_end = ggml_time_ms();
|
||||
|
@ -1453,7 +1525,13 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
|
|||
float control_strength,
|
||||
float style_ratio,
|
||||
bool normalize_input,
|
||||
const char* input_id_images_path_c_str) {
|
||||
const char* input_id_images_path_c_str,
|
||||
int* skip_layers = NULL,
|
||||
size_t skip_layers_count = 0,
|
||||
float slg_scale = 0,
|
||||
float skip_layer_start = 0.01,
|
||||
float skip_layer_end = 0.2) {
|
||||
std::vector<int> skip_layers_vec(skip_layers, skip_layers + skip_layers_count);
|
||||
LOG_DEBUG("txt2img %dx%d", width, height);
|
||||
if (sd_ctx == NULL) {
|
||||
return NULL;
|
||||
|
@ -1461,10 +1539,10 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
|
|||
|
||||
struct ggml_init_params params;
|
||||
params.mem_size = static_cast<size_t>(10 * 1024 * 1024); // 10 MB
|
||||
if (sd_ctx->sd->version == VERSION_SD3_2B || sd_ctx->sd->version == VERSION_SD3_5_8B || sd_ctx->sd->version == VERSION_SD3_5_2B) {
|
||||
if (sd_version_is_sd3(sd_ctx->sd->version)) {
|
||||
params.mem_size *= 3;
|
||||
}
|
||||
if (sd_ctx->sd->version == VERSION_FLUX_DEV || sd_ctx->sd->version == VERSION_FLUX_SCHNELL) {
|
||||
if (sd_version_is_flux(sd_ctx->sd->version)) {
|
||||
params.mem_size *= 4;
|
||||
}
|
||||
if (sd_ctx->sd->stacked_id) {
|
||||
|
@ -1487,17 +1565,17 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
|
|||
std::vector<float> sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps);
|
||||
|
||||
int C = 4;
|
||||
if (sd_ctx->sd->version == VERSION_SD3_2B || sd_ctx->sd->version == VERSION_SD3_5_8B || sd_ctx->sd->version == VERSION_SD3_5_2B) {
|
||||
if (sd_version_is_sd3(sd_ctx->sd->version)) {
|
||||
C = 16;
|
||||
} else if (sd_ctx->sd->version == VERSION_FLUX_DEV || sd_ctx->sd->version == VERSION_FLUX_SCHNELL) {
|
||||
} else if (sd_version_is_flux(sd_ctx->sd->version)) {
|
||||
C = 16;
|
||||
}
|
||||
int W = width / 8;
|
||||
int H = height / 8;
|
||||
ggml_tensor* init_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1);
|
||||
if (sd_ctx->sd->version == VERSION_SD3_2B || sd_ctx->sd->version == VERSION_SD3_5_8B || sd_ctx->sd->version == VERSION_SD3_5_2B) {
|
||||
if (sd_version_is_sd3(sd_ctx->sd->version)) {
|
||||
ggml_set_f32(init_latent, 0.0609f);
|
||||
} else if (sd_ctx->sd->version == VERSION_FLUX_DEV || sd_ctx->sd->version == VERSION_FLUX_SCHNELL) {
|
||||
} else if (sd_version_is_flux(sd_ctx->sd->version)) {
|
||||
ggml_set_f32(init_latent, 0.1159f);
|
||||
} else {
|
||||
ggml_set_f32(init_latent, 0.f);
|
||||
|
@ -1521,7 +1599,11 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
|
|||
control_strength,
|
||||
style_ratio,
|
||||
normalize_input,
|
||||
input_id_images_path_c_str);
|
||||
input_id_images_path_c_str,
|
||||
skip_layers_vec,
|
||||
slg_scale,
|
||||
skip_layer_start,
|
||||
skip_layer_end);
|
||||
|
||||
size_t t1 = ggml_time_ms();
|
||||
|
||||
|
@ -1548,7 +1630,13 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
|
|||
float control_strength,
|
||||
float style_ratio,
|
||||
bool normalize_input,
|
||||
const char* input_id_images_path_c_str) {
|
||||
const char* input_id_images_path_c_str,
|
||||
int* skip_layers = NULL,
|
||||
size_t skip_layers_count = 0,
|
||||
float slg_scale = 0,
|
||||
float skip_layer_start = 0.01,
|
||||
float skip_layer_end = 0.2) {
|
||||
std::vector<int> skip_layers_vec(skip_layers, skip_layers + skip_layers_count);
|
||||
LOG_DEBUG("img2img %dx%d", width, height);
|
||||
if (sd_ctx == NULL) {
|
||||
return NULL;
|
||||
|
@ -1556,10 +1644,10 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
|
|||
|
||||
struct ggml_init_params params;
|
||||
params.mem_size = static_cast<size_t>(10 * 1024 * 1024); // 10 MB
|
||||
if (sd_ctx->sd->version == VERSION_SD3_2B || sd_ctx->sd->version == VERSION_SD3_5_8B || sd_ctx->sd->version == VERSION_SD3_5_2B) {
|
||||
if (sd_version_is_sd3(sd_ctx->sd->version)) {
|
||||
params.mem_size *= 2;
|
||||
}
|
||||
if (sd_ctx->sd->version == VERSION_FLUX_DEV || sd_ctx->sd->version == VERSION_FLUX_SCHNELL) {
|
||||
if (sd_version_is_flux(sd_ctx->sd->version)) {
|
||||
params.mem_size *= 3;
|
||||
}
|
||||
if (sd_ctx->sd->stacked_id) {
|
||||
|
@ -1622,7 +1710,11 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
|
|||
control_strength,
|
||||
style_ratio,
|
||||
normalize_input,
|
||||
input_id_images_path_c_str);
|
||||
input_id_images_path_c_str,
|
||||
skip_layers_vec,
|
||||
slg_scale,
|
||||
skip_layer_start,
|
||||
skip_layer_end);
|
||||
|
||||
size_t t2 = ggml_time_ms();
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue