added support for flux kontext

This commit is contained in:
Concedo 2025-06-28 11:37:19 +08:00
parent 0bd648ffa4
commit ed289227e5
6 changed files with 202 additions and 70 deletions

View file

@ -678,7 +678,7 @@ public:
int64_t t0 = ggml_time_ms();
struct ggml_tensor* out = ggml_dup_tensor(work_ctx, x_t);
diffusion_model->compute(n_threads, x_t, timesteps, c, concat, NULL, NULL, -1, {}, 0.f, &out);
diffusion_model->compute(n_threads, x_t, timesteps, c, concat, NULL, NULL, -1, {}, 0.f, std::vector<struct ggml_tensor*>(), &out);
diffusion_model->free_compute_buffer();
double result = 0.f;
@ -892,11 +892,12 @@ public:
const std::vector<float>& sigmas,
int start_merge_step,
SDCondition id_cond,
std::vector<int> skip_layers = {},
float slg_scale = 0,
float skip_layer_start = 0.01,
float skip_layer_end = 0.2,
ggml_tensor* noise_mask = nullptr) {
std::vector<int> skip_layers = {},
float slg_scale = 0,
float skip_layer_start = 0.01,
float skip_layer_end = 0.2,
std::vector<struct ggml_tensor*> kontext_imgs = std::vector<struct ggml_tensor*>(),
ggml_tensor* noise_mask = NULL) {
LOG_DEBUG("Sample");
struct ggml_init_params params;
size_t data_size = ggml_row_size(init_latent->type, init_latent->ne[0]);
@ -982,6 +983,7 @@ public:
-1,
controls,
control_strength,
kontext_imgs,
&out_cond);
} else {
diffusion_model->compute(n_threads,
@ -994,6 +996,7 @@ public:
-1,
controls,
control_strength,
kontext_imgs,
&out_cond);
}
@ -1014,6 +1017,7 @@ public:
-1,
controls,
control_strength,
kontext_imgs,
&out_uncond);
negative_data = (float*)out_uncond->data;
}
@ -1034,6 +1038,7 @@ public:
-1,
controls,
control_strength,
kontext_imgs,
&out_skip,
NULL,
skip_layers);
@ -1411,11 +1416,12 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
float style_ratio,
bool normalize_input,
std::string input_id_images_path,
std::vector<int> skip_layers = {},
float slg_scale = 0,
float skip_layer_start = 0.01,
float skip_layer_end = 0.2,
ggml_tensor* masked_image = NULL,
std::vector<struct ggml_tensor*> kontext_imgs = std::vector<struct ggml_tensor*>(),
std::vector<int> skip_layers = {},
float slg_scale = 0,
float skip_layer_start = 0.01,
float skip_layer_end = 0.2,
ggml_tensor* masked_image = NULL,
const sd_image_t* photomaker_reference = nullptr) {
if (seed < 0) {
// Generally, when using the provided command line, the seed is always >0.
@ -1707,6 +1713,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
slg_scale,
skip_layer_start,
skip_layer_end,
kontext_imgs,
noise_mask);
// struct ggml_tensor* x_0 = load_tensor_from_file(ctx, "samples_ddim.bin");
@ -1776,6 +1783,8 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
float style_ratio,
bool normalize_input,
const char* input_id_images_path_c_str,
sd_image_t* kontext_imgs,
int kontext_img_count,
int* skip_layers = NULL,
size_t skip_layers_count = 0,
float slg_scale = 0,
@ -1835,6 +1844,22 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
if (sd_version_is_inpaint(sd_ctx->sd->version)) {
LOG_WARN("This is an inpainting model, this should only be used in img2img mode with a mask");
}
std::vector<struct ggml_tensor*> kontext_latents = std::vector<struct ggml_tensor*>();
if (kontext_imgs) {
for (int i = 0; i < kontext_img_count; i++) {
ggml_tensor* img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, kontext_imgs[i].width, kontext_imgs[i].height, 3, 1);
sd_image_to_tensor(kontext_imgs[i].data, img);
ggml_tensor* latent = NULL;
if (!sd_ctx->sd->use_tiny_autoencoder) {
ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, img);
latent = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments);
} else {
latent = sd_ctx->sd->encode_first_stage(work_ctx, img);
}
kontext_latents.push_back(latent);
}
}
sd_image_t* result_images = generate_image(sd_ctx,
work_ctx,
@ -1856,6 +1881,7 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
style_ratio,
normalize_input,
input_id_images_path_c_str,
kontext_latents,
skip_layers_vec,
slg_scale,
skip_layer_start,
@ -1891,6 +1917,8 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
float style_ratio,
bool normalize_input,
const char* input_id_images_path_c_str,
sd_image_t* kontext_imgs,
int kontext_img_count,
int* skip_layers = NULL,
size_t skip_layers_count = 0,
float slg_scale = 0,
@ -2006,6 +2034,23 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
} else {
init_latent = sd_ctx->sd->encode_first_stage(work_ctx, init_img);
}
std::vector<struct ggml_tensor*> kontext_latents = std::vector<struct ggml_tensor*>();
if (kontext_imgs) {
for (int i = 0; i < kontext_img_count; i++) {
ggml_tensor* img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
sd_image_to_tensor(kontext_imgs[i].data, img);
ggml_tensor* latent = NULL;
if (!sd_ctx->sd->use_tiny_autoencoder) {
ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, img);
latent = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments);
} else {
latent = sd_ctx->sd->encode_first_stage(work_ctx, img);
}
kontext_latents.push_back(latent);
}
}
// print_ggml_tensor(init_latent, true);
size_t t1 = ggml_time_ms();
LOG_INFO("encode_first_stage completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
@ -2038,6 +2083,7 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
style_ratio,
normalize_input,
input_id_images_path_c_str,
kontext_latents,
skip_layers_vec,
slg_scale,
skip_layer_start,