mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-12 01:54:37 +00:00
added support for flux kontext
This commit is contained in:
parent
0bd648ffa4
commit
ed289227e5
6 changed files with 202 additions and 70 deletions
|
@ -16,6 +16,7 @@ struct DiffusionModel {
|
|||
int num_video_frames = -1,
|
||||
std::vector<struct ggml_tensor*> controls = {},
|
||||
float control_strength = 0.f,
|
||||
std::vector<struct ggml_tensor*> kontext_imgs = std::vector<struct ggml_tensor*>(),
|
||||
struct ggml_tensor** output = NULL,
|
||||
struct ggml_context* output_ctx = NULL,
|
||||
std::vector<int> skip_layers = std::vector<int>()) = 0;
|
||||
|
@ -71,6 +72,7 @@ struct UNetModel : public DiffusionModel {
|
|||
int num_video_frames = -1,
|
||||
std::vector<struct ggml_tensor*> controls = {},
|
||||
float control_strength = 0.f,
|
||||
std::vector<struct ggml_tensor*> kontext_imgs = std::vector<struct ggml_tensor*>(),
|
||||
struct ggml_tensor** output = NULL,
|
||||
struct ggml_context* output_ctx = NULL,
|
||||
std::vector<int> skip_layers = std::vector<int>()) {
|
||||
|
@ -121,6 +123,7 @@ struct MMDiTModel : public DiffusionModel {
|
|||
int num_video_frames = -1,
|
||||
std::vector<struct ggml_tensor*> controls = {},
|
||||
float control_strength = 0.f,
|
||||
std::vector<struct ggml_tensor*> kontext_imgs = std::vector<struct ggml_tensor*>(),
|
||||
struct ggml_tensor** output = NULL,
|
||||
struct ggml_context* output_ctx = NULL,
|
||||
std::vector<int> skip_layers = std::vector<int>()) {
|
||||
|
@ -172,10 +175,11 @@ struct FluxModel : public DiffusionModel {
|
|||
int num_video_frames = -1,
|
||||
std::vector<struct ggml_tensor*> controls = {},
|
||||
float control_strength = 0.f,
|
||||
std::vector<struct ggml_tensor*> kontext_imgs = std::vector<struct ggml_tensor*>(),
|
||||
struct ggml_tensor** output = NULL,
|
||||
struct ggml_context* output_ctx = NULL,
|
||||
std::vector<int> skip_layers = std::vector<int>()) {
|
||||
return flux.compute(n_threads, x, timesteps, context, c_concat, y, guidance, output, output_ctx, skip_layers);
|
||||
return flux.compute(n_threads, x, timesteps, context, c_concat, y, guidance, kontext_imgs, output, output_ctx, skip_layers);
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
@ -672,11 +672,11 @@ namespace Flux {
|
|||
}
|
||||
|
||||
// Generate IDs for image patches and text
|
||||
std::vector<std::vector<float>> gen_ids(int h, int w, int patch_size, int bs, int context_len) {
|
||||
std::vector<std::vector<float>> gen_ids(int h, int w, int patch_size, int index = 0) {
|
||||
int h_len = (h + (patch_size / 2)) / patch_size;
|
||||
int w_len = (w + (patch_size / 2)) / patch_size;
|
||||
|
||||
std::vector<std::vector<float>> img_ids(h_len * w_len, std::vector<float>(3, 0.0));
|
||||
std::vector<std::vector<float>> img_ids(h_len * w_len, std::vector<float>(3, (float)index));
|
||||
|
||||
std::vector<float> row_ids = linspace(0, h_len - 1, h_len);
|
||||
std::vector<float> col_ids = linspace(0, w_len - 1, w_len);
|
||||
|
@ -688,10 +688,22 @@ namespace Flux {
|
|||
}
|
||||
}
|
||||
|
||||
std::vector<std::vector<float>> img_ids_repeated(bs * img_ids.size(), std::vector<float>(3));
|
||||
for (int i = 0; i < bs; ++i) {
|
||||
for (int j = 0; j < img_ids.size(); ++j) {
|
||||
img_ids_repeated[i * img_ids.size() + j] = img_ids[j];
|
||||
return img_ids;
|
||||
}
|
||||
|
||||
// Generate positional embeddings
|
||||
std::vector<float> gen_pe(std::vector<struct ggml_tensor*> imgs, struct ggml_tensor* context, int patch_size, int theta, const std::vector<int>& axes_dim) {
|
||||
int context_len = context->ne[1];
|
||||
int bs = imgs[0]->ne[3];
|
||||
|
||||
std::vector<std::vector<float>> img_ids;
|
||||
for (int i = 0; i < imgs.size(); i++) {
|
||||
auto x = imgs[i];
|
||||
if (x) {
|
||||
int h = x->ne[1];
|
||||
int w = x->ne[0];
|
||||
std::vector<std::vector<float>> img_ids_i = gen_ids(h, w, patch_size, i);
|
||||
img_ids.insert(img_ids.end(), img_ids_i.begin(), img_ids_i.end());
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -702,17 +714,10 @@ namespace Flux {
|
|||
ids[i * (context_len + img_ids.size()) + j] = txt_ids[j];
|
||||
}
|
||||
for (int j = 0; j < img_ids.size(); ++j) {
|
||||
ids[i * (context_len + img_ids.size()) + context_len + j] = img_ids_repeated[i * img_ids.size() + j];
|
||||
ids[i * (context_len + img_ids.size()) + context_len + j] = img_ids[j];
|
||||
}
|
||||
}
|
||||
|
||||
return ids;
|
||||
}
|
||||
|
||||
|
||||
// Generate positional embeddings
|
||||
std::vector<float> gen_pe(int h, int w, int patch_size, int bs, int context_len, int theta, const std::vector<int>& axes_dim) {
|
||||
std::vector<std::vector<float>> ids = gen_ids(h, w, patch_size, bs, context_len);
|
||||
std::vector<std::vector<float>> trans_ids = transpose(ids);
|
||||
size_t pos_len = ids.size();
|
||||
int num_axes = axes_dim.size();
|
||||
|
@ -925,7 +930,7 @@ namespace Flux {
|
|||
}
|
||||
|
||||
struct ggml_tensor* forward(struct ggml_context* ctx,
|
||||
struct ggml_tensor* x,
|
||||
std::vector<struct ggml_tensor*> imgs,
|
||||
struct ggml_tensor* timestep,
|
||||
struct ggml_tensor* context,
|
||||
struct ggml_tensor* c_concat,
|
||||
|
@ -933,7 +938,8 @@ namespace Flux {
|
|||
struct ggml_tensor* guidance,
|
||||
struct ggml_tensor* pe,
|
||||
struct ggml_tensor* arange = NULL,
|
||||
std::vector<int> skip_layers = std::vector<int>()) {
|
||||
std::vector<int> skip_layers = std::vector<int>(),
|
||||
SDVersion version = VERSION_FLUX) {
|
||||
// Forward pass of DiT.
|
||||
// x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
|
||||
// timestep: (N,) tensor of diffusion timesteps
|
||||
|
@ -944,18 +950,31 @@ namespace Flux {
|
|||
// pe: (L, d_head/2, 2, 2)
|
||||
// return: (N, C, H, W)
|
||||
|
||||
auto x = imgs[0];
|
||||
GGML_ASSERT(x->ne[3] == 1);
|
||||
|
||||
int64_t W = x->ne[0];
|
||||
int64_t H = x->ne[1];
|
||||
int64_t C = x->ne[2];
|
||||
int64_t patch_size = 2;
|
||||
int pad_h = (patch_size - H % patch_size) % patch_size;
|
||||
int pad_w = (patch_size - W % patch_size) % patch_size;
|
||||
x = ggml_pad(ctx, x, pad_w, pad_h, 0, 0); // [N, C, H + pad_h, W + pad_w]
|
||||
int pad_h = (patch_size - x->ne[0] % patch_size) % patch_size;
|
||||
int pad_w = (patch_size - x->ne[1] % patch_size) % patch_size;
|
||||
|
||||
// img = rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=patch_size, pw=patch_size)
|
||||
auto img = patchify(ctx, x, patch_size); // [N, h*w, C * patch_size * patch_size]
|
||||
ggml_tensor* img = NULL; // [N, h*w, C * patch_size * patch_size]
|
||||
int64_t patchified_img_size;
|
||||
for (auto& x : imgs) {
|
||||
int pad_h = (patch_size - x->ne[0] % patch_size) % patch_size;
|
||||
int pad_w = (patch_size - x->ne[1] % patch_size) % patch_size;
|
||||
ggml_tensor* pad_x = ggml_pad(ctx, x, pad_w, pad_h, 0, 0);
|
||||
pad_x = patchify(ctx, pad_x, patch_size);
|
||||
if (img) {
|
||||
img = ggml_concat(ctx, img, pad_x, 1);
|
||||
} else {
|
||||
img = pad_x;
|
||||
patchified_img_size = img->ne[1];
|
||||
}
|
||||
}
|
||||
|
||||
if (c_concat != NULL) {
|
||||
ggml_tensor* masked = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], C, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], 0);
|
||||
|
@ -971,6 +990,7 @@ namespace Flux {
|
|||
}
|
||||
|
||||
auto out = forward_orig(ctx, img, context, timestep, y, guidance, pe, arange, skip_layers); // [N, h*w, C * patch_size * patch_size]
|
||||
out = ggml_cont(ctx, ggml_view_2d(ctx, out, out->ne[0], patchified_img_size, out->nb[1], 0));
|
||||
|
||||
// rearrange(out, "b (h w) (c ph pw) -> b c (h ph) (w pw)", h=h_len, w=w_len, ph=2, pw=2)
|
||||
out = unpatchify(ctx, out, (H + pad_h) / patch_size, (W + pad_w) / patch_size, patch_size); // [N, C, H + pad_h, W + pad_w]
|
||||
|
@ -1056,6 +1076,7 @@ namespace Flux {
|
|||
struct ggml_tensor* c_concat,
|
||||
struct ggml_tensor* y,
|
||||
struct ggml_tensor* guidance,
|
||||
std::vector<struct ggml_tensor*> kontext_imgs = std::vector<struct ggml_tensor*>(),
|
||||
std::vector<int> skip_layers = std::vector<int>()) {
|
||||
GGML_ASSERT(x->ne[3] == 1);
|
||||
struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, FLUX_GRAPH_SIZE, false);
|
||||
|
@ -1067,6 +1088,9 @@ namespace Flux {
|
|||
if (c_concat != NULL) {
|
||||
c_concat = to_backend(c_concat);
|
||||
}
|
||||
for (auto &img : kontext_imgs){
|
||||
img = to_backend(img);
|
||||
}
|
||||
if (flux_params.is_chroma) {
|
||||
const char* SD_CHROMA_ENABLE_GUIDANCE = getenv("SD_CHROMA_ENABLE_GUIDANCE");
|
||||
bool disable_guidance = true;
|
||||
|
@ -1107,8 +1131,10 @@ namespace Flux {
|
|||
if (flux_params.guidance_embed || flux_params.is_chroma) {
|
||||
guidance = to_backend(guidance);
|
||||
}
|
||||
auto imgs = kontext_imgs;
|
||||
imgs.insert(imgs.begin(), x);
|
||||
|
||||
pe_vec = flux.gen_pe(x->ne[1], x->ne[0], 2, x->ne[3], context->ne[1], flux_params.theta, flux_params.axes_dim);
|
||||
pe_vec = flux.gen_pe(imgs, context, 2, flux_params.theta, flux_params.axes_dim);
|
||||
int pos_len = pe_vec.size() / flux_params.axes_dim_sum / 2;
|
||||
// LOG_DEBUG("pos_len %d", pos_len);
|
||||
auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, flux_params.axes_dim_sum / 2, pos_len);
|
||||
|
@ -1118,7 +1144,7 @@ namespace Flux {
|
|||
set_backend_tensor_data(pe, pe_vec.data());
|
||||
|
||||
struct ggml_tensor* out = flux.forward(compute_ctx,
|
||||
x,
|
||||
imgs,
|
||||
timesteps,
|
||||
context,
|
||||
c_concat,
|
||||
|
@ -1140,6 +1166,7 @@ namespace Flux {
|
|||
struct ggml_tensor* c_concat,
|
||||
struct ggml_tensor* y,
|
||||
struct ggml_tensor* guidance,
|
||||
std::vector<struct ggml_tensor*> kontext_imgs = std::vector<struct ggml_tensor*>(),
|
||||
struct ggml_tensor** output = NULL,
|
||||
struct ggml_context* output_ctx = NULL,
|
||||
std::vector<int> skip_layers = std::vector<int>()) {
|
||||
|
@ -1149,7 +1176,7 @@ namespace Flux {
|
|||
// y: [N, adm_in_channels] or [1, adm_in_channels]
|
||||
// guidance: [N, ]
|
||||
auto get_graph = [&]() -> struct ggml_cgraph* {
|
||||
return build_graph(x, timesteps, context, c_concat, y, guidance, skip_layers);
|
||||
return build_graph(x, timesteps, context, c_concat, y, guidance, kontext_imgs, skip_layers);
|
||||
};
|
||||
|
||||
GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
|
||||
|
@ -1189,7 +1216,7 @@ namespace Flux {
|
|||
struct ggml_tensor* out = NULL;
|
||||
|
||||
int t0 = ggml_time_ms();
|
||||
compute(8, x, timesteps, context, NULL, y, guidance, &out, work_ctx);
|
||||
compute(8, x, timesteps, context, NULL, y, guidance, std::vector<struct ggml_tensor*>(), &out, work_ctx);
|
||||
int t1 = ggml_time_ms();
|
||||
|
||||
print_ggml_tensor(out);
|
||||
|
|
|
@ -87,6 +87,8 @@ struct SDParams {
|
|||
std::string mask_path;
|
||||
std::string control_image_path;
|
||||
|
||||
std::vector<std::string> kontext_image_paths;
|
||||
|
||||
std::string prompt;
|
||||
std::string negative_prompt;
|
||||
float min_cfg = 1.0f;
|
||||
|
@ -242,6 +244,7 @@ void print_usage(int argc, const char* argv[]) {
|
|||
printf(" --canny apply canny preprocessor (edge detection)\n");
|
||||
printf(" --color Colors the logging tags according to level\n");
|
||||
printf(" -v, --verbose print extra info\n");
|
||||
printf(" -ki, --kontext_img [PATH] Reference image for Flux Kontext models (can be used multiple times) \n");
|
||||
}
|
||||
|
||||
void parse_args(int argc, const char** argv, SDParams& params) {
|
||||
|
@ -626,6 +629,12 @@ void parse_args(int argc, const char** argv, SDParams& params) {
|
|||
break;
|
||||
}
|
||||
params.skip_layer_end = std::stof(argv[i]);
|
||||
} else if (arg == "-ki" || arg == "--kontext-img") {
|
||||
if (++i >= argc) {
|
||||
invalid_arg = true;
|
||||
break;
|
||||
}
|
||||
params.kontext_image_paths.push_back(argv[i]);
|
||||
} else {
|
||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||
print_usage(argc, argv);
|
||||
|
@ -818,8 +827,40 @@ int main(int argc, const char* argv[]) {
|
|||
fprintf(stderr, "SVD support is broken, do not use it!!!\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
bool vae_decode_only = true;
|
||||
|
||||
std::vector<sd_image_t> kontext_imgs;
|
||||
for (auto& path : params.kontext_image_paths) {
|
||||
vae_decode_only = false;
|
||||
int c = 0;
|
||||
int width = 0;
|
||||
int height = 0;
|
||||
uint8_t* image_buffer = stbi_load(path.c_str(), &width, &height, &c, 3);
|
||||
if (image_buffer == NULL) {
|
||||
fprintf(stderr, "load image from '%s' failed\n", path.c_str());
|
||||
return 1;
|
||||
}
|
||||
if (c < 3) {
|
||||
fprintf(stderr, "the number of channels for the input image must be >= 3, but got %d channels\n", c);
|
||||
free(image_buffer);
|
||||
return 1;
|
||||
}
|
||||
if (width <= 0) {
|
||||
fprintf(stderr, "error: the width of image must be greater than 0\n");
|
||||
free(image_buffer);
|
||||
return 1;
|
||||
}
|
||||
if (height <= 0) {
|
||||
fprintf(stderr, "error: the height of image must be greater than 0\n");
|
||||
free(image_buffer);
|
||||
return 1;
|
||||
}
|
||||
kontext_imgs.push_back({(uint32_t)width,
|
||||
(uint32_t)height,
|
||||
3,
|
||||
image_buffer});
|
||||
}
|
||||
|
||||
uint8_t* input_image_buffer = NULL;
|
||||
uint8_t* control_image_buffer = NULL;
|
||||
uint8_t* mask_image_buffer = NULL;
|
||||
|
@ -960,6 +1001,7 @@ int main(int argc, const char* argv[]) {
|
|||
params.style_ratio,
|
||||
params.normalize_input,
|
||||
params.input_id_images_path.c_str(),
|
||||
kontext_imgs.data(), kontext_imgs.size(),
|
||||
params.skip_layers.data(),
|
||||
params.skip_layers.size(),
|
||||
params.slg_scale,
|
||||
|
@ -1030,6 +1072,7 @@ int main(int argc, const char* argv[]) {
|
|||
params.style_ratio,
|
||||
params.normalize_input,
|
||||
params.input_id_images_path.c_str(),
|
||||
kontext_imgs.data(), kontext_imgs.size(),
|
||||
params.skip_layers.data(),
|
||||
params.skip_layers.size(),
|
||||
params.slg_scale,
|
||||
|
@ -1098,7 +1141,7 @@ int main(int argc, const char* argv[]) {
|
|||
continue;
|
||||
}
|
||||
std::string final_image_path = i > 0 ? dummy_name + "_" + std::to_string(i + 1) + ext : dummy_name + ext;
|
||||
if(is_jpg) {
|
||||
if (is_jpg) {
|
||||
stbi_write_jpg(final_image_path.c_str(), results[i].width, results[i].height, results[i].channel,
|
||||
results[i].data, 90);
|
||||
printf("save result JPEG image to '%s'\n", final_image_path.c_str());
|
||||
|
|
|
@ -593,6 +593,12 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
|
|||
}
|
||||
}
|
||||
|
||||
std::vector<sd_image_t> kontext_imgs;
|
||||
if(extra_image_data!="" && loadedsdver==SDVersion::VERSION_FLUX && !sd_loaded_chroma())
|
||||
{
|
||||
kontext_imgs.push_back(extraimage_reference);
|
||||
}
|
||||
|
||||
if (sd_params->mode == TXT2IMG) {
|
||||
|
||||
if(!sd_is_quiet && sddebugmode==1)
|
||||
|
@ -631,6 +637,7 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
|
|||
sd_params->style_ratio,
|
||||
sd_params->normalize_input,
|
||||
sd_params->input_id_images_path.c_str(),
|
||||
kontext_imgs.data(), kontext_imgs.size(),
|
||||
sd_params->skip_layers.data(),
|
||||
sd_params->skip_layers.size(),
|
||||
sd_params->slg_scale,
|
||||
|
@ -755,6 +762,7 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
|
|||
sd_params->style_ratio,
|
||||
sd_params->normalize_input,
|
||||
sd_params->input_id_images_path.c_str(),
|
||||
kontext_imgs.data(), kontext_imgs.size(),
|
||||
sd_params->skip_layers.data(),
|
||||
sd_params->skip_layers.size(),
|
||||
sd_params->slg_scale,
|
||||
|
|
|
@ -678,7 +678,7 @@ public:
|
|||
|
||||
int64_t t0 = ggml_time_ms();
|
||||
struct ggml_tensor* out = ggml_dup_tensor(work_ctx, x_t);
|
||||
diffusion_model->compute(n_threads, x_t, timesteps, c, concat, NULL, NULL, -1, {}, 0.f, &out);
|
||||
diffusion_model->compute(n_threads, x_t, timesteps, c, concat, NULL, NULL, -1, {}, 0.f, std::vector<struct ggml_tensor*>(), &out);
|
||||
diffusion_model->free_compute_buffer();
|
||||
|
||||
double result = 0.f;
|
||||
|
@ -896,7 +896,8 @@ public:
|
|||
float slg_scale = 0,
|
||||
float skip_layer_start = 0.01,
|
||||
float skip_layer_end = 0.2,
|
||||
ggml_tensor* noise_mask = nullptr) {
|
||||
std::vector<struct ggml_tensor*> kontext_imgs = std::vector<struct ggml_tensor*>(),
|
||||
ggml_tensor* noise_mask = NULL) {
|
||||
LOG_DEBUG("Sample");
|
||||
struct ggml_init_params params;
|
||||
size_t data_size = ggml_row_size(init_latent->type, init_latent->ne[0]);
|
||||
|
@ -982,6 +983,7 @@ public:
|
|||
-1,
|
||||
controls,
|
||||
control_strength,
|
||||
kontext_imgs,
|
||||
&out_cond);
|
||||
} else {
|
||||
diffusion_model->compute(n_threads,
|
||||
|
@ -994,6 +996,7 @@ public:
|
|||
-1,
|
||||
controls,
|
||||
control_strength,
|
||||
kontext_imgs,
|
||||
&out_cond);
|
||||
}
|
||||
|
||||
|
@ -1014,6 +1017,7 @@ public:
|
|||
-1,
|
||||
controls,
|
||||
control_strength,
|
||||
kontext_imgs,
|
||||
&out_uncond);
|
||||
negative_data = (float*)out_uncond->data;
|
||||
}
|
||||
|
@ -1034,6 +1038,7 @@ public:
|
|||
-1,
|
||||
controls,
|
||||
control_strength,
|
||||
kontext_imgs,
|
||||
&out_skip,
|
||||
NULL,
|
||||
skip_layers);
|
||||
|
@ -1411,6 +1416,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
|
|||
float style_ratio,
|
||||
bool normalize_input,
|
||||
std::string input_id_images_path,
|
||||
std::vector<struct ggml_tensor*> kontext_imgs = std::vector<struct ggml_tensor*>(),
|
||||
std::vector<int> skip_layers = {},
|
||||
float slg_scale = 0,
|
||||
float skip_layer_start = 0.01,
|
||||
|
@ -1707,6 +1713,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
|
|||
slg_scale,
|
||||
skip_layer_start,
|
||||
skip_layer_end,
|
||||
kontext_imgs,
|
||||
noise_mask);
|
||||
|
||||
// struct ggml_tensor* x_0 = load_tensor_from_file(ctx, "samples_ddim.bin");
|
||||
|
@ -1776,6 +1783,8 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
|
|||
float style_ratio,
|
||||
bool normalize_input,
|
||||
const char* input_id_images_path_c_str,
|
||||
sd_image_t* kontext_imgs,
|
||||
int kontext_img_count,
|
||||
int* skip_layers = NULL,
|
||||
size_t skip_layers_count = 0,
|
||||
float slg_scale = 0,
|
||||
|
@ -1835,6 +1844,22 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
|
|||
if (sd_version_is_inpaint(sd_ctx->sd->version)) {
|
||||
LOG_WARN("This is an inpainting model, this should only be used in img2img mode with a mask");
|
||||
}
|
||||
std::vector<struct ggml_tensor*> kontext_latents = std::vector<struct ggml_tensor*>();
|
||||
if (kontext_imgs) {
|
||||
for (int i = 0; i < kontext_img_count; i++) {
|
||||
ggml_tensor* img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, kontext_imgs[i].width, kontext_imgs[i].height, 3, 1);
|
||||
sd_image_to_tensor(kontext_imgs[i].data, img);
|
||||
|
||||
ggml_tensor* latent = NULL;
|
||||
if (!sd_ctx->sd->use_tiny_autoencoder) {
|
||||
ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, img);
|
||||
latent = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments);
|
||||
} else {
|
||||
latent = sd_ctx->sd->encode_first_stage(work_ctx, img);
|
||||
}
|
||||
kontext_latents.push_back(latent);
|
||||
}
|
||||
}
|
||||
|
||||
sd_image_t* result_images = generate_image(sd_ctx,
|
||||
work_ctx,
|
||||
|
@ -1856,6 +1881,7 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
|
|||
style_ratio,
|
||||
normalize_input,
|
||||
input_id_images_path_c_str,
|
||||
kontext_latents,
|
||||
skip_layers_vec,
|
||||
slg_scale,
|
||||
skip_layer_start,
|
||||
|
@ -1891,6 +1917,8 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
|
|||
float style_ratio,
|
||||
bool normalize_input,
|
||||
const char* input_id_images_path_c_str,
|
||||
sd_image_t* kontext_imgs,
|
||||
int kontext_img_count,
|
||||
int* skip_layers = NULL,
|
||||
size_t skip_layers_count = 0,
|
||||
float slg_scale = 0,
|
||||
|
@ -2006,6 +2034,23 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
|
|||
} else {
|
||||
init_latent = sd_ctx->sd->encode_first_stage(work_ctx, init_img);
|
||||
}
|
||||
std::vector<struct ggml_tensor*> kontext_latents = std::vector<struct ggml_tensor*>();
|
||||
if (kontext_imgs) {
|
||||
for (int i = 0; i < kontext_img_count; i++) {
|
||||
ggml_tensor* img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
|
||||
sd_image_to_tensor(kontext_imgs[i].data, img);
|
||||
|
||||
ggml_tensor* latent = NULL;
|
||||
if (!sd_ctx->sd->use_tiny_autoencoder) {
|
||||
ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, img);
|
||||
latent = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments);
|
||||
} else {
|
||||
latent = sd_ctx->sd->encode_first_stage(work_ctx, img);
|
||||
}
|
||||
kontext_latents.push_back(latent);
|
||||
}
|
||||
}
|
||||
|
||||
// print_ggml_tensor(init_latent, true);
|
||||
size_t t1 = ggml_time_ms();
|
||||
LOG_INFO("encode_first_stage completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
|
||||
|
@ -2038,6 +2083,7 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
|
|||
style_ratio,
|
||||
normalize_input,
|
||||
input_id_images_path_c_str,
|
||||
kontext_latents,
|
||||
skip_layers_vec,
|
||||
slg_scale,
|
||||
skip_layer_start,
|
||||
|
|
|
@ -176,6 +176,8 @@ SD_API sd_image_t* txt2img(sd_ctx_t* sd_ctx,
|
|||
float style_strength,
|
||||
bool normalize_input,
|
||||
const char* input_id_images_path,
|
||||
sd_image_t* kontext_imgs,
|
||||
int kontext_img_count,
|
||||
int* skip_layers,
|
||||
size_t skip_layers_count,
|
||||
float slg_scale,
|
||||
|
@ -204,6 +206,8 @@ SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx,
|
|||
float style_strength,
|
||||
bool normalize_input,
|
||||
const char* input_id_images_path,
|
||||
sd_image_t* kontext_imgs,
|
||||
int kontext_img_count,
|
||||
int* skip_layers,
|
||||
size_t skip_layers_count,
|
||||
float slg_scale,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue