mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 17:44:38 +00:00
merged leejet changes
This commit is contained in:
parent
b6edb79648
commit
a1175cf34f
5 changed files with 296 additions and 176 deletions
|
@ -13,13 +13,13 @@ struct DiffusionModel {
|
||||||
struct ggml_tensor* c_concat,
|
struct ggml_tensor* c_concat,
|
||||||
struct ggml_tensor* y,
|
struct ggml_tensor* y,
|
||||||
struct ggml_tensor* guidance,
|
struct ggml_tensor* guidance,
|
||||||
int num_video_frames = -1,
|
std::vector<ggml_tensor*> ref_latents = {},
|
||||||
std::vector<struct ggml_tensor*> controls = {},
|
int num_video_frames = -1,
|
||||||
float control_strength = 0.f,
|
std::vector<struct ggml_tensor*> controls = {},
|
||||||
std::vector<struct ggml_tensor*> kontext_imgs = std::vector<struct ggml_tensor*>(),
|
float control_strength = 0.f,
|
||||||
struct ggml_tensor** output = NULL,
|
struct ggml_tensor** output = NULL,
|
||||||
struct ggml_context* output_ctx = NULL,
|
struct ggml_context* output_ctx = NULL,
|
||||||
std::vector<int> skip_layers = std::vector<int>()) = 0;
|
std::vector<int> skip_layers = std::vector<int>()) = 0;
|
||||||
virtual void alloc_params_buffer() = 0;
|
virtual void alloc_params_buffer() = 0;
|
||||||
virtual void free_params_buffer() = 0;
|
virtual void free_params_buffer() = 0;
|
||||||
virtual void free_compute_buffer() = 0;
|
virtual void free_compute_buffer() = 0;
|
||||||
|
@ -69,13 +69,13 @@ struct UNetModel : public DiffusionModel {
|
||||||
struct ggml_tensor* c_concat,
|
struct ggml_tensor* c_concat,
|
||||||
struct ggml_tensor* y,
|
struct ggml_tensor* y,
|
||||||
struct ggml_tensor* guidance,
|
struct ggml_tensor* guidance,
|
||||||
int num_video_frames = -1,
|
std::vector<ggml_tensor*> ref_latents = {},
|
||||||
std::vector<struct ggml_tensor*> controls = {},
|
int num_video_frames = -1,
|
||||||
float control_strength = 0.f,
|
std::vector<struct ggml_tensor*> controls = {},
|
||||||
std::vector<struct ggml_tensor*> kontext_imgs = std::vector<struct ggml_tensor*>(),
|
float control_strength = 0.f,
|
||||||
struct ggml_tensor** output = NULL,
|
struct ggml_tensor** output = NULL,
|
||||||
struct ggml_context* output_ctx = NULL,
|
struct ggml_context* output_ctx = NULL,
|
||||||
std::vector<int> skip_layers = std::vector<int>()) {
|
std::vector<int> skip_layers = std::vector<int>()) {
|
||||||
(void)skip_layers; // SLG doesn't work with UNet models
|
(void)skip_layers; // SLG doesn't work with UNet models
|
||||||
return unet.compute(n_threads, x, timesteps, context, c_concat, y, num_video_frames, controls, control_strength, output, output_ctx);
|
return unet.compute(n_threads, x, timesteps, context, c_concat, y, num_video_frames, controls, control_strength, output, output_ctx);
|
||||||
}
|
}
|
||||||
|
@ -120,13 +120,13 @@ struct MMDiTModel : public DiffusionModel {
|
||||||
struct ggml_tensor* c_concat,
|
struct ggml_tensor* c_concat,
|
||||||
struct ggml_tensor* y,
|
struct ggml_tensor* y,
|
||||||
struct ggml_tensor* guidance,
|
struct ggml_tensor* guidance,
|
||||||
int num_video_frames = -1,
|
std::vector<ggml_tensor*> ref_latents = {},
|
||||||
std::vector<struct ggml_tensor*> controls = {},
|
int num_video_frames = -1,
|
||||||
float control_strength = 0.f,
|
std::vector<struct ggml_tensor*> controls = {},
|
||||||
std::vector<struct ggml_tensor*> kontext_imgs = std::vector<struct ggml_tensor*>(),
|
float control_strength = 0.f,
|
||||||
struct ggml_tensor** output = NULL,
|
struct ggml_tensor** output = NULL,
|
||||||
struct ggml_context* output_ctx = NULL,
|
struct ggml_context* output_ctx = NULL,
|
||||||
std::vector<int> skip_layers = std::vector<int>()) {
|
std::vector<int> skip_layers = std::vector<int>()) {
|
||||||
return mmdit.compute(n_threads, x, timesteps, context, y, output, output_ctx, skip_layers);
|
return mmdit.compute(n_threads, x, timesteps, context, y, output, output_ctx, skip_layers);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -172,14 +172,14 @@ struct FluxModel : public DiffusionModel {
|
||||||
struct ggml_tensor* c_concat,
|
struct ggml_tensor* c_concat,
|
||||||
struct ggml_tensor* y,
|
struct ggml_tensor* y,
|
||||||
struct ggml_tensor* guidance,
|
struct ggml_tensor* guidance,
|
||||||
int num_video_frames = -1,
|
std::vector<ggml_tensor*> ref_latents = {},
|
||||||
std::vector<struct ggml_tensor*> controls = {},
|
int num_video_frames = -1,
|
||||||
float control_strength = 0.f,
|
std::vector<struct ggml_tensor*> controls = {},
|
||||||
std::vector<struct ggml_tensor*> kontext_imgs = std::vector<struct ggml_tensor*>(),
|
float control_strength = 0.f,
|
||||||
struct ggml_tensor** output = NULL,
|
struct ggml_tensor** output = NULL,
|
||||||
struct ggml_context* output_ctx = NULL,
|
struct ggml_context* output_ctx = NULL,
|
||||||
std::vector<int> skip_layers = std::vector<int>()) {
|
std::vector<int> skip_layers = std::vector<int>()) {
|
||||||
return flux.compute(n_threads, x, timesteps, context, c_concat, y, guidance, kontext_imgs, output, output_ctx, skip_layers);
|
return flux.compute(n_threads, x, timesteps, context, c_concat, y, guidance, ref_latents, output, output_ctx, skip_layers);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -672,51 +672,81 @@ namespace Flux {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Generate IDs for image patches and text
|
// Generate IDs for image patches and text
|
||||||
std::vector<std::vector<float>> gen_ids(int h, int w, int patch_size, int index = 0) {
|
std::vector<std::vector<float>> gen_txt_ids(int bs, int context_len) {
|
||||||
|
return std::vector<std::vector<float>>(bs * context_len, std::vector<float>(3, 0.0));
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::vector<float>> gen_img_ids(int h, int w, int patch_size, int bs, int index = 0, int h_offset = 0, int w_offset = 0) {
|
||||||
int h_len = (h + (patch_size / 2)) / patch_size;
|
int h_len = (h + (patch_size / 2)) / patch_size;
|
||||||
int w_len = (w + (patch_size / 2)) / patch_size;
|
int w_len = (w + (patch_size / 2)) / patch_size;
|
||||||
|
|
||||||
std::vector<std::vector<float>> img_ids(h_len * w_len, std::vector<float>(3, (float)index));
|
std::vector<std::vector<float>> img_ids(h_len * w_len, std::vector<float>(3, 0.0));
|
||||||
|
|
||||||
std::vector<float> row_ids = linspace(0, h_len - 1, h_len);
|
std::vector<float> row_ids = linspace(h_offset, h_len - 1 + h_offset, h_len);
|
||||||
std::vector<float> col_ids = linspace(0, w_len - 1, w_len);
|
std::vector<float> col_ids = linspace(w_offset, w_len - 1 + w_offset, w_len);
|
||||||
|
|
||||||
for (int i = 0; i < h_len; ++i) {
|
for (int i = 0; i < h_len; ++i) {
|
||||||
for (int j = 0; j < w_len; ++j) {
|
for (int j = 0; j < w_len; ++j) {
|
||||||
|
img_ids[i * w_len + j][0] = index;
|
||||||
img_ids[i * w_len + j][1] = row_ids[i];
|
img_ids[i * w_len + j][1] = row_ids[i];
|
||||||
img_ids[i * w_len + j][2] = col_ids[j];
|
img_ids[i * w_len + j][2] = col_ids[j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return img_ids;
|
std::vector<std::vector<float>> img_ids_repeated(bs * img_ids.size(), std::vector<float>(3));
|
||||||
|
for (int i = 0; i < bs; ++i) {
|
||||||
|
for (int j = 0; j < img_ids.size(); ++j) {
|
||||||
|
img_ids_repeated[i * img_ids.size() + j] = img_ids[j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return img_ids_repeated;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::vector<float>> concat_ids(const std::vector<std::vector<float>>& a,
|
||||||
|
const std::vector<std::vector<float>>& b,
|
||||||
|
int bs) {
|
||||||
|
size_t a_len = a.size() / bs;
|
||||||
|
size_t b_len = b.size() / bs;
|
||||||
|
std::vector<std::vector<float>> ids(a.size() + b.size(), std::vector<float>(3));
|
||||||
|
for (int i = 0; i < bs; ++i) {
|
||||||
|
for (int j = 0; j < a_len; ++j) {
|
||||||
|
ids[i * (a_len + b_len) + j] = a[i * a_len + j];
|
||||||
|
}
|
||||||
|
for (int j = 0; j < b_len; ++j) {
|
||||||
|
ids[i * (a_len + b_len) + a_len + j] = b[i * b_len + j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ids;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::vector<float>> gen_ids(int h, int w, int patch_size, int bs, int context_len, std::vector<ggml_tensor*> ref_latents) {
|
||||||
|
auto txt_ids = gen_txt_ids(bs, context_len);
|
||||||
|
auto img_ids = gen_img_ids(h, w, patch_size, bs);
|
||||||
|
|
||||||
|
auto ids = concat_ids(txt_ids, img_ids, bs);
|
||||||
|
uint64_t curr_h_offset = 0;
|
||||||
|
uint64_t curr_w_offset = 0;
|
||||||
|
for (ggml_tensor* ref : ref_latents) {
|
||||||
|
uint64_t h_offset = 0;
|
||||||
|
uint64_t w_offset = 0;
|
||||||
|
if (ref->ne[1] + curr_h_offset > ref->ne[0] + curr_w_offset) {
|
||||||
|
w_offset = curr_w_offset;
|
||||||
|
} else {
|
||||||
|
h_offset = curr_h_offset;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto ref_ids = gen_img_ids(ref->ne[1], ref->ne[0], patch_size, bs, 1, h_offset, w_offset);
|
||||||
|
ids = concat_ids(ids, ref_ids, bs);
|
||||||
|
|
||||||
|
curr_h_offset = std::max(curr_h_offset, ref->ne[1] + h_offset);
|
||||||
|
curr_w_offset = std::max(curr_w_offset, ref->ne[0] + w_offset);
|
||||||
|
}
|
||||||
|
return ids;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Generate positional embeddings
|
// Generate positional embeddings
|
||||||
std::vector<float> gen_pe(std::vector<struct ggml_tensor*> imgs, struct ggml_tensor* context, int patch_size, int theta, const std::vector<int>& axes_dim) {
|
std::vector<float> gen_pe(int h, int w, int patch_size, int bs, int context_len, std::vector<ggml_tensor*> ref_latents, int theta, const std::vector<int>& axes_dim) {
|
||||||
int context_len = context->ne[1];
|
std::vector<std::vector<float>> ids = gen_ids(h, w, patch_size, bs, context_len, ref_latents);
|
||||||
int bs = imgs[0]->ne[3];
|
|
||||||
|
|
||||||
std::vector<std::vector<float>> img_ids;
|
|
||||||
for (int i = 0; i < imgs.size(); i++) {
|
|
||||||
auto x = imgs[i];
|
|
||||||
if (x) {
|
|
||||||
int h = x->ne[1];
|
|
||||||
int w = x->ne[0];
|
|
||||||
std::vector<std::vector<float>> img_ids_i = gen_ids(h, w, patch_size, i);
|
|
||||||
img_ids.insert(img_ids.end(), img_ids_i.begin(), img_ids_i.end());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<std::vector<float>> txt_ids(bs * context_len, std::vector<float>(3, 0.0));
|
|
||||||
std::vector<std::vector<float>> ids(bs * (context_len + img_ids.size()), std::vector<float>(3));
|
|
||||||
for (int i = 0; i < bs; ++i) {
|
|
||||||
for (int j = 0; j < context_len; ++j) {
|
|
||||||
ids[i * (context_len + img_ids.size()) + j] = txt_ids[j];
|
|
||||||
}
|
|
||||||
for (int j = 0; j < img_ids.size(); ++j) {
|
|
||||||
ids[i * (context_len + img_ids.size()) + context_len + j] = img_ids[j];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<std::vector<float>> trans_ids = transpose(ids);
|
std::vector<std::vector<float>> trans_ids = transpose(ids);
|
||||||
size_t pos_len = ids.size();
|
size_t pos_len = ids.size();
|
||||||
|
@ -843,7 +873,7 @@ namespace Flux {
|
||||||
struct ggml_tensor* guidance,
|
struct ggml_tensor* guidance,
|
||||||
struct ggml_tensor* pe,
|
struct ggml_tensor* pe,
|
||||||
struct ggml_tensor* arange = NULL,
|
struct ggml_tensor* arange = NULL,
|
||||||
std::vector<int> skip_layers = std::vector<int>()) {
|
std::vector<int> skip_layers = {}) {
|
||||||
auto img_in = std::dynamic_pointer_cast<Linear>(blocks["img_in"]);
|
auto img_in = std::dynamic_pointer_cast<Linear>(blocks["img_in"]);
|
||||||
auto txt_in = std::dynamic_pointer_cast<Linear>(blocks["txt_in"]);
|
auto txt_in = std::dynamic_pointer_cast<Linear>(blocks["txt_in"]);
|
||||||
auto final_layer = std::dynamic_pointer_cast<LastLayer>(blocks["final_layer"]);
|
auto final_layer = std::dynamic_pointer_cast<LastLayer>(blocks["final_layer"]);
|
||||||
|
@ -929,8 +959,23 @@ namespace Flux {
|
||||||
return img;
|
return img;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor* process_img(struct ggml_context* ctx,
|
||||||
|
struct ggml_tensor* x) {
|
||||||
|
|
||||||
|
int64_t W = x->ne[0];
|
||||||
|
int64_t H = x->ne[1];
|
||||||
|
int64_t patch_size = 2;
|
||||||
|
int pad_h = (patch_size - H % patch_size) % patch_size;
|
||||||
|
int pad_w = (patch_size - W % patch_size) % patch_size;
|
||||||
|
x = ggml_pad(ctx, x, pad_w, pad_h, 0, 0); // [N, C, H + pad_h, W + pad_w]
|
||||||
|
|
||||||
|
// img = rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=patch_size, pw=patch_size)
|
||||||
|
auto img = patchify(ctx, x, patch_size); // [N, h*w, C * patch_size * patch_size]
|
||||||
|
return img;
|
||||||
|
}
|
||||||
|
|
||||||
struct ggml_tensor* forward(struct ggml_context* ctx,
|
struct ggml_tensor* forward(struct ggml_context* ctx,
|
||||||
std::vector<struct ggml_tensor*> imgs,
|
struct ggml_tensor* x,
|
||||||
struct ggml_tensor* timestep,
|
struct ggml_tensor* timestep,
|
||||||
struct ggml_tensor* context,
|
struct ggml_tensor* context,
|
||||||
struct ggml_tensor* c_concat,
|
struct ggml_tensor* c_concat,
|
||||||
|
@ -938,8 +983,8 @@ namespace Flux {
|
||||||
struct ggml_tensor* guidance,
|
struct ggml_tensor* guidance,
|
||||||
struct ggml_tensor* pe,
|
struct ggml_tensor* pe,
|
||||||
struct ggml_tensor* arange = NULL,
|
struct ggml_tensor* arange = NULL,
|
||||||
std::vector<int> skip_layers = std::vector<int>(),
|
std::vector<ggml_tensor*> ref_latents = {},
|
||||||
SDVersion version = VERSION_FLUX) {
|
std::vector<int> skip_layers = {}) {
|
||||||
// Forward pass of DiT.
|
// Forward pass of DiT.
|
||||||
// x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
|
// x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
|
||||||
// timestep: (N,) tensor of diffusion timesteps
|
// timestep: (N,) tensor of diffusion timesteps
|
||||||
|
@ -950,47 +995,41 @@ namespace Flux {
|
||||||
// pe: (L, d_head/2, 2, 2)
|
// pe: (L, d_head/2, 2, 2)
|
||||||
// return: (N, C, H, W)
|
// return: (N, C, H, W)
|
||||||
|
|
||||||
auto x = imgs[0];
|
|
||||||
GGML_ASSERT(x->ne[3] == 1);
|
GGML_ASSERT(x->ne[3] == 1);
|
||||||
|
|
||||||
int64_t W = x->ne[0];
|
int64_t W = x->ne[0];
|
||||||
int64_t H = x->ne[1];
|
int64_t H = x->ne[1];
|
||||||
int64_t C = x->ne[2];
|
int64_t C = x->ne[2];
|
||||||
int64_t patch_size = 2;
|
int64_t patch_size = 2;
|
||||||
int pad_h = (patch_size - x->ne[0] % patch_size) % patch_size;
|
int pad_h = (patch_size - H % patch_size) % patch_size;
|
||||||
int pad_w = (patch_size - x->ne[1] % patch_size) % patch_size;
|
int pad_w = (patch_size - W % patch_size) % patch_size;
|
||||||
|
|
||||||
// img = rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=patch_size, pw=patch_size)
|
auto img = process_img(ctx, x);
|
||||||
ggml_tensor* img = NULL; // [N, h*w, C * patch_size * patch_size]
|
uint64_t img_tokens = img->ne[1];
|
||||||
int64_t patchified_img_size;
|
|
||||||
for (auto& x : imgs) {
|
|
||||||
int pad_h = (patch_size - x->ne[0] % patch_size) % patch_size;
|
|
||||||
int pad_w = (patch_size - x->ne[1] % patch_size) % patch_size;
|
|
||||||
ggml_tensor* pad_x = ggml_pad(ctx, x, pad_w, pad_h, 0, 0);
|
|
||||||
pad_x = patchify(ctx, pad_x, patch_size);
|
|
||||||
if (img) {
|
|
||||||
img = ggml_concat(ctx, img, pad_x, 1);
|
|
||||||
} else {
|
|
||||||
img = pad_x;
|
|
||||||
patchified_img_size = img->ne[1];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (c_concat != NULL) {
|
if (c_concat != NULL) {
|
||||||
ggml_tensor* masked = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], C, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], 0);
|
ggml_tensor* masked = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], C, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], 0);
|
||||||
ggml_tensor* mask = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], 8 * 8, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * C);
|
ggml_tensor* mask = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], 8 * 8, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * C);
|
||||||
|
|
||||||
masked = ggml_pad(ctx, masked, pad_w, pad_h, 0, 0);
|
masked = process_img(ctx, masked);
|
||||||
mask = ggml_pad(ctx, mask, pad_w, pad_h, 0, 0);
|
mask = process_img(ctx, mask);
|
||||||
|
|
||||||
masked = patchify(ctx, masked, patch_size);
|
|
||||||
mask = patchify(ctx, mask, patch_size);
|
|
||||||
|
|
||||||
img = ggml_concat(ctx, img, ggml_concat(ctx, masked, mask, 0), 0);
|
img = ggml_concat(ctx, img, ggml_concat(ctx, masked, mask, 0), 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
auto out = forward_orig(ctx, img, context, timestep, y, guidance, pe, arange, skip_layers); // [N, h*w, C * patch_size * patch_size]
|
if (ref_latents.size() > 0) {
|
||||||
out = ggml_cont(ctx, ggml_view_2d(ctx, out, out->ne[0], patchified_img_size, out->nb[1], 0));
|
for (ggml_tensor* ref : ref_latents) {
|
||||||
|
ref = process_img(ctx, ref);
|
||||||
|
img = ggml_concat(ctx, img, ref, 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
auto out = forward_orig(ctx, img, context, timestep, y, guidance, pe, arange, skip_layers); // [N, num_tokens, C * patch_size * patch_size]
|
||||||
|
if (out->ne[1] > img_tokens) {
|
||||||
|
out = ggml_cont(ctx, ggml_permute(ctx, out, 0, 2, 1, 3)); // [num_tokens, N, C * patch_size * patch_size]
|
||||||
|
out = ggml_view_3d(ctx, out, out->ne[0], out->ne[1], img_tokens, out->nb[1], out->nb[2], 0);
|
||||||
|
out = ggml_cont(ctx, ggml_permute(ctx, out, 0, 2, 1, 3)); // [N, h*w, C * patch_size * patch_size]
|
||||||
|
}
|
||||||
|
|
||||||
// rearrange(out, "b (h w) (c ph pw) -> b c (h ph) (w pw)", h=h_len, w=w_len, ph=2, pw=2)
|
// rearrange(out, "b (h w) (c ph pw) -> b c (h ph) (w pw)", h=h_len, w=w_len, ph=2, pw=2)
|
||||||
out = unpatchify(ctx, out, (H + pad_h) / patch_size, (W + pad_w) / patch_size, patch_size); // [N, C, H + pad_h, W + pad_w]
|
out = unpatchify(ctx, out, (H + pad_h) / patch_size, (W + pad_w) / patch_size, patch_size); // [N, C, H + pad_h, W + pad_w]
|
||||||
|
@ -1076,8 +1115,8 @@ namespace Flux {
|
||||||
struct ggml_tensor* c_concat,
|
struct ggml_tensor* c_concat,
|
||||||
struct ggml_tensor* y,
|
struct ggml_tensor* y,
|
||||||
struct ggml_tensor* guidance,
|
struct ggml_tensor* guidance,
|
||||||
std::vector<struct ggml_tensor*> kontext_imgs = std::vector<struct ggml_tensor*>(),
|
std::vector<ggml_tensor*> ref_latents = {},
|
||||||
std::vector<int> skip_layers = std::vector<int>()) {
|
std::vector<int> skip_layers = std::vector<int>()) {
|
||||||
GGML_ASSERT(x->ne[3] == 1);
|
GGML_ASSERT(x->ne[3] == 1);
|
||||||
struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, FLUX_GRAPH_SIZE, false);
|
struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, FLUX_GRAPH_SIZE, false);
|
||||||
|
|
||||||
|
@ -1088,9 +1127,7 @@ namespace Flux {
|
||||||
if (c_concat != NULL) {
|
if (c_concat != NULL) {
|
||||||
c_concat = to_backend(c_concat);
|
c_concat = to_backend(c_concat);
|
||||||
}
|
}
|
||||||
for (auto &img : kontext_imgs){
|
|
||||||
img = to_backend(img);
|
|
||||||
}
|
|
||||||
if (flux_params.is_chroma) {
|
if (flux_params.is_chroma) {
|
||||||
const char* SD_CHROMA_ENABLE_GUIDANCE = getenv("SD_CHROMA_ENABLE_GUIDANCE");
|
const char* SD_CHROMA_ENABLE_GUIDANCE = getenv("SD_CHROMA_ENABLE_GUIDANCE");
|
||||||
bool disable_guidance = true;
|
bool disable_guidance = true;
|
||||||
|
@ -1131,10 +1168,11 @@ namespace Flux {
|
||||||
if (flux_params.guidance_embed || flux_params.is_chroma) {
|
if (flux_params.guidance_embed || flux_params.is_chroma) {
|
||||||
guidance = to_backend(guidance);
|
guidance = to_backend(guidance);
|
||||||
}
|
}
|
||||||
auto imgs = kontext_imgs;
|
for (int i = 0; i < ref_latents.size(); i++) {
|
||||||
imgs.insert(imgs.begin(), x);
|
ref_latents[i] = to_backend(ref_latents[i]);
|
||||||
|
}
|
||||||
|
|
||||||
pe_vec = flux.gen_pe(imgs, context, 2, flux_params.theta, flux_params.axes_dim);
|
pe_vec = flux.gen_pe(x->ne[1], x->ne[0], 2, x->ne[3], context->ne[1], ref_latents, flux_params.theta, flux_params.axes_dim);
|
||||||
int pos_len = pe_vec.size() / flux_params.axes_dim_sum / 2;
|
int pos_len = pe_vec.size() / flux_params.axes_dim_sum / 2;
|
||||||
// LOG_DEBUG("pos_len %d", pos_len);
|
// LOG_DEBUG("pos_len %d", pos_len);
|
||||||
auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, flux_params.axes_dim_sum / 2, pos_len);
|
auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, flux_params.axes_dim_sum / 2, pos_len);
|
||||||
|
@ -1144,7 +1182,7 @@ namespace Flux {
|
||||||
set_backend_tensor_data(pe, pe_vec.data());
|
set_backend_tensor_data(pe, pe_vec.data());
|
||||||
|
|
||||||
struct ggml_tensor* out = flux.forward(compute_ctx,
|
struct ggml_tensor* out = flux.forward(compute_ctx,
|
||||||
imgs,
|
x,
|
||||||
timesteps,
|
timesteps,
|
||||||
context,
|
context,
|
||||||
c_concat,
|
c_concat,
|
||||||
|
@ -1152,6 +1190,7 @@ namespace Flux {
|
||||||
guidance,
|
guidance,
|
||||||
pe,
|
pe,
|
||||||
precompute_arange,
|
precompute_arange,
|
||||||
|
ref_latents,
|
||||||
skip_layers);
|
skip_layers);
|
||||||
|
|
||||||
ggml_build_forward_expand(gf, out);
|
ggml_build_forward_expand(gf, out);
|
||||||
|
@ -1166,17 +1205,17 @@ namespace Flux {
|
||||||
struct ggml_tensor* c_concat,
|
struct ggml_tensor* c_concat,
|
||||||
struct ggml_tensor* y,
|
struct ggml_tensor* y,
|
||||||
struct ggml_tensor* guidance,
|
struct ggml_tensor* guidance,
|
||||||
std::vector<struct ggml_tensor*> kontext_imgs = std::vector<struct ggml_tensor*>(),
|
std::vector<ggml_tensor*> ref_latents = {},
|
||||||
struct ggml_tensor** output = NULL,
|
struct ggml_tensor** output = NULL,
|
||||||
struct ggml_context* output_ctx = NULL,
|
struct ggml_context* output_ctx = NULL,
|
||||||
std::vector<int> skip_layers = std::vector<int>()) {
|
std::vector<int> skip_layers = std::vector<int>()) {
|
||||||
// x: [N, in_channels, h, w]
|
// x: [N, in_channels, h, w]
|
||||||
// timesteps: [N, ]
|
// timesteps: [N, ]
|
||||||
// context: [N, max_position, hidden_size]
|
// context: [N, max_position, hidden_size]
|
||||||
// y: [N, adm_in_channels] or [1, adm_in_channels]
|
// y: [N, adm_in_channels] or [1, adm_in_channels]
|
||||||
// guidance: [N, ]
|
// guidance: [N, ]
|
||||||
auto get_graph = [&]() -> struct ggml_cgraph* {
|
auto get_graph = [&]() -> struct ggml_cgraph* {
|
||||||
return build_graph(x, timesteps, context, c_concat, y, guidance, kontext_imgs, skip_layers);
|
return build_graph(x, timesteps, context, c_concat, y, guidance, ref_latents, skip_layers);
|
||||||
};
|
};
|
||||||
|
|
||||||
GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
|
GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
|
||||||
|
@ -1216,7 +1255,7 @@ namespace Flux {
|
||||||
struct ggml_tensor* out = NULL;
|
struct ggml_tensor* out = NULL;
|
||||||
|
|
||||||
int t0 = ggml_time_ms();
|
int t0 = ggml_time_ms();
|
||||||
compute(8, x, timesteps, context, NULL, y, guidance, std::vector<struct ggml_tensor*>(), &out, work_ctx);
|
compute(8, x, timesteps, context, NULL, y, guidance, {}, &out, work_ctx);
|
||||||
int t1 = ggml_time_ms();
|
int t1 = ggml_time_ms();
|
||||||
|
|
||||||
print_ggml_tensor(out);
|
print_ggml_tensor(out);
|
||||||
|
|
|
@ -54,6 +54,7 @@ const char* modes_str[] = {
|
||||||
"txt2img",
|
"txt2img",
|
||||||
"img2img",
|
"img2img",
|
||||||
"img2vid",
|
"img2vid",
|
||||||
|
"edit",
|
||||||
"convert",
|
"convert",
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -61,6 +62,7 @@ enum SDMode {
|
||||||
TXT2IMG,
|
TXT2IMG,
|
||||||
IMG2IMG,
|
IMG2IMG,
|
||||||
IMG2VID,
|
IMG2VID,
|
||||||
|
EDIT,
|
||||||
CONVERT,
|
CONVERT,
|
||||||
MODE_COUNT
|
MODE_COUNT
|
||||||
};
|
};
|
||||||
|
@ -86,8 +88,7 @@ struct SDParams {
|
||||||
std::string input_path;
|
std::string input_path;
|
||||||
std::string mask_path;
|
std::string mask_path;
|
||||||
std::string control_image_path;
|
std::string control_image_path;
|
||||||
|
std::vector<std::string> ref_image_paths;
|
||||||
std::vector<std::string> kontext_image_paths;
|
|
||||||
|
|
||||||
std::string prompt;
|
std::string prompt;
|
||||||
std::string negative_prompt;
|
std::string negative_prompt;
|
||||||
|
@ -153,6 +154,10 @@ void print_params(SDParams params) {
|
||||||
printf(" init_img: %s\n", params.input_path.c_str());
|
printf(" init_img: %s\n", params.input_path.c_str());
|
||||||
printf(" mask_img: %s\n", params.mask_path.c_str());
|
printf(" mask_img: %s\n", params.mask_path.c_str());
|
||||||
printf(" control_image: %s\n", params.control_image_path.c_str());
|
printf(" control_image: %s\n", params.control_image_path.c_str());
|
||||||
|
printf(" ref_images_paths:\n");
|
||||||
|
for (auto& path : params.ref_image_paths) {
|
||||||
|
printf(" %s\n", path.c_str());
|
||||||
|
};
|
||||||
printf(" clip on cpu: %s\n", params.clip_on_cpu ? "true" : "false");
|
printf(" clip on cpu: %s\n", params.clip_on_cpu ? "true" : "false");
|
||||||
printf(" controlnet cpu: %s\n", params.control_net_cpu ? "true" : "false");
|
printf(" controlnet cpu: %s\n", params.control_net_cpu ? "true" : "false");
|
||||||
printf(" vae decoder on cpu:%s\n", params.vae_on_cpu ? "true" : "false");
|
printf(" vae decoder on cpu:%s\n", params.vae_on_cpu ? "true" : "false");
|
||||||
|
@ -207,6 +212,7 @@ void print_usage(int argc, const char* argv[]) {
|
||||||
printf(" -i, --init-img [IMAGE] path to the input image, required by img2img\n");
|
printf(" -i, --init-img [IMAGE] path to the input image, required by img2img\n");
|
||||||
printf(" --mask [MASK] path to the mask image, required by img2img with mask\n");
|
printf(" --mask [MASK] path to the mask image, required by img2img with mask\n");
|
||||||
printf(" --control-image [IMAGE] path to image condition, control net\n");
|
printf(" --control-image [IMAGE] path to image condition, control net\n");
|
||||||
|
printf(" -r, --ref_image [PATH] reference image for Flux Kontext models (can be used multiple times) \n");
|
||||||
printf(" -o, --output OUTPUT path to write result image to (default: ./output.png)\n");
|
printf(" -o, --output OUTPUT path to write result image to (default: ./output.png)\n");
|
||||||
printf(" -p, --prompt [PROMPT] the prompt to render\n");
|
printf(" -p, --prompt [PROMPT] the prompt to render\n");
|
||||||
printf(" -n, --negative-prompt PROMPT the negative prompt (default: \"\")\n");
|
printf(" -n, --negative-prompt PROMPT the negative prompt (default: \"\")\n");
|
||||||
|
@ -242,9 +248,8 @@ void print_usage(int argc, const char* argv[]) {
|
||||||
printf(" This might crash if it is not supported by the backend.\n");
|
printf(" This might crash if it is not supported by the backend.\n");
|
||||||
printf(" --control-net-cpu keep controlnet in cpu (for low vram)\n");
|
printf(" --control-net-cpu keep controlnet in cpu (for low vram)\n");
|
||||||
printf(" --canny apply canny preprocessor (edge detection)\n");
|
printf(" --canny apply canny preprocessor (edge detection)\n");
|
||||||
printf(" --color Colors the logging tags according to level\n");
|
printf(" --color colors the logging tags according to level\n");
|
||||||
printf(" -v, --verbose print extra info\n");
|
printf(" -v, --verbose print extra info\n");
|
||||||
printf(" -ki, --kontext_img [PATH] Reference image for Flux Kontext models (can be used multiple times) \n");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void parse_args(int argc, const char** argv, SDParams& params) {
|
void parse_args(int argc, const char** argv, SDParams& params) {
|
||||||
|
@ -629,12 +634,12 @@ void parse_args(int argc, const char** argv, SDParams& params) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.skip_layer_end = std::stof(argv[i]);
|
params.skip_layer_end = std::stof(argv[i]);
|
||||||
} else if (arg == "-ki" || arg == "--kontext-img") {
|
} else if (arg == "-r" || arg == "--ref-image") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_arg = true;
|
invalid_arg = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.kontext_image_paths.push_back(argv[i]);
|
params.ref_image_paths.push_back(argv[i]);
|
||||||
} else {
|
} else {
|
||||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||||
print_usage(argc, argv);
|
print_usage(argc, argv);
|
||||||
|
@ -663,7 +668,13 @@ void parse_args(int argc, const char** argv, SDParams& params) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((params.mode == IMG2IMG || params.mode == IMG2VID) && params.input_path.length() == 0) {
|
if ((params.mode == IMG2IMG || params.mode == IMG2VID) && params.input_path.length() == 0) {
|
||||||
fprintf(stderr, "error: when using the img2img mode, the following arguments are required: init-img\n");
|
fprintf(stderr, "error: when using the img2img/img2vid mode, the following arguments are required: init-img\n");
|
||||||
|
print_usage(argc, argv);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (params.mode == EDIT && params.ref_image_paths.size() == 0) {
|
||||||
|
fprintf(stderr, "error: when using the edit mode, the following arguments are required: ref-image\n");
|
||||||
print_usage(argc, argv);
|
print_usage(argc, argv);
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
@ -827,43 +838,12 @@ int main(int argc, const char* argv[]) {
|
||||||
fprintf(stderr, "SVD support is broken, do not use it!!!\n");
|
fprintf(stderr, "SVD support is broken, do not use it!!!\n");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
bool vae_decode_only = true;
|
|
||||||
|
|
||||||
std::vector<sd_image_t> kontext_imgs;
|
|
||||||
for (auto& path : params.kontext_image_paths) {
|
|
||||||
vae_decode_only = false;
|
|
||||||
int c = 0;
|
|
||||||
int width = 0;
|
|
||||||
int height = 0;
|
|
||||||
uint8_t* image_buffer = stbi_load(path.c_str(), &width, &height, &c, 3);
|
|
||||||
if (image_buffer == NULL) {
|
|
||||||
fprintf(stderr, "load image from '%s' failed\n", path.c_str());
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
if (c < 3) {
|
|
||||||
fprintf(stderr, "the number of channels for the input image must be >= 3, but got %d channels\n", c);
|
|
||||||
free(image_buffer);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
if (width <= 0) {
|
|
||||||
fprintf(stderr, "error: the width of image must be greater than 0\n");
|
|
||||||
free(image_buffer);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
if (height <= 0) {
|
|
||||||
fprintf(stderr, "error: the height of image must be greater than 0\n");
|
|
||||||
free(image_buffer);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
kontext_imgs.push_back({(uint32_t)width,
|
|
||||||
(uint32_t)height,
|
|
||||||
3,
|
|
||||||
image_buffer});
|
|
||||||
}
|
|
||||||
|
|
||||||
|
bool vae_decode_only = true;
|
||||||
uint8_t* input_image_buffer = NULL;
|
uint8_t* input_image_buffer = NULL;
|
||||||
uint8_t* control_image_buffer = NULL;
|
uint8_t* control_image_buffer = NULL;
|
||||||
uint8_t* mask_image_buffer = NULL;
|
uint8_t* mask_image_buffer = NULL;
|
||||||
|
std::vector<sd_image_t> ref_images;
|
||||||
|
|
||||||
if (params.mode == IMG2IMG || params.mode == IMG2VID) {
|
if (params.mode == IMG2IMG || params.mode == IMG2VID) {
|
||||||
vae_decode_only = false;
|
vae_decode_only = false;
|
||||||
|
@ -915,6 +895,37 @@ int main(int argc, const char* argv[]) {
|
||||||
free(input_image_buffer);
|
free(input_image_buffer);
|
||||||
input_image_buffer = resized_image_buffer;
|
input_image_buffer = resized_image_buffer;
|
||||||
}
|
}
|
||||||
|
} else if (params.mode == EDIT) {
|
||||||
|
vae_decode_only = false;
|
||||||
|
for (auto& path : params.ref_image_paths) {
|
||||||
|
int c = 0;
|
||||||
|
int width = 0;
|
||||||
|
int height = 0;
|
||||||
|
uint8_t* image_buffer = stbi_load(path.c_str(), &width, &height, &c, 3);
|
||||||
|
if (image_buffer == NULL) {
|
||||||
|
fprintf(stderr, "load image from '%s' failed\n", path.c_str());
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if (c < 3) {
|
||||||
|
fprintf(stderr, "the number of channels for the input image must be >= 3, but got %d channels\n", c);
|
||||||
|
free(image_buffer);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if (width <= 0) {
|
||||||
|
fprintf(stderr, "error: the width of image must be greater than 0\n");
|
||||||
|
free(image_buffer);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if (height <= 0) {
|
||||||
|
fprintf(stderr, "error: the height of image must be greater than 0\n");
|
||||||
|
free(image_buffer);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
ref_images.push_back({(uint32_t)width,
|
||||||
|
(uint32_t)height,
|
||||||
|
3,
|
||||||
|
image_buffer});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
sd_ctx_t* sd_ctx = new_sd_ctx(params.model_path.c_str(),
|
sd_ctx_t* sd_ctx = new_sd_ctx(params.model_path.c_str(),
|
||||||
|
@ -1001,14 +1012,12 @@ int main(int argc, const char* argv[]) {
|
||||||
params.style_ratio,
|
params.style_ratio,
|
||||||
params.normalize_input,
|
params.normalize_input,
|
||||||
params.input_id_images_path.c_str(),
|
params.input_id_images_path.c_str(),
|
||||||
kontext_imgs.data(), kontext_imgs.size(),
|
|
||||||
params.skip_layers.data(),
|
params.skip_layers.data(),
|
||||||
params.skip_layers.size(),
|
params.skip_layers.size(),
|
||||||
params.slg_scale,
|
params.slg_scale,
|
||||||
params.skip_layer_start,
|
params.skip_layer_start,
|
||||||
params.skip_layer_end,
|
params.skip_layer_end);
|
||||||
nullptr);
|
} else if (params.mode == IMG2IMG || params.mode == IMG2VID) {
|
||||||
} else {
|
|
||||||
sd_image_t input_image = {(uint32_t)params.width,
|
sd_image_t input_image = {(uint32_t)params.width,
|
||||||
(uint32_t)params.height,
|
(uint32_t)params.height,
|
||||||
3,
|
3,
|
||||||
|
@ -1072,14 +1081,38 @@ int main(int argc, const char* argv[]) {
|
||||||
params.style_ratio,
|
params.style_ratio,
|
||||||
params.normalize_input,
|
params.normalize_input,
|
||||||
params.input_id_images_path.c_str(),
|
params.input_id_images_path.c_str(),
|
||||||
kontext_imgs.data(), kontext_imgs.size(),
|
|
||||||
params.skip_layers.data(),
|
params.skip_layers.data(),
|
||||||
params.skip_layers.size(),
|
params.skip_layers.size(),
|
||||||
params.slg_scale,
|
params.slg_scale,
|
||||||
params.skip_layer_start,
|
params.skip_layer_start,
|
||||||
params.skip_layer_end,
|
params.skip_layer_end);
|
||||||
nullptr);
|
|
||||||
}
|
}
|
||||||
|
} else { // EDIT
|
||||||
|
results = edit(sd_ctx,
|
||||||
|
ref_images.data(),
|
||||||
|
ref_images.size(),
|
||||||
|
params.prompt.c_str(),
|
||||||
|
params.negative_prompt.c_str(),
|
||||||
|
params.clip_skip,
|
||||||
|
params.cfg_scale,
|
||||||
|
params.guidance,
|
||||||
|
params.eta,
|
||||||
|
params.width,
|
||||||
|
params.height,
|
||||||
|
params.sample_method,
|
||||||
|
params.sample_steps,
|
||||||
|
params.strength,
|
||||||
|
params.seed,
|
||||||
|
params.batch_count,
|
||||||
|
control_image,
|
||||||
|
params.control_strength,
|
||||||
|
params.style_ratio,
|
||||||
|
params.normalize_input,
|
||||||
|
params.skip_layers.data(),
|
||||||
|
params.skip_layers.size(),
|
||||||
|
params.slg_scale,
|
||||||
|
params.skip_layer_start,
|
||||||
|
params.skip_layer_end);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (results == NULL) {
|
if (results == NULL) {
|
||||||
|
@ -1117,11 +1150,11 @@ int main(int argc, const char* argv[]) {
|
||||||
|
|
||||||
std::string dummy_name, ext, lc_ext;
|
std::string dummy_name, ext, lc_ext;
|
||||||
bool is_jpg;
|
bool is_jpg;
|
||||||
size_t last = params.output_path.find_last_of(".");
|
size_t last = params.output_path.find_last_of(".");
|
||||||
size_t last_path = std::min(params.output_path.find_last_of("/"),
|
size_t last_path = std::min(params.output_path.find_last_of("/"),
|
||||||
params.output_path.find_last_of("\\"));
|
params.output_path.find_last_of("\\"));
|
||||||
if (last != std::string::npos // filename has extension
|
if (last != std::string::npos // filename has extension
|
||||||
&& (last_path == std::string::npos || last > last_path)) {
|
&& (last_path == std::string::npos || last > last_path)) {
|
||||||
dummy_name = params.output_path.substr(0, last);
|
dummy_name = params.output_path.substr(0, last);
|
||||||
ext = lc_ext = params.output_path.substr(last);
|
ext = lc_ext = params.output_path.substr(last);
|
||||||
std::transform(ext.begin(), ext.end(), lc_ext.begin(), ::tolower);
|
std::transform(ext.begin(), ext.end(), lc_ext.begin(), ::tolower);
|
||||||
|
@ -1129,7 +1162,7 @@ int main(int argc, const char* argv[]) {
|
||||||
} else {
|
} else {
|
||||||
dummy_name = params.output_path;
|
dummy_name = params.output_path;
|
||||||
ext = lc_ext = "";
|
ext = lc_ext = "";
|
||||||
is_jpg = false;
|
is_jpg = false;
|
||||||
}
|
}
|
||||||
// appending ".png" to absent or unknown extension
|
// appending ".png" to absent or unknown extension
|
||||||
if (!is_jpg && lc_ext != ".png") {
|
if (!is_jpg && lc_ext != ".png") {
|
||||||
|
@ -1141,7 +1174,7 @@ int main(int argc, const char* argv[]) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
std::string final_image_path = i > 0 ? dummy_name + "_" + std::to_string(i + 1) + ext : dummy_name + ext;
|
std::string final_image_path = i > 0 ? dummy_name + "_" + std::to_string(i + 1) + ext : dummy_name + ext;
|
||||||
if (is_jpg) {
|
if(is_jpg) {
|
||||||
stbi_write_jpg(final_image_path.c_str(), results[i].width, results[i].height, results[i].channel,
|
stbi_write_jpg(final_image_path.c_str(), results[i].width, results[i].height, results[i].channel,
|
||||||
results[i].data, 90);
|
results[i].data, 90);
|
||||||
printf("save result JPEG image to '%s'\n", final_image_path.c_str());
|
printf("save result JPEG image to '%s'\n", final_image_path.c_str());
|
||||||
|
|
|
@ -587,7 +587,47 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
|
||||||
extraimage_buffers.push_back(kcpp_base64_decode(extra_image_data[i]));
|
extraimage_buffers.push_back(kcpp_base64_decode(extra_image_data[i]));
|
||||||
input_extraimage_buffers.push_back(stbi_load_from_memory(extraimage_buffers[i].data(), extraimage_buffers[i].size(), &nx2, &ny2, &nc2, desiredchannels));
|
input_extraimage_buffers.push_back(stbi_load_from_memory(extraimage_buffers[i].data(), extraimage_buffers[i].size(), &nx2, &ny2, &nc2, desiredchannels));
|
||||||
// Resize the image
|
// Resize the image
|
||||||
int resok = stbir_resize_uint8(input_extraimage_buffers[i], nx2, ny2, 0, resized_extraimage_bufs[i].data(), img2imgW, img2imgH, 0, desiredchannels);
|
int desiredWidth = nx2;
|
||||||
|
int desiredHeight = ny2;
|
||||||
|
float aspect_ratio = static_cast<float>(nx2) / ny2;
|
||||||
|
int maxsize = 1024; // no image can exceed this
|
||||||
|
int minsize = 256;
|
||||||
|
|
||||||
|
if (desiredWidth > maxsize || desiredHeight > maxsize) { // Enforce maxsize first
|
||||||
|
if (aspect_ratio > 1.0f) { // wider than tall
|
||||||
|
desiredWidth = maxsize;
|
||||||
|
desiredHeight = static_cast<int>(maxsize / aspect_ratio);
|
||||||
|
} else { // taller than wide or square
|
||||||
|
desiredHeight = maxsize;
|
||||||
|
desiredWidth = static_cast<int>(maxsize * aspect_ratio);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (desiredWidth < minsize || desiredHeight < minsize) { // Enforce minsize only if it won't exceed maxsize
|
||||||
|
if (aspect_ratio > 1.0f) { // wider than tall
|
||||||
|
// Try to scale width up to max of (minsize, maxsize)
|
||||||
|
desiredWidth = std::min(maxsize, std::max(minsize, desiredWidth));
|
||||||
|
desiredHeight = static_cast<int>(desiredWidth / aspect_ratio);
|
||||||
|
if (desiredHeight > maxsize) { // If height now exceeds maxsize, clamp based on height instead
|
||||||
|
desiredHeight = maxsize;
|
||||||
|
desiredWidth = static_cast<int>(maxsize * aspect_ratio);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Taller than wide or square
|
||||||
|
desiredHeight = std::min(maxsize, std::max(minsize, desiredHeight));
|
||||||
|
desiredWidth = static_cast<int>(desiredHeight * aspect_ratio);
|
||||||
|
if (desiredWidth > maxsize) {
|
||||||
|
desiredWidth = maxsize;
|
||||||
|
desiredHeight = static_cast<int>(maxsize / aspect_ratio);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if(!sd_is_quiet && sddebugmode==1)
|
||||||
|
{
|
||||||
|
printf("Resize Extraimg: %dx%d to %dx%d\n",nx2,ny2,desiredWidth,desiredHeight);
|
||||||
|
}
|
||||||
|
int resok = stbir_resize_uint8(input_extraimage_buffers[i], nx2, ny2, 0, resized_extraimage_bufs[i].data(), desiredWidth, desiredHeight, 0, desiredchannels);
|
||||||
if (!resok) {
|
if (!resok) {
|
||||||
printf("\nKCPP SD: resize extra image failed!\n");
|
printf("\nKCPP SD: resize extra image failed!\n");
|
||||||
output.data = "";
|
output.data = "";
|
||||||
|
@ -595,8 +635,8 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
|
||||||
return output;
|
return output;
|
||||||
}
|
}
|
||||||
sd_image_t extraimage_reference;
|
sd_image_t extraimage_reference;
|
||||||
extraimage_reference.width = img2imgW;
|
extraimage_reference.width = desiredWidth;
|
||||||
extraimage_reference.height = img2imgH;
|
extraimage_reference.height = desiredHeight;
|
||||||
extraimage_reference.channel = desiredchannels;
|
extraimage_reference.channel = desiredchannels;
|
||||||
extraimage_reference.data = resized_extraimage_bufs[i].data();
|
extraimage_reference.data = resized_extraimage_bufs[i].data();
|
||||||
extraimage_references.push_back(extraimage_reference);
|
extraimage_references.push_back(extraimage_reference);
|
||||||
|
@ -716,6 +756,10 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Resize the image
|
// Resize the image
|
||||||
|
if(!sd_is_quiet && sddebugmode==1)
|
||||||
|
{
|
||||||
|
printf("Resize Img2Img: %dx%d to %dx%d\n",nx,ny,img2imgW,img2imgH);
|
||||||
|
}
|
||||||
int resok = stbir_resize_uint8(input_image_buffer, nx, ny, 0, resized_image_buf.data(), img2imgW, img2imgH, 0, img2imgC);
|
int resok = stbir_resize_uint8(input_image_buffer, nx, ny, 0, resized_image_buf.data(), img2imgW, img2imgH, 0, img2imgC);
|
||||||
if (!resok) {
|
if (!resok) {
|
||||||
printf("\nKCPP SD: resize image failed!\n");
|
printf("\nKCPP SD: resize image failed!\n");
|
||||||
|
@ -735,6 +779,10 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
|
||||||
image_mask_buffer = kcpp_base64_decode(img2img_mask);
|
image_mask_buffer = kcpp_base64_decode(img2img_mask);
|
||||||
input_mask_buffer = stbi_load_from_memory(image_mask_buffer.data(), image_mask_buffer.size(), &nx2, &ny2, &nc2, 1);
|
input_mask_buffer = stbi_load_from_memory(image_mask_buffer.data(), image_mask_buffer.size(), &nx2, &ny2, &nc2, 1);
|
||||||
// Resize the image
|
// Resize the image
|
||||||
|
if(!sd_is_quiet && sddebugmode==1)
|
||||||
|
{
|
||||||
|
printf("Resize Mask: %dx%d to %dx%d\n",nx2,ny2,img2imgW,img2imgH);
|
||||||
|
}
|
||||||
int resok = stbir_resize_uint8(input_mask_buffer, nx2, ny2, 0, resized_mask_buf.data(), img2imgW, img2imgH, 0, 1);
|
int resok = stbir_resize_uint8(input_mask_buffer, nx2, ny2, 0, resized_mask_buf.data(), img2imgW, img2imgH, 0, 1);
|
||||||
if (!resok) {
|
if (!resok) {
|
||||||
printf("\nKCPP SD: resize image failed!\n");
|
printf("\nKCPP SD: resize image failed!\n");
|
||||||
|
|
|
@ -678,7 +678,7 @@ public:
|
||||||
|
|
||||||
int64_t t0 = ggml_time_ms();
|
int64_t t0 = ggml_time_ms();
|
||||||
struct ggml_tensor* out = ggml_dup_tensor(work_ctx, x_t);
|
struct ggml_tensor* out = ggml_dup_tensor(work_ctx, x_t);
|
||||||
diffusion_model->compute(n_threads, x_t, timesteps, c, concat, NULL, NULL, -1, {}, 0.f, std::vector<struct ggml_tensor*>(), &out);
|
diffusion_model->compute(n_threads, x_t, timesteps, c, concat, NULL, NULL, {}, -1, {}, 0.f, &out);
|
||||||
diffusion_model->free_compute_buffer();
|
diffusion_model->free_compute_buffer();
|
||||||
|
|
||||||
double result = 0.f;
|
double result = 0.f;
|
||||||
|
@ -892,12 +892,12 @@ public:
|
||||||
const std::vector<float>& sigmas,
|
const std::vector<float>& sigmas,
|
||||||
int start_merge_step,
|
int start_merge_step,
|
||||||
SDCondition id_cond,
|
SDCondition id_cond,
|
||||||
std::vector<int> skip_layers = {},
|
std::vector<ggml_tensor*> ref_latents = {},
|
||||||
float slg_scale = 0,
|
std::vector<int> skip_layers = {},
|
||||||
float skip_layer_start = 0.01,
|
float slg_scale = 0,
|
||||||
float skip_layer_end = 0.2,
|
float skip_layer_start = 0.01,
|
||||||
std::vector<struct ggml_tensor*> kontext_imgs = std::vector<struct ggml_tensor*>(),
|
float skip_layer_end = 0.2,
|
||||||
ggml_tensor* noise_mask = NULL) {
|
ggml_tensor* noise_mask = nullptr) {
|
||||||
LOG_DEBUG("Sample");
|
LOG_DEBUG("Sample");
|
||||||
struct ggml_init_params params;
|
struct ggml_init_params params;
|
||||||
size_t data_size = ggml_row_size(init_latent->type, init_latent->ne[0]);
|
size_t data_size = ggml_row_size(init_latent->type, init_latent->ne[0]);
|
||||||
|
@ -980,10 +980,10 @@ public:
|
||||||
cond.c_concat,
|
cond.c_concat,
|
||||||
cond.c_vector,
|
cond.c_vector,
|
||||||
guidance_tensor,
|
guidance_tensor,
|
||||||
|
ref_latents,
|
||||||
-1,
|
-1,
|
||||||
controls,
|
controls,
|
||||||
control_strength,
|
control_strength,
|
||||||
kontext_imgs,
|
|
||||||
&out_cond);
|
&out_cond);
|
||||||
} else {
|
} else {
|
||||||
diffusion_model->compute(n_threads,
|
diffusion_model->compute(n_threads,
|
||||||
|
@ -993,10 +993,10 @@ public:
|
||||||
cond.c_concat,
|
cond.c_concat,
|
||||||
id_cond.c_vector,
|
id_cond.c_vector,
|
||||||
guidance_tensor,
|
guidance_tensor,
|
||||||
|
ref_latents,
|
||||||
-1,
|
-1,
|
||||||
controls,
|
controls,
|
||||||
control_strength,
|
control_strength,
|
||||||
kontext_imgs,
|
|
||||||
&out_cond);
|
&out_cond);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1014,10 +1014,10 @@ public:
|
||||||
uncond.c_concat,
|
uncond.c_concat,
|
||||||
uncond.c_vector,
|
uncond.c_vector,
|
||||||
guidance_tensor,
|
guidance_tensor,
|
||||||
|
ref_latents,
|
||||||
-1,
|
-1,
|
||||||
controls,
|
controls,
|
||||||
control_strength,
|
control_strength,
|
||||||
kontext_imgs,
|
|
||||||
&out_uncond);
|
&out_uncond);
|
||||||
negative_data = (float*)out_uncond->data;
|
negative_data = (float*)out_uncond->data;
|
||||||
}
|
}
|
||||||
|
@ -1035,10 +1035,10 @@ public:
|
||||||
cond.c_concat,
|
cond.c_concat,
|
||||||
cond.c_vector,
|
cond.c_vector,
|
||||||
guidance_tensor,
|
guidance_tensor,
|
||||||
|
ref_latents,
|
||||||
-1,
|
-1,
|
||||||
controls,
|
controls,
|
||||||
control_strength,
|
control_strength,
|
||||||
kontext_imgs,
|
|
||||||
&out_skip,
|
&out_skip,
|
||||||
NULL,
|
NULL,
|
||||||
skip_layers);
|
skip_layers);
|
||||||
|
@ -1416,12 +1416,12 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
|
||||||
float style_ratio,
|
float style_ratio,
|
||||||
bool normalize_input,
|
bool normalize_input,
|
||||||
std::string input_id_images_path,
|
std::string input_id_images_path,
|
||||||
std::vector<struct ggml_tensor*> kontext_imgs = std::vector<struct ggml_tensor*>(),
|
std::vector<ggml_tensor*> ref_latents,
|
||||||
std::vector<int> skip_layers = {},
|
std::vector<int> skip_layers = {},
|
||||||
float slg_scale = 0,
|
float slg_scale = 0,
|
||||||
float skip_layer_start = 0.01,
|
float skip_layer_start = 0.01,
|
||||||
float skip_layer_end = 0.2,
|
float skip_layer_end = 0.2,
|
||||||
ggml_tensor* masked_image = NULL,
|
ggml_tensor* masked_image = NULL,
|
||||||
const std::vector<sd_image_t*> photomaker_references = std::vector<sd_image_t*>()) {
|
const std::vector<sd_image_t*> photomaker_references = std::vector<sd_image_t*>()) {
|
||||||
if (seed < 0) {
|
if (seed < 0) {
|
||||||
// Generally, when using the provided command line, the seed is always >0.
|
// Generally, when using the provided command line, the seed is always >0.
|
||||||
|
@ -1712,11 +1712,11 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
|
||||||
sigmas,
|
sigmas,
|
||||||
start_merge_step,
|
start_merge_step,
|
||||||
id_cond,
|
id_cond,
|
||||||
|
ref_latents,
|
||||||
skip_layers,
|
skip_layers,
|
||||||
slg_scale,
|
slg_scale,
|
||||||
skip_layer_start,
|
skip_layer_start,
|
||||||
skip_layer_end,
|
skip_layer_end,
|
||||||
kontext_imgs,
|
|
||||||
noise_mask);
|
noise_mask);
|
||||||
|
|
||||||
// struct ggml_tensor* x_0 = load_tensor_from_file(ctx, "samples_ddim.bin");
|
// struct ggml_tensor* x_0 = load_tensor_from_file(ctx, "samples_ddim.bin");
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue