mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-12 09:59:41 +00:00
sync with sd.cpp
This commit is contained in:
parent
e5af9b5ea9
commit
186227fc26
8 changed files with 234 additions and 82 deletions
|
@ -597,7 +597,6 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
|
||||||
GGML_ASSERT(it != tokens.end()); // prompt must have trigger word
|
GGML_ASSERT(it != tokens.end()); // prompt must have trigger word
|
||||||
tokens.erase(it);
|
tokens.erase(it);
|
||||||
return decode(tokens);
|
return decode(tokens);
|
||||||
//return prompt; //kcpp we don't care about photomaker trigger words
|
|
||||||
}
|
}
|
||||||
|
|
||||||
SDCondition get_learned_condition(ggml_context* work_ctx,
|
SDCondition get_learned_condition(ggml_context* work_ctx,
|
||||||
|
@ -903,6 +902,7 @@ struct SD3CLIPEmbedder : public Conditioner {
|
||||||
|
|
||||||
t5->compute(n_threads,
|
t5->compute(n_threads,
|
||||||
input_ids,
|
input_ids,
|
||||||
|
NULL,
|
||||||
&chunk_hidden_states_t5,
|
&chunk_hidden_states_t5,
|
||||||
work_ctx);
|
work_ctx);
|
||||||
{
|
{
|
||||||
|
@ -1148,6 +1148,7 @@ struct FluxCLIPEmbedder : public Conditioner {
|
||||||
|
|
||||||
t5->compute(n_threads,
|
t5->compute(n_threads,
|
||||||
input_ids,
|
input_ids,
|
||||||
|
NULL,
|
||||||
&chunk_hidden_states,
|
&chunk_hidden_states,
|
||||||
work_ctx);
|
work_ctx);
|
||||||
{
|
{
|
||||||
|
@ -1223,10 +1224,15 @@ struct PixArtCLIPEmbedder : public Conditioner {
|
||||||
T5UniGramTokenizer t5_tokenizer;
|
T5UniGramTokenizer t5_tokenizer;
|
||||||
std::shared_ptr<T5Runner> t5;
|
std::shared_ptr<T5Runner> t5;
|
||||||
size_t chunk_len = 512;
|
size_t chunk_len = 512;
|
||||||
|
bool use_mask = false;
|
||||||
|
int mask_pad = 1;
|
||||||
|
|
||||||
PixArtCLIPEmbedder(ggml_backend_t backend,
|
PixArtCLIPEmbedder(ggml_backend_t backend,
|
||||||
std::map<std::string, enum ggml_type>& tensor_types,
|
std::map<std::string, enum ggml_type>& tensor_types,
|
||||||
int clip_skip = -1) {
|
int clip_skip = -1,
|
||||||
|
bool use_mask = false,
|
||||||
|
int mask_pad = 1)
|
||||||
|
: use_mask(use_mask), mask_pad(mask_pad) {
|
||||||
t5 = std::make_shared<T5Runner>(backend, tensor_types, "text_encoders.t5xxl.transformer");
|
t5 = std::make_shared<T5Runner>(backend, tensor_types, "text_encoders.t5xxl.transformer");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1323,16 +1329,6 @@ struct PixArtCLIPEmbedder : public Conditioner {
|
||||||
|
|
||||||
size_t chunk_count = t5_tokens.size() / chunk_len;
|
size_t chunk_count = t5_tokens.size() / chunk_len;
|
||||||
|
|
||||||
bool use_mask = false;
|
|
||||||
const char* SD_CHROMA_USE_T5_MASK = getenv("SD_CHROMA_USE_T5_MASK");
|
|
||||||
if (SD_CHROMA_USE_T5_MASK != nullptr) {
|
|
||||||
std::string sd_chroma_use_t5_mask_str = SD_CHROMA_USE_T5_MASK;
|
|
||||||
if (sd_chroma_use_t5_mask_str == "ON" || sd_chroma_use_t5_mask_str == "TRUE") {
|
|
||||||
use_mask = true;
|
|
||||||
} else if (sd_chroma_use_t5_mask_str != "OFF" && sd_chroma_use_t5_mask_str != "FALSE") {
|
|
||||||
LOG_WARN("SD_CHROMA_USE_T5_MASK environment variable has unexpected value. Assuming default (\"OFF\"). (Expected \"OFF\"/\"FALSE\" or\"ON\"/\"TRUE\", got \"%s\")", SD_CHROMA_USE_T5_MASK);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for (int chunk_idx = 0; chunk_idx < chunk_count; chunk_idx++) {
|
for (int chunk_idx = 0; chunk_idx < chunk_count; chunk_idx++) {
|
||||||
// t5
|
// t5
|
||||||
std::vector<int> chunk_tokens(t5_tokens.begin() + chunk_idx * chunk_len,
|
std::vector<int> chunk_tokens(t5_tokens.begin() + chunk_idx * chunk_len,
|
||||||
|
@ -1347,9 +1343,9 @@ struct PixArtCLIPEmbedder : public Conditioner {
|
||||||
|
|
||||||
t5->compute(n_threads,
|
t5->compute(n_threads,
|
||||||
input_ids,
|
input_ids,
|
||||||
|
t5_attn_mask_chunk,
|
||||||
&chunk_hidden_states,
|
&chunk_hidden_states,
|
||||||
work_ctx,
|
work_ctx);
|
||||||
t5_attn_mask_chunk);
|
|
||||||
{
|
{
|
||||||
auto tensor = chunk_hidden_states;
|
auto tensor = chunk_hidden_states;
|
||||||
float original_mean = ggml_tensor_mean(tensor);
|
float original_mean = ggml_tensor_mean(tensor);
|
||||||
|
@ -1391,18 +1387,6 @@ struct PixArtCLIPEmbedder : public Conditioner {
|
||||||
ggml_set_f32(hidden_states, 0.f);
|
ggml_set_f32(hidden_states, 0.f);
|
||||||
}
|
}
|
||||||
|
|
||||||
int mask_pad = 1;
|
|
||||||
const char* SD_CHROMA_MASK_PAD_OVERRIDE = getenv("SD_CHROMA_MASK_PAD_OVERRIDE");
|
|
||||||
if (SD_CHROMA_MASK_PAD_OVERRIDE != nullptr) {
|
|
||||||
std::string mask_pad_str = SD_CHROMA_MASK_PAD_OVERRIDE;
|
|
||||||
try {
|
|
||||||
mask_pad = std::stoi(mask_pad_str);
|
|
||||||
} catch (const std::invalid_argument&) {
|
|
||||||
LOG_WARN("SD_CHROMA_MASK_PAD_OVERRIDE environment variable is not a valid integer (%s). Falling back to default (%d)", SD_CHROMA_MASK_PAD_OVERRIDE, mask_pad);
|
|
||||||
} catch (const std::out_of_range&) {
|
|
||||||
LOG_WARN("SD_CHROMA_MASK_PAD_OVERRIDE environment variable value is out of range for `int` type (%s). Falling back to default (%d)", SD_CHROMA_MASK_PAD_OVERRIDE, mask_pad);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
modify_mask_to_attend_padding(t5_attn_mask, ggml_nelements(t5_attn_mask), mask_pad);
|
modify_mask_to_attend_padding(t5_attn_mask, ggml_nelements(t5_attn_mask), mask_pad);
|
||||||
|
|
||||||
return SDCondition(hidden_states, t5_attn_mask, NULL);
|
return SDCondition(hidden_states, t5_attn_mask, NULL);
|
||||||
|
|
|
@ -137,8 +137,9 @@ struct FluxModel : public DiffusionModel {
|
||||||
FluxModel(ggml_backend_t backend,
|
FluxModel(ggml_backend_t backend,
|
||||||
std::map<std::string, enum ggml_type>& tensor_types,
|
std::map<std::string, enum ggml_type>& tensor_types,
|
||||||
SDVersion version = VERSION_FLUX,
|
SDVersion version = VERSION_FLUX,
|
||||||
bool flash_attn = false)
|
bool flash_attn = false,
|
||||||
: flux(backend, tensor_types, "model.diffusion_model", version, flash_attn) {
|
bool use_mask = false)
|
||||||
|
: flux(backend, tensor_types, "model.diffusion_model", version, flash_attn, use_mask) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void alloc_params_buffer() {
|
void alloc_params_buffer() {
|
||||||
|
|
|
@ -744,10 +744,10 @@ namespace Flux {
|
||||||
return ids;
|
return ids;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// Generate positional embeddings
|
// Generate positional embeddings
|
||||||
std::vector<float> gen_pe(int h, int w, int patch_size, int bs, int context_len, std::vector<ggml_tensor*> ref_latents, int theta, const std::vector<int>& axes_dim) {
|
std::vector<float> gen_pe(int h, int w, int patch_size, int bs, int context_len, std::vector<ggml_tensor*> ref_latents, int theta, const std::vector<int>& axes_dim) {
|
||||||
std::vector<std::vector<float>> ids = gen_ids(h, w, patch_size, bs, context_len, ref_latents);
|
std::vector<std::vector<float>> ids = gen_ids(h, w, patch_size, bs, context_len, ref_latents);
|
||||||
|
|
||||||
std::vector<std::vector<float>> trans_ids = transpose(ids);
|
std::vector<std::vector<float>> trans_ids = transpose(ids);
|
||||||
size_t pos_len = ids.size();
|
size_t pos_len = ids.size();
|
||||||
int num_axes = axes_dim.size();
|
int num_axes = axes_dim.size();
|
||||||
|
@ -872,7 +872,7 @@ namespace Flux {
|
||||||
struct ggml_tensor* y,
|
struct ggml_tensor* y,
|
||||||
struct ggml_tensor* guidance,
|
struct ggml_tensor* guidance,
|
||||||
struct ggml_tensor* pe,
|
struct ggml_tensor* pe,
|
||||||
struct ggml_tensor* arange = NULL,
|
struct ggml_tensor* mod_index_arange = NULL,
|
||||||
std::vector<int> skip_layers = {}) {
|
std::vector<int> skip_layers = {}) {
|
||||||
auto img_in = std::dynamic_pointer_cast<Linear>(blocks["img_in"]);
|
auto img_in = std::dynamic_pointer_cast<Linear>(blocks["img_in"]);
|
||||||
auto txt_in = std::dynamic_pointer_cast<Linear>(blocks["txt_in"]);
|
auto txt_in = std::dynamic_pointer_cast<Linear>(blocks["txt_in"]);
|
||||||
|
@ -887,9 +887,10 @@ namespace Flux {
|
||||||
auto distill_timestep = ggml_nn_timestep_embedding(ctx, timesteps, 16, 10000, 1000.f);
|
auto distill_timestep = ggml_nn_timestep_embedding(ctx, timesteps, 16, 10000, 1000.f);
|
||||||
auto distill_guidance = ggml_nn_timestep_embedding(ctx, guidance, 16, 10000, 1000.f);
|
auto distill_guidance = ggml_nn_timestep_embedding(ctx, guidance, 16, 10000, 1000.f);
|
||||||
|
|
||||||
// auto arange = ggml_arange(ctx, 0, (float)mod_index_length, 1); // Not working on a lot of backends, precomputing it on CPU instead
|
// auto mod_index_arange = ggml_arange(ctx, 0, (float)mod_index_length, 1);
|
||||||
|
// ggml_arange tot working on a lot of backends, precomputing it on CPU instead
|
||||||
GGML_ASSERT(arange != NULL);
|
GGML_ASSERT(arange != NULL);
|
||||||
auto modulation_index = ggml_nn_timestep_embedding(ctx, arange, 32, 10000, 1000.f); // [1, 344, 32]
|
auto modulation_index = ggml_nn_timestep_embedding(ctx, mod_index_arange, 32, 10000, 1000.f); // [1, 344, 32]
|
||||||
|
|
||||||
// Batch broadcast (will it ever be useful)
|
// Batch broadcast (will it ever be useful)
|
||||||
modulation_index = ggml_repeat(ctx, modulation_index, ggml_new_tensor_3d(ctx, GGML_TYPE_F32, modulation_index->ne[0], modulation_index->ne[1], img->ne[2])); // [N, 344, 32]
|
modulation_index = ggml_repeat(ctx, modulation_index, ggml_new_tensor_3d(ctx, GGML_TYPE_F32, modulation_index->ne[0], modulation_index->ne[1], img->ne[2])); // [N, 344, 32]
|
||||||
|
@ -982,7 +983,7 @@ namespace Flux {
|
||||||
struct ggml_tensor* y,
|
struct ggml_tensor* y,
|
||||||
struct ggml_tensor* guidance,
|
struct ggml_tensor* guidance,
|
||||||
struct ggml_tensor* pe,
|
struct ggml_tensor* pe,
|
||||||
struct ggml_tensor* arange = NULL,
|
struct ggml_tensor* mod_index_arange = NULL,
|
||||||
std::vector<ggml_tensor*> ref_latents = {},
|
std::vector<ggml_tensor*> ref_latents = {},
|
||||||
std::vector<int> skip_layers = {}) {
|
std::vector<int> skip_layers = {}) {
|
||||||
// Forward pass of DiT.
|
// Forward pass of DiT.
|
||||||
|
@ -1024,7 +1025,7 @@ namespace Flux {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
auto out = forward_orig(ctx, img, context, timestep, y, guidance, pe, arange, skip_layers); // [N, num_tokens, C * patch_size * patch_size]
|
auto out = forward_orig(ctx, img, context, timestep, y, guidance, pe, mod_index_arange, skip_layers); // [N, num_tokens, C * patch_size * patch_size]
|
||||||
if (out->ne[1] > img_tokens) {
|
if (out->ne[1] > img_tokens) {
|
||||||
out = ggml_cont(ctx, ggml_permute(ctx, out, 0, 2, 1, 3)); // [num_tokens, N, C * patch_size * patch_size]
|
out = ggml_cont(ctx, ggml_permute(ctx, out, 0, 2, 1, 3)); // [num_tokens, N, C * patch_size * patch_size]
|
||||||
out = ggml_view_3d(ctx, out, out->ne[0], out->ne[1], img_tokens, out->nb[1], out->nb[2], 0);
|
out = ggml_view_3d(ctx, out, out->ne[0], out->ne[1], img_tokens, out->nb[1], out->nb[2], 0);
|
||||||
|
@ -1044,15 +1045,18 @@ namespace Flux {
|
||||||
public:
|
public:
|
||||||
FluxParams flux_params;
|
FluxParams flux_params;
|
||||||
Flux flux;
|
Flux flux;
|
||||||
std::vector<float> pe_vec, range; // for cache
|
std::vector<float> pe_vec;
|
||||||
|
std::vector<float> mod_index_arange_vec; // for cache
|
||||||
SDVersion version;
|
SDVersion version;
|
||||||
|
bool use_mask = false;
|
||||||
|
|
||||||
FluxRunner(ggml_backend_t backend,
|
FluxRunner(ggml_backend_t backend,
|
||||||
std::map<std::string, enum ggml_type>& tensor_types = empty_tensor_types,
|
std::map<std::string, enum ggml_type>& tensor_types = empty_tensor_types,
|
||||||
const std::string prefix = "",
|
const std::string prefix = "",
|
||||||
SDVersion version = VERSION_FLUX,
|
SDVersion version = VERSION_FLUX,
|
||||||
bool flash_attn = false)
|
bool flash_attn = false,
|
||||||
: GGMLRunner(backend) {
|
bool use_mask = false)
|
||||||
|
: GGMLRunner(backend), use_mask(use_mask) {
|
||||||
flux_params.flash_attn = flash_attn;
|
flux_params.flash_attn = flash_attn;
|
||||||
flux_params.guidance_embed = false;
|
flux_params.guidance_embed = false;
|
||||||
flux_params.depth = 0;
|
flux_params.depth = 0;
|
||||||
|
@ -1116,51 +1120,28 @@ namespace Flux {
|
||||||
struct ggml_tensor* y,
|
struct ggml_tensor* y,
|
||||||
struct ggml_tensor* guidance,
|
struct ggml_tensor* guidance,
|
||||||
std::vector<ggml_tensor*> ref_latents = {},
|
std::vector<ggml_tensor*> ref_latents = {},
|
||||||
std::vector<int> skip_layers = std::vector<int>()) {
|
std::vector<int> skip_layers = {}) {
|
||||||
GGML_ASSERT(x->ne[3] == 1);
|
GGML_ASSERT(x->ne[3] == 1);
|
||||||
struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, FLUX_GRAPH_SIZE, false);
|
struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, FLUX_GRAPH_SIZE, false);
|
||||||
|
|
||||||
struct ggml_tensor* precompute_arange = NULL;
|
struct ggml_tensor* mod_index_arange = NULL;
|
||||||
|
|
||||||
x = to_backend(x);
|
x = to_backend(x);
|
||||||
context = to_backend(context);
|
context = to_backend(context);
|
||||||
if (c_concat != NULL) {
|
if (c_concat != NULL) {
|
||||||
c_concat = to_backend(c_concat);
|
c_concat = to_backend(c_concat);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (flux_params.is_chroma) {
|
if (flux_params.is_chroma) {
|
||||||
const char* SD_CHROMA_ENABLE_GUIDANCE = getenv("SD_CHROMA_ENABLE_GUIDANCE");
|
guidance = ggml_set_f32(guidance, 0);
|
||||||
bool disable_guidance = true;
|
|
||||||
if (SD_CHROMA_ENABLE_GUIDANCE != NULL) {
|
if (!use_mask) {
|
||||||
std::string enable_guidance_str = SD_CHROMA_ENABLE_GUIDANCE;
|
y = NULL;
|
||||||
if (enable_guidance_str == "ON" || enable_guidance_str == "TRUE") {
|
|
||||||
LOG_WARN("Chroma guidance has been enabled. Image might be broken. (SD_CHROMA_ENABLE_GUIDANCE env variable to \"OFF\" to disable)", SD_CHROMA_ENABLE_GUIDANCE);
|
|
||||||
disable_guidance = false;
|
|
||||||
} else if (enable_guidance_str != "OFF" && enable_guidance_str != "FALSE") {
|
|
||||||
LOG_WARN("SD_CHROMA_ENABLE_GUIDANCE environment variable has unexpected value. Assuming default (\"OFF\"). (Expected \"ON\"/\"TRUE\" or\"OFF\"/\"FALSE\", got \"%s\")", SD_CHROMA_ENABLE_GUIDANCE);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (disable_guidance) {
|
|
||||||
// LOG_DEBUG("Forcing guidance to 0 for chroma model (SD_CHROMA_ENABLE_GUIDANCE env variable to \"ON\" to enable)");
|
|
||||||
guidance = ggml_set_f32(guidance, 0);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ggml_arange is not working on some backends, precompute it
|
||||||
const char* SD_CHROMA_USE_DIT_MASK = getenv("SD_CHROMA_USE_DIT_MASK");
|
mod_index_arange_vec = arange(0, 344);
|
||||||
if (SD_CHROMA_USE_DIT_MASK != nullptr) {
|
mod_index_arange = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_F32, mod_index_arange_vec.size());
|
||||||
std::string sd_chroma_use_DiT_mask_str = SD_CHROMA_USE_DIT_MASK;
|
set_backend_tensor_data(mod_index_arange, mod_index_arange_vec.data());
|
||||||
if (sd_chroma_use_DiT_mask_str == "OFF" || sd_chroma_use_DiT_mask_str == "FALSE") {
|
|
||||||
y = NULL;
|
|
||||||
} else if (sd_chroma_use_DiT_mask_str != "ON" && sd_chroma_use_DiT_mask_str != "TRUE") {
|
|
||||||
LOG_WARN("SD_CHROMA_USE_DIT_MASK environment variable has unexpected value. Assuming default (\"ON\"). (Expected \"ON\"/\"TRUE\" or\"OFF\"/\"FALSE\", got \"%s\")", SD_CHROMA_USE_DIT_MASK);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// ggml_arrange is not working on some backends, and y isn't used, so let's reuse y to precompute it
|
|
||||||
range = arange(0, 344);
|
|
||||||
precompute_arange = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_F32, range.size());
|
|
||||||
set_backend_tensor_data(precompute_arange, range.data());
|
|
||||||
// y = NULL;
|
|
||||||
}
|
}
|
||||||
y = to_backend(y);
|
y = to_backend(y);
|
||||||
|
|
||||||
|
@ -1189,7 +1170,7 @@ namespace Flux {
|
||||||
y,
|
y,
|
||||||
guidance,
|
guidance,
|
||||||
pe,
|
pe,
|
||||||
precompute_arange,
|
mod_index_arange,
|
||||||
ref_latents,
|
ref_latents,
|
||||||
skip_layers);
|
skip_layers);
|
||||||
|
|
||||||
|
|
|
@ -128,6 +128,10 @@ struct SDParams {
|
||||||
float slg_scale = 0.f;
|
float slg_scale = 0.f;
|
||||||
float skip_layer_start = 0.01f;
|
float skip_layer_start = 0.01f;
|
||||||
float skip_layer_end = 0.2f;
|
float skip_layer_end = 0.2f;
|
||||||
|
|
||||||
|
bool chroma_use_dit_mask = true;
|
||||||
|
bool chroma_use_t5_mask = false;
|
||||||
|
int chroma_t5_mask_pad = 1;
|
||||||
};
|
};
|
||||||
|
|
||||||
void print_params(SDParams params) {
|
void print_params(SDParams params) {
|
||||||
|
@ -177,6 +181,9 @@ void print_params(SDParams params) {
|
||||||
printf(" batch_count: %d\n", params.batch_count);
|
printf(" batch_count: %d\n", params.batch_count);
|
||||||
printf(" vae_tiling: %s\n", params.vae_tiling ? "true" : "false");
|
printf(" vae_tiling: %s\n", params.vae_tiling ? "true" : "false");
|
||||||
printf(" upscale_repeats: %d\n", params.upscale_repeats);
|
printf(" upscale_repeats: %d\n", params.upscale_repeats);
|
||||||
|
printf(" chroma_use_dit_mask: %s\n", params.chroma_use_dit_mask ? "true" : "false");
|
||||||
|
printf(" chroma_use_t5_mask: %s\n", params.chroma_use_t5_mask ? "true" : "false");
|
||||||
|
printf(" chroma_t5_mask_pad: %d\n", params.chroma_t5_mask_pad);
|
||||||
}
|
}
|
||||||
|
|
||||||
void print_usage(int argc, const char* argv[]) {
|
void print_usage(int argc, const char* argv[]) {
|
||||||
|
@ -243,6 +250,9 @@ void print_usage(int argc, const char* argv[]) {
|
||||||
printf(" --control-net-cpu keep controlnet in cpu (for low vram)\n");
|
printf(" --control-net-cpu keep controlnet in cpu (for low vram)\n");
|
||||||
printf(" --canny apply canny preprocessor (edge detection)\n");
|
printf(" --canny apply canny preprocessor (edge detection)\n");
|
||||||
printf(" --color Colors the logging tags according to level\n");
|
printf(" --color Colors the logging tags according to level\n");
|
||||||
|
printf(" --chroma-disable-dit-mask disable dit mask for chroma\n");
|
||||||
|
printf(" --chroma-enable-t5-mask enable t5 mask for chroma\n");
|
||||||
|
printf(" --chroma-t5-mask-pad PAD_SIZE t5 mask pad size of chroma\n");
|
||||||
printf(" -v, --verbose print extra info\n");
|
printf(" -v, --verbose print extra info\n");
|
||||||
printf(" -ki, --kontext_img [PATH] Reference image for Flux Kontext models (can be used multiple times) \n");
|
printf(" -ki, --kontext_img [PATH] Reference image for Flux Kontext models (can be used multiple times) \n");
|
||||||
}
|
}
|
||||||
|
@ -938,7 +948,10 @@ int main(int argc, const char* argv[]) {
|
||||||
params.clip_on_cpu,
|
params.clip_on_cpu,
|
||||||
params.control_net_cpu,
|
params.control_net_cpu,
|
||||||
params.vae_on_cpu,
|
params.vae_on_cpu,
|
||||||
params.diffusion_flash_attn);
|
params.diffusion_flash_attn,
|
||||||
|
params.chroma_use_dit_mask,
|
||||||
|
params.chroma_use_t5_mask,
|
||||||
|
params.chroma_t5_mask_pad);
|
||||||
|
|
||||||
if (sd_ctx == NULL) {
|
if (sd_ctx == NULL) {
|
||||||
printf("new_sd_ctx_t failed\n");
|
printf("new_sd_ctx_t failed\n");
|
||||||
|
|
|
@ -104,6 +104,10 @@ struct SDParams {
|
||||||
float slg_scale = 0.f;
|
float slg_scale = 0.f;
|
||||||
float skip_layer_start = 0.01f;
|
float skip_layer_start = 0.01f;
|
||||||
float skip_layer_end = 0.2f;
|
float skip_layer_end = 0.2f;
|
||||||
|
|
||||||
|
bool chroma_use_dit_mask = true;
|
||||||
|
bool chroma_use_t5_mask = false;
|
||||||
|
int chroma_t5_mask_pad = 1;
|
||||||
};
|
};
|
||||||
|
|
||||||
//shared
|
//shared
|
||||||
|
@ -272,7 +276,10 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) {
|
||||||
sd_params->clip_on_cpu,
|
sd_params->clip_on_cpu,
|
||||||
sd_params->control_net_cpu,
|
sd_params->control_net_cpu,
|
||||||
sd_params->vae_on_cpu,
|
sd_params->vae_on_cpu,
|
||||||
sd_params->diffusion_flash_attn);
|
sd_params->diffusion_flash_attn,
|
||||||
|
sd_params->chroma_use_dit_mask,
|
||||||
|
sd_params->chroma_use_t5_mask,
|
||||||
|
sd_params->chroma_t5_mask_pad);
|
||||||
|
|
||||||
if (sd_ctx == NULL) {
|
if (sd_ctx == NULL) {
|
||||||
printf("\nError: KCPP SD Failed to create context!\nIf using Flux/SD3.5, make sure you have ALL files required (e.g. VAE, T5, Clip...) or baked in!\n");
|
printf("\nError: KCPP SD Failed to create context!\nIf using Flux/SD3.5, make sure you have ALL files required (e.g. VAE, T5, Clip...) or baked in!\n");
|
||||||
|
|
|
@ -159,7 +159,10 @@ public:
|
||||||
bool clip_on_cpu,
|
bool clip_on_cpu,
|
||||||
bool control_net_cpu,
|
bool control_net_cpu,
|
||||||
bool vae_on_cpu,
|
bool vae_on_cpu,
|
||||||
bool diffusion_flash_attn) {
|
bool diffusion_flash_attn,
|
||||||
|
bool chroma_use_dit_mask,
|
||||||
|
bool chroma_use_t5_mask,
|
||||||
|
int chroma_t5_mask_pad) {
|
||||||
use_tiny_autoencoder = taesd_path.size() > 0;
|
use_tiny_autoencoder = taesd_path.size() > 0;
|
||||||
std::string taesd_path_fixed = taesd_path;
|
std::string taesd_path_fixed = taesd_path;
|
||||||
is_loaded_chroma = false;
|
is_loaded_chroma = false;
|
||||||
|
@ -391,11 +394,11 @@ public:
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (is_chroma) {
|
if (is_chroma) {
|
||||||
cond_stage_model = std::make_shared<PixArtCLIPEmbedder>(clip_backend, model_loader.tensor_storages_types);
|
cond_stage_model = std::make_shared<PixArtCLIPEmbedder>(clip_backend, model_loader.tensor_storages_types, -1, chroma_use_t5_mask, chroma_t5_mask_pad);
|
||||||
} else {
|
} else {
|
||||||
cond_stage_model = std::make_shared<FluxCLIPEmbedder>(clip_backend, model_loader.tensor_storages_types);
|
cond_stage_model = std::make_shared<FluxCLIPEmbedder>(clip_backend, model_loader.tensor_storages_types);
|
||||||
}
|
}
|
||||||
diffusion_model = std::make_shared<FluxModel>(backend, model_loader.tensor_storages_types, version, diffusion_flash_attn);
|
diffusion_model = std::make_shared<FluxModel>(backend, model_loader.tensor_storages_types, version, diffusion_flash_attn, chroma_use_dit_mask);
|
||||||
} else {
|
} else {
|
||||||
if (id_embeddings_path.find("v2") != std::string::npos) {
|
if (id_embeddings_path.find("v2") != std::string::npos) {
|
||||||
cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend, model_loader.tensor_storages_types, embeddings_path, version, PM_VERSION_2);
|
cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend, model_loader.tensor_storages_types, embeddings_path, version, PM_VERSION_2);
|
||||||
|
@ -1337,7 +1340,10 @@ sd_ctx_t* new_sd_ctx(const char* model_path_c_str,
|
||||||
bool keep_clip_on_cpu,
|
bool keep_clip_on_cpu,
|
||||||
bool keep_control_net_cpu,
|
bool keep_control_net_cpu,
|
||||||
bool keep_vae_on_cpu,
|
bool keep_vae_on_cpu,
|
||||||
bool diffusion_flash_attn) {
|
bool diffusion_flash_attn,
|
||||||
|
bool chroma_use_dit_mask,
|
||||||
|
bool chroma_use_t5_mask,
|
||||||
|
int chroma_t5_mask_pad) {
|
||||||
sd_ctx_t* sd_ctx = (sd_ctx_t*)malloc(sizeof(sd_ctx_t));
|
sd_ctx_t* sd_ctx = (sd_ctx_t*)malloc(sizeof(sd_ctx_t));
|
||||||
if (sd_ctx == NULL) {
|
if (sd_ctx == NULL) {
|
||||||
return NULL;
|
return NULL;
|
||||||
|
@ -1379,7 +1385,10 @@ sd_ctx_t* new_sd_ctx(const char* model_path_c_str,
|
||||||
keep_clip_on_cpu,
|
keep_clip_on_cpu,
|
||||||
keep_control_net_cpu,
|
keep_control_net_cpu,
|
||||||
keep_vae_on_cpu,
|
keep_vae_on_cpu,
|
||||||
diffusion_flash_attn)) {
|
diffusion_flash_attn,
|
||||||
|
chroma_use_dit_mask,
|
||||||
|
chroma_use_t5_mask,
|
||||||
|
chroma_t5_mask_pad)) {
|
||||||
delete sd_ctx->sd;
|
delete sd_ctx->sd;
|
||||||
sd_ctx->sd = NULL;
|
sd_ctx->sd = NULL;
|
||||||
free(sd_ctx);
|
free(sd_ctx);
|
||||||
|
@ -2231,5 +2240,133 @@ SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx,
|
||||||
|
|
||||||
LOG_INFO("img2vid completed in %.2fs", (t3 - t0) * 1.0f / 1000);
|
LOG_INFO("img2vid completed in %.2fs", (t3 - t0) * 1.0f / 1000);
|
||||||
|
|
||||||
|
return result_images;
|
||||||
|
}
|
||||||
|
|
||||||
|
sd_image_t* edit(sd_ctx_t* sd_ctx,
|
||||||
|
sd_image_t* ref_images,
|
||||||
|
int ref_images_count,
|
||||||
|
const char* prompt_c_str,
|
||||||
|
const char* negative_prompt_c_str,
|
||||||
|
int clip_skip,
|
||||||
|
float cfg_scale,
|
||||||
|
float guidance,
|
||||||
|
float eta,
|
||||||
|
int width,
|
||||||
|
int height,
|
||||||
|
sample_method_t sample_method,
|
||||||
|
int sample_steps,
|
||||||
|
float strength,
|
||||||
|
int64_t seed,
|
||||||
|
int batch_count,
|
||||||
|
const sd_image_t* control_cond,
|
||||||
|
float control_strength,
|
||||||
|
float style_ratio,
|
||||||
|
bool normalize_input,
|
||||||
|
int* skip_layers = NULL,
|
||||||
|
size_t skip_layers_count = 0,
|
||||||
|
float slg_scale = 0,
|
||||||
|
float skip_layer_start = 0.01,
|
||||||
|
float skip_layer_end = 0.2) {
|
||||||
|
std::vector<int> skip_layers_vec(skip_layers, skip_layers + skip_layers_count);
|
||||||
|
LOG_DEBUG("edit %dx%d", width, height);
|
||||||
|
if (sd_ctx == NULL) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
if (ref_images_count <= 0) {
|
||||||
|
LOG_ERROR("ref images count should > 0");
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_init_params params;
|
||||||
|
params.mem_size = static_cast<size_t>(30 * 1024 * 1024); // 10 MB
|
||||||
|
params.mem_size += width * height * 3 * sizeof(float) * 3 * ref_images_count;
|
||||||
|
params.mem_size *= batch_count;
|
||||||
|
params.mem_buffer = NULL;
|
||||||
|
params.no_alloc = false;
|
||||||
|
// LOG_DEBUG("mem_size %u ", params.mem_size);
|
||||||
|
|
||||||
|
struct ggml_context* work_ctx = ggml_init(params);
|
||||||
|
if (!work_ctx) {
|
||||||
|
LOG_ERROR("ggml_init() failed");
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (seed < 0) {
|
||||||
|
srand((int)time(NULL));
|
||||||
|
seed = rand();
|
||||||
|
}
|
||||||
|
sd_ctx->sd->rng->manual_seed(seed);
|
||||||
|
|
||||||
|
int C = 4;
|
||||||
|
if (sd_version_is_sd3(sd_ctx->sd->version)) {
|
||||||
|
C = 16;
|
||||||
|
} else if (sd_version_is_flux(sd_ctx->sd->version)) {
|
||||||
|
C = 16;
|
||||||
|
}
|
||||||
|
int W = width / 8;
|
||||||
|
int H = height / 8;
|
||||||
|
ggml_tensor* init_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1);
|
||||||
|
if (sd_version_is_sd3(sd_ctx->sd->version)) {
|
||||||
|
ggml_set_f32(init_latent, 0.0609f);
|
||||||
|
} else if (sd_version_is_flux(sd_ctx->sd->version)) {
|
||||||
|
ggml_set_f32(init_latent, 0.1159f);
|
||||||
|
} else {
|
||||||
|
ggml_set_f32(init_latent, 0.f);
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t t0 = ggml_time_ms();
|
||||||
|
|
||||||
|
std::vector<struct ggml_tensor*> ref_latents;
|
||||||
|
for (int i = 0; i < ref_images_count; i++) {
|
||||||
|
ggml_tensor* img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, ref_images[i].width, ref_images[i].height, 3, 1);
|
||||||
|
sd_image_to_tensor(ref_images[i].data, img);
|
||||||
|
|
||||||
|
ggml_tensor* latent = NULL;
|
||||||
|
if (!sd_ctx->sd->use_tiny_autoencoder) {
|
||||||
|
ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, img);
|
||||||
|
latent = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments);
|
||||||
|
} else {
|
||||||
|
latent = sd_ctx->sd->encode_first_stage(work_ctx, img);
|
||||||
|
}
|
||||||
|
ref_latents.push_back(latent);
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t t1 = ggml_time_ms();
|
||||||
|
LOG_INFO("encode_first_stage completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
|
||||||
|
|
||||||
|
std::vector<float> sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps);
|
||||||
|
|
||||||
|
sd_image_t* result_images = generate_image(sd_ctx,
|
||||||
|
work_ctx,
|
||||||
|
init_latent,
|
||||||
|
prompt_c_str,
|
||||||
|
negative_prompt_c_str,
|
||||||
|
clip_skip,
|
||||||
|
cfg_scale,
|
||||||
|
guidance,
|
||||||
|
eta,
|
||||||
|
width,
|
||||||
|
height,
|
||||||
|
sample_method,
|
||||||
|
sigmas,
|
||||||
|
seed,
|
||||||
|
batch_count,
|
||||||
|
control_cond,
|
||||||
|
control_strength,
|
||||||
|
style_ratio,
|
||||||
|
normalize_input,
|
||||||
|
"",
|
||||||
|
ref_latents,
|
||||||
|
skip_layers_vec,
|
||||||
|
slg_scale,
|
||||||
|
skip_layer_start,
|
||||||
|
skip_layer_end,
|
||||||
|
NULL);
|
||||||
|
|
||||||
|
size_t t2 = ggml_time_ms();
|
||||||
|
|
||||||
|
LOG_INFO("edit completed in %.2fs", (t2 - t0) * 1.0f / 1000);
|
||||||
|
|
||||||
return result_images;
|
return result_images;
|
||||||
}
|
}
|
|
@ -154,7 +154,10 @@ SD_API sd_ctx_t* new_sd_ctx(const char* model_path,
|
||||||
bool keep_clip_on_cpu,
|
bool keep_clip_on_cpu,
|
||||||
bool keep_control_net_cpu,
|
bool keep_control_net_cpu,
|
||||||
bool keep_vae_on_cpu,
|
bool keep_vae_on_cpu,
|
||||||
bool diffusion_flash_attn);
|
bool diffusion_flash_attn,
|
||||||
|
bool chroma_use_dit_mask,
|
||||||
|
bool chroma_use_t5_mask,
|
||||||
|
int chroma_t5_mask_pad);
|
||||||
|
|
||||||
SD_API void free_sd_ctx(sd_ctx_t* sd_ctx);
|
SD_API void free_sd_ctx(sd_ctx_t* sd_ctx);
|
||||||
|
|
||||||
|
@ -230,6 +233,32 @@ SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx,
|
||||||
float strength,
|
float strength,
|
||||||
int64_t seed);
|
int64_t seed);
|
||||||
|
|
||||||
|
SD_API sd_image_t* edit(sd_ctx_t* sd_ctx,
|
||||||
|
sd_image_t* ref_images,
|
||||||
|
int ref_images_count,
|
||||||
|
const char* prompt,
|
||||||
|
const char* negative_prompt,
|
||||||
|
int clip_skip,
|
||||||
|
float cfg_scale,
|
||||||
|
float guidance,
|
||||||
|
float eta,
|
||||||
|
int width,
|
||||||
|
int height,
|
||||||
|
enum sample_method_t sample_method,
|
||||||
|
int sample_steps,
|
||||||
|
float strength,
|
||||||
|
int64_t seed,
|
||||||
|
int batch_count,
|
||||||
|
const sd_image_t* control_cond,
|
||||||
|
float control_strength,
|
||||||
|
float style_strength,
|
||||||
|
bool normalize_input,
|
||||||
|
int* skip_layers,
|
||||||
|
size_t skip_layers_count,
|
||||||
|
float slg_scale,
|
||||||
|
float skip_layer_start,
|
||||||
|
float skip_layer_end);
|
||||||
|
|
||||||
typedef struct upscaler_ctx_t upscaler_ctx_t;
|
typedef struct upscaler_ctx_t upscaler_ctx_t;
|
||||||
|
|
||||||
SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path,
|
SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path,
|
||||||
|
|
|
@ -795,9 +795,9 @@ struct T5Runner : public GGMLRunner {
|
||||||
|
|
||||||
void compute(const int n_threads,
|
void compute(const int n_threads,
|
||||||
struct ggml_tensor* input_ids,
|
struct ggml_tensor* input_ids,
|
||||||
|
struct ggml_tensor* attention_mask,
|
||||||
ggml_tensor** output,
|
ggml_tensor** output,
|
||||||
ggml_context* output_ctx = NULL,
|
ggml_context* output_ctx = NULL) {
|
||||||
struct ggml_tensor* attention_mask = NULL) {
|
|
||||||
auto get_graph = [&]() -> struct ggml_cgraph* {
|
auto get_graph = [&]() -> struct ggml_cgraph* {
|
||||||
return build_graph(input_ids, attention_mask);
|
return build_graph(input_ids, attention_mask);
|
||||||
};
|
};
|
||||||
|
@ -966,7 +966,7 @@ struct T5Embedder {
|
||||||
struct ggml_tensor* out = NULL;
|
struct ggml_tensor* out = NULL;
|
||||||
|
|
||||||
int t0 = ggml_time_ms();
|
int t0 = ggml_time_ms();
|
||||||
model.compute(8, input_ids, &out, work_ctx);
|
model.compute(8, input_ids, NULL, &out, work_ctx);
|
||||||
int t1 = ggml_time_ms();
|
int t1 = ggml_time_ms();
|
||||||
|
|
||||||
print_ggml_tensor(out);
|
print_ggml_tensor(out);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue