mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-07 17:22:04 +00:00
sd: sync to master-520-d950627 (#2006)
* sd: sync to master-509-4cdfff5 * sd: Anima support * sd: sync to master-514-5792c66 * sd: additional workaround for Anima .safetensors model * sd: sync to master-517-ba35dd7 * sd: sync to master-520-d950627
This commit is contained in:
parent
ebe44e7819
commit
9158bd8b4d
21 changed files with 1786 additions and 309 deletions
686
otherarch/sdcpp/anima.hpp
Normal file
686
otherarch/sdcpp/anima.hpp
Normal file
|
|
@ -0,0 +1,686 @@
|
|||
#ifndef __ANIMA_HPP__
|
||||
#define __ANIMA_HPP__
|
||||
|
||||
#include <cmath>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "common_block.hpp"
|
||||
#include "flux.hpp"
|
||||
#include "rope.hpp"
|
||||
|
||||
namespace Anima {
|
||||
constexpr int ANIMA_GRAPH_SIZE = 65536;
|
||||
|
||||
__STATIC_INLINE__ struct ggml_tensor* apply_gate(struct ggml_context* ctx,
|
||||
struct ggml_tensor* x,
|
||||
struct ggml_tensor* gate) {
|
||||
gate = ggml_reshape_3d(ctx, gate, gate->ne[0], 1, gate->ne[1]); // [N, 1, C]
|
||||
return ggml_mul(ctx, x, gate);
|
||||
}
|
||||
|
||||
struct XEmbedder : public GGMLBlock {
|
||||
public:
|
||||
XEmbedder(int64_t in_dim, int64_t out_dim) {
|
||||
blocks["proj.1"] = std::make_shared<Linear>(in_dim, out_dim, false);
|
||||
}
|
||||
|
||||
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
|
||||
auto proj = std::dynamic_pointer_cast<Linear>(blocks["proj.1"]);
|
||||
return proj->forward(ctx, x);
|
||||
}
|
||||
};
|
||||
|
||||
struct TimestepEmbedder : public GGMLBlock {
|
||||
public:
|
||||
TimestepEmbedder(int64_t in_dim, int64_t out_dim) {
|
||||
blocks["1.linear_1"] = std::make_shared<Linear>(in_dim, in_dim, false);
|
||||
blocks["1.linear_2"] = std::make_shared<Linear>(in_dim, out_dim, false);
|
||||
}
|
||||
|
||||
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
|
||||
auto linear_1 = std::dynamic_pointer_cast<Linear>(blocks["1.linear_1"]);
|
||||
auto linear_2 = std::dynamic_pointer_cast<Linear>(blocks["1.linear_2"]);
|
||||
|
||||
x = linear_1->forward(ctx, x);
|
||||
x = ggml_silu_inplace(ctx->ggml_ctx, x);
|
||||
x = linear_2->forward(ctx, x);
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
struct AdaLayerNormZero : public GGMLBlock {
|
||||
protected:
|
||||
int64_t in_features;
|
||||
|
||||
public:
|
||||
AdaLayerNormZero(int64_t in_features, int64_t hidden_features = 256)
|
||||
: in_features(in_features) {
|
||||
blocks["norm"] = std::make_shared<LayerNorm>(in_features, 1e-6f, false, false);
|
||||
blocks["1"] = std::make_shared<Linear>(in_features, hidden_features, false);
|
||||
blocks["2"] = std::make_shared<Linear>(hidden_features, 3 * in_features, false);
|
||||
}
|
||||
|
||||
std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(GGMLRunnerContext* ctx,
|
||||
struct ggml_tensor* hidden_states,
|
||||
struct ggml_tensor* embedded_timestep,
|
||||
struct ggml_tensor* temb = nullptr) {
|
||||
auto norm = std::dynamic_pointer_cast<LayerNorm>(blocks["norm"]);
|
||||
auto linear_1 = std::dynamic_pointer_cast<Linear>(blocks["1"]);
|
||||
auto linear_2 = std::dynamic_pointer_cast<Linear>(blocks["2"]);
|
||||
|
||||
auto emb = ggml_silu(ctx->ggml_ctx, embedded_timestep);
|
||||
emb = linear_1->forward(ctx, emb);
|
||||
emb = linear_2->forward(ctx, emb); // [N, 3*C]
|
||||
|
||||
if (temb != nullptr) {
|
||||
emb = ggml_add(ctx->ggml_ctx, emb, temb);
|
||||
}
|
||||
|
||||
auto emb_chunks = ggml_ext_chunk(ctx->ggml_ctx, emb, 3, 0);
|
||||
auto shift = emb_chunks[0];
|
||||
auto scale = emb_chunks[1];
|
||||
auto gate = emb_chunks[2];
|
||||
|
||||
auto x = norm->forward(ctx, hidden_states);
|
||||
x = Flux::modulate(ctx->ggml_ctx, x, shift, scale);
|
||||
|
||||
return {x, gate};
|
||||
}
|
||||
};
|
||||
|
||||
struct AdaLayerNorm : public GGMLBlock {
|
||||
protected:
|
||||
int64_t embedding_dim;
|
||||
|
||||
public:
|
||||
AdaLayerNorm(int64_t in_features, int64_t hidden_features = 256)
|
||||
: embedding_dim(in_features) {
|
||||
blocks["norm"] = std::make_shared<LayerNorm>(in_features, 1e-6f, false, false);
|
||||
blocks["1"] = std::make_shared<Linear>(in_features, hidden_features, false);
|
||||
blocks["2"] = std::make_shared<Linear>(hidden_features, 2 * in_features, false);
|
||||
}
|
||||
|
||||
struct ggml_tensor* forward(GGMLRunnerContext* ctx,
|
||||
struct ggml_tensor* hidden_states,
|
||||
struct ggml_tensor* embedded_timestep,
|
||||
struct ggml_tensor* temb = nullptr) {
|
||||
auto norm = std::dynamic_pointer_cast<LayerNorm>(blocks["norm"]);
|
||||
auto linear_1 = std::dynamic_pointer_cast<Linear>(blocks["1"]);
|
||||
auto linear_2 = std::dynamic_pointer_cast<Linear>(blocks["2"]);
|
||||
|
||||
auto emb = ggml_silu(ctx->ggml_ctx, embedded_timestep);
|
||||
emb = linear_1->forward(ctx, emb);
|
||||
emb = linear_2->forward(ctx, emb); // [N, 2*C]
|
||||
|
||||
if (temb != nullptr) {
|
||||
auto temb_2c = ggml_view_2d(ctx->ggml_ctx, temb, 2 * embedding_dim, temb->ne[1], temb->nb[1], 0);
|
||||
emb = ggml_add(ctx->ggml_ctx, emb, temb_2c);
|
||||
}
|
||||
|
||||
auto emb_chunks = ggml_ext_chunk(ctx->ggml_ctx, emb, 2, 0);
|
||||
auto shift = emb_chunks[0];
|
||||
auto scale = emb_chunks[1];
|
||||
|
||||
auto x = norm->forward(ctx, hidden_states);
|
||||
x = Flux::modulate(ctx->ggml_ctx, x, shift, scale);
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
struct AnimaAttention : public GGMLBlock {
|
||||
protected:
|
||||
int64_t num_heads;
|
||||
int64_t head_dim;
|
||||
std::string out_proj_name;
|
||||
|
||||
public:
|
||||
AnimaAttention(int64_t query_dim,
|
||||
int64_t context_dim,
|
||||
int64_t num_heads,
|
||||
int64_t head_dim,
|
||||
const std::string& out_proj_name = "output_proj")
|
||||
: num_heads(num_heads), head_dim(head_dim), out_proj_name(out_proj_name) {
|
||||
int64_t inner_dim = num_heads * head_dim;
|
||||
|
||||
blocks["q_proj"] = std::make_shared<Linear>(query_dim, inner_dim, false);
|
||||
blocks["k_proj"] = std::make_shared<Linear>(context_dim, inner_dim, false);
|
||||
blocks["v_proj"] = std::make_shared<Linear>(context_dim, inner_dim, false);
|
||||
blocks["q_norm"] = std::make_shared<RMSNorm>(head_dim, 1e-6f);
|
||||
blocks["k_norm"] = std::make_shared<RMSNorm>(head_dim, 1e-6f);
|
||||
blocks[this->out_proj_name] = std::make_shared<Linear>(inner_dim, query_dim, false);
|
||||
}
|
||||
|
||||
struct ggml_tensor* forward(GGMLRunnerContext* ctx,
|
||||
struct ggml_tensor* hidden_states,
|
||||
struct ggml_tensor* encoder_hidden_states = nullptr,
|
||||
struct ggml_tensor* pe_q = nullptr,
|
||||
struct ggml_tensor* pe_k = nullptr) {
|
||||
if (encoder_hidden_states == nullptr) {
|
||||
encoder_hidden_states = hidden_states;
|
||||
}
|
||||
|
||||
auto q_proj = std::dynamic_pointer_cast<Linear>(blocks["q_proj"]);
|
||||
auto k_proj = std::dynamic_pointer_cast<Linear>(blocks["k_proj"]);
|
||||
auto v_proj = std::dynamic_pointer_cast<Linear>(blocks["v_proj"]);
|
||||
auto q_norm = std::dynamic_pointer_cast<RMSNorm>(blocks["q_norm"]);
|
||||
auto k_norm = std::dynamic_pointer_cast<RMSNorm>(blocks["k_norm"]);
|
||||
auto out_proj = std::dynamic_pointer_cast<Linear>(blocks[out_proj_name]);
|
||||
|
||||
auto q = q_proj->forward(ctx, hidden_states);
|
||||
auto k = k_proj->forward(ctx, encoder_hidden_states);
|
||||
auto v = v_proj->forward(ctx, encoder_hidden_states);
|
||||
|
||||
int64_t N = q->ne[2];
|
||||
int64_t L_q = q->ne[1];
|
||||
int64_t L_k = k->ne[1];
|
||||
|
||||
auto q4 = ggml_reshape_4d(ctx->ggml_ctx, q, head_dim, num_heads, L_q, N); // [N, L_q, H, D]
|
||||
auto k4 = ggml_reshape_4d(ctx->ggml_ctx, k, head_dim, num_heads, L_k, N); // [N, L_k, H, D]
|
||||
auto v4 = ggml_reshape_4d(ctx->ggml_ctx, v, head_dim, num_heads, L_k, N); // [N, L_k, H, D]
|
||||
|
||||
q4 = q_norm->forward(ctx, q4);
|
||||
k4 = k_norm->forward(ctx, k4);
|
||||
|
||||
struct ggml_tensor* attn_out = nullptr;
|
||||
if (pe_q != nullptr || pe_k != nullptr) {
|
||||
if (pe_q == nullptr) {
|
||||
pe_q = pe_k;
|
||||
}
|
||||
if (pe_k == nullptr) {
|
||||
pe_k = pe_q;
|
||||
}
|
||||
auto q_rope = Rope::apply_rope(ctx->ggml_ctx, q4, pe_q, false);
|
||||
auto k_rope = Rope::apply_rope(ctx->ggml_ctx, k4, pe_k, false);
|
||||
attn_out = ggml_ext_attention_ext(ctx->ggml_ctx,
|
||||
ctx->backend,
|
||||
q_rope,
|
||||
k_rope,
|
||||
v4,
|
||||
num_heads,
|
||||
nullptr,
|
||||
true,
|
||||
ctx->flash_attn_enabled);
|
||||
} else {
|
||||
auto q_flat = ggml_reshape_3d(ctx->ggml_ctx, q4, head_dim * num_heads, L_q, N);
|
||||
auto k_flat = ggml_reshape_3d(ctx->ggml_ctx, k4, head_dim * num_heads, L_k, N);
|
||||
attn_out = ggml_ext_attention_ext(ctx->ggml_ctx,
|
||||
ctx->backend,
|
||||
q_flat,
|
||||
k_flat,
|
||||
v,
|
||||
num_heads,
|
||||
nullptr,
|
||||
false,
|
||||
ctx->flash_attn_enabled);
|
||||
}
|
||||
|
||||
return out_proj->forward(ctx, attn_out);
|
||||
}
|
||||
};
|
||||
|
||||
struct AnimaMLP : public GGMLBlock {
|
||||
public:
|
||||
AnimaMLP(int64_t dim, int64_t hidden_dim) {
|
||||
blocks["layer1"] = std::make_shared<Linear>(dim, hidden_dim, false);
|
||||
blocks["layer2"] = std::make_shared<Linear>(hidden_dim, dim, false);
|
||||
}
|
||||
|
||||
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
|
||||
auto layer1 = std::dynamic_pointer_cast<Linear>(blocks["layer1"]);
|
||||
auto layer2 = std::dynamic_pointer_cast<Linear>(blocks["layer2"]);
|
||||
|
||||
x = layer1->forward(ctx, x);
|
||||
x = ggml_ext_gelu(ctx->ggml_ctx, x, true);
|
||||
x = layer2->forward(ctx, x);
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
struct AdapterMLP : public GGMLBlock {
|
||||
public:
|
||||
AdapterMLP(int64_t dim, int64_t hidden_dim) {
|
||||
blocks["0"] = std::make_shared<Linear>(dim, hidden_dim, true);
|
||||
blocks["2"] = std::make_shared<Linear>(hidden_dim, dim, true);
|
||||
}
|
||||
|
||||
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
|
||||
auto layer0 = std::dynamic_pointer_cast<Linear>(blocks["0"]);
|
||||
auto layer2 = std::dynamic_pointer_cast<Linear>(blocks["2"]);
|
||||
|
||||
x = layer0->forward(ctx, x);
|
||||
x = ggml_ext_gelu(ctx->ggml_ctx, x, true);
|
||||
x = layer2->forward(ctx, x);
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
struct LLMAdapterBlock : public GGMLBlock {
|
||||
public:
|
||||
LLMAdapterBlock(int64_t model_dim = 1024, int64_t source_dim = 1024, int64_t num_heads = 16, int64_t head_dim = 64) {
|
||||
blocks["norm_self_attn"] = std::make_shared<RMSNorm>(model_dim, 1e-6f);
|
||||
blocks["self_attn"] = std::make_shared<AnimaAttention>(model_dim, model_dim, num_heads, head_dim, "o_proj");
|
||||
blocks["norm_cross_attn"] = std::make_shared<RMSNorm>(model_dim, 1e-6f);
|
||||
blocks["cross_attn"] = std::make_shared<AnimaAttention>(model_dim, source_dim, num_heads, head_dim, "o_proj");
|
||||
blocks["norm_mlp"] = std::make_shared<RMSNorm>(model_dim, 1e-6f);
|
||||
blocks["mlp"] = std::make_shared<AdapterMLP>(model_dim, model_dim * 4);
|
||||
}
|
||||
|
||||
struct ggml_tensor* forward(GGMLRunnerContext* ctx,
|
||||
struct ggml_tensor* x,
|
||||
struct ggml_tensor* context,
|
||||
struct ggml_tensor* target_pe,
|
||||
struct ggml_tensor* context_pe) {
|
||||
auto norm_self_attn = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_self_attn"]);
|
||||
auto self_attn = std::dynamic_pointer_cast<AnimaAttention>(blocks["self_attn"]);
|
||||
auto norm_cross_attn = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_cross_attn"]);
|
||||
auto cross_attn = std::dynamic_pointer_cast<AnimaAttention>(blocks["cross_attn"]);
|
||||
auto norm_mlp = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_mlp"]);
|
||||
auto mlp = std::dynamic_pointer_cast<AdapterMLP>(blocks["mlp"]);
|
||||
|
||||
auto h = norm_self_attn->forward(ctx, x);
|
||||
h = self_attn->forward(ctx, h, nullptr, target_pe, target_pe);
|
||||
x = ggml_add(ctx->ggml_ctx, x, h);
|
||||
|
||||
h = norm_cross_attn->forward(ctx, x);
|
||||
h = cross_attn->forward(ctx, h, context, target_pe, context_pe);
|
||||
x = ggml_add(ctx->ggml_ctx, x, h);
|
||||
|
||||
h = norm_mlp->forward(ctx, x);
|
||||
h = mlp->forward(ctx, h);
|
||||
x = ggml_add(ctx->ggml_ctx, x, h);
|
||||
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
struct LLMAdapter : public GGMLBlock {
|
||||
protected:
|
||||
int num_layers;
|
||||
|
||||
public:
|
||||
LLMAdapter(int64_t source_dim = 1024,
|
||||
int64_t target_dim = 1024,
|
||||
int64_t model_dim = 1024,
|
||||
int num_layers = 6,
|
||||
int num_heads = 16)
|
||||
: num_layers(num_layers) {
|
||||
int64_t head_dim = model_dim / num_heads;
|
||||
|
||||
blocks["embed"] = std::make_shared<Embedding>(32128, target_dim);
|
||||
for (int i = 0; i < num_layers; i++) {
|
||||
blocks["blocks." + std::to_string(i)] =
|
||||
std::make_shared<LLMAdapterBlock>(model_dim, source_dim, num_heads, head_dim);
|
||||
}
|
||||
blocks["out_proj"] = std::make_shared<Linear>(model_dim, target_dim, true);
|
||||
blocks["norm"] = std::make_shared<RMSNorm>(target_dim, 1e-6f);
|
||||
}
|
||||
|
||||
struct ggml_tensor* forward(GGMLRunnerContext* ctx,
|
||||
struct ggml_tensor* source_hidden_states,
|
||||
struct ggml_tensor* target_input_ids,
|
||||
struct ggml_tensor* target_pe,
|
||||
struct ggml_tensor* source_pe) {
|
||||
GGML_ASSERT(target_input_ids != nullptr);
|
||||
if (ggml_n_dims(target_input_ids) == 1) {
|
||||
target_input_ids = ggml_reshape_2d(ctx->ggml_ctx, target_input_ids, target_input_ids->ne[0], 1);
|
||||
}
|
||||
|
||||
auto embed = std::dynamic_pointer_cast<Embedding>(blocks["embed"]);
|
||||
auto out_proj = std::dynamic_pointer_cast<Linear>(blocks["out_proj"]);
|
||||
auto norm = std::dynamic_pointer_cast<RMSNorm>(blocks["norm"]);
|
||||
|
||||
auto x = embed->forward(ctx, target_input_ids); // [N, target_len, target_dim]
|
||||
|
||||
for (int i = 0; i < num_layers; i++) {
|
||||
auto block = std::dynamic_pointer_cast<LLMAdapterBlock>(blocks["blocks." + std::to_string(i)]);
|
||||
x = block->forward(ctx, x, source_hidden_states, target_pe, source_pe);
|
||||
}
|
||||
|
||||
x = out_proj->forward(ctx, x);
|
||||
x = norm->forward(ctx, x);
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
struct TransformerBlock : public GGMLBlock {
|
||||
public:
|
||||
TransformerBlock(int64_t hidden_size,
|
||||
int64_t text_embed_dim,
|
||||
int64_t num_heads,
|
||||
int64_t head_dim,
|
||||
int64_t mlp_ratio = 4,
|
||||
int64_t adaln_lora_dim = 256) {
|
||||
blocks["adaln_modulation_self_attn"] = std::make_shared<AdaLayerNormZero>(hidden_size, adaln_lora_dim);
|
||||
blocks["self_attn"] = std::make_shared<AnimaAttention>(hidden_size, hidden_size, num_heads, head_dim);
|
||||
blocks["adaln_modulation_cross_attn"] = std::make_shared<AdaLayerNormZero>(hidden_size, adaln_lora_dim);
|
||||
blocks["cross_attn"] = std::make_shared<AnimaAttention>(hidden_size, text_embed_dim, num_heads, head_dim);
|
||||
blocks["adaln_modulation_mlp"] = std::make_shared<AdaLayerNormZero>(hidden_size, adaln_lora_dim);
|
||||
blocks["mlp"] = std::make_shared<AnimaMLP>(hidden_size, hidden_size * mlp_ratio);
|
||||
}
|
||||
|
||||
struct ggml_tensor* forward(GGMLRunnerContext* ctx,
|
||||
struct ggml_tensor* hidden_states,
|
||||
struct ggml_tensor* encoder_hidden_states,
|
||||
struct ggml_tensor* embedded_timestep,
|
||||
struct ggml_tensor* temb,
|
||||
struct ggml_tensor* image_pe) {
|
||||
auto norm1 = std::dynamic_pointer_cast<AdaLayerNormZero>(blocks["adaln_modulation_self_attn"]);
|
||||
auto attn1 = std::dynamic_pointer_cast<AnimaAttention>(blocks["self_attn"]);
|
||||
auto norm2 = std::dynamic_pointer_cast<AdaLayerNormZero>(blocks["adaln_modulation_cross_attn"]);
|
||||
auto attn2 = std::dynamic_pointer_cast<AnimaAttention>(blocks["cross_attn"]);
|
||||
auto norm3 = std::dynamic_pointer_cast<AdaLayerNormZero>(blocks["adaln_modulation_mlp"]);
|
||||
auto mlp = std::dynamic_pointer_cast<AnimaMLP>(blocks["mlp"]);
|
||||
|
||||
auto [normed1, gate1] = norm1->forward(ctx, hidden_states, embedded_timestep, temb);
|
||||
auto h = attn1->forward(ctx, normed1, nullptr, image_pe, image_pe);
|
||||
hidden_states = ggml_add(ctx->ggml_ctx, hidden_states, apply_gate(ctx->ggml_ctx, h, gate1));
|
||||
|
||||
auto [normed2, gate2] = norm2->forward(ctx, hidden_states, embedded_timestep, temb);
|
||||
h = attn2->forward(ctx, normed2, encoder_hidden_states, nullptr, nullptr);
|
||||
hidden_states = ggml_add(ctx->ggml_ctx, hidden_states, apply_gate(ctx->ggml_ctx, h, gate2));
|
||||
|
||||
auto [normed3, gate3] = norm3->forward(ctx, hidden_states, embedded_timestep, temb);
|
||||
h = mlp->forward(ctx, normed3);
|
||||
hidden_states = ggml_add(ctx->ggml_ctx, hidden_states, apply_gate(ctx->ggml_ctx, h, gate3));
|
||||
|
||||
return hidden_states;
|
||||
}
|
||||
};
|
||||
|
||||
struct FinalLayer : public GGMLBlock {
|
||||
protected:
|
||||
int64_t hidden_size;
|
||||
int64_t patch_size;
|
||||
int64_t out_channels;
|
||||
|
||||
public:
|
||||
FinalLayer(int64_t hidden_size, int64_t patch_size, int64_t out_channels)
|
||||
: hidden_size(hidden_size), patch_size(patch_size), out_channels(out_channels) {
|
||||
blocks["adaln_modulation"] = std::make_shared<AdaLayerNorm>(hidden_size, 256);
|
||||
blocks["linear"] = std::make_shared<Linear>(hidden_size, patch_size * patch_size * out_channels, false);
|
||||
}
|
||||
|
||||
struct ggml_tensor* forward(GGMLRunnerContext* ctx,
|
||||
struct ggml_tensor* hidden_states,
|
||||
struct ggml_tensor* embedded_timestep,
|
||||
struct ggml_tensor* temb) {
|
||||
auto adaln = std::dynamic_pointer_cast<AdaLayerNorm>(blocks["adaln_modulation"]);
|
||||
auto linear = std::dynamic_pointer_cast<Linear>(blocks["linear"]);
|
||||
|
||||
hidden_states = adaln->forward(ctx, hidden_states, embedded_timestep, temb);
|
||||
hidden_states = linear->forward(ctx, hidden_states);
|
||||
return hidden_states;
|
||||
}
|
||||
};
|
||||
|
||||
struct AnimaNet : public GGMLBlock {
|
||||
public:
|
||||
int64_t in_channels = 16;
|
||||
int64_t out_channels = 16;
|
||||
int64_t hidden_size = 2048;
|
||||
int64_t text_embed_dim = 1024;
|
||||
int64_t num_heads = 16;
|
||||
int64_t head_dim = 128;
|
||||
int patch_size = 2;
|
||||
int64_t num_layers = 28;
|
||||
std::vector<int> axes_dim = {44, 42, 42};
|
||||
int theta = 10000;
|
||||
|
||||
public:
|
||||
AnimaNet() = default;
|
||||
explicit AnimaNet(int64_t num_layers)
|
||||
: num_layers(num_layers) {
|
||||
blocks["x_embedder"] = std::make_shared<XEmbedder>((in_channels + 1) * patch_size * patch_size, hidden_size);
|
||||
blocks["t_embedder"] = std::make_shared<TimestepEmbedder>(hidden_size, hidden_size * 3);
|
||||
blocks["t_embedding_norm"] = std::make_shared<RMSNorm>(hidden_size, 1e-6f);
|
||||
for (int i = 0; i < num_layers; i++) {
|
||||
blocks["blocks." + std::to_string(i)] = std::make_shared<TransformerBlock>(hidden_size,
|
||||
text_embed_dim,
|
||||
num_heads,
|
||||
head_dim);
|
||||
}
|
||||
blocks["final_layer"] = std::make_shared<FinalLayer>(hidden_size, patch_size, out_channels);
|
||||
blocks["llm_adapter"] = std::make_shared<LLMAdapter>(1024, 1024, 1024, 6, 16);
|
||||
}
|
||||
|
||||
struct ggml_tensor* forward(GGMLRunnerContext* ctx,
|
||||
struct ggml_tensor* x,
|
||||
struct ggml_tensor* timestep,
|
||||
struct ggml_tensor* encoder_hidden_states,
|
||||
struct ggml_tensor* image_pe,
|
||||
struct ggml_tensor* t5_ids = nullptr,
|
||||
struct ggml_tensor* t5_weights = nullptr,
|
||||
struct ggml_tensor* adapter_q_pe = nullptr,
|
||||
struct ggml_tensor* adapter_k_pe = nullptr) {
|
||||
GGML_ASSERT(x->ne[3] == 1);
|
||||
|
||||
auto x_embedder = std::dynamic_pointer_cast<XEmbedder>(blocks["x_embedder"]);
|
||||
auto t_embedder = std::dynamic_pointer_cast<TimestepEmbedder>(blocks["t_embedder"]);
|
||||
auto t_embedding_norm = std::dynamic_pointer_cast<RMSNorm>(blocks["t_embedding_norm"]);
|
||||
auto final_layer = std::dynamic_pointer_cast<FinalLayer>(blocks["final_layer"]);
|
||||
auto llm_adapter = std::dynamic_pointer_cast<LLMAdapter>(blocks["llm_adapter"]);
|
||||
|
||||
int64_t W = x->ne[0];
|
||||
int64_t H = x->ne[1];
|
||||
|
||||
auto padding_mask = ggml_ext_zeros(ctx->ggml_ctx, x->ne[0], x->ne[1], 1, x->ne[3]);
|
||||
x = ggml_concat(ctx->ggml_ctx, x, padding_mask, 2); // [N, C + 1, H, W]
|
||||
|
||||
x = DiT::pad_and_patchify(ctx, x, patch_size, patch_size); // [N, h*w, (C+1)*ph*pw]
|
||||
|
||||
x = x_embedder->forward(ctx, x);
|
||||
|
||||
auto timestep_proj = ggml_ext_timestep_embedding(ctx->ggml_ctx, timestep, static_cast<int>(hidden_size));
|
||||
auto temb = t_embedder->forward(ctx, timestep_proj);
|
||||
auto embedded_timestep = t_embedding_norm->forward(ctx, timestep_proj);
|
||||
|
||||
if (t5_ids != nullptr) {
|
||||
auto adapted_context = llm_adapter->forward(ctx, encoder_hidden_states, t5_ids, adapter_q_pe, adapter_k_pe);
|
||||
if (t5_weights != nullptr) {
|
||||
auto w = t5_weights;
|
||||
if (ggml_n_dims(w) == 1) {
|
||||
w = ggml_reshape_3d(ctx->ggml_ctx, w, 1, w->ne[0], 1);
|
||||
}
|
||||
w = ggml_repeat_4d(ctx->ggml_ctx, w, adapted_context->ne[0], adapted_context->ne[1], adapted_context->ne[2], 1);
|
||||
adapted_context = ggml_mul(ctx->ggml_ctx, adapted_context, w);
|
||||
}
|
||||
if (adapted_context->ne[1] < 512) {
|
||||
auto pad_ctx = ggml_ext_zeros(ctx->ggml_ctx,
|
||||
adapted_context->ne[0],
|
||||
512 - adapted_context->ne[1],
|
||||
adapted_context->ne[2],
|
||||
1);
|
||||
adapted_context = ggml_concat(ctx->ggml_ctx, adapted_context, pad_ctx, 1);
|
||||
} else if (adapted_context->ne[1] > 512) {
|
||||
adapted_context = ggml_ext_slice(ctx->ggml_ctx, adapted_context, 1, 0, 512);
|
||||
}
|
||||
encoder_hidden_states = adapted_context;
|
||||
}
|
||||
|
||||
for (int i = 0; i < num_layers; i++) {
|
||||
auto block = std::dynamic_pointer_cast<TransformerBlock>(blocks["blocks." + std::to_string(i)]);
|
||||
x = block->forward(ctx, x, encoder_hidden_states, embedded_timestep, temb, image_pe);
|
||||
}
|
||||
|
||||
x = final_layer->forward(ctx, x, embedded_timestep, temb); // [N, h*w, ph*pw*C]
|
||||
|
||||
x = DiT::unpatchify_and_crop(ctx->ggml_ctx, x, H, W, patch_size, patch_size, false); // [N, C, H, W]
|
||||
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
struct AnimaRunner : public GGMLRunner {
|
||||
public:
|
||||
std::vector<float> image_pe_vec;
|
||||
std::vector<float> adapter_q_pe_vec;
|
||||
std::vector<float> adapter_k_pe_vec;
|
||||
AnimaNet net;
|
||||
|
||||
AnimaRunner(ggml_backend_t backend,
|
||||
bool offload_params_to_cpu,
|
||||
const String2TensorStorage& tensor_storage_map = {},
|
||||
const std::string prefix = "model.diffusion_model")
|
||||
: GGMLRunner(backend, offload_params_to_cpu) {
|
||||
int64_t num_layers = 0;
|
||||
std::string layer_tag = prefix + ".net.blocks.";
|
||||
for (const auto& kv : tensor_storage_map) {
|
||||
const std::string& tensor_name = kv.first;
|
||||
size_t pos = tensor_name.find(layer_tag);
|
||||
if (pos == std::string::npos) {
|
||||
continue;
|
||||
}
|
||||
size_t start = pos + layer_tag.size();
|
||||
size_t end = tensor_name.find('.', start);
|
||||
if (end == std::string::npos) {
|
||||
continue;
|
||||
}
|
||||
int64_t layer_id = atoll(tensor_name.substr(start, end - start).c_str());
|
||||
num_layers = std::max(num_layers, layer_id + 1);
|
||||
}
|
||||
if (num_layers <= 0) {
|
||||
num_layers = 28;
|
||||
}
|
||||
LOG_INFO("anima net layers: %" PRId64, num_layers);
|
||||
|
||||
net = AnimaNet(num_layers);
|
||||
net.init(params_ctx, tensor_storage_map, prefix + ".net");
|
||||
}
|
||||
|
||||
std::string get_desc() override {
|
||||
return "anima";
|
||||
}
|
||||
|
||||
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
|
||||
net.get_param_tensors(tensors, prefix + ".net");
|
||||
}
|
||||
|
||||
static std::vector<float> gen_1d_rope_pe_vec(int64_t seq_len, int dim, float theta = 10000.f) {
|
||||
std::vector<float> pos(seq_len);
|
||||
for (int64_t i = 0; i < seq_len; i++) {
|
||||
pos[i] = static_cast<float>(i);
|
||||
}
|
||||
auto rope_emb = Rope::rope(pos, dim, theta);
|
||||
return Rope::flatten(rope_emb);
|
||||
}
|
||||
|
||||
static float calc_ntk_factor(float extrapolation_ratio, int axis_dim) {
|
||||
if (extrapolation_ratio == 1.0f || axis_dim <= 2) {
|
||||
return 1.0f;
|
||||
}
|
||||
return std::pow(extrapolation_ratio, static_cast<float>(axis_dim) / static_cast<float>(axis_dim - 2));
|
||||
}
|
||||
|
||||
static std::vector<float> gen_anima_image_pe_vec(int bs,
|
||||
int h,
|
||||
int w,
|
||||
int patch_size,
|
||||
int theta,
|
||||
const std::vector<int>& axes_dim,
|
||||
float h_extrapolation_ratio,
|
||||
float w_extrapolation_ratio,
|
||||
float t_extrapolation_ratio) {
|
||||
static const std::vector<ggml_tensor*> empty_ref_latents;
|
||||
auto ids = Rope::gen_flux_ids(h,
|
||||
w,
|
||||
patch_size,
|
||||
bs,
|
||||
static_cast<int>(axes_dim.size()),
|
||||
0,
|
||||
{},
|
||||
empty_ref_latents,
|
||||
false,
|
||||
1.0f);
|
||||
|
||||
std::vector<float> axis_thetas = {
|
||||
static_cast<float>(theta) * calc_ntk_factor(t_extrapolation_ratio, axes_dim[0]),
|
||||
static_cast<float>(theta) * calc_ntk_factor(h_extrapolation_ratio, axes_dim[1]),
|
||||
static_cast<float>(theta) * calc_ntk_factor(w_extrapolation_ratio, axes_dim[2]),
|
||||
};
|
||||
return Rope::embed_nd(ids, bs, axis_thetas, axes_dim);
|
||||
}
|
||||
|
||||
struct ggml_cgraph* build_graph(struct ggml_tensor* x,
|
||||
struct ggml_tensor* timesteps,
|
||||
struct ggml_tensor* context,
|
||||
struct ggml_tensor* t5_ids = nullptr,
|
||||
struct ggml_tensor* t5_weights = nullptr) {
|
||||
GGML_ASSERT(x->ne[3] == 1);
|
||||
struct ggml_cgraph* gf = new_graph_custom(ANIMA_GRAPH_SIZE);
|
||||
|
||||
x = to_backend(x);
|
||||
timesteps = to_backend(timesteps);
|
||||
context = to_backend(context);
|
||||
t5_ids = to_backend(t5_ids);
|
||||
t5_weights = to_backend(t5_weights);
|
||||
|
||||
int64_t pad_h = (net.patch_size - x->ne[1] % net.patch_size) % net.patch_size;
|
||||
int64_t pad_w = (net.patch_size - x->ne[0] % net.patch_size) % net.patch_size;
|
||||
int64_t h_pad = x->ne[1] + pad_h;
|
||||
int64_t w_pad = x->ne[0] + pad_w;
|
||||
|
||||
image_pe_vec = gen_anima_image_pe_vec(1,
|
||||
static_cast<int>(h_pad),
|
||||
static_cast<int>(w_pad),
|
||||
static_cast<int>(net.patch_size),
|
||||
net.theta,
|
||||
net.axes_dim,
|
||||
4.0f,
|
||||
4.0f,
|
||||
1.0f);
|
||||
int64_t image_pos_len = static_cast<int64_t>(image_pe_vec.size()) / (2 * 2 * (net.head_dim / 2));
|
||||
auto image_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, net.head_dim / 2, image_pos_len);
|
||||
set_backend_tensor_data(image_pe, image_pe_vec.data());
|
||||
|
||||
ggml_tensor* adapter_q_pe = nullptr;
|
||||
ggml_tensor* adapter_k_pe = nullptr;
|
||||
if (t5_ids != nullptr) {
|
||||
int64_t target_len = t5_ids->ne[0];
|
||||
int64_t source_len = context->ne[1];
|
||||
|
||||
adapter_q_pe_vec = gen_1d_rope_pe_vec(target_len, 64, 10000.f);
|
||||
adapter_k_pe_vec = gen_1d_rope_pe_vec(source_len, 64, 10000.f);
|
||||
|
||||
int64_t target_pos_len = static_cast<int64_t>(adapter_q_pe_vec.size()) / (2 * 2 * 32);
|
||||
int64_t source_pos_len = static_cast<int64_t>(adapter_k_pe_vec.size()) / (2 * 2 * 32);
|
||||
|
||||
adapter_q_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, 32, target_pos_len);
|
||||
adapter_k_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, 32, source_pos_len);
|
||||
set_backend_tensor_data(adapter_q_pe, adapter_q_pe_vec.data());
|
||||
set_backend_tensor_data(adapter_k_pe, adapter_k_pe_vec.data());
|
||||
}
|
||||
|
||||
auto runner_ctx = get_context();
|
||||
auto out = net.forward(&runner_ctx,
|
||||
x,
|
||||
timesteps,
|
||||
context,
|
||||
image_pe,
|
||||
t5_ids,
|
||||
t5_weights,
|
||||
adapter_q_pe,
|
||||
adapter_k_pe);
|
||||
|
||||
ggml_build_forward_expand(gf, out);
|
||||
return gf;
|
||||
}
|
||||
|
||||
bool compute(int n_threads,
|
||||
struct ggml_tensor* x,
|
||||
struct ggml_tensor* timesteps,
|
||||
struct ggml_tensor* context,
|
||||
struct ggml_tensor* t5_ids = nullptr,
|
||||
struct ggml_tensor* t5_weights = nullptr,
|
||||
struct ggml_tensor** output = nullptr,
|
||||
struct ggml_context* output_ctx = nullptr) {
|
||||
auto get_graph = [&]() -> struct ggml_cgraph* {
|
||||
return build_graph(x, timesteps, context, t5_ids, t5_weights);
|
||||
};
|
||||
return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
|
||||
}
|
||||
};
|
||||
} // namespace Anima
|
||||
|
||||
#endif // __ANIMA_HPP__
|
||||
593
otherarch/sdcpp/common_block.hpp
Normal file
593
otherarch/sdcpp/common_block.hpp
Normal file
|
|
@ -0,0 +1,593 @@
|
|||
#ifndef __COMMON_BLOCK_HPP__
|
||||
#define __COMMON_BLOCK_HPP__
|
||||
|
||||
#include "ggml_extend.hpp"
|
||||
|
||||
class DownSampleBlock : public GGMLBlock {
|
||||
protected:
|
||||
int channels;
|
||||
int out_channels;
|
||||
bool vae_downsample;
|
||||
|
||||
public:
|
||||
DownSampleBlock(int channels,
|
||||
int out_channels,
|
||||
bool vae_downsample = false)
|
||||
: channels(channels),
|
||||
out_channels(out_channels),
|
||||
vae_downsample(vae_downsample) {
|
||||
if (vae_downsample) {
|
||||
blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {2, 2}, {0, 0}));
|
||||
} else {
|
||||
blocks["op"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {2, 2}, {1, 1}));
|
||||
}
|
||||
}
|
||||
|
||||
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
|
||||
// x: [N, channels, h, w]
|
||||
if (vae_downsample) {
|
||||
auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]);
|
||||
|
||||
x = ggml_ext_pad(ctx->ggml_ctx, x, 1, 1, 0, 0, ctx->circular_x_enabled, ctx->circular_y_enabled);
|
||||
x = conv->forward(ctx, x);
|
||||
} else {
|
||||
auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["op"]);
|
||||
|
||||
x = conv->forward(ctx, x);
|
||||
}
|
||||
return x; // [N, out_channels, h/2, w/2]
|
||||
}
|
||||
};
|
||||
|
||||
class UpSampleBlock : public GGMLBlock {
|
||||
protected:
|
||||
int channels;
|
||||
int out_channels;
|
||||
|
||||
public:
|
||||
UpSampleBlock(int channels,
|
||||
int out_channels)
|
||||
: channels(channels),
|
||||
out_channels(out_channels) {
|
||||
blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
|
||||
}
|
||||
|
||||
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
|
||||
// x: [N, channels, h, w]
|
||||
auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]);
|
||||
|
||||
x = ggml_upscale(ctx->ggml_ctx, x, 2, GGML_SCALE_MODE_NEAREST); // [N, channels, h*2, w*2]
|
||||
x = conv->forward(ctx, x); // [N, out_channels, h*2, w*2]
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
class ResBlock : public GGMLBlock {
|
||||
protected:
|
||||
// network hparams
|
||||
int64_t channels; // model_channels * (1, 1, 1, 2, 2, 4, 4, 4)
|
||||
int64_t emb_channels; // time_embed_dim
|
||||
int64_t out_channels; // mult * model_channels
|
||||
std::pair<int, int> kernel_size;
|
||||
int dims;
|
||||
bool skip_t_emb;
|
||||
bool exchange_temb_dims;
|
||||
|
||||
std::shared_ptr<GGMLBlock> conv_nd(int dims,
|
||||
int64_t in_channels,
|
||||
int64_t out_channels,
|
||||
std::pair<int, int> kernel_size,
|
||||
std::pair<int, int> padding) {
|
||||
GGML_ASSERT(dims == 2 || dims == 3);
|
||||
if (dims == 3) {
|
||||
return std::shared_ptr<GGMLBlock>(new Conv3d(in_channels, out_channels, {kernel_size.first, 1, 1}, {1, 1, 1}, {padding.first, 0, 0}));
|
||||
} else {
|
||||
return std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, out_channels, kernel_size, {1, 1}, padding));
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
ResBlock(int64_t channels,
|
||||
int64_t emb_channels,
|
||||
int64_t out_channels,
|
||||
std::pair<int, int> kernel_size = {3, 3},
|
||||
int dims = 2,
|
||||
bool exchange_temb_dims = false,
|
||||
bool skip_t_emb = false)
|
||||
: channels(channels),
|
||||
emb_channels(emb_channels),
|
||||
out_channels(out_channels),
|
||||
kernel_size(kernel_size),
|
||||
dims(dims),
|
||||
skip_t_emb(skip_t_emb),
|
||||
exchange_temb_dims(exchange_temb_dims) {
|
||||
std::pair<int, int> padding = {kernel_size.first / 2, kernel_size.second / 2};
|
||||
blocks["in_layers.0"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(channels));
|
||||
// in_layer_1 is nn.SILU()
|
||||
blocks["in_layers.2"] = conv_nd(dims, channels, out_channels, kernel_size, padding);
|
||||
|
||||
if (!skip_t_emb) {
|
||||
// emb_layer_0 is nn.SILU()
|
||||
blocks["emb_layers.1"] = std::shared_ptr<GGMLBlock>(new Linear(emb_channels, out_channels));
|
||||
}
|
||||
|
||||
blocks["out_layers.0"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(out_channels));
|
||||
// out_layer_1 is nn.SILU()
|
||||
// out_layer_2 is nn.Dropout(), skip for inference
|
||||
blocks["out_layers.3"] = conv_nd(dims, out_channels, out_channels, kernel_size, padding);
|
||||
|
||||
if (out_channels != channels) {
|
||||
blocks["skip_connection"] = conv_nd(dims, channels, out_channels, {1, 1}, {0, 0});
|
||||
}
|
||||
}
|
||||
|
||||
virtual struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x, struct ggml_tensor* emb = nullptr) {
|
||||
// For dims==3, we reduce dimension from 5d to 4d by merging h and w, in order not to change ggml
|
||||
// [N, c, t, h, w] => [N, c, t, h * w]
|
||||
// x: [N, channels, h, w] if dims == 2 else [N, channels, t, h, w]
|
||||
// emb: [N, emb_channels] if dims == 2 else [N, t, emb_channels]
|
||||
auto in_layers_0 = std::dynamic_pointer_cast<GroupNorm32>(blocks["in_layers.0"]);
|
||||
auto in_layers_2 = std::dynamic_pointer_cast<UnaryBlock>(blocks["in_layers.2"]);
|
||||
auto out_layers_0 = std::dynamic_pointer_cast<GroupNorm32>(blocks["out_layers.0"]);
|
||||
auto out_layers_3 = std::dynamic_pointer_cast<UnaryBlock>(blocks["out_layers.3"]);
|
||||
|
||||
if (emb == nullptr) {
|
||||
GGML_ASSERT(skip_t_emb);
|
||||
}
|
||||
|
||||
// in_layers
|
||||
auto h = in_layers_0->forward(ctx, x);
|
||||
h = ggml_silu_inplace(ctx->ggml_ctx, h);
|
||||
h = in_layers_2->forward(ctx, h); // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
|
||||
|
||||
// emb_layers
|
||||
if (!skip_t_emb) {
|
||||
auto emb_layer_1 = std::dynamic_pointer_cast<Linear>(blocks["emb_layers.1"]);
|
||||
|
||||
auto emb_out = ggml_silu(ctx->ggml_ctx, emb);
|
||||
emb_out = emb_layer_1->forward(ctx, emb_out); // [N, out_channels] if dims == 2 else [N, t, out_channels]
|
||||
|
||||
if (dims == 2) {
|
||||
emb_out = ggml_reshape_4d(ctx->ggml_ctx, emb_out, 1, 1, emb_out->ne[0], emb_out->ne[1]); // [N, out_channels, 1, 1]
|
||||
} else {
|
||||
emb_out = ggml_reshape_4d(ctx->ggml_ctx, emb_out, 1, emb_out->ne[0], emb_out->ne[1], emb_out->ne[2]); // [N, t, out_channels, 1]
|
||||
if (exchange_temb_dims) {
|
||||
// emb_out = rearrange(emb_out, "b t c ... -> b c t ...")
|
||||
emb_out = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, emb_out, 0, 2, 1, 3)); // [N, out_channels, t, 1]
|
||||
}
|
||||
}
|
||||
|
||||
h = ggml_add(ctx->ggml_ctx, h, emb_out); // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
|
||||
}
|
||||
|
||||
// out_layers
|
||||
h = out_layers_0->forward(ctx, h);
|
||||
h = ggml_silu_inplace(ctx->ggml_ctx, h);
|
||||
// dropout, skip for inference
|
||||
h = out_layers_3->forward(ctx, h);
|
||||
|
||||
// skip connection
|
||||
if (out_channels != channels) {
|
||||
auto skip_connection = std::dynamic_pointer_cast<UnaryBlock>(blocks["skip_connection"]);
|
||||
x = skip_connection->forward(ctx, x); // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
|
||||
}
|
||||
|
||||
h = ggml_add(ctx->ggml_ctx, h, x);
|
||||
return h; // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
|
||||
}
|
||||
};
|
||||
|
||||
class GEGLU : public UnaryBlock {
|
||||
protected:
|
||||
int64_t dim_in;
|
||||
int64_t dim_out;
|
||||
|
||||
public:
|
||||
GEGLU(int64_t dim_in, int64_t dim_out)
|
||||
: dim_in(dim_in), dim_out(dim_out) {
|
||||
blocks["proj"] = std::shared_ptr<GGMLBlock>(new Linear(dim_in, dim_out * 2));
|
||||
}
|
||||
|
||||
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
|
||||
// x: [ne3, ne2, ne1, dim_in]
|
||||
// return: [ne3, ne2, ne1, dim_out]
|
||||
auto proj = std::dynamic_pointer_cast<Linear>(blocks["proj"]);
|
||||
|
||||
x = proj->forward(ctx, x); // [ne3, ne2, ne1, dim_out*2]
|
||||
auto x_vec = ggml_ext_chunk(ctx->ggml_ctx, x, 2, 0, false);
|
||||
x = x_vec[0]; // [ne3, ne2, ne1, dim_out]
|
||||
auto gate = x_vec[1]; // [ne3, ne2, ne1, dim_out]
|
||||
|
||||
gate = ggml_cont(ctx->ggml_ctx, gate);
|
||||
|
||||
gate = ggml_ext_gelu(ctx->ggml_ctx, gate, true);
|
||||
|
||||
x = ggml_mul(ctx->ggml_ctx, x, gate); // [ne3, ne2, ne1, dim_out]
|
||||
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
class GELU : public UnaryBlock {
|
||||
public:
|
||||
GELU(int64_t dim_in, int64_t dim_out, bool bias = true) {
|
||||
blocks["proj"] = std::shared_ptr<GGMLBlock>(new Linear(dim_in, dim_out, bias));
|
||||
}
|
||||
|
||||
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
|
||||
// x: [ne3, ne2, ne1, dim_in]
|
||||
// return: [ne3, ne2, ne1, dim_out]
|
||||
auto proj = std::dynamic_pointer_cast<Linear>(blocks["proj"]);
|
||||
|
||||
x = proj->forward(ctx, x);
|
||||
x = ggml_ext_gelu(ctx->ggml_ctx, x, true);
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
class FeedForward : public GGMLBlock {
|
||||
public:
|
||||
enum class Activation {
|
||||
GEGLU,
|
||||
GELU
|
||||
};
|
||||
FeedForward(int64_t dim,
|
||||
int64_t dim_out,
|
||||
int64_t mult = 4,
|
||||
Activation activation = Activation::GEGLU,
|
||||
bool precision_fix = false) {
|
||||
int64_t inner_dim = dim * mult;
|
||||
if (activation == Activation::GELU) {
|
||||
blocks["net.0"] = std::shared_ptr<GGMLBlock>(new GELU(dim, inner_dim));
|
||||
} else {
|
||||
blocks["net.0"] = std::shared_ptr<GGMLBlock>(new GEGLU(dim, inner_dim));
|
||||
}
|
||||
|
||||
// net_1 is nn.Dropout(), skip for inference
|
||||
bool force_prec_f32 = false;
|
||||
float scale = 1.f;
|
||||
if (precision_fix) {
|
||||
scale = 1.f / 128.f;
|
||||
#ifdef SD_USE_VULKAN
|
||||
force_prec_f32 = true;
|
||||
#endif
|
||||
}
|
||||
// The purpose of the scale here is to prevent NaN issues in certain situations.
|
||||
// For example, when using Vulkan without enabling force_prec_f32,
|
||||
// or when using CUDA but the weights are k-quants.
|
||||
blocks["net.2"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim_out, true, false, force_prec_f32, scale));
|
||||
}
|
||||
|
||||
struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
|
||||
// x: [ne3, ne2, ne1, dim]
|
||||
// return: [ne3, ne2, ne1, dim_out]
|
||||
|
||||
auto net_0 = std::dynamic_pointer_cast<UnaryBlock>(blocks["net.0"]);
|
||||
auto net_2 = std::dynamic_pointer_cast<Linear>(blocks["net.2"]);
|
||||
|
||||
x = net_0->forward(ctx, x); // [ne3, ne2, ne1, inner_dim]
|
||||
x = net_2->forward(ctx, x); // [ne3, ne2, ne1, dim_out]
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
class CrossAttention : public GGMLBlock {
|
||||
protected:
|
||||
int64_t query_dim;
|
||||
int64_t context_dim;
|
||||
int64_t n_head;
|
||||
int64_t d_head;
|
||||
|
||||
public:
|
||||
CrossAttention(int64_t query_dim,
|
||||
int64_t context_dim,
|
||||
int64_t n_head,
|
||||
int64_t d_head)
|
||||
: n_head(n_head),
|
||||
d_head(d_head),
|
||||
query_dim(query_dim),
|
||||
context_dim(context_dim) {
|
||||
int64_t inner_dim = d_head * n_head;
|
||||
|
||||
blocks["to_q"] = std::shared_ptr<GGMLBlock>(new Linear(query_dim, inner_dim, false));
|
||||
blocks["to_k"] = std::shared_ptr<GGMLBlock>(new Linear(context_dim, inner_dim, false));
|
||||
blocks["to_v"] = std::shared_ptr<GGMLBlock>(new Linear(context_dim, inner_dim, false));
|
||||
|
||||
blocks["to_out.0"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, query_dim));
|
||||
// to_out_1 is nn.Dropout(), skip for inference
|
||||
}
|
||||
|
||||
struct ggml_tensor* forward(GGMLRunnerContext* ctx,
|
||||
struct ggml_tensor* x,
|
||||
struct ggml_tensor* context) {
|
||||
// x: [N, n_token, query_dim]
|
||||
// context: [N, n_context, context_dim]
|
||||
// return: [N, n_token, query_dim]
|
||||
auto to_q = std::dynamic_pointer_cast<Linear>(blocks["to_q"]);
|
||||
auto to_k = std::dynamic_pointer_cast<Linear>(blocks["to_k"]);
|
||||
auto to_v = std::dynamic_pointer_cast<Linear>(blocks["to_v"]);
|
||||
auto to_out_0 = std::dynamic_pointer_cast<Linear>(blocks["to_out.0"]);
|
||||
|
||||
int64_t n = x->ne[2];
|
||||
int64_t n_token = x->ne[1];
|
||||
int64_t n_context = context->ne[1];
|
||||
int64_t inner_dim = d_head * n_head;
|
||||
|
||||
auto q = to_q->forward(ctx, x); // [N, n_token, inner_dim]
|
||||
auto k = to_k->forward(ctx, context); // [N, n_context, inner_dim]
|
||||
auto v = to_v->forward(ctx, context); // [N, n_context, inner_dim]
|
||||
|
||||
x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, n_head, nullptr, false, ctx->flash_attn_enabled); // [N, n_token, inner_dim]
|
||||
|
||||
x = to_out_0->forward(ctx, x); // [N, n_token, query_dim]
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
class BasicTransformerBlock : public GGMLBlock {
|
||||
protected:
|
||||
int64_t n_head;
|
||||
int64_t d_head;
|
||||
bool ff_in;
|
||||
|
||||
public:
|
||||
BasicTransformerBlock(int64_t dim,
|
||||
int64_t n_head,
|
||||
int64_t d_head,
|
||||
int64_t context_dim,
|
||||
bool ff_in = false)
|
||||
: n_head(n_head), d_head(d_head), ff_in(ff_in) {
|
||||
// disable_self_attn is always False
|
||||
// disable_temporal_crossattention is always False
|
||||
// switch_temporal_ca_to_sa is always False
|
||||
// inner_dim is always None or equal to dim
|
||||
// gated_ff is always True
|
||||
blocks["attn1"] = std::shared_ptr<GGMLBlock>(new CrossAttention(dim, dim, n_head, d_head));
|
||||
blocks["attn2"] = std::shared_ptr<GGMLBlock>(new CrossAttention(dim, context_dim, n_head, d_head));
|
||||
blocks["ff"] = std::shared_ptr<GGMLBlock>(new FeedForward(dim, dim));
|
||||
blocks["norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
|
||||
blocks["norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
|
||||
blocks["norm3"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
|
||||
|
||||
if (ff_in) {
|
||||
blocks["norm_in"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
|
||||
blocks["ff_in"] = std::shared_ptr<GGMLBlock>(new FeedForward(dim, dim));
|
||||
}
|
||||
}
|
||||
|
||||
struct ggml_tensor* forward(GGMLRunnerContext* ctx,
|
||||
struct ggml_tensor* x,
|
||||
struct ggml_tensor* context) {
|
||||
// x: [N, n_token, query_dim]
|
||||
// context: [N, n_context, context_dim]
|
||||
// return: [N, n_token, query_dim]
|
||||
|
||||
auto attn1 = std::dynamic_pointer_cast<CrossAttention>(blocks["attn1"]);
|
||||
auto attn2 = std::dynamic_pointer_cast<CrossAttention>(blocks["attn2"]);
|
||||
auto ff = std::dynamic_pointer_cast<FeedForward>(blocks["ff"]);
|
||||
auto norm1 = std::dynamic_pointer_cast<LayerNorm>(blocks["norm1"]);
|
||||
auto norm2 = std::dynamic_pointer_cast<LayerNorm>(blocks["norm2"]);
|
||||
auto norm3 = std::dynamic_pointer_cast<LayerNorm>(blocks["norm3"]);
|
||||
|
||||
if (ff_in) {
|
||||
auto norm_in = std::dynamic_pointer_cast<LayerNorm>(blocks["norm_in"]);
|
||||
auto ff_in = std::dynamic_pointer_cast<FeedForward>(blocks["ff_in"]);
|
||||
|
||||
auto x_skip = x;
|
||||
x = norm_in->forward(ctx, x);
|
||||
x = ff_in->forward(ctx, x);
|
||||
// self.is_res is always True
|
||||
x = ggml_add(ctx->ggml_ctx, x, x_skip);
|
||||
}
|
||||
|
||||
auto r = x;
|
||||
x = norm1->forward(ctx, x);
|
||||
x = attn1->forward(ctx, x, x); // self-attention
|
||||
x = ggml_add(ctx->ggml_ctx, x, r);
|
||||
r = x;
|
||||
x = norm2->forward(ctx, x);
|
||||
x = attn2->forward(ctx, x, context); // cross-attention
|
||||
x = ggml_add(ctx->ggml_ctx, x, r);
|
||||
r = x;
|
||||
x = norm3->forward(ctx, x);
|
||||
x = ff->forward(ctx, x);
|
||||
x = ggml_add(ctx->ggml_ctx, x, r);
|
||||
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
class SpatialTransformer : public GGMLBlock {
|
||||
protected:
|
||||
int64_t in_channels; // mult * model_channels
|
||||
int64_t n_head;
|
||||
int64_t d_head;
|
||||
int64_t depth = 1; // 1
|
||||
int64_t context_dim = 768; // hidden_size, 1024 for VERSION_SD2
|
||||
bool use_linear = false;
|
||||
|
||||
void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") {
|
||||
auto iter = tensor_storage_map.find(prefix + "proj_out.weight");
|
||||
if (iter != tensor_storage_map.end()) {
|
||||
int64_t inner_dim = n_head * d_head;
|
||||
if (iter->second.n_dims == 4 && use_linear) {
|
||||
use_linear = false;
|
||||
blocks["proj_in"] = std::make_shared<Conv2d>(in_channels, inner_dim, std::pair{1, 1});
|
||||
blocks["proj_out"] = std::make_shared<Conv2d>(inner_dim, in_channels, std::pair{1, 1});
|
||||
} else if (iter->second.n_dims == 2 && !use_linear) {
|
||||
use_linear = true;
|
||||
blocks["proj_in"] = std::make_shared<Linear>(in_channels, inner_dim);
|
||||
blocks["proj_out"] = std::make_shared<Linear>(inner_dim, in_channels);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
SpatialTransformer(int64_t in_channels,
|
||||
int64_t n_head,
|
||||
int64_t d_head,
|
||||
int64_t depth,
|
||||
int64_t context_dim,
|
||||
bool use_linear)
|
||||
: in_channels(in_channels),
|
||||
n_head(n_head),
|
||||
d_head(d_head),
|
||||
depth(depth),
|
||||
context_dim(context_dim),
|
||||
use_linear(use_linear) {
|
||||
// disable_self_attn is always False
|
||||
int64_t inner_dim = n_head * d_head; // in_channels
|
||||
blocks["norm"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(in_channels));
|
||||
if (use_linear) {
|
||||
blocks["proj_in"] = std::shared_ptr<GGMLBlock>(new Linear(in_channels, inner_dim));
|
||||
} else {
|
||||
blocks["proj_in"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, inner_dim, {1, 1}));
|
||||
}
|
||||
|
||||
for (int i = 0; i < depth; i++) {
|
||||
std::string name = "transformer_blocks." + std::to_string(i);
|
||||
blocks[name] = std::shared_ptr<GGMLBlock>(new BasicTransformerBlock(inner_dim, n_head, d_head, context_dim, false));
|
||||
}
|
||||
|
||||
if (use_linear) {
|
||||
blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, in_channels));
|
||||
} else {
|
||||
blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(inner_dim, in_channels, {1, 1}));
|
||||
}
|
||||
}
|
||||
|
||||
virtual struct ggml_tensor* forward(GGMLRunnerContext* ctx,
|
||||
struct ggml_tensor* x,
|
||||
struct ggml_tensor* context) {
|
||||
// x: [N, in_channels, h, w]
|
||||
// context: [N, max_position(aka n_token), hidden_size(aka context_dim)]
|
||||
auto norm = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm"]);
|
||||
auto proj_in = std::dynamic_pointer_cast<UnaryBlock>(blocks["proj_in"]);
|
||||
auto proj_out = std::dynamic_pointer_cast<UnaryBlock>(blocks["proj_out"]);
|
||||
|
||||
auto x_in = x;
|
||||
int64_t n = x->ne[3];
|
||||
int64_t h = x->ne[1];
|
||||
int64_t w = x->ne[0];
|
||||
int64_t inner_dim = n_head * d_head;
|
||||
|
||||
x = norm->forward(ctx, x);
|
||||
if (use_linear) {
|
||||
x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 2, 0, 3)); // [N, h, w, inner_dim]
|
||||
x = ggml_reshape_3d(ctx->ggml_ctx, x, inner_dim, w * h, n); // [N, h * w, inner_dim]
|
||||
x = proj_in->forward(ctx, x); // [N, inner_dim, h, w]
|
||||
} else {
|
||||
x = proj_in->forward(ctx, x); // [N, inner_dim, h, w]
|
||||
x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 2, 0, 3)); // [N, h, w, inner_dim]
|
||||
x = ggml_reshape_3d(ctx->ggml_ctx, x, inner_dim, w * h, n); // [N, h * w, inner_dim]
|
||||
}
|
||||
|
||||
for (int i = 0; i < depth; i++) {
|
||||
std::string name = "transformer_blocks." + std::to_string(i);
|
||||
auto transformer_block = std::dynamic_pointer_cast<BasicTransformerBlock>(blocks[name]);
|
||||
|
||||
x = transformer_block->forward(ctx, x, context);
|
||||
}
|
||||
|
||||
if (use_linear) {
|
||||
// proj_out
|
||||
x = proj_out->forward(ctx, x); // [N, in_channels, h, w]
|
||||
|
||||
x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 0, 2, 3)); // [N, inner_dim, h * w]
|
||||
x = ggml_reshape_4d(ctx->ggml_ctx, x, w, h, inner_dim, n); // [N, inner_dim, h, w]
|
||||
} else {
|
||||
x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 0, 2, 3)); // [N, inner_dim, h * w]
|
||||
x = ggml_reshape_4d(ctx->ggml_ctx, x, w, h, inner_dim, n); // [N, inner_dim, h, w]
|
||||
|
||||
// proj_out
|
||||
x = proj_out->forward(ctx, x); // [N, in_channels, h, w]
|
||||
}
|
||||
|
||||
x = ggml_add(ctx->ggml_ctx, x, x_in);
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
class AlphaBlender : public GGMLBlock {
|
||||
protected:
|
||||
void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, std::string prefix = "") override {
|
||||
// Get the type of the "mix_factor" tensor from the input tensors map with the specified prefix
|
||||
enum ggml_type wtype = GGML_TYPE_F32;
|
||||
params["mix_factor"] = ggml_new_tensor_1d(ctx, wtype, 1);
|
||||
}
|
||||
|
||||
float get_alpha() {
|
||||
// image_only_indicator is always tensor([0.]) and since mix_factor.shape is [1,]
|
||||
// so learned_with_images is same as learned
|
||||
float alpha = ggml_ext_backend_tensor_get_f32(params["mix_factor"]);
|
||||
return sigmoid(alpha);
|
||||
}
|
||||
|
||||
public:
|
||||
AlphaBlender() {
|
||||
// merge_strategy is always learned_with_images
|
||||
// for inference, we don't need to set alpha
|
||||
// since mix_factor.shape is [1,], we don't need rearrange using rearrange_pattern
|
||||
}
|
||||
|
||||
struct ggml_tensor* forward(GGMLRunnerContext* ctx,
|
||||
struct ggml_tensor* x_spatial,
|
||||
struct ggml_tensor* x_temporal) {
|
||||
// image_only_indicator is always tensor([0.])
|
||||
float alpha = get_alpha();
|
||||
auto x = ggml_add(ctx->ggml_ctx,
|
||||
ggml_ext_scale(ctx->ggml_ctx, x_spatial, alpha),
|
||||
ggml_ext_scale(ctx->ggml_ctx, x_temporal, 1.0f - alpha));
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
class VideoResBlock : public ResBlock {
|
||||
public:
|
||||
VideoResBlock(int64_t channels,
|
||||
int64_t emb_channels,
|
||||
int64_t out_channels,
|
||||
std::pair<int, int> kernel_size = {3, 3},
|
||||
int64_t video_kernel_size = 3,
|
||||
int dims = 2) // always 2
|
||||
: ResBlock(channels, emb_channels, out_channels, kernel_size, dims) {
|
||||
blocks["time_stack"] = std::shared_ptr<GGMLBlock>(new ResBlock(out_channels, emb_channels, out_channels, kernel_size, 3, true));
|
||||
blocks["time_mixer"] = std::shared_ptr<GGMLBlock>(new AlphaBlender());
|
||||
}
|
||||
|
||||
struct ggml_tensor* forward(GGMLRunnerContext* ctx,
|
||||
struct ggml_tensor* x,
|
||||
struct ggml_tensor* emb,
|
||||
int num_video_frames) {
|
||||
// x: [N, channels, h, w] aka [b*t, channels, h, w]
|
||||
// emb: [N, emb_channels] aka [b*t, emb_channels]
|
||||
// image_only_indicator is always tensor([0.])
|
||||
auto time_stack = std::dynamic_pointer_cast<ResBlock>(blocks["time_stack"]);
|
||||
auto time_mixer = std::dynamic_pointer_cast<AlphaBlender>(blocks["time_mixer"]);
|
||||
|
||||
x = ResBlock::forward(ctx, x, emb);
|
||||
|
||||
int64_t T = num_video_frames;
|
||||
int64_t B = x->ne[3] / T;
|
||||
int64_t C = x->ne[2];
|
||||
int64_t H = x->ne[1];
|
||||
int64_t W = x->ne[0];
|
||||
|
||||
x = ggml_reshape_4d(ctx->ggml_ctx, x, W * H, C, T, B); // (b t) c h w -> b t c (h w)
|
||||
x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3)); // b t c (h w) -> b c t (h w)
|
||||
auto x_mix = x;
|
||||
|
||||
emb = ggml_reshape_4d(ctx->ggml_ctx, emb, emb->ne[0], T, B, emb->ne[3]); // (b t) ... -> b t ...
|
||||
|
||||
x = time_stack->forward(ctx, x, emb); // b t c (h w)
|
||||
|
||||
x = time_mixer->forward(ctx, x_mix, x); // b t c (h w)
|
||||
|
||||
x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3)); // b c t (h w) -> b t c (h w)
|
||||
x = ggml_reshape_4d(ctx->ggml_ctx, x, W, H, C, T * B); // b t c (h w) -> (b t) c h w
|
||||
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
#endif // __COMMON_BLOCK_HPP__
|
||||
108
otherarch/sdcpp/common_dit.hpp
Normal file
108
otherarch/sdcpp/common_dit.hpp
Normal file
|
|
@ -0,0 +1,108 @@
|
|||
#ifndef __COMMON_DIT_HPP__
|
||||
#define __COMMON_DIT_HPP__
|
||||
|
||||
#include "ggml_extend.hpp"
|
||||
|
||||
namespace DiT {
|
||||
ggml_tensor* patchify(ggml_context* ctx,
|
||||
ggml_tensor* x,
|
||||
int pw,
|
||||
int ph,
|
||||
bool patch_last = true) {
|
||||
// x: [N, C, H, W]
|
||||
// return: [N, h*w, C*ph*pw] if patch_last else [N, h*w, ph*pw*C]
|
||||
int64_t N = x->ne[3];
|
||||
int64_t C = x->ne[2];
|
||||
int64_t H = x->ne[1];
|
||||
int64_t W = x->ne[0];
|
||||
int64_t h = H / ph;
|
||||
int64_t w = W / pw;
|
||||
|
||||
GGML_ASSERT(h * ph == H && w * pw == W);
|
||||
|
||||
x = ggml_reshape_4d(ctx, x, pw, w, ph, h * C * N); // [N*C*h, ph, w, pw]
|
||||
x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // [N*C*h, w, ph, pw]
|
||||
x = ggml_reshape_4d(ctx, x, pw * ph, w * h, C, N); // [N, C, h*w, ph*pw]
|
||||
if (patch_last) {
|
||||
x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // [N, h*w, C, ph*pw]
|
||||
x = ggml_reshape_3d(ctx, x, pw * ph * C, w * h, N); // [N, h*w, C*ph*pw]
|
||||
} else {
|
||||
x = ggml_cont(ctx, ggml_ext_torch_permute(ctx, x, 2, 0, 1, 3)); // [N, h*w, C, ph*pw]
|
||||
x = ggml_reshape_3d(ctx, x, C * pw * ph, w * h, N); // [N, h*w, ph*pw*C]
|
||||
}
|
||||
return x;
|
||||
}
|
||||
|
||||
ggml_tensor* unpatchify(ggml_context* ctx,
|
||||
ggml_tensor* x,
|
||||
int64_t h,
|
||||
int64_t w,
|
||||
int ph,
|
||||
int pw,
|
||||
bool patch_last = true) {
|
||||
// x: [N, h*w, C*ph*pw] if patch_last else [N, h*w, ph*pw*C]
|
||||
// return: [N, C, H, W]
|
||||
int64_t N = x->ne[2];
|
||||
int64_t C = x->ne[0] / ph / pw;
|
||||
int64_t H = h * ph;
|
||||
int64_t W = w * pw;
|
||||
|
||||
GGML_ASSERT(C * ph * pw == x->ne[0]);
|
||||
|
||||
if (patch_last) {
|
||||
x = ggml_reshape_4d(ctx, x, pw * ph, C, w * h, N); // [N, h*w, C, ph*pw]
|
||||
x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // [N, C, h*w, ph*pw]
|
||||
} else {
|
||||
x = ggml_reshape_4d(ctx, x, C, pw * ph, w * h, N); // [N, h*w, ph*pw, C]
|
||||
x = ggml_cont(ctx, ggml_permute(ctx, x, 2, 0, 1, 3)); // [N, C, h*w, ph*pw]
|
||||
}
|
||||
|
||||
x = ggml_reshape_4d(ctx, x, pw, ph, w, h * C * N); // [N*C*h, w, ph, pw]
|
||||
x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // [N*C*h, ph, w, pw]
|
||||
x = ggml_reshape_4d(ctx, x, W, H, C, N); // [N, C, h*ph, w*pw]
|
||||
|
||||
return x;
|
||||
}
|
||||
|
||||
ggml_tensor* pad_to_patch_size(GGMLRunnerContext* ctx,
|
||||
ggml_tensor* x,
|
||||
int ph,
|
||||
int pw) {
|
||||
int64_t W = x->ne[0];
|
||||
int64_t H = x->ne[1];
|
||||
|
||||
int pad_h = (ph - H % ph) % ph;
|
||||
int pad_w = (pw - W % pw) % pw;
|
||||
x = ggml_ext_pad(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0, ctx->circular_x_enabled, ctx->circular_y_enabled);
|
||||
return x;
|
||||
}
|
||||
|
||||
ggml_tensor* pad_and_patchify(GGMLRunnerContext* ctx,
|
||||
ggml_tensor* x,
|
||||
int ph,
|
||||
int pw,
|
||||
bool patch_last = true) {
|
||||
x = pad_to_patch_size(ctx, x, ph, pw);
|
||||
x = patchify(ctx->ggml_ctx, x, ph, pw, patch_last);
|
||||
return x;
|
||||
}
|
||||
|
||||
ggml_tensor* unpatchify_and_crop(ggml_context* ctx,
|
||||
ggml_tensor* x,
|
||||
int64_t H,
|
||||
int64_t W,
|
||||
int ph,
|
||||
int pw,
|
||||
bool patch_last = true) {
|
||||
int pad_h = (ph - H % ph) % ph;
|
||||
int pad_w = (pw - W % pw) % pw;
|
||||
int64_t h = ((H + pad_h) / ph);
|
||||
int64_t w = ((W + pad_w) / pw);
|
||||
x = unpatchify(ctx, x, h, w, ph, pw, patch_last); // [N, C, H + pad_h, W + pad_w]
|
||||
x = ggml_ext_slice(ctx, x, 1, 0, H); // [N, C, H, W + pad_w]
|
||||
x = ggml_ext_slice(ctx, x, 0, 0, W); // [N, C, H, W]
|
||||
return x;
|
||||
}
|
||||
} // namespace DiT
|
||||
|
||||
#endif // __COMMON_DIT_HPP__
|
||||
|
|
@ -1641,6 +1641,142 @@ struct T5CLIPEmbedder : public Conditioner {
|
|||
}
|
||||
};
|
||||
|
||||
struct AnimaConditioner : public Conditioner {
|
||||
std::shared_ptr<LLM::BPETokenizer> qwen_tokenizer;
|
||||
T5UniGramTokenizer t5_tokenizer;
|
||||
std::shared_ptr<LLM::LLMRunner> llm;
|
||||
|
||||
AnimaConditioner(ggml_backend_t backend,
|
||||
bool offload_params_to_cpu,
|
||||
const String2TensorStorage& tensor_storage_map = {}) {
|
||||
qwen_tokenizer = std::make_shared<LLM::Qwen2Tokenizer>();
|
||||
llm = std::make_shared<LLM::LLMRunner>(LLM::LLMArch::QWEN3,
|
||||
backend,
|
||||
offload_params_to_cpu,
|
||||
tensor_storage_map,
|
||||
"text_encoders.llm",
|
||||
false);
|
||||
}
|
||||
|
||||
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
|
||||
llm->get_param_tensors(tensors, "text_encoders.llm");
|
||||
}
|
||||
|
||||
void alloc_params_buffer() override {
|
||||
llm->alloc_params_buffer();
|
||||
}
|
||||
|
||||
void free_params_buffer() override {
|
||||
llm->free_params_buffer();
|
||||
}
|
||||
|
||||
size_t get_params_buffer_size() override {
|
||||
return llm->get_params_buffer_size();
|
||||
}
|
||||
|
||||
void set_flash_attention_enabled(bool enabled) override {
|
||||
llm->set_flash_attention_enabled(enabled);
|
||||
}
|
||||
|
||||
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
|
||||
llm->set_weight_adapter(adapter);
|
||||
}
|
||||
|
||||
std::tuple<std::vector<int>, std::vector<float>, std::vector<int>, std::vector<float>> tokenize(std::string text) {
|
||||
auto parsed_attention = parse_prompt_attention(text);
|
||||
|
||||
{
|
||||
std::stringstream ss;
|
||||
ss << "[";
|
||||
for (const auto& item : parsed_attention) {
|
||||
ss << "['" << item.first << "', " << item.second << "], ";
|
||||
}
|
||||
ss << "]";
|
||||
LOG_DEBUG("parse '%s' to %s", text.c_str(), ss.str().c_str());
|
||||
}
|
||||
|
||||
std::vector<int> qwen_tokens;
|
||||
std::vector<float> qwen_weights;
|
||||
std::vector<int> t5_tokens;
|
||||
std::vector<float> t5_weights;
|
||||
|
||||
for (const auto& item : parsed_attention) {
|
||||
const std::string& curr_text = item.first;
|
||||
std::vector<int> curr_tokens = qwen_tokenizer->tokenize(curr_text, nullptr);
|
||||
qwen_tokens.insert(qwen_tokens.end(), curr_tokens.begin(), curr_tokens.end());
|
||||
// Anima uses uniform Qwen token weights.
|
||||
qwen_weights.insert(qwen_weights.end(), curr_tokens.size(), 1.f);
|
||||
}
|
||||
if (qwen_tokens.empty()) {
|
||||
qwen_tokens.push_back(151643); // qwen3 pad token
|
||||
qwen_weights.push_back(1.f);
|
||||
}
|
||||
|
||||
for (const auto& item : parsed_attention) {
|
||||
const std::string& curr_text = item.first;
|
||||
float curr_weight = item.second;
|
||||
std::vector<int> curr_tokens = t5_tokenizer.Encode(curr_text, true);
|
||||
t5_tokens.insert(t5_tokens.end(), curr_tokens.begin(), curr_tokens.end());
|
||||
t5_weights.insert(t5_weights.end(), curr_tokens.size(), curr_weight);
|
||||
}
|
||||
|
||||
return {qwen_tokens, qwen_weights, t5_tokens, t5_weights};
|
||||
}
|
||||
|
||||
SDCondition get_learned_condition(ggml_context* work_ctx,
|
||||
int n_threads,
|
||||
const ConditionerParams& conditioner_params) override {
|
||||
int64_t t0 = ggml_time_ms();
|
||||
|
||||
auto tokenized = tokenize(conditioner_params.text);
|
||||
auto& qwen_tokens = std::get<0>(tokenized);
|
||||
auto& qwen_weights = std::get<1>(tokenized);
|
||||
auto& t5_tokens = std::get<2>(tokenized);
|
||||
auto& t5_weights = std::get<3>(tokenized);
|
||||
|
||||
auto input_ids = vector_to_ggml_tensor_i32(work_ctx, qwen_tokens);
|
||||
|
||||
struct ggml_tensor* hidden_states = nullptr; // [N, n_token, 1024]
|
||||
llm->compute(n_threads,
|
||||
input_ids,
|
||||
nullptr,
|
||||
{},
|
||||
{},
|
||||
&hidden_states,
|
||||
work_ctx);
|
||||
|
||||
{
|
||||
auto tensor = hidden_states;
|
||||
float original_mean = ggml_ext_tensor_mean(tensor);
|
||||
for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
|
||||
for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
|
||||
for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
|
||||
float value = ggml_ext_tensor_get_f32(tensor, i0, i1, i2);
|
||||
value *= qwen_weights[i1];
|
||||
ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2);
|
||||
}
|
||||
}
|
||||
}
|
||||
float new_mean = ggml_ext_tensor_mean(tensor);
|
||||
if (new_mean != 0.f) {
|
||||
ggml_ext_tensor_scale_inplace(tensor, (original_mean / new_mean));
|
||||
}
|
||||
}
|
||||
|
||||
struct ggml_tensor* t5_ids_tensor = nullptr;
|
||||
struct ggml_tensor* t5_weight_tensor = nullptr;
|
||||
if (!t5_tokens.empty()) {
|
||||
t5_ids_tensor = vector_to_ggml_tensor_i32(work_ctx, t5_tokens);
|
||||
t5_weight_tensor = vector_to_ggml_tensor(work_ctx, t5_weights);
|
||||
}
|
||||
|
||||
int64_t t1 = ggml_time_ms();
|
||||
LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0);
|
||||
|
||||
return {hidden_states, t5_weight_tensor, t5_ids_tensor};
|
||||
}
|
||||
};
|
||||
|
||||
struct LLMEmbedder : public Conditioner {
|
||||
SDVersion version;
|
||||
std::shared_ptr<LLM::BPETokenizer> tokenizer;
|
||||
|
|
|
|||
|
|
@ -1,8 +1,7 @@
|
|||
#ifndef __CONTROL_HPP__
|
||||
#define __CONTROL_HPP__
|
||||
|
||||
#include "common.hpp"
|
||||
#include "ggml_extend.hpp"
|
||||
#include "common_block.hpp"
|
||||
#include "model.h"
|
||||
|
||||
#define CONTROL_NET_GRAPH_SIZE 1536
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
#ifndef __DIFFUSION_MODEL_H__
|
||||
#define __DIFFUSION_MODEL_H__
|
||||
|
||||
#include "anima.hpp"
|
||||
#include "flux.hpp"
|
||||
#include "mmdit.hpp"
|
||||
#include "qwen_image.hpp"
|
||||
|
|
@ -242,6 +243,72 @@ struct FluxModel : public DiffusionModel {
|
|||
}
|
||||
};
|
||||
|
||||
struct AnimaModel : public DiffusionModel {
|
||||
std::string prefix;
|
||||
Anima::AnimaRunner anima;
|
||||
|
||||
AnimaModel(ggml_backend_t backend,
|
||||
bool offload_params_to_cpu,
|
||||
const String2TensorStorage& tensor_storage_map = {},
|
||||
const std::string prefix = "model.diffusion_model")
|
||||
: prefix(prefix), anima(backend, offload_params_to_cpu, tensor_storage_map, prefix) {
|
||||
}
|
||||
|
||||
std::string get_desc() override {
|
||||
return anima.get_desc();
|
||||
}
|
||||
|
||||
void alloc_params_buffer() override {
|
||||
anima.alloc_params_buffer();
|
||||
}
|
||||
|
||||
void free_params_buffer() override {
|
||||
anima.free_params_buffer();
|
||||
}
|
||||
|
||||
void free_compute_buffer() override {
|
||||
anima.free_compute_buffer();
|
||||
}
|
||||
|
||||
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
|
||||
anima.get_param_tensors(tensors, prefix);
|
||||
}
|
||||
|
||||
size_t get_params_buffer_size() override {
|
||||
return anima.get_params_buffer_size();
|
||||
}
|
||||
|
||||
void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
|
||||
anima.set_weight_adapter(adapter);
|
||||
}
|
||||
|
||||
int64_t get_adm_in_channels() override {
|
||||
return 768;
|
||||
}
|
||||
|
||||
void set_flash_attention_enabled(bool enabled) {
|
||||
anima.set_flash_attention_enabled(enabled);
|
||||
}
|
||||
|
||||
void set_circular_axes(bool circular_x, bool circular_y) override {
|
||||
anima.set_circular_axes(circular_x, circular_y);
|
||||
}
|
||||
|
||||
bool compute(int n_threads,
|
||||
DiffusionParams diffusion_params,
|
||||
struct ggml_tensor** output = nullptr,
|
||||
struct ggml_context* output_ctx = nullptr) override {
|
||||
return anima.compute(n_threads,
|
||||
diffusion_params.x,
|
||||
diffusion_params.timesteps,
|
||||
diffusion_params.context,
|
||||
diffusion_params.c_concat,
|
||||
diffusion_params.y,
|
||||
output,
|
||||
output_ctx);
|
||||
}
|
||||
};
|
||||
|
||||
struct WanModel : public DiffusionModel {
|
||||
std::string prefix;
|
||||
WAN::WanRunner wan;
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@
|
|||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include "ggml_extend.hpp"
|
||||
#include "common_dit.hpp"
|
||||
#include "model.h"
|
||||
#include "rope.hpp"
|
||||
|
||||
|
|
@ -103,11 +103,13 @@ namespace Flux {
|
|||
auto norm = std::dynamic_pointer_cast<QKNorm>(blocks["norm"]);
|
||||
|
||||
auto qkv = qkv_proj->forward(ctx, x);
|
||||
auto qkv_vec = ggml_ext_chunk(ctx->ggml_ctx, qkv, 3, 0, true);
|
||||
int64_t head_dim = qkv_vec[0]->ne[0] / num_heads;
|
||||
auto q = ggml_reshape_4d(ctx->ggml_ctx, qkv_vec[0], head_dim, num_heads, qkv_vec[0]->ne[1], qkv_vec[0]->ne[2]);
|
||||
auto k = ggml_reshape_4d(ctx->ggml_ctx, qkv_vec[1], head_dim, num_heads, qkv_vec[1]->ne[1], qkv_vec[1]->ne[2]);
|
||||
auto v = ggml_reshape_4d(ctx->ggml_ctx, qkv_vec[2], head_dim, num_heads, qkv_vec[2]->ne[1], qkv_vec[2]->ne[2]);
|
||||
int64_t head_dim = qkv->ne[0] / 3 / num_heads;
|
||||
auto q = ggml_view_4d(ctx->ggml_ctx, qkv, head_dim, num_heads, qkv->ne[1], qkv->ne[2],
|
||||
qkv->nb[0] * head_dim, qkv->nb[1], qkv->nb[2], 0);
|
||||
auto k = ggml_view_4d(ctx->ggml_ctx, qkv, head_dim, num_heads, qkv->ne[1], qkv->ne[2],
|
||||
qkv->nb[0] * head_dim, qkv->nb[1], qkv->nb[2], (qkv->nb[0]) * qkv->ne[0] / 3);
|
||||
auto v = ggml_view_4d(ctx->ggml_ctx, qkv, head_dim, num_heads, qkv->ne[1], qkv->ne[2],
|
||||
qkv->nb[0] * head_dim, qkv->nb[1], qkv->nb[2], (qkv->nb[0]) * 2 * qkv->ne[0] / 3);
|
||||
q = norm->query_norm(ctx, q);
|
||||
k = norm->key_norm(ctx, k);
|
||||
return {q, k, v};
|
||||
|
|
@ -491,15 +493,14 @@ namespace Flux {
|
|||
auto x_mod = Flux::modulate(ctx->ggml_ctx, pre_norm->forward(ctx, x), mod.shift, mod.scale);
|
||||
auto qkv_mlp = linear1->forward(ctx, x_mod); // [N, n_token, hidden_size * 3 + mlp_hidden_dim*mlp_mult_factor]
|
||||
|
||||
auto q = ggml_view_3d(ctx->ggml_ctx, qkv_mlp, hidden_size, qkv_mlp->ne[1], qkv_mlp->ne[2], qkv_mlp->nb[1], qkv_mlp->nb[2], 0);
|
||||
auto k = ggml_view_3d(ctx->ggml_ctx, qkv_mlp, hidden_size, qkv_mlp->ne[1], qkv_mlp->ne[2], qkv_mlp->nb[1], qkv_mlp->nb[2], hidden_size * qkv_mlp->nb[0]);
|
||||
auto v = ggml_view_3d(ctx->ggml_ctx, qkv_mlp, hidden_size, qkv_mlp->ne[1], qkv_mlp->ne[2], qkv_mlp->nb[1], qkv_mlp->nb[2], hidden_size * 2 * qkv_mlp->nb[0]);
|
||||
|
||||
int64_t head_dim = hidden_size / num_heads;
|
||||
|
||||
q = ggml_reshape_4d(ctx->ggml_ctx, ggml_cont(ctx->ggml_ctx, q), head_dim, num_heads, q->ne[1], q->ne[2]); // [N, n_token, n_head, d_head]
|
||||
k = ggml_reshape_4d(ctx->ggml_ctx, ggml_cont(ctx->ggml_ctx, k), head_dim, num_heads, k->ne[1], k->ne[2]); // [N, n_token, n_head, d_head]
|
||||
v = ggml_reshape_4d(ctx->ggml_ctx, ggml_cont(ctx->ggml_ctx, v), head_dim, num_heads, v->ne[1], v->ne[2]); // [N, n_token, n_head, d_head]
|
||||
auto q = ggml_view_4d(ctx->ggml_ctx, qkv_mlp, head_dim, num_heads, qkv_mlp->ne[1], qkv_mlp->ne[2],
|
||||
qkv_mlp->nb[0] * head_dim, qkv_mlp->nb[1], qkv_mlp->nb[2], 0);
|
||||
auto k = ggml_view_4d(ctx->ggml_ctx, qkv_mlp, head_dim, num_heads, qkv_mlp->ne[1], qkv_mlp->ne[2],
|
||||
qkv_mlp->nb[0] * head_dim, qkv_mlp->nb[1], qkv_mlp->nb[2], (qkv_mlp->nb[0]) * hidden_size);
|
||||
auto v = ggml_view_4d(ctx->ggml_ctx, qkv_mlp, head_dim, num_heads, qkv_mlp->ne[1], qkv_mlp->ne[2],
|
||||
qkv_mlp->nb[0] * head_dim, qkv_mlp->nb[1], qkv_mlp->nb[2], (qkv_mlp->nb[0]) * 2 * hidden_size);
|
||||
|
||||
q = norm->query_norm(ctx, q);
|
||||
k = norm->key_norm(ctx, k);
|
||||
|
|
@ -846,70 +847,6 @@ namespace Flux {
|
|||
}
|
||||
}
|
||||
|
||||
struct ggml_tensor* pad_to_patch_size(GGMLRunnerContext* ctx,
|
||||
struct ggml_tensor* x) {
|
||||
int64_t W = x->ne[0];
|
||||
int64_t H = x->ne[1];
|
||||
|
||||
int pad_h = (params.patch_size - H % params.patch_size) % params.patch_size;
|
||||
int pad_w = (params.patch_size - W % params.patch_size) % params.patch_size;
|
||||
x = ggml_ext_pad(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0, ctx->circular_x_enabled, ctx->circular_y_enabled);
|
||||
return x;
|
||||
}
|
||||
|
||||
struct ggml_tensor* patchify(struct ggml_context* ctx,
|
||||
struct ggml_tensor* x) {
|
||||
// x: [N, C, H, W]
|
||||
// return: [N, h*w, C * patch_size * patch_size]
|
||||
int64_t N = x->ne[3];
|
||||
int64_t C = x->ne[2];
|
||||
int64_t H = x->ne[1];
|
||||
int64_t W = x->ne[0];
|
||||
int64_t p = params.patch_size;
|
||||
int64_t h = H / params.patch_size;
|
||||
int64_t w = W / params.patch_size;
|
||||
|
||||
GGML_ASSERT(h * p == H && w * p == W);
|
||||
|
||||
x = ggml_reshape_4d(ctx, x, p, w, p, h * C * N); // [N*C*h, p, w, p]
|
||||
x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // [N*C*h, w, p, p]
|
||||
x = ggml_reshape_4d(ctx, x, p * p, w * h, C, N); // [N, C, h*w, p*p]
|
||||
x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // [N, h*w, C, p*p]
|
||||
x = ggml_reshape_3d(ctx, x, p * p * C, w * h, N); // [N, h*w, C*p*p]
|
||||
return x;
|
||||
}
|
||||
|
||||
struct ggml_tensor* process_img(GGMLRunnerContext* ctx,
|
||||
struct ggml_tensor* x) {
|
||||
// img = rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=patch_size, pw=patch_size)
|
||||
x = pad_to_patch_size(ctx, x);
|
||||
x = patchify(ctx->ggml_ctx, x);
|
||||
return x;
|
||||
}
|
||||
|
||||
struct ggml_tensor* unpatchify(struct ggml_context* ctx,
|
||||
struct ggml_tensor* x,
|
||||
int64_t h,
|
||||
int64_t w) {
|
||||
// x: [N, h*w, C*patch_size*patch_size]
|
||||
// return: [N, C, H, W]
|
||||
int64_t N = x->ne[2];
|
||||
int64_t C = x->ne[0] / params.patch_size / params.patch_size;
|
||||
int64_t H = h * params.patch_size;
|
||||
int64_t W = w * params.patch_size;
|
||||
int64_t p = params.patch_size;
|
||||
|
||||
GGML_ASSERT(C * p * p == x->ne[0]);
|
||||
|
||||
x = ggml_reshape_4d(ctx, x, p * p, C, w * h, N); // [N, h*w, C, p*p]
|
||||
x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // [N, C, h*w, p*p]
|
||||
x = ggml_reshape_4d(ctx, x, p, p, w, h * C * N); // [N*C*h, w, p, p]
|
||||
x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // [N*C*h, p, w, p]
|
||||
x = ggml_reshape_4d(ctx, x, W, H, C, N); // [N, C, h*p, w*p]
|
||||
|
||||
return x;
|
||||
}
|
||||
|
||||
struct ggml_tensor* forward_orig(GGMLRunnerContext* ctx,
|
||||
struct ggml_tensor* img,
|
||||
struct ggml_tensor* txt,
|
||||
|
|
@ -1060,7 +997,7 @@ namespace Flux {
|
|||
int pad_h = (patch_size - H % patch_size) % patch_size;
|
||||
int pad_w = (patch_size - W % patch_size) % patch_size;
|
||||
|
||||
auto img = pad_to_patch_size(ctx, x);
|
||||
auto img = DiT::pad_to_patch_size(ctx, x, params.patch_size, params.patch_size);
|
||||
auto orig_img = img;
|
||||
|
||||
if (params.chroma_radiance_params.fake_patch_size_x2) {
|
||||
|
|
@ -1082,7 +1019,7 @@ namespace Flux {
|
|||
auto nerf_image_embedder = std::dynamic_pointer_cast<NerfEmbedder>(blocks["nerf_image_embedder"]);
|
||||
auto nerf_final_layer_conv = std::dynamic_pointer_cast<NerfFinalLayerConv>(blocks["nerf_final_layer_conv"]);
|
||||
|
||||
auto nerf_pixels = patchify(ctx->ggml_ctx, orig_img); // [N, num_patches, C * patch_size * patch_size]
|
||||
auto nerf_pixels = DiT::patchify(ctx->ggml_ctx, orig_img, patch_size, patch_size); // [N, num_patches, C * patch_size * patch_size]
|
||||
int64_t num_patches = nerf_pixels->ne[1];
|
||||
nerf_pixels = ggml_reshape_3d(ctx->ggml_ctx,
|
||||
nerf_pixels,
|
||||
|
|
@ -1102,7 +1039,7 @@ namespace Flux {
|
|||
|
||||
img_dct = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, img_dct, 1, 0, 2, 3)); // [N*num_patches, nerf_hidden_size, patch_size*patch_size]
|
||||
img_dct = ggml_reshape_3d(ctx->ggml_ctx, img_dct, img_dct->ne[0] * img_dct->ne[1], num_patches, img_dct->ne[2] / num_patches); // [N, num_patches, nerf_hidden_size*patch_size*patch_size]
|
||||
img_dct = unpatchify(ctx->ggml_ctx, img_dct, (H + pad_h) / patch_size, (W + pad_w) / patch_size); // [N, nerf_hidden_size, H, W]
|
||||
img_dct = DiT::unpatchify(ctx->ggml_ctx, img_dct, (H + pad_h) / patch_size, (W + pad_w) / patch_size, patch_size, patch_size); // [N, nerf_hidden_size, H, W]
|
||||
|
||||
out = nerf_final_layer_conv->forward(ctx, img_dct); // [N, C, H, W]
|
||||
|
||||
|
|
@ -1134,7 +1071,7 @@ namespace Flux {
|
|||
int pad_h = (patch_size - H % patch_size) % patch_size;
|
||||
int pad_w = (patch_size - W % patch_size) % patch_size;
|
||||
|
||||
auto img = process_img(ctx, x);
|
||||
auto img = DiT::pad_and_patchify(ctx, x, patch_size, patch_size);
|
||||
int64_t img_tokens = img->ne[1];
|
||||
|
||||
if (params.version == VERSION_FLUX_FILL) {
|
||||
|
|
@ -1142,8 +1079,8 @@ namespace Flux {
|
|||
ggml_tensor* masked = ggml_view_4d(ctx->ggml_ctx, c_concat, c_concat->ne[0], c_concat->ne[1], C, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], 0);
|
||||
ggml_tensor* mask = ggml_view_4d(ctx->ggml_ctx, c_concat, c_concat->ne[0], c_concat->ne[1], 8 * 8, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * C);
|
||||
|
||||
masked = process_img(ctx, masked);
|
||||
mask = process_img(ctx, mask);
|
||||
masked = DiT::pad_and_patchify(ctx, masked, patch_size, patch_size);
|
||||
mask = DiT::pad_and_patchify(ctx, mask, patch_size, patch_size);
|
||||
|
||||
img = ggml_concat(ctx->ggml_ctx, img, ggml_concat(ctx->ggml_ctx, masked, mask, 0), 0);
|
||||
} else if (params.version == VERSION_FLEX_2) {
|
||||
|
|
@ -1152,21 +1089,21 @@ namespace Flux {
|
|||
ggml_tensor* mask = ggml_view_4d(ctx->ggml_ctx, c_concat, c_concat->ne[0], c_concat->ne[1], 1, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * C);
|
||||
ggml_tensor* control = ggml_view_4d(ctx->ggml_ctx, c_concat, c_concat->ne[0], c_concat->ne[1], C, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * (C + 1));
|
||||
|
||||
masked = process_img(ctx, masked);
|
||||
mask = process_img(ctx, mask);
|
||||
control = process_img(ctx, control);
|
||||
masked = DiT::pad_and_patchify(ctx, masked, patch_size, patch_size);
|
||||
mask = DiT::pad_and_patchify(ctx, mask, patch_size, patch_size);
|
||||
control = DiT::pad_and_patchify(ctx, control, patch_size, patch_size);
|
||||
|
||||
img = ggml_concat(ctx->ggml_ctx, img, ggml_concat(ctx->ggml_ctx, ggml_concat(ctx->ggml_ctx, masked, mask, 0), control, 0), 0);
|
||||
} else if (params.version == VERSION_FLUX_CONTROLS) {
|
||||
GGML_ASSERT(c_concat != nullptr);
|
||||
|
||||
auto control = process_img(ctx, c_concat);
|
||||
auto control = DiT::pad_and_patchify(ctx, c_concat, patch_size, patch_size);
|
||||
img = ggml_concat(ctx->ggml_ctx, img, control, 0);
|
||||
}
|
||||
|
||||
if (ref_latents.size() > 0) {
|
||||
for (ggml_tensor* ref : ref_latents) {
|
||||
ref = process_img(ctx, ref);
|
||||
ref = DiT::pad_and_patchify(ctx, ref, patch_size, patch_size);
|
||||
img = ggml_concat(ctx->ggml_ctx, img, ref, 1);
|
||||
}
|
||||
}
|
||||
|
|
@ -1178,8 +1115,7 @@ namespace Flux {
|
|||
out = ggml_cont(ctx->ggml_ctx, out);
|
||||
}
|
||||
|
||||
// rearrange(out, "b (h w) (c ph pw) -> b c (h ph) (w pw)", h=h_len, w=w_len, ph=2, pw=2)
|
||||
out = unpatchify(ctx->ggml_ctx, out, (H + pad_h) / patch_size, (W + pad_w) / patch_size); // [N, C, H + pad_h, W + pad_w]
|
||||
out = DiT::unpatchify_and_crop(ctx->ggml_ctx, out, H, W, patch_size, patch_size); // [N, C, H, W]
|
||||
return out;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1219,6 +1219,11 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_zeros(struct ggml_context* ctx,
|
|||
return ggml_ext_full(ctx, 0.f, ne0, ne1, ne2, ne3);
|
||||
}
|
||||
|
||||
__STATIC_INLINE__ struct ggml_tensor* ggml_ext_zeros_like(struct ggml_context* ctx,
|
||||
struct ggml_tensor* x) {
|
||||
return ggml_ext_zeros(ctx, x->ne[0], x->ne[1], x->ne[2], x->ne[3]);
|
||||
}
|
||||
|
||||
__STATIC_INLINE__ struct ggml_tensor* ggml_ext_ones(struct ggml_context* ctx,
|
||||
int64_t ne0,
|
||||
int64_t ne1,
|
||||
|
|
@ -1227,6 +1232,11 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_ones(struct ggml_context* ctx,
|
|||
return ggml_ext_full(ctx, 1.f, ne0, ne1, ne2, ne3);
|
||||
}
|
||||
|
||||
__STATIC_INLINE__ struct ggml_tensor* ggml_ext_ones_like(struct ggml_context* ctx,
|
||||
struct ggml_tensor* x) {
|
||||
return ggml_ext_ones(ctx, x->ne[0], x->ne[1], x->ne[2], x->ne[3]);
|
||||
}
|
||||
|
||||
__STATIC_INLINE__ ggml_tensor* ggml_ext_cast_f32(ggml_context* ctx, ggml_tensor* a) {
|
||||
#ifdef SD_USE_VULKAN
|
||||
auto zero_index = ggml_get_tensor(ctx, "ggml_runner_build_in_tensor:zero_int");
|
||||
|
|
|
|||
|
|
@ -1,8 +1,7 @@
|
|||
#ifndef __LTXV_HPP__
|
||||
#define __LTXV_HPP__
|
||||
|
||||
#include "common.hpp"
|
||||
#include "ggml_extend.hpp"
|
||||
#include "common_block.hpp"
|
||||
|
||||
namespace LTXV {
|
||||
|
||||
|
|
|
|||
|
|
@ -745,28 +745,6 @@ public:
|
|||
return spatial_pos_embed;
|
||||
}
|
||||
|
||||
struct ggml_tensor* unpatchify(struct ggml_context* ctx,
|
||||
struct ggml_tensor* x,
|
||||
int64_t h,
|
||||
int64_t w) {
|
||||
// x: [N, H*W, patch_size * patch_size * C]
|
||||
// return: [N, C, H, W]
|
||||
int64_t n = x->ne[2];
|
||||
int64_t c = out_channels;
|
||||
int64_t p = patch_size;
|
||||
h = (h + 1) / p;
|
||||
w = (w + 1) / p;
|
||||
|
||||
GGML_ASSERT(h * w == x->ne[1]);
|
||||
|
||||
x = ggml_reshape_4d(ctx, x, c, p * p, w * h, n); // [N, H*W, P*P, C]
|
||||
x = ggml_cont(ctx, ggml_permute(ctx, x, 2, 0, 1, 3)); // [N, C, H*W, P*P]
|
||||
x = ggml_reshape_4d(ctx, x, p, p, w, h * c * n); // [N*C*H, W, P, P]
|
||||
x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // [N*C*H, P, W, P]
|
||||
x = ggml_reshape_4d(ctx, x, p * w, p * h, c, n); // [N, C, H*P, W*P]
|
||||
return x;
|
||||
}
|
||||
|
||||
struct ggml_tensor* forward_core_with_concat(GGMLRunnerContext* ctx,
|
||||
struct ggml_tensor* x,
|
||||
struct ggml_tensor* c_mod,
|
||||
|
|
@ -811,11 +789,11 @@ public:
|
|||
auto x_embedder = std::dynamic_pointer_cast<PatchEmbed>(blocks["x_embedder"]);
|
||||
auto t_embedder = std::dynamic_pointer_cast<TimestepEmbedder>(blocks["t_embedder"]);
|
||||
|
||||
int64_t w = x->ne[0];
|
||||
int64_t h = x->ne[1];
|
||||
int64_t W = x->ne[0];
|
||||
int64_t H = x->ne[1];
|
||||
|
||||
auto patch_embed = x_embedder->forward(ctx, x); // [N, H*W, hidden_size]
|
||||
auto pos_embed = cropped_pos_embed(ctx->ggml_ctx, h, w); // [1, H*W, hidden_size]
|
||||
auto pos_embed = cropped_pos_embed(ctx->ggml_ctx, H, W); // [1, H*W, hidden_size]
|
||||
x = ggml_add(ctx->ggml_ctx, patch_embed, pos_embed); // [N, H*W, hidden_size]
|
||||
|
||||
auto c = t_embedder->forward(ctx, t); // [N, hidden_size]
|
||||
|
|
@ -834,7 +812,7 @@ public:
|
|||
|
||||
x = forward_core_with_concat(ctx, x, c, context, skip_layers); // (N, H*W, patch_size ** 2 * out_channels)
|
||||
|
||||
x = unpatchify(ctx->ggml_ctx, x, h, w); // [N, C, H, W]
|
||||
x = DiT::unpatchify_and_crop(ctx->ggml_ctx, x, H, W, patch_size, patch_size, /*patch_last*/ false); // [N, C, H, W]
|
||||
|
||||
return x;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1083,6 +1083,9 @@ SDVersion ModelLoader::get_sd_version() {
|
|||
if (tensor_storage.name.find("model.diffusion_model.transformer_blocks.0.img_mod.1.weight") != std::string::npos) {
|
||||
return VERSION_QWEN_IMAGE;
|
||||
}
|
||||
if (tensor_storage.name.find("llm_adapter.blocks.0.cross_attn.q_proj.weight") != std::string::npos) {
|
||||
return VERSION_ANIMA;
|
||||
}
|
||||
if (tensor_storage.name.find("model.diffusion_model.double_stream_modulation_img.lin.weight") != std::string::npos) {
|
||||
is_flux2 = true;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -45,6 +45,7 @@ enum SDVersion {
|
|||
VERSION_WAN2_2_I2V,
|
||||
VERSION_WAN2_2_TI2V,
|
||||
VERSION_QWEN_IMAGE,
|
||||
VERSION_ANIMA,
|
||||
VERSION_FLUX2,
|
||||
VERSION_FLUX2_KLEIN,
|
||||
VERSION_Z_IMAGE,
|
||||
|
|
@ -122,6 +123,13 @@ static inline bool sd_version_is_qwen_image(SDVersion version) {
|
|||
return false;
|
||||
}
|
||||
|
||||
static inline bool sd_version_is_anima(SDVersion version) {
|
||||
if (version == VERSION_ANIMA) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline bool sd_version_is_z_image(SDVersion version) {
|
||||
if (version == VERSION_Z_IMAGE) {
|
||||
return true;
|
||||
|
|
@ -146,6 +154,7 @@ static inline bool sd_version_is_dit(SDVersion version) {
|
|||
sd_version_is_sd3(version) ||
|
||||
sd_version_is_wan(version) ||
|
||||
sd_version_is_qwen_image(version) ||
|
||||
sd_version_is_anima(version) ||
|
||||
sd_version_is_z_image(version)) {
|
||||
return true;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -653,6 +653,14 @@ std::string convert_diffusers_dit_to_original_lumina2(std::string name) {
|
|||
return name;
|
||||
}
|
||||
|
||||
std::string convert_other_dit_to_original_anima(std::string name) {
|
||||
static const std::string anima_net_prefix = "net.";
|
||||
if (!starts_with(name, anima_net_prefix)) {
|
||||
name = anima_net_prefix + name;
|
||||
}
|
||||
return name;
|
||||
}
|
||||
|
||||
std::string convert_diffusion_model_name(std::string name, std::string prefix, SDVersion version) {
|
||||
if (sd_version_is_sd1(version) || sd_version_is_sd2(version)) {
|
||||
name = convert_diffusers_unet_to_original_sd1(name);
|
||||
|
|
@ -664,6 +672,8 @@ std::string convert_diffusion_model_name(std::string name, std::string prefix, S
|
|||
name = convert_diffusers_dit_to_original_flux(name);
|
||||
} else if (sd_version_is_z_image(version)) {
|
||||
name = convert_diffusers_dit_to_original_lumina2(name);
|
||||
} else if (sd_version_is_anima(version)) {
|
||||
name = convert_other_dit_to_original_anima(name);
|
||||
}
|
||||
return name;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -3,9 +3,8 @@
|
|||
|
||||
#include <memory>
|
||||
|
||||
#include "common.hpp"
|
||||
#include "common_block.hpp"
|
||||
#include "flux.hpp"
|
||||
#include "ggml_extend.hpp"
|
||||
|
||||
namespace Qwen {
|
||||
constexpr int QWEN_IMAGE_GRAPH_SIZE = 20480;
|
||||
|
|
@ -390,69 +389,6 @@ namespace Qwen {
|
|||
blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, params.patch_size * params.patch_size * params.out_channels));
|
||||
}
|
||||
|
||||
struct ggml_tensor* pad_to_patch_size(GGMLRunnerContext* ctx,
|
||||
struct ggml_tensor* x) {
|
||||
int64_t W = x->ne[0];
|
||||
int64_t H = x->ne[1];
|
||||
|
||||
int pad_h = (params.patch_size - H % params.patch_size) % params.patch_size;
|
||||
int pad_w = (params.patch_size - W % params.patch_size) % params.patch_size;
|
||||
x = ggml_ext_pad(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0, ctx->circular_x_enabled, ctx->circular_y_enabled);
|
||||
return x;
|
||||
}
|
||||
|
||||
struct ggml_tensor* patchify(struct ggml_context* ctx,
|
||||
struct ggml_tensor* x) {
|
||||
// x: [N, C, H, W]
|
||||
// return: [N, h*w, C * patch_size * patch_size]
|
||||
int64_t N = x->ne[3];
|
||||
int64_t C = x->ne[2];
|
||||
int64_t H = x->ne[1];
|
||||
int64_t W = x->ne[0];
|
||||
int64_t p = params.patch_size;
|
||||
int64_t h = H / params.patch_size;
|
||||
int64_t w = W / params.patch_size;
|
||||
|
||||
GGML_ASSERT(h * p == H && w * p == W);
|
||||
|
||||
x = ggml_reshape_4d(ctx, x, p, w, p, h * C * N); // [N*C*h, p, w, p]
|
||||
x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // [N*C*h, w, p, p]
|
||||
x = ggml_reshape_4d(ctx, x, p * p, w * h, C, N); // [N, C, h*w, p*p]
|
||||
x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // [N, h*w, C, p*p]
|
||||
x = ggml_reshape_3d(ctx, x, p * p * C, w * h, N); // [N, h*w, C*p*p]
|
||||
return x;
|
||||
}
|
||||
|
||||
struct ggml_tensor* process_img(GGMLRunnerContext* ctx,
|
||||
struct ggml_tensor* x) {
|
||||
x = pad_to_patch_size(ctx, x);
|
||||
x = patchify(ctx->ggml_ctx, x);
|
||||
return x;
|
||||
}
|
||||
|
||||
struct ggml_tensor* unpatchify(struct ggml_context* ctx,
|
||||
struct ggml_tensor* x,
|
||||
int64_t h,
|
||||
int64_t w) {
|
||||
// x: [N, h*w, C*patch_size*patch_size]
|
||||
// return: [N, C, H, W]
|
||||
int64_t N = x->ne[2];
|
||||
int64_t C = x->ne[0] / params.patch_size / params.patch_size;
|
||||
int64_t H = h * params.patch_size;
|
||||
int64_t W = w * params.patch_size;
|
||||
int64_t p = params.patch_size;
|
||||
|
||||
GGML_ASSERT(C * p * p == x->ne[0]);
|
||||
|
||||
x = ggml_reshape_4d(ctx, x, p * p, C, w * h, N); // [N, h*w, C, p*p]
|
||||
x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // [N, C, h*w, p*p]
|
||||
x = ggml_reshape_4d(ctx, x, p, p, w, h * C * N); // [N*C*h, w, p, p]
|
||||
x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // [N*C*h, p, w, p]
|
||||
x = ggml_reshape_4d(ctx, x, W, H, C, N); // [N, C, h*p, w*p]
|
||||
|
||||
return x;
|
||||
}
|
||||
|
||||
struct ggml_tensor* forward_orig(GGMLRunnerContext* ctx,
|
||||
struct ggml_tensor* x,
|
||||
struct ggml_tensor* timestep,
|
||||
|
|
@ -468,7 +404,7 @@ namespace Qwen {
|
|||
|
||||
auto t_emb = time_text_embed->forward(ctx, timestep);
|
||||
if (params.zero_cond_t) {
|
||||
auto t_emb_0 = time_text_embed->forward(ctx, ggml_ext_zeros(ctx->ggml_ctx, timestep->ne[0], timestep->ne[1], timestep->ne[2], timestep->ne[3]));
|
||||
auto t_emb_0 = time_text_embed->forward(ctx, ggml_ext_zeros_like(ctx->ggml_ctx, timestep));
|
||||
t_emb = ggml_concat(ctx->ggml_ctx, t_emb, t_emb_0, 1);
|
||||
}
|
||||
auto img = img_in->forward(ctx, x);
|
||||
|
|
@ -512,19 +448,16 @@ namespace Qwen {
|
|||
int64_t C = x->ne[2];
|
||||
int64_t N = x->ne[3];
|
||||
|
||||
auto img = process_img(ctx, x);
|
||||
auto img = DiT::pad_and_patchify(ctx, x, params.patch_size, params.patch_size);
|
||||
int64_t img_tokens = img->ne[1];
|
||||
|
||||
if (ref_latents.size() > 0) {
|
||||
for (ggml_tensor* ref : ref_latents) {
|
||||
ref = process_img(ctx, ref);
|
||||
ref = DiT::pad_and_patchify(ctx, ref, params.patch_size, params.patch_size);
|
||||
img = ggml_concat(ctx->ggml_ctx, img, ref, 1);
|
||||
}
|
||||
}
|
||||
|
||||
int64_t h_len = ((H + (params.patch_size / 2)) / params.patch_size);
|
||||
int64_t w_len = ((W + (params.patch_size / 2)) / params.patch_size);
|
||||
|
||||
auto out = forward_orig(ctx, img, timestep, context, pe, modulate_index); // [N, h_len*w_len, ph*pw*C]
|
||||
|
||||
if (out->ne[1] > img_tokens) {
|
||||
|
|
@ -533,11 +466,7 @@ namespace Qwen {
|
|||
out = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, out, 0, 2, 1, 3)); // [N, h*w, C * patch_size * patch_size]
|
||||
}
|
||||
|
||||
out = unpatchify(ctx->ggml_ctx, out, h_len, w_len); // [N, C, H + pad_h, W + pad_w]
|
||||
|
||||
// slice
|
||||
out = ggml_ext_slice(ctx->ggml_ctx, out, 1, 0, H); // [N, C, H, W + pad_w]
|
||||
out = ggml_ext_slice(ctx->ggml_ctx, out, 0, 0, W); // [N, C, H, W]
|
||||
out = DiT::unpatchify_and_crop(ctx->ggml_ctx, out, H, W, params.patch_size, params.patch_size); // [N, C, H, W]
|
||||
|
||||
return out;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -43,7 +43,7 @@ namespace Rope {
|
|||
|
||||
__STATIC_INLINE__ std::vector<std::vector<float>> rope(const std::vector<float>& pos,
|
||||
int dim,
|
||||
int theta,
|
||||
float theta,
|
||||
const std::vector<int>& axis_wrap_dims = {}) {
|
||||
assert(dim % 2 == 0);
|
||||
int half_dim = dim / 2;
|
||||
|
|
@ -167,7 +167,7 @@ namespace Rope {
|
|||
|
||||
__STATIC_INLINE__ std::vector<float> embed_nd(const std::vector<std::vector<float>>& ids,
|
||||
int bs,
|
||||
int theta,
|
||||
const std::vector<float>& axis_thetas,
|
||||
const std::vector<int>& axes_dim,
|
||||
const std::vector<std::vector<int>>& wrap_dims = {}) {
|
||||
std::vector<std::vector<float>> trans_ids = transpose(ids);
|
||||
|
|
@ -188,8 +188,12 @@ namespace Rope {
|
|||
if (!wrap_dims.empty() && i < (int)wrap_dims.size()) {
|
||||
axis_wrap_dims = wrap_dims[i];
|
||||
}
|
||||
float axis_theta = 10000.0f;
|
||||
if (!axis_thetas.empty()) {
|
||||
axis_theta = axis_thetas[std::min(i, axis_thetas.size() - 1)];
|
||||
}
|
||||
std::vector<std::vector<float>> rope_emb =
|
||||
rope(trans_ids[i], axes_dim[i], theta, axis_wrap_dims); // [bs*pos_len, axes_dim[i]/2 * 2 * 2]
|
||||
rope(trans_ids[i], axes_dim[i], axis_theta, axis_wrap_dims); // [bs*pos_len, axes_dim[i]/2 * 2 * 2]
|
||||
for (int b = 0; b < bs; ++b) {
|
||||
for (int j = 0; j < pos_len; ++j) {
|
||||
for (int k = 0; k < rope_emb[0].size(); ++k) {
|
||||
|
|
@ -203,6 +207,15 @@ namespace Rope {
|
|||
return flatten(emb);
|
||||
}
|
||||
|
||||
__STATIC_INLINE__ std::vector<float> embed_nd(const std::vector<std::vector<float>>& ids,
|
||||
int bs,
|
||||
float theta,
|
||||
const std::vector<int>& axes_dim,
|
||||
const std::vector<std::vector<int>>& wrap_dims = {}) {
|
||||
std::vector<float> axis_thetas(axes_dim.size(), theta);
|
||||
return embed_nd(ids, bs, axis_thetas, axes_dim, wrap_dims);
|
||||
}
|
||||
|
||||
__STATIC_INLINE__ std::vector<std::vector<float>> gen_refs_ids(int patch_size,
|
||||
int bs,
|
||||
int axes_dim_num,
|
||||
|
|
@ -332,7 +345,7 @@ namespace Rope {
|
|||
}
|
||||
}
|
||||
}
|
||||
return embed_nd(ids, bs, theta, axes_dim, wrap_dims);
|
||||
return embed_nd(ids, bs, static_cast<float>(theta), axes_dim, wrap_dims);
|
||||
}
|
||||
|
||||
__STATIC_INLINE__ std::vector<std::vector<float>> gen_qwen_image_ids(int h,
|
||||
|
|
@ -421,7 +434,7 @@ namespace Rope {
|
|||
}
|
||||
}
|
||||
}
|
||||
return embed_nd(ids, bs, theta, axes_dim, wrap_dims);
|
||||
return embed_nd(ids, bs, static_cast<float>(theta), axes_dim, wrap_dims);
|
||||
}
|
||||
|
||||
__STATIC_INLINE__ std::vector<std::vector<float>> gen_vid_ids(int t,
|
||||
|
|
@ -475,7 +488,7 @@ namespace Rope {
|
|||
int theta,
|
||||
const std::vector<int>& axes_dim) {
|
||||
std::vector<std::vector<float>> ids = gen_vid_ids(t, h, w, pt, ph, pw, bs);
|
||||
return embed_nd(ids, bs, theta, axes_dim);
|
||||
return embed_nd(ids, bs, static_cast<float>(theta), axes_dim);
|
||||
}
|
||||
|
||||
__STATIC_INLINE__ std::vector<std::vector<float>> gen_qwen2vl_ids(int grid_h,
|
||||
|
|
@ -511,7 +524,7 @@ namespace Rope {
|
|||
int theta,
|
||||
const std::vector<int>& axes_dim) {
|
||||
std::vector<std::vector<float>> ids = gen_qwen2vl_ids(grid_h, grid_w, merge_size, window_index);
|
||||
return embed_nd(ids, 1, theta, axes_dim);
|
||||
return embed_nd(ids, 1, static_cast<float>(theta), axes_dim);
|
||||
}
|
||||
|
||||
__STATIC_INLINE__ int bound_mod(int a, int m) {
|
||||
|
|
@ -584,7 +597,7 @@ namespace Rope {
|
|||
}
|
||||
}
|
||||
|
||||
return embed_nd(ids, bs, theta, axes_dim, wrap_dims);
|
||||
return embed_nd(ids, bs, static_cast<float>(theta), axes_dim, wrap_dims);
|
||||
}
|
||||
|
||||
__STATIC_INLINE__ struct ggml_tensor* apply_rope(struct ggml_context* ctx,
|
||||
|
|
|
|||
|
|
@ -50,6 +50,7 @@ const char* model_version_to_str[] = {
|
|||
"Wan 2.2 I2V",
|
||||
"Wan 2.2 TI2V",
|
||||
"Qwen Image",
|
||||
"Anima",
|
||||
"Flux.2",
|
||||
"Flux.2 klein",
|
||||
"Z-Image",
|
||||
|
|
@ -310,15 +311,30 @@ public:
|
|||
}
|
||||
}
|
||||
|
||||
if (tempver == VERSION_ANIMA &&
|
||||
strlen(SAFE_STR(sd_ctx_params->model_path)) > 0 &&
|
||||
strlen(SAFE_STR(sd_ctx_params->diffusion_model_path)) == 0 &&
|
||||
!model_loader.has_diffusion_model_tensors()
|
||||
)
|
||||
{
|
||||
LOG_INFO("Anima: SD Diffusion Model tensors missing! Fallback trying alternative tensor names...\n");
|
||||
if (!model_loader.init_from_file(sd_ctx_params->model_path, "model.diffusion_model.")) {
|
||||
LOG_WARN("loading diffusion model from '%s' failed", sd_ctx_params->model_path);
|
||||
}
|
||||
tempver = model_loader.get_sd_version();
|
||||
}
|
||||
|
||||
bool iswan = (tempver==VERSION_WAN2 || tempver==VERSION_WAN2_2_I2V || tempver==VERSION_WAN2_2_TI2V);
|
||||
bool isqwenimg = (tempver==VERSION_QWEN_IMAGE);
|
||||
bool iszimg = (tempver==VERSION_Z_IMAGE);
|
||||
bool isflux2 = (tempver==VERSION_FLUX2);
|
||||
bool isflux2k = (tempver==VERSION_FLUX2_KLEIN);
|
||||
bool is_ovis = (tempver==VERSION_OVIS_IMAGE);
|
||||
bool is_anima = (tempver==VERSION_ANIMA);
|
||||
bool conditioner_is_llm = (isqwenimg||iszimg||isflux2||isflux2k||is_ovis||is_anima);
|
||||
|
||||
//kcpp qol fallback: if qwen image, and they loaded the qwen2vl llm as t5 by mistake
|
||||
if((isqwenimg||iszimg||isflux2||isflux2k||is_ovis) && t5_path_fixed!="")
|
||||
if(conditioner_is_llm && t5_path_fixed!="")
|
||||
{
|
||||
if(clipl_path_fixed=="" && clipg_path_fixed=="")
|
||||
{
|
||||
|
|
@ -350,7 +366,7 @@ public:
|
|||
prefix = "cond_stage_model.transformer.";
|
||||
LOG_INFO("swap clip_vision from '%s'", clipl_path_fixed.c_str());
|
||||
}
|
||||
if(isqwenimg||iszimg||isflux2||isflux2k||is_ovis)
|
||||
if(conditioner_is_llm)
|
||||
{
|
||||
prefix = "text_encoders.llm.";
|
||||
LOG_INFO("swap llm from '%s'", clipl_path_fixed.c_str());
|
||||
|
|
@ -452,7 +468,7 @@ public:
|
|||
{
|
||||
to_replace = "taesd_f2.embd";
|
||||
}
|
||||
else if((sd_version_is_wan(version) && version != VERSION_WAN2_2_TI2V)||sd_version_is_qwen_image(version))
|
||||
else if((sd_version_is_wan(version) && version != VERSION_WAN2_2_TI2V)||sd_version_is_qwen_image(version)||sd_version_is_anima(version))
|
||||
{
|
||||
to_replace = "taesd_w21.embd";
|
||||
}
|
||||
|
|
@ -545,6 +561,7 @@ public:
|
|||
shift_factor = 0.1159f;
|
||||
} else if (sd_version_is_wan(version) ||
|
||||
sd_version_is_qwen_image(version) ||
|
||||
sd_version_is_anima(version) ||
|
||||
sd_version_is_flux2(version)) {
|
||||
scale_factor = 1.0f;
|
||||
shift_factor = 0.f;
|
||||
|
|
@ -675,6 +692,14 @@ public:
|
|||
"model.diffusion_model",
|
||||
version,
|
||||
sd_ctx_params->qwen_image_zero_cond_t);
|
||||
} else if (sd_version_is_anima(version)) {
|
||||
cond_stage_model = std::make_shared<AnimaConditioner>(clip_backend,
|
||||
offload_params_to_cpu,
|
||||
tensor_storage_map);
|
||||
diffusion_model = std::make_shared<AnimaModel>(backend,
|
||||
offload_params_to_cpu,
|
||||
tensor_storage_map,
|
||||
"model.diffusion_model");
|
||||
} else if (sd_version_is_z_image(version)) {
|
||||
cond_stage_model = std::make_shared<LLMEmbedder>(clip_backend,
|
||||
offload_params_to_cpu,
|
||||
|
|
@ -737,7 +762,7 @@ public:
|
|||
}
|
||||
|
||||
if (!(use_tiny_autoencoder || version == VERSION_SDXS) || tae_preview_only) {
|
||||
if (sd_version_is_wan(version) || sd_version_is_qwen_image(version)) {
|
||||
if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_anima(version)) {
|
||||
first_stage_model = std::make_shared<WAN::WanVAERunner>(vae_backend,
|
||||
offload_params_to_cpu,
|
||||
tensor_storage_map,
|
||||
|
|
@ -775,7 +800,7 @@ public:
|
|||
}
|
||||
}
|
||||
if (use_tiny_autoencoder || version == VERSION_SDXS) {
|
||||
if (sd_version_is_wan(version) || sd_version_is_qwen_image(version)) {
|
||||
if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_anima(version)) {
|
||||
tae_first_stage = std::make_shared<TinyVideoAutoEncoder>(vae_backend,
|
||||
offload_params_to_cpu,
|
||||
tensor_storage_map,
|
||||
|
|
@ -1051,6 +1076,7 @@ public:
|
|||
} else if (sd_version_is_sd3(version) ||
|
||||
sd_version_is_wan(version) ||
|
||||
sd_version_is_qwen_image(version) ||
|
||||
sd_version_is_anima(version) ||
|
||||
sd_version_is_z_image(version)) {
|
||||
pred_type = FLOW_PRED;
|
||||
if (sd_version_is_wan(version)) {
|
||||
|
|
@ -1234,6 +1260,18 @@ public:
|
|||
cond_stage_lora_models.clear();
|
||||
diffusion_lora_models.clear();
|
||||
first_stage_lora_models.clear();
|
||||
if (cond_stage_model) {
|
||||
cond_stage_model->set_weight_adapter(nullptr);
|
||||
}
|
||||
if (diffusion_model) {
|
||||
diffusion_model->set_weight_adapter(nullptr);
|
||||
}
|
||||
if (high_noise_diffusion_model) {
|
||||
high_noise_diffusion_model->set_weight_adapter(nullptr);
|
||||
}
|
||||
if (first_stage_model) {
|
||||
first_stage_model->set_weight_adapter(nullptr);
|
||||
}
|
||||
if (lora_state.empty()) {
|
||||
return;
|
||||
}
|
||||
|
|
@ -1650,7 +1688,7 @@ public:
|
|||
} else if (sd_version_is_flux(version) || sd_version_is_z_image(version)) {
|
||||
latent_rgb_proj = flux_latent_rgb_proj;
|
||||
latent_rgb_bias = flux_latent_rgb_bias;
|
||||
} else if (sd_version_is_wan(version) || sd_version_is_qwen_image(version)) {
|
||||
} else if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_anima(version)) {
|
||||
latent_rgb_proj = wan_21_latent_rgb_proj;
|
||||
latent_rgb_bias = wan_21_latent_rgb_bias;
|
||||
} else {
|
||||
|
|
@ -2131,6 +2169,9 @@ public:
|
|||
shifted_t = std::max((int64_t)0, std::min((int64_t)(TIMESTEPS - 1), shifted_t));
|
||||
LOG_DEBUG("shifting timestep from %.2f to %" PRId64 " (sigma: %.4f)", t, shifted_t, sigma);
|
||||
timesteps_vec.assign(1, (float)shifted_t);
|
||||
} else if (sd_version_is_anima(version)) {
|
||||
// Anima uses normalized flow timesteps.
|
||||
timesteps_vec.assign(1, t / static_cast<float>(TIMESTEPS));
|
||||
} else if (sd_version_is_z_image(version)) {
|
||||
timesteps_vec.assign(1, 1000.f - t);
|
||||
} else {
|
||||
|
|
@ -2542,7 +2583,7 @@ public:
|
|||
}
|
||||
|
||||
void process_latent_in(ggml_tensor* latent) {
|
||||
if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_flux2(version)) {
|
||||
if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_anima(version) || sd_version_is_flux2(version)) {
|
||||
int channel_dim = sd_version_is_flux2(version) ? 2 : 3;
|
||||
std::vector<float> latents_mean_vec;
|
||||
std::vector<float> latents_std_vec;
|
||||
|
|
@ -2581,7 +2622,7 @@ public:
|
|||
}
|
||||
|
||||
void process_latent_out(ggml_tensor* latent) {
|
||||
if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_flux2(version)) {
|
||||
if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_anima(version) || sd_version_is_flux2(version)) {
|
||||
int channel_dim = sd_version_is_flux2(version) ? 2 : 3;
|
||||
std::vector<float> latents_mean_vec;
|
||||
std::vector<float> latents_std_vec;
|
||||
|
|
@ -2659,7 +2700,7 @@ public:
|
|||
// TODO wan2.2 vae support?
|
||||
int64_t ne2;
|
||||
int64_t ne3;
|
||||
if (sd_version_is_qwen_image(version)) {
|
||||
if (sd_version_is_qwen_image(version) || sd_version_is_anima(version)) {
|
||||
ne2 = 1;
|
||||
ne3 = C * x->ne[3];
|
||||
} else {
|
||||
|
|
@ -2677,7 +2718,7 @@ public:
|
|||
result = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, ne2, ne3);
|
||||
}
|
||||
|
||||
if (sd_version_is_qwen_image(version)) {
|
||||
if (sd_version_is_qwen_image(version) || sd_version_is_anima(version)) {
|
||||
x = ggml_reshape_4d(work_ctx, x, x->ne[0], x->ne[1], 1, x->ne[2] * x->ne[3]);
|
||||
}
|
||||
|
||||
|
|
@ -2750,6 +2791,7 @@ public:
|
|||
ggml_tensor* latent;
|
||||
if (use_tiny_autoencoder ||
|
||||
sd_version_is_qwen_image(version) ||
|
||||
sd_version_is_anima(version) ||
|
||||
sd_version_is_wan(version) ||
|
||||
sd_version_is_flux2(version) ||
|
||||
version == VERSION_CHROMA_RADIANCE) {
|
||||
|
|
@ -2769,7 +2811,7 @@ public:
|
|||
if (!use_tiny_autoencoder) {
|
||||
process_latent_in(latent);
|
||||
}
|
||||
if (sd_version_is_qwen_image(version)) {
|
||||
if (sd_version_is_qwen_image(version) || sd_version_is_anima(version)) {
|
||||
latent = ggml_reshape_4d(work_ctx, latent, latent->ne[0], latent->ne[1], latent->ne[3], 1);
|
||||
}
|
||||
return latent;
|
||||
|
|
@ -2807,7 +2849,7 @@ public:
|
|||
}
|
||||
int64_t t0 = ggml_time_ms();
|
||||
if (!use_tiny_autoencoder) {
|
||||
if (sd_version_is_qwen_image(version)) {
|
||||
if (sd_version_is_qwen_image(version) || sd_version_is_anima(version)) {
|
||||
x = ggml_reshape_4d(work_ctx, x, x->ne[0], x->ne[1], 1, x->ne[2] * x->ne[3]);
|
||||
}
|
||||
process_latent_out(x);
|
||||
|
|
|
|||
|
|
@ -19,6 +19,7 @@ struct UCacheConfig {
|
|||
bool adaptive_threshold = true;
|
||||
float early_step_multiplier = 0.5f;
|
||||
float late_step_multiplier = 1.5f;
|
||||
float relative_norm_gain = 1.6f;
|
||||
bool reset_error_on_compute = true;
|
||||
};
|
||||
|
||||
|
|
@ -45,14 +46,16 @@ struct UCacheState {
|
|||
bool has_output_prev_norm = false;
|
||||
bool has_relative_transformation_rate = false;
|
||||
float relative_transformation_rate = 0.0f;
|
||||
float cumulative_change_rate = 0.0f;
|
||||
float last_input_change = 0.0f;
|
||||
bool has_last_input_change = false;
|
||||
float output_change_ema = 0.0f;
|
||||
bool has_output_change_ema = false;
|
||||
int total_steps_skipped = 0;
|
||||
int current_step_index = -1;
|
||||
int steps_computed_since_active = 0;
|
||||
int expected_total_steps = 0;
|
||||
int consecutive_skipped_steps = 0;
|
||||
float accumulated_error = 0.0f;
|
||||
float reference_output_norm = 0.0f;
|
||||
|
||||
struct BlockMetrics {
|
||||
float sum_transformation_rate = 0.0f;
|
||||
|
|
@ -106,14 +109,16 @@ struct UCacheState {
|
|||
has_output_prev_norm = false;
|
||||
has_relative_transformation_rate = false;
|
||||
relative_transformation_rate = 0.0f;
|
||||
cumulative_change_rate = 0.0f;
|
||||
last_input_change = 0.0f;
|
||||
has_last_input_change = false;
|
||||
output_change_ema = 0.0f;
|
||||
has_output_change_ema = false;
|
||||
total_steps_skipped = 0;
|
||||
current_step_index = -1;
|
||||
steps_computed_since_active = 0;
|
||||
expected_total_steps = 0;
|
||||
consecutive_skipped_steps = 0;
|
||||
accumulated_error = 0.0f;
|
||||
reference_output_norm = 0.0f;
|
||||
block_metrics.reset();
|
||||
total_active_steps = 0;
|
||||
}
|
||||
|
|
@ -133,7 +138,8 @@ struct UCacheState {
|
|||
if (!initialized || sigmas.size() < 2) {
|
||||
return;
|
||||
}
|
||||
size_t n_steps = sigmas.size() - 1;
|
||||
size_t n_steps = sigmas.size() - 1;
|
||||
expected_total_steps = static_cast<int>(n_steps);
|
||||
|
||||
size_t start_step = static_cast<size_t>(config.start_percent * n_steps);
|
||||
size_t end_step = static_cast<size_t>(config.end_percent * n_steps);
|
||||
|
|
@ -207,11 +213,15 @@ struct UCacheState {
|
|||
}
|
||||
|
||||
int effective_total = estimated_total_steps;
|
||||
if (effective_total <= 0) {
|
||||
effective_total = expected_total_steps;
|
||||
}
|
||||
if (effective_total <= 0) {
|
||||
effective_total = std::max(20, steps_computed_since_active * 2);
|
||||
}
|
||||
|
||||
float progress = (effective_total > 0) ? (static_cast<float>(steps_computed_since_active) / effective_total) : 0.0f;
|
||||
progress = std::max(0.0f, std::min(1.0f, progress));
|
||||
|
||||
float multiplier = 1.0f;
|
||||
if (progress < 0.2f) {
|
||||
|
|
@ -309,17 +319,31 @@ struct UCacheState {
|
|||
|
||||
if (has_output_prev_norm && has_relative_transformation_rate &&
|
||||
last_input_change > 0.0f && output_prev_norm > 0.0f) {
|
||||
float approx_output_change_rate = (relative_transformation_rate * last_input_change) / output_prev_norm;
|
||||
accumulated_error = accumulated_error * config.error_decay_rate + approx_output_change_rate;
|
||||
float approx_output_change = relative_transformation_rate * last_input_change;
|
||||
float approx_output_change_rate;
|
||||
if (config.use_relative_threshold) {
|
||||
float base_scale = std::max(output_prev_norm, 1e-6f);
|
||||
float dyn_scale = has_output_change_ema
|
||||
? std::max(output_change_ema * std::max(1.0f, config.relative_norm_gain), 1e-6f)
|
||||
: base_scale;
|
||||
float scale = std::sqrt(base_scale * dyn_scale);
|
||||
approx_output_change_rate = approx_output_change / scale;
|
||||
} else {
|
||||
approx_output_change_rate = approx_output_change;
|
||||
}
|
||||
// Increase estimated error with skip horizon to avoid long extrapolation streaks
|
||||
approx_output_change_rate *= (1.0f + 0.50f * consecutive_skipped_steps);
|
||||
accumulated_error = accumulated_error * config.error_decay_rate + approx_output_change_rate;
|
||||
|
||||
float effective_threshold = get_adaptive_threshold();
|
||||
if (config.use_relative_threshold && reference_output_norm > 0.0f) {
|
||||
effective_threshold = effective_threshold * reference_output_norm;
|
||||
if (!config.use_relative_threshold && output_prev_norm > 0.0f) {
|
||||
effective_threshold = effective_threshold * output_prev_norm;
|
||||
}
|
||||
|
||||
if (accumulated_error < effective_threshold) {
|
||||
skip_current_step = true;
|
||||
total_steps_skipped++;
|
||||
consecutive_skipped_steps++;
|
||||
apply_cache(cond, input, output);
|
||||
return true;
|
||||
} else if (config.reset_error_on_compute) {
|
||||
|
|
@ -340,6 +364,8 @@ struct UCacheState {
|
|||
if (cond != anchor_condition) {
|
||||
return;
|
||||
}
|
||||
steps_computed_since_active++;
|
||||
consecutive_skipped_steps = 0;
|
||||
|
||||
size_t ne = static_cast<size_t>(ggml_nelements(input));
|
||||
float* in_data = (float*)input->data;
|
||||
|
|
@ -359,6 +385,14 @@ struct UCacheState {
|
|||
output_change /= static_cast<float>(ne);
|
||||
}
|
||||
}
|
||||
if (std::isfinite(output_change) && output_change > 0.0f) {
|
||||
if (!has_output_change_ema) {
|
||||
output_change_ema = output_change;
|
||||
has_output_change_ema = true;
|
||||
} else {
|
||||
output_change_ema = 0.8f * output_change_ema + 0.2f * output_change;
|
||||
}
|
||||
}
|
||||
|
||||
prev_output.resize(ne);
|
||||
for (size_t i = 0; i < ne; ++i) {
|
||||
|
|
@ -373,10 +407,6 @@ struct UCacheState {
|
|||
output_prev_norm = (ne > 0) ? (mean_abs / static_cast<float>(ne)) : 0.0f;
|
||||
has_output_prev_norm = output_prev_norm > 0.0f;
|
||||
|
||||
if (reference_output_norm == 0.0f) {
|
||||
reference_output_norm = output_prev_norm;
|
||||
}
|
||||
|
||||
if (has_last_input_change && last_input_change > 0.0f && output_change > 0.0f) {
|
||||
float rate = output_change / last_input_change;
|
||||
if (std::isfinite(rate)) {
|
||||
|
|
|
|||
|
|
@ -1,8 +1,7 @@
|
|||
#ifndef __UNET_HPP__
|
||||
#define __UNET_HPP__
|
||||
|
||||
#include "common.hpp"
|
||||
#include "ggml_extend.hpp"
|
||||
#include "common_block.hpp"
|
||||
#include "model.h"
|
||||
|
||||
/*==================================================== UnetModel =====================================================*/
|
||||
|
|
|
|||
|
|
@ -1,8 +1,7 @@
|
|||
#ifndef __VAE_HPP__
|
||||
#define __VAE_HPP__
|
||||
|
||||
#include "common.hpp"
|
||||
#include "ggml_extend.hpp"
|
||||
#include "common_block.hpp"
|
||||
|
||||
/*================================================== AutoEncoderKL ===================================================*/
|
||||
|
||||
|
|
|
|||
|
|
@ -5,9 +5,8 @@
|
|||
#include <memory>
|
||||
#include <utility>
|
||||
|
||||
#include "common.hpp"
|
||||
#include "common_block.hpp"
|
||||
#include "flux.hpp"
|
||||
#include "ggml_extend.hpp"
|
||||
#include "rope.hpp"
|
||||
#include "vae.hpp"
|
||||
|
||||
|
|
|
|||
|
|
@ -346,69 +346,6 @@ namespace ZImage {
|
|||
blocks["final_layer"] = std::make_shared<FinalLayer>(z_image_params.hidden_size, z_image_params.patch_size, z_image_params.out_channels);
|
||||
}
|
||||
|
||||
struct ggml_tensor* pad_to_patch_size(GGMLRunnerContext* ctx,
|
||||
struct ggml_tensor* x) {
|
||||
int64_t W = x->ne[0];
|
||||
int64_t H = x->ne[1];
|
||||
|
||||
int pad_h = (z_image_params.patch_size - H % z_image_params.patch_size) % z_image_params.patch_size;
|
||||
int pad_w = (z_image_params.patch_size - W % z_image_params.patch_size) % z_image_params.patch_size;
|
||||
x = ggml_ext_pad(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0, ctx->circular_x_enabled, ctx->circular_y_enabled);
|
||||
return x;
|
||||
}
|
||||
|
||||
struct ggml_tensor* patchify(struct ggml_context* ctx,
|
||||
struct ggml_tensor* x) {
|
||||
// x: [N, C, H, W]
|
||||
// return: [N, h*w, patch_size*patch_size*C]
|
||||
int64_t N = x->ne[3];
|
||||
int64_t C = x->ne[2];
|
||||
int64_t H = x->ne[1];
|
||||
int64_t W = x->ne[0];
|
||||
int64_t p = z_image_params.patch_size;
|
||||
int64_t h = H / z_image_params.patch_size;
|
||||
int64_t w = W / z_image_params.patch_size;
|
||||
|
||||
GGML_ASSERT(h * p == H && w * p == W);
|
||||
|
||||
x = ggml_reshape_4d(ctx, x, p, w, p, h * C * N); // [N*C*h, p, w, p]
|
||||
x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // [N*C*h, w, p, p]
|
||||
x = ggml_reshape_4d(ctx, x, p * p, w * h, C, N); // [N, C, h*w, p*p]
|
||||
x = ggml_cont(ctx, ggml_ext_torch_permute(ctx, x, 2, 0, 1, 3)); // [N, h*w, C, p*p]
|
||||
x = ggml_reshape_3d(ctx, x, C * p * p, w * h, N); // [N, h*w, p*p*C]
|
||||
return x;
|
||||
}
|
||||
|
||||
struct ggml_tensor* process_img(GGMLRunnerContext* ctx,
|
||||
struct ggml_tensor* x) {
|
||||
x = pad_to_patch_size(ctx, x);
|
||||
x = patchify(ctx->ggml_ctx, x);
|
||||
return x;
|
||||
}
|
||||
|
||||
struct ggml_tensor* unpatchify(struct ggml_context* ctx,
|
||||
struct ggml_tensor* x,
|
||||
int64_t h,
|
||||
int64_t w) {
|
||||
// x: [N, h*w, patch_size*patch_size*C]
|
||||
// return: [N, C, H, W]
|
||||
int64_t N = x->ne[2];
|
||||
int64_t C = x->ne[0] / z_image_params.patch_size / z_image_params.patch_size;
|
||||
int64_t H = h * z_image_params.patch_size;
|
||||
int64_t W = w * z_image_params.patch_size;
|
||||
int64_t p = z_image_params.patch_size;
|
||||
|
||||
GGML_ASSERT(C * p * p == x->ne[0]);
|
||||
|
||||
x = ggml_reshape_4d(ctx, x, C, p * p, w * h, N); // [N, h*w, p*p, C]
|
||||
x = ggml_cont(ctx, ggml_ext_torch_permute(ctx, x, 1, 2, 0, 3)); // [N, C, h*w, p*p]
|
||||
x = ggml_reshape_4d(ctx, x, p, p, w, h * C * N); // [N*C*h, w, p, p]
|
||||
x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // [N*C*h, p, w, p]
|
||||
x = ggml_reshape_4d(ctx, x, W, H, C, N); // [N, C, h*p, w*p]
|
||||
|
||||
return x;
|
||||
}
|
||||
|
||||
struct ggml_tensor* forward_core(GGMLRunnerContext* ctx,
|
||||
struct ggml_tensor* x,
|
||||
struct ggml_tensor* timestep,
|
||||
|
|
@ -495,27 +432,22 @@ namespace ZImage {
|
|||
int64_t C = x->ne[2];
|
||||
int64_t N = x->ne[3];
|
||||
|
||||
auto img = process_img(ctx, x);
|
||||
int patch_size = z_image_params.patch_size;
|
||||
|
||||
auto img = DiT::pad_and_patchify(ctx, x, patch_size, patch_size, false);
|
||||
uint64_t n_img_token = img->ne[1];
|
||||
|
||||
if (ref_latents.size() > 0) {
|
||||
for (ggml_tensor* ref : ref_latents) {
|
||||
ref = process_img(ctx, ref);
|
||||
ref = DiT::pad_and_patchify(ctx, ref, patch_size, patch_size, false);
|
||||
img = ggml_concat(ctx->ggml_ctx, img, ref, 1);
|
||||
}
|
||||
}
|
||||
|
||||
int64_t h_len = ((H + (z_image_params.patch_size / 2)) / z_image_params.patch_size);
|
||||
int64_t w_len = ((W + (z_image_params.patch_size / 2)) / z_image_params.patch_size);
|
||||
|
||||
auto out = forward_core(ctx, img, timestep, context, pe);
|
||||
|
||||
out = ggml_ext_slice(ctx->ggml_ctx, out, 1, 0, n_img_token); // [N, n_img_token, ph*pw*C]
|
||||
out = unpatchify(ctx->ggml_ctx, out, h_len, w_len); // [N, C, H + pad_h, W + pad_w]
|
||||
|
||||
// slice
|
||||
out = ggml_ext_slice(ctx->ggml_ctx, out, 1, 0, H); // [N, C, H, W + pad_w]
|
||||
out = ggml_ext_slice(ctx->ggml_ctx, out, 0, 0, W); // [N, C, H, W]
|
||||
out = ggml_ext_slice(ctx->ggml_ctx, out, 1, 0, n_img_token); // [N, n_img_token, ph*pw*C]
|
||||
out = DiT::unpatchify_and_crop(ctx->ggml_ctx, out, H, W, patch_size, patch_size, false); // [N, C, H, W]
|
||||
|
||||
out = ggml_ext_scale(ctx->ggml_ctx, out, -1.f);
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue