added support for flux kontext

2025-09-12 01:54:37 +00:00 · 2025-06-28 11:37:19 +08:00 · 2025-06-28 11:37:19 +08:00 · ed289227e5
commit ed289227e5
parent 0bd648ffa4
6 changed files with 202 additions and 70 deletions
--- a/otherarch/sdcpp/flux.hpp
+++ b/otherarch/sdcpp/flux.hpp
@ -672,11 +672,11 @@ namespace Flux {
        }

        // Generate IDs for image patches and text
-        std::vector<std::vector<float>> gen_ids(int h, int w, int patch_size, int bs, int context_len) {
+        std::vector<std::vector<float>> gen_ids(int h, int w, int patch_size, int index = 0) {
            int h_len = (h + (patch_size / 2)) / patch_size;
            int w_len = (w + (patch_size / 2)) / patch_size;

-            std::vector<std::vector<float>> img_ids(h_len * w_len, std::vector<float>(3, 0.0));
+            std::vector<std::vector<float>> img_ids(h_len * w_len, std::vector<float>(3, (float)index));

            std::vector<float> row_ids = linspace(0, h_len - 1, h_len);
            std::vector<float> col_ids = linspace(0, w_len - 1, w_len);
@ -688,10 +688,22 @@ namespace Flux {
                }
            }

-            std::vector<std::vector<float>> img_ids_repeated(bs * img_ids.size(), std::vector<float>(3));
-            for (int i = 0; i < bs; ++i) {
-                for (int j = 0; j < img_ids.size(); ++j) {
-                    img_ids_repeated[i * img_ids.size() + j] = img_ids[j];
+            return img_ids;
+        }
+
+        // Generate positional embeddings
+        std::vector<float> gen_pe(std::vector<struct ggml_tensor*> imgs, struct ggml_tensor* context, int patch_size, int theta, const std::vector<int>& axes_dim) {
+            int context_len = context->ne[1];
+            int bs          = imgs[0]->ne[3];
+
+            std::vector<std::vector<float>> img_ids;
+            for (int i = 0; i < imgs.size(); i++) {
+                auto x = imgs[i];
+                if (x) {
+                    int h                                     = x->ne[1];
+                    int w                                     = x->ne[0];
+                    std::vector<std::vector<float>> img_ids_i = gen_ids(h, w, patch_size, i);
+                    img_ids.insert(img_ids.end(), img_ids_i.begin(), img_ids_i.end());
                }
            }

@ -702,17 +714,10 @@ namespace Flux {
                    ids[i * (context_len + img_ids.size()) + j] = txt_ids[j];
                }
                for (int j = 0; j < img_ids.size(); ++j) {
-                    ids[i * (context_len + img_ids.size()) + context_len + j] = img_ids_repeated[i * img_ids.size() + j];
+                    ids[i * (context_len + img_ids.size()) + context_len + j] = img_ids[j];
                }
            }

-            return ids;
-        }
-
-
-        // Generate positional embeddings
-        std::vector<float> gen_pe(int h, int w, int patch_size, int bs, int context_len, int theta, const std::vector<int>& axes_dim) {
-            std::vector<std::vector<float>> ids       = gen_ids(h, w, patch_size, bs, context_len);
            std::vector<std::vector<float>> trans_ids = transpose(ids);
            size_t pos_len                            = ids.size();
            int num_axes                              = axes_dim.size();
@ -925,7 +930,7 @@ namespace Flux {
        }

        struct ggml_tensor* forward(struct ggml_context* ctx,
-                                    struct ggml_tensor* x,
+                                    std::vector<struct ggml_tensor*> imgs,
                                    struct ggml_tensor* timestep,
                                    struct ggml_tensor* context,
                                    struct ggml_tensor* c_concat,
@ -933,7 +938,8 @@ namespace Flux {
                                    struct ggml_tensor* guidance,
                                    struct ggml_tensor* pe,
                                    struct ggml_tensor* arange   = NULL,
-                                    std::vector<int> skip_layers = std::vector<int>()) {
+                                    std::vector<int> skip_layers = std::vector<int>(),
+                                    SDVersion version            = VERSION_FLUX) {
            // Forward pass of DiT.
            // x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
            // timestep: (N,) tensor of diffusion timesteps
@ -944,18 +950,31 @@ namespace Flux {
            // pe: (L, d_head/2, 2, 2)
            // return: (N, C, H, W)

+            auto x = imgs[0];
            GGML_ASSERT(x->ne[3] == 1);

            int64_t W          = x->ne[0];
            int64_t H          = x->ne[1];
            int64_t C          = x->ne[2];
            int64_t patch_size = 2;
-            int pad_h          = (patch_size - H % patch_size) % patch_size;
-            int pad_w          = (patch_size - W % patch_size) % patch_size;
-            x                  = ggml_pad(ctx, x, pad_w, pad_h, 0, 0);  // [N, C, H + pad_h, W + pad_w]
+            int pad_h          = (patch_size - x->ne[0] % patch_size) % patch_size;
+            int pad_w          = (patch_size - x->ne[1] % patch_size) % patch_size;

            // img = rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=patch_size, pw=patch_size)
-            auto img = patchify(ctx, x, patch_size);  // [N, h*w, C * patch_size * patch_size]
+            ggml_tensor* img = NULL;  // [N, h*w, C * patch_size * patch_size]
+            int64_t patchified_img_size;
+            for (auto& x : imgs) {
+                int pad_h          = (patch_size - x->ne[0] % patch_size) % patch_size;
+                int pad_w          = (patch_size - x->ne[1] % patch_size) % patch_size;
+                ggml_tensor* pad_x = ggml_pad(ctx, x, pad_w, pad_h, 0, 0);
+                pad_x              = patchify(ctx, pad_x, patch_size);
+                if (img) {
+                    img = ggml_concat(ctx, img, pad_x, 1);
+                } else {
+                    img                 = pad_x;
+                    patchified_img_size = img->ne[1];
+                }
+            }

            if (c_concat != NULL) {
                ggml_tensor* masked = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], C, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], 0);
@ -971,6 +990,7 @@ namespace Flux {
            }

            auto out = forward_orig(ctx, img, context, timestep, y, guidance, pe, arange, skip_layers);  // [N, h*w, C * patch_size * patch_size]
+            out = ggml_cont(ctx, ggml_view_2d(ctx, out, out->ne[0], patchified_img_size, out->nb[1], 0));

            // rearrange(out, "b (h w) (c ph pw) -> b c (h ph) (w pw)", h=h_len, w=w_len, ph=2, pw=2)
            out = unpatchify(ctx, out, (H + pad_h) / patch_size, (W + pad_w) / patch_size, patch_size);  // [N, C, H + pad_h, W + pad_w]
@ -1056,7 +1076,8 @@ namespace Flux {
                                        struct ggml_tensor* c_concat,
                                        struct ggml_tensor* y,
                                        struct ggml_tensor* guidance,
-                                        std::vector<int> skip_layers = std::vector<int>()) {
+                                        std::vector<struct ggml_tensor*> kontext_imgs = std::vector<struct ggml_tensor*>(),
+                                        std::vector<int> skip_layers                  = std::vector<int>()) {
            GGML_ASSERT(x->ne[3] == 1);
            struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, FLUX_GRAPH_SIZE, false);

@ -1067,6 +1088,9 @@ namespace Flux {
            if (c_concat != NULL) {
                c_concat = to_backend(c_concat);
            }
+            for (auto &img : kontext_imgs){
+                img = to_backend(img);
+            }
            if (flux_params.is_chroma) {
                const char* SD_CHROMA_ENABLE_GUIDANCE = getenv("SD_CHROMA_ENABLE_GUIDANCE");
                bool disable_guidance                 = true;
@ -1107,8 +1131,10 @@ namespace Flux {
            if (flux_params.guidance_embed || flux_params.is_chroma) {
                guidance = to_backend(guidance);
            }
+            auto imgs = kontext_imgs;
+            imgs.insert(imgs.begin(), x);

-            pe_vec      = flux.gen_pe(x->ne[1], x->ne[0], 2, x->ne[3], context->ne[1], flux_params.theta, flux_params.axes_dim);
+            pe_vec      = flux.gen_pe(imgs, context, 2, flux_params.theta, flux_params.axes_dim);
            int pos_len = pe_vec.size() / flux_params.axes_dim_sum / 2;
            // LOG_DEBUG("pos_len %d", pos_len);
            auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, flux_params.axes_dim_sum / 2, pos_len);
@ -1118,7 +1144,7 @@ namespace Flux {
            set_backend_tensor_data(pe, pe_vec.data());

            struct ggml_tensor* out = flux.forward(compute_ctx,
-                                                   x,
+                                                   imgs,
                                                   timesteps,
                                                   context,
                                                   c_concat,
@ -1140,16 +1166,17 @@ namespace Flux {
                     struct ggml_tensor* c_concat,
                     struct ggml_tensor* y,
                     struct ggml_tensor* guidance,
-                     struct ggml_tensor** output     = NULL,
-                     struct ggml_context* output_ctx = NULL,
-                     std::vector<int> skip_layers    = std::vector<int>()) {
+                     std::vector<struct ggml_tensor*> kontext_imgs = std::vector<struct ggml_tensor*>(),
+                     struct ggml_tensor** output                   = NULL,
+                     struct ggml_context* output_ctx               = NULL,
+                     std::vector<int> skip_layers                  = std::vector<int>()) {
            // x: [N, in_channels, h, w]
            // timesteps: [N, ]
            // context: [N, max_position, hidden_size]
            // y: [N, adm_in_channels] or [1, adm_in_channels]
            // guidance: [N, ]
            auto get_graph = [&]() -> struct ggml_cgraph* {
-                return build_graph(x, timesteps, context, c_concat, y, guidance, skip_layers);
+                return build_graph(x, timesteps, context, c_concat, y, guidance, kontext_imgs, skip_layers);
            };

            GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
@ -1189,7 +1216,7 @@ namespace Flux {
                struct ggml_tensor* out = NULL;

                int t0 = ggml_time_ms();
-                compute(8, x, timesteps, context, NULL, y, guidance, &out, work_ctx);
+                compute(8, x, timesteps, context, NULL, y, guidance, std::vector<struct ggml_tensor*>(), &out, work_ctx);
                int t1 = ggml_time_ms();

                print_ggml_tensor(out);