sd: sync to master-355-694f0d9

2026-05-18 23:49:46 +00:00 · 2025-11-10 20:14:50 -03:00 · 2025-11-10 20:14:50 -03:00 · 3318b73c94
commit 3318b73c94
parent 1cc4403cba
34 changed files with 3860 additions and 3216 deletions
--- a/8
+++ b/8
@ -719,11 +719,11 @@ expose.o: expose.cpp expose.h model_adapter.cpp
 	$(CXX) $(CXXFLAGS) -c $< -o $@

 # sd.cpp objects
-sdcpp_default.o: otherarch/sdcpp/sdtype_adapter.cpp otherarch/sdcpp/stable-diffusion.h otherarch/sdcpp/stable-diffusion.cpp otherarch/sdcpp/util.cpp otherarch/sdcpp/upscaler.cpp otherarch/sdcpp/model.cpp otherarch/sdcpp/tokenize_util.cpp otherarch/sdcpp/thirdparty/zip.c
+sdcpp_default.o: otherarch/sdcpp/sdtype_adapter.cpp otherarch/sdcpp/stable-diffusion.h otherarch/sdcpp/stable-diffusion.cpp otherarch/sdcpp/util.cpp otherarch/sdcpp/upscaler.cpp otherarch/sdcpp/model.cpp otherarch/sdcpp/name_conversion.cpp otherarch/sdcpp/tokenize_util.cpp otherarch/sdcpp/thirdparty/zip.c
 	$(CXX) $(CXXFLAGS) -c $< -o $@
-sdcpp_cublas.o: otherarch/sdcpp/sdtype_adapter.cpp otherarch/sdcpp/stable-diffusion.h otherarch/sdcpp/stable-diffusion.cpp otherarch/sdcpp/util.cpp otherarch/sdcpp/upscaler.cpp otherarch/sdcpp/model.cpp otherarch/sdcpp/tokenize_util.cpp otherarch/sdcpp/thirdparty/zip.c
+sdcpp_cublas.o: otherarch/sdcpp/sdtype_adapter.cpp otherarch/sdcpp/stable-diffusion.h otherarch/sdcpp/stable-diffusion.cpp otherarch/sdcpp/util.cpp otherarch/sdcpp/upscaler.cpp otherarch/sdcpp/model.cpp otherarch/sdcpp/name_conversion.cpp otherarch/sdcpp/tokenize_util.cpp otherarch/sdcpp/thirdparty/zip.c
 	$(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@
-sdcpp_vulkan.o: otherarch/sdcpp/sdtype_adapter.cpp otherarch/sdcpp/stable-diffusion.h otherarch/sdcpp/stable-diffusion.cpp otherarch/sdcpp/util.cpp otherarch/sdcpp/upscaler.cpp otherarch/sdcpp/model.cpp otherarch/sdcpp/tokenize_util.cpp otherarch/sdcpp/thirdparty/zip.c
+sdcpp_vulkan.o: otherarch/sdcpp/sdtype_adapter.cpp otherarch/sdcpp/stable-diffusion.h otherarch/sdcpp/stable-diffusion.cpp otherarch/sdcpp/util.cpp otherarch/sdcpp/upscaler.cpp otherarch/sdcpp/model.cpp otherarch/sdcpp/name_conversion.cpp otherarch/sdcpp/tokenize_util.cpp otherarch/sdcpp/thirdparty/zip.c
 	$(CXX) $(CXXFLAGS) $(VULKAN_FLAGS) -c $< -o $@


@ -770,7 +770,7 @@ main: tools/main/main.cpp common/arg.cpp common/download.cpp build-info.h ggml.o
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 mainvk: tools/main/main.cpp common/arg.cpp common/download.cpp build-info.h ggml_v4_vulkan.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_vulkan.o llava.o ggml-backend_vulkan.o ggml-backend-reg_vulkan.o ggml-vulkan.o ggml-vulkan-shaders.o ggml-repack.o $(OBJS_FULL) $(OBJS) lib/vulkan-1.lib
 	$(CXX) $(CXXFLAGS) -DGGML_USE_VULKAN -DSD_USE_VULKAN $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-sdmain: otherarch/sdcpp/util.cpp otherarch/sdcpp/main.cpp otherarch/sdcpp/stable-diffusion.cpp otherarch/sdcpp/upscaler.cpp otherarch/sdcpp/model.cpp otherarch/sdcpp/tokenize_util.cpp otherarch/sdcpp/thirdparty/zip.c build-info.h ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS)
+sdmain: otherarch/sdcpp/util.cpp otherarch/sdcpp/main.cpp otherarch/sdcpp/stable-diffusion.cpp otherarch/sdcpp/upscaler.cpp otherarch/sdcpp/model.cpp otherarch/sdcpp/name_conversion.cpp otherarch/sdcpp/tokenize_util.cpp otherarch/sdcpp/thirdparty/zip.c build-info.h ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 whispermain: otherarch/whispercpp/main.cpp otherarch/whispercpp/whisper.cpp build-info.h ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
--- a/otherarch/sdcpp/clip.hpp
+++ b/otherarch/sdcpp/clip.hpp
@ -451,16 +451,16 @@ public:
        }
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
        // x: [N, n_token, d_model]
        auto fc1 = std::dynamic_pointer_cast<Linear>(blocks["fc1"]);
        auto fc2 = std::dynamic_pointer_cast<Linear>(blocks["fc2"]);

        x = fc1->forward(ctx, x);
        if (use_gelu) {
-            x = ggml_gelu_inplace(ctx, x);
+            x = ggml_gelu_inplace(ctx->ggml_ctx, x);
        } else {
-            x = ggml_gelu_quick_inplace(ctx, x);
+            x = ggml_gelu_quick_inplace(ctx->ggml_ctx, x);
        }
        x = fc2->forward(ctx, x);
        return x;
@ -476,11 +476,12 @@ protected:
 public:
    CLIPLayer(int64_t d_model,
              int64_t n_head,
-              int64_t intermediate_size)
+              int64_t intermediate_size,
+              bool proj_in = false)
        : d_model(d_model),
          n_head(n_head),
          intermediate_size(intermediate_size) {
-        blocks["self_attn"] = std::shared_ptr<GGMLBlock>(new MultiheadAttention(d_model, n_head, true, true));
+        blocks["self_attn"] = std::shared_ptr<GGMLBlock>(new MultiheadAttention(d_model, n_head, true, true, proj_in));

        blocks["layer_norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(d_model));
        blocks["layer_norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(d_model));
@ -488,15 +489,15 @@ public:
        blocks["mlp"] = std::shared_ptr<GGMLBlock>(new CLIPMLP(d_model, intermediate_size));
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx, ggml_backend_t backend, struct ggml_tensor* x, bool mask = true) {
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x, bool mask = true) {
        // x: [N, n_token, d_model]
        auto self_attn   = std::dynamic_pointer_cast<MultiheadAttention>(blocks["self_attn"]);
        auto layer_norm1 = std::dynamic_pointer_cast<LayerNorm>(blocks["layer_norm1"]);
        auto layer_norm2 = std::dynamic_pointer_cast<LayerNorm>(blocks["layer_norm2"]);
        auto mlp         = std::dynamic_pointer_cast<CLIPMLP>(blocks["mlp"]);

-        x = ggml_add(ctx, x, self_attn->forward(ctx, backend, layer_norm1->forward(ctx, x), mask));
-        x = ggml_add(ctx, x, mlp->forward(ctx, layer_norm2->forward(ctx, x)));
+        x = ggml_add(ctx->ggml_ctx, x, self_attn->forward(ctx, layer_norm1->forward(ctx, x), mask));
+        x = ggml_add(ctx->ggml_ctx, x, mlp->forward(ctx, layer_norm2->forward(ctx, x)));
        return x;
    }
 };
@ -509,16 +510,16 @@ public:
    CLIPEncoder(int64_t n_layer,
                int64_t d_model,
                int64_t n_head,
-                int64_t intermediate_size)
+                int64_t intermediate_size,
+                bool proj_in = false)
        : n_layer(n_layer) {
        for (int i = 0; i < n_layer; i++) {
            std::string name = "layers." + std::to_string(i);
-            blocks[name]     = std::shared_ptr<GGMLBlock>(new CLIPLayer(d_model, n_head, intermediate_size));
+            blocks[name]     = std::shared_ptr<GGMLBlock>(new CLIPLayer(d_model, n_head, intermediate_size, proj_in));
        }
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx,
-                                ggml_backend_t backend,
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                struct ggml_tensor* x,
                                int clip_skip = -1,
                                bool mask     = true) {
@ -536,7 +537,7 @@ public:
            }
            std::string name = "layers." + std::to_string(i);
            auto layer       = std::dynamic_pointer_cast<CLIPLayer>(blocks[name]);
-            x                = layer->forward(ctx, backend, x, mask);  // [N, n_token, d_model]
+            x                = layer->forward(ctx, x, mask);  // [N, n_token, d_model]
            // LOG_DEBUG("layer %d", i);
        }
        return x;
@ -550,10 +551,10 @@ protected:
    int64_t num_positions;
    bool force_clip_f32;

-    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") override {
+    void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
        enum ggml_type token_wtype = GGML_TYPE_F32;
        if (!force_clip_f32) {
-            token_wtype = get_type(prefix + "token_embedding.weight", tensor_types, GGML_TYPE_F32);
+            token_wtype = get_type(prefix + "token_embedding.weight", tensor_storage_map, GGML_TYPE_F32);
            if (!support_get_rows(token_wtype)) {
                token_wtype = GGML_TYPE_F32;
            }
@ -578,7 +579,7 @@ public:
        return params["token_embedding.weight"];
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx,
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                struct ggml_tensor* input_ids,
                                struct ggml_tensor* custom_embed_weight) {
        // input_ids: [N, n_token]
@ -586,12 +587,12 @@ public:
        auto position_embed_weight = params["position_embedding.weight"];

        GGML_ASSERT(input_ids->ne[0] == position_embed_weight->ne[1]);
-        input_ids            = ggml_reshape_3d(ctx, input_ids, input_ids->ne[0], 1, input_ids->ne[1]);
-        auto token_embedding = ggml_get_rows(ctx, custom_embed_weight != nullptr ? custom_embed_weight : token_embed_weight, input_ids);
-        token_embedding      = ggml_reshape_3d(ctx, token_embedding, token_embedding->ne[0], token_embedding->ne[1], token_embedding->ne[3]);
+        input_ids            = ggml_reshape_3d(ctx->ggml_ctx, input_ids, input_ids->ne[0], 1, input_ids->ne[1]);
+        auto token_embedding = ggml_get_rows(ctx->ggml_ctx, custom_embed_weight != nullptr ? custom_embed_weight : token_embed_weight, input_ids);
+        token_embedding      = ggml_reshape_3d(ctx->ggml_ctx, token_embedding, token_embedding->ne[0], token_embedding->ne[1], token_embedding->ne[3]);

        // token_embedding + position_embedding
-        auto x = ggml_add(ctx,
+        auto x = ggml_add(ctx->ggml_ctx,
                          token_embedding,
                          position_embed_weight);  // [N, n_token, embed_dim]
        return x;
@ -606,7 +607,8 @@ protected:
    int64_t image_size;
    int64_t num_patches;
    int64_t num_positions;
-    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") override {
+
+    void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
        enum ggml_type patch_wtype    = GGML_TYPE_F16;
        enum ggml_type class_wtype    = GGML_TYPE_F32;
        enum ggml_type position_wtype = GGML_TYPE_F32;
@ -629,7 +631,7 @@ public:
        num_positions = num_patches + 1;
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* pixel_values) {
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* pixel_values) {
        // pixel_values: [N, num_channels, image_size, image_size]
        // return: [N, num_positions, embed_dim]
        GGML_ASSERT(pixel_values->ne[0] == image_size && pixel_values->ne[1] == image_size && pixel_values->ne[2] == num_channels);
@ -641,18 +643,18 @@ public:
        // concat(patch_embedding, class_embedding) + position_embedding
        struct ggml_tensor* patch_embedding;
        int64_t N       = pixel_values->ne[3];
-        patch_embedding = ggml_ext_conv_2d(ctx, pixel_values, patch_embed_weight, nullptr, patch_size, patch_size);  // [N, embed_dim, image_size // pacht_size, image_size // pacht_size]
-        patch_embedding = ggml_reshape_3d(ctx, patch_embedding, num_patches, embed_dim, N);                          // [N, embed_dim, num_patches]
-        patch_embedding = ggml_cont(ctx, ggml_permute(ctx, patch_embedding, 1, 0, 2, 3));                            // [N, num_patches, embed_dim]
-        patch_embedding = ggml_reshape_4d(ctx, patch_embedding, 1, embed_dim, num_patches, N);                       // [N, num_patches, embed_dim, 1]
+        patch_embedding = ggml_ext_conv_2d(ctx->ggml_ctx, pixel_values, patch_embed_weight, nullptr, patch_size, patch_size);  // [N, embed_dim, image_size // pacht_size, image_size // pacht_size]
+        patch_embedding = ggml_reshape_3d(ctx->ggml_ctx, patch_embedding, num_patches, embed_dim, N);                          // [N, embed_dim, num_patches]
+        patch_embedding = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, patch_embedding, 1, 0, 2, 3));                  // [N, num_patches, embed_dim]
+        patch_embedding = ggml_reshape_4d(ctx->ggml_ctx, patch_embedding, 1, embed_dim, num_patches, N);                       // [N, num_patches, embed_dim, 1]

-        struct ggml_tensor* class_embedding = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, embed_dim, N);
-        class_embedding                     = ggml_repeat(ctx, class_embed_weight, class_embedding);      // [N, embed_dim]
-        class_embedding                     = ggml_reshape_4d(ctx, class_embedding, 1, embed_dim, 1, N);  // [N, 1, embed_dim, 1]
+        struct ggml_tensor* class_embedding = ggml_new_tensor_2d(ctx->ggml_ctx, GGML_TYPE_F32, embed_dim, N);
+        class_embedding                     = ggml_repeat(ctx->ggml_ctx, class_embed_weight, class_embedding);      // [N, embed_dim]
+        class_embedding                     = ggml_reshape_4d(ctx->ggml_ctx, class_embedding, 1, embed_dim, 1, N);  // [N, 1, embed_dim, 1]

-        struct ggml_tensor* x = ggml_concat(ctx, class_embedding, patch_embedding, 2);  // [N, num_positions, embed_dim, 1]
-        x                     = ggml_reshape_3d(ctx, x, embed_dim, num_positions, N);   // [N, num_positions, embed_dim]
-        x                     = ggml_add(ctx, x, position_embed_weight);
+        struct ggml_tensor* x = ggml_concat(ctx->ggml_ctx, class_embedding, patch_embedding, 2);  // [N, num_positions, embed_dim, 1]
+        x                     = ggml_reshape_3d(ctx->ggml_ctx, x, embed_dim, num_positions, N);   // [N, num_positions, embed_dim]
+        x                     = ggml_add(ctx->ggml_ctx, x, position_embed_weight);
        return x;  // [N, num_positions, embed_dim]
    }
 };
@ -669,7 +671,7 @@ enum CLIPVersion {

 class CLIPTextModel : public GGMLBlock {
 protected:
-    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") override {
+    void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
        if (version == OPEN_CLIP_VIT_BIGG_14) {
            enum ggml_type wtype      = GGML_TYPE_F32;
            params["text_projection"] = ggml_new_tensor_2d(ctx, wtype, projection_dim, hidden_size);
@ -690,7 +692,8 @@ public:

    CLIPTextModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14,
                  bool with_final_ln  = true,
-                  bool force_clip_f32 = false)
+                  bool force_clip_f32 = false,
+                  bool proj_in        = false)
        : version(version), with_final_ln(with_final_ln) {
        if (version == OPEN_CLIP_VIT_H_14) {
            hidden_size       = 1024;
@ -705,7 +708,7 @@ public:
        }

        blocks["embeddings"]       = std::shared_ptr<GGMLBlock>(new CLIPEmbeddings(hidden_size, vocab_size, n_token, force_clip_f32));
-        blocks["encoder"]          = std::shared_ptr<GGMLBlock>(new CLIPEncoder(n_layer, hidden_size, n_head, intermediate_size));
+        blocks["encoder"]          = std::shared_ptr<GGMLBlock>(new CLIPEncoder(n_layer, hidden_size, n_head, intermediate_size, proj_in));
        blocks["final_layer_norm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size));
    }

@ -714,8 +717,7 @@ public:
        return embeddings->get_token_embed_weight();
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx,
-                                ggml_backend_t backend,
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                struct ggml_tensor* input_ids,
                                struct ggml_tensor* tkn_embeddings,
                                size_t max_token_idx = 0,
@ -727,16 +729,16 @@ public:
        auto final_layer_norm = std::dynamic_pointer_cast<LayerNorm>(blocks["final_layer_norm"]);

        auto x = embeddings->forward(ctx, input_ids, tkn_embeddings);  // [N, n_token, hidden_size]
-        x      = encoder->forward(ctx, backend, x, return_pooled ? -1 : clip_skip, true);
+        x      = encoder->forward(ctx, x, return_pooled ? -1 : clip_skip, true);
        if (return_pooled || with_final_ln) {
            x = final_layer_norm->forward(ctx, x);
        }

        if (return_pooled) {
            auto text_projection = params["text_projection"];
-            ggml_tensor* pooled  = ggml_view_1d(ctx, x, hidden_size, x->nb[1] * max_token_idx);
+            ggml_tensor* pooled  = ggml_view_1d(ctx->ggml_ctx, x, hidden_size, x->nb[1] * max_token_idx);
            if (text_projection != nullptr) {
-                pooled = ggml_ext_linear(ctx, pooled, text_projection, nullptr);
+                pooled = ggml_ext_linear(ctx->ggml_ctx, pooled, text_projection, nullptr);
            } else {
                LOG_DEBUG("identity projection");
            }
@ -760,7 +762,7 @@ public:
    int32_t n_layer           = 24;

 public:
-    CLIPVisionModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14) {
+    CLIPVisionModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14, bool proj_in = false) {
        if (version == OPEN_CLIP_VIT_H_14) {
            hidden_size       = 1280;
            intermediate_size = 5120;
@ -775,12 +777,11 @@ public:

        blocks["embeddings"]     = std::shared_ptr<GGMLBlock>(new CLIPVisionEmbeddings(hidden_size, num_channels, patch_size, image_size));
        blocks["pre_layernorm"]  = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size));
-        blocks["encoder"]        = std::shared_ptr<GGMLBlock>(new CLIPEncoder(n_layer, hidden_size, n_head, intermediate_size));
+        blocks["encoder"]        = std::shared_ptr<GGMLBlock>(new CLIPEncoder(n_layer, hidden_size, n_head, intermediate_size, proj_in));
        blocks["post_layernorm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size));
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx,
-                                ggml_backend_t backend,
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                struct ggml_tensor* pixel_values,
                                bool return_pooled = true,
                                int clip_skip      = -1) {
@ -792,14 +793,14 @@ public:

        auto x = embeddings->forward(ctx, pixel_values);  // [N, num_positions, embed_dim]
        x      = pre_layernorm->forward(ctx, x);
-        x      = encoder->forward(ctx, backend, x, clip_skip, false);
+        x      = encoder->forward(ctx, x, clip_skip, false);
        // print_ggml_tensor(x, true, "ClipVisionModel x: ");
        auto last_hidden_state = x;
        x                      = post_layernorm->forward(ctx, x);  // [N, n_token, hidden_size]

        GGML_ASSERT(x->ne[3] == 1);
        if (return_pooled) {
-            ggml_tensor* pooled = ggml_cont(ctx, ggml_view_2d(ctx, x, x->ne[0], x->ne[2], x->nb[2], 0));
+            ggml_tensor* pooled = ggml_cont(ctx->ggml_ctx, ggml_view_2d(ctx->ggml_ctx, x, x->ne[0], x->ne[2], x->nb[2], 0));
            return pooled;  // [N, hidden_size]
        } else {
            // return x;  // [N, n_token, hidden_size]
@ -814,8 +815,8 @@ protected:
    int64_t out_features;
    bool transpose_weight;

-    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") override {
-        enum ggml_type wtype = get_type(prefix + "weight", tensor_types, GGML_TYPE_F32);
+    void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
+        enum ggml_type wtype = get_type(prefix + "weight", tensor_storage_map, GGML_TYPE_F32);
        if (transpose_weight) {
            params["weight"] = ggml_new_tensor_2d(ctx, wtype, out_features, in_features);
        } else {
@ -831,12 +832,12 @@ public:
          out_features(out_features),
          transpose_weight(transpose_weight) {}

-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) override {
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
        struct ggml_tensor* w = params["weight"];
        if (transpose_weight) {
-            w = ggml_cont(ctx, ggml_transpose(ctx, w));
+            w = ggml_cont(ctx->ggml_ctx, ggml_transpose(ctx->ggml_ctx, w));
        }
-        return ggml_ext_linear(ctx, x, w, nullptr);
+        return ggml_ext_linear(ctx->ggml_ctx, x, w, nullptr);
    }
 };

@ -848,7 +849,8 @@ public:

 public:
    CLIPVisionModelProjection(CLIPVersion version   = OPENAI_CLIP_VIT_L_14,
-                              bool transpose_proj_w = false) {
+                              bool transpose_proj_w = false,
+                              bool proj_in          = false) {
        if (version == OPEN_CLIP_VIT_H_14) {
            hidden_size    = 1280;
            projection_dim = 1024;
@ -856,12 +858,11 @@ public:
            hidden_size = 1664;
        }

-        blocks["vision_model"]      = std::shared_ptr<GGMLBlock>(new CLIPVisionModel(version));
+        blocks["vision_model"]      = std::shared_ptr<GGMLBlock>(new CLIPVisionModel(version, proj_in));
        blocks["visual_projection"] = std::shared_ptr<GGMLBlock>(new CLIPProjection(hidden_size, projection_dim, transpose_proj_w));
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx,
-                                ggml_backend_t backend,
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                struct ggml_tensor* pixel_values,
                                bool return_pooled = true,
                                int clip_skip      = -1) {
@ -870,7 +871,7 @@ public:
        auto vision_model      = std::dynamic_pointer_cast<CLIPVisionModel>(blocks["vision_model"]);
        auto visual_projection = std::dynamic_pointer_cast<CLIPProjection>(blocks["visual_projection"]);

-        auto x = vision_model->forward(ctx, backend, pixel_values, return_pooled, clip_skip);  // [N, hidden_size] or [N, n_token, hidden_size]
+        auto x = vision_model->forward(ctx, pixel_values, return_pooled, clip_skip);  // [N, hidden_size] or [N, n_token, hidden_size]

        if (return_pooled) {
            x = visual_projection->forward(ctx, x);  // [N, projection_dim]
@ -885,13 +886,24 @@ struct CLIPTextModelRunner : public GGMLRunner {

    CLIPTextModelRunner(ggml_backend_t backend,
                        bool offload_params_to_cpu,
-                        const String2GGMLType& tensor_types,
+                        const String2TensorStorage& tensor_storage_map,
                        const std::string prefix,
                        CLIPVersion version = OPENAI_CLIP_VIT_L_14,
                        bool with_final_ln  = true,
                        bool force_clip_f32 = false)
-        : GGMLRunner(backend, offload_params_to_cpu), model(version, with_final_ln, force_clip_f32) {
-        model.init(params_ctx, tensor_types, prefix);
+        : GGMLRunner(backend, offload_params_to_cpu) {
+        bool proj_in = false;
+        for (const auto& [name, tensor_storage] : tensor_storage_map) {
+            if (!starts_with(name, prefix)) {
+                continue;
+            }
+            if (contains(name, "self_attn.in_proj")) {
+                proj_in = true;
+                break;
+            }
+        }
+        model = CLIPTextModel(version, with_final_ln, force_clip_f32, proj_in);
+        model.init(params_ctx, tensor_storage_map, prefix);
    }

    std::string get_desc() override {
@ -902,8 +914,7 @@ struct CLIPTextModelRunner : public GGMLRunner {
        model.get_param_tensors(tensors, prefix);
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx,
-                                ggml_backend_t backend,
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                struct ggml_tensor* input_ids,
                                struct ggml_tensor* embeddings,
                                size_t max_token_idx = 0,
@ -913,10 +924,10 @@ struct CLIPTextModelRunner : public GGMLRunner {
        size_t n_token = input_ids->ne[0];
        if (input_ids->ne[0] > model.n_token) {
            GGML_ASSERT(input_ids->ne[0] % model.n_token == 0);
-            input_ids = ggml_reshape_2d(ctx, input_ids, model.n_token, input_ids->ne[0] / model.n_token);
+            input_ids = ggml_reshape_2d(ctx->ggml_ctx, input_ids, model.n_token, input_ids->ne[0] / model.n_token);
        }

-        return model.forward(ctx, backend, input_ids, embeddings, max_token_idx, return_pooled, clip_skip);
+        return model.forward(ctx, input_ids, embeddings, max_token_idx, return_pooled, clip_skip);
    }

    struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids,
@ -943,7 +954,9 @@ struct CLIPTextModelRunner : public GGMLRunner {
            embeddings = ggml_concat(compute_ctx, token_embed_weight, custom_embeddings, 1);
        }

-        struct ggml_tensor* hidden_states = forward(compute_ctx, runtime_backend, input_ids, embeddings, max_token_idx, return_pooled, clip_skip);
+        auto runner_ctx = get_context();
+
+        struct ggml_tensor* hidden_states = forward(&runner_ctx, input_ids, embeddings, max_token_idx, return_pooled, clip_skip);

        ggml_build_forward_expand(gf, hidden_states);

--- a/otherarch/sdcpp/common.hpp
+++ b/otherarch/sdcpp/common.hpp
@ -23,12 +23,12 @@ public:
        }
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
        // x: [N, channels, h, w]
        if (vae_downsample) {
            auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]);

-            x = ggml_pad(ctx, x, 1, 1, 0, 0);
+            x = ggml_pad(ctx->ggml_ctx, x, 1, 1, 0, 0);
            x = conv->forward(ctx, x);
        } else {
            auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["op"]);
@ -52,12 +52,12 @@ public:
        blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
        // x: [N, channels, h, w]
        auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]);

-        x = ggml_upscale(ctx, x, 2, GGML_SCALE_MODE_NEAREST);  // [N, channels, h*2, w*2]
-        x = conv->forward(ctx, x);                             // [N, out_channels, h*2, w*2]
+        x = ggml_upscale(ctx->ggml_ctx, x, 2, GGML_SCALE_MODE_NEAREST);  // [N, channels, h*2, w*2]
+        x = conv->forward(ctx, x);                                       // [N, out_channels, h*2, w*2]
        return x;
    }
 };
@ -121,7 +121,7 @@ public:
        }
    }

-    virtual struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* emb = nullptr) {
+    virtual struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x, struct ggml_tensor* emb = nullptr) {
        // For dims==3, we reduce dimension from 5d to 4d by merging h and w, in order not to change ggml
        // [N, c, t, h, w] => [N, c, t, h * w]
        // x: [N, channels, h, w] if dims == 2 else [N, channels, t, h, w]
@ -137,32 +137,32 @@ public:

        // in_layers
        auto h = in_layers_0->forward(ctx, x);
-        h      = ggml_silu_inplace(ctx, h);
+        h      = ggml_silu_inplace(ctx->ggml_ctx, h);
        h      = in_layers_2->forward(ctx, h);  // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]

        // emb_layers
        if (!skip_t_emb) {
            auto emb_layer_1 = std::dynamic_pointer_cast<Linear>(blocks["emb_layers.1"]);

-            auto emb_out = ggml_silu(ctx, emb);
+            auto emb_out = ggml_silu(ctx->ggml_ctx, emb);
            emb_out      = emb_layer_1->forward(ctx, emb_out);  // [N, out_channels] if dims == 2 else [N, t, out_channels]

            if (dims == 2) {
-                emb_out = ggml_reshape_4d(ctx, emb_out, 1, 1, emb_out->ne[0], emb_out->ne[1]);  // [N, out_channels, 1, 1]
+                emb_out = ggml_reshape_4d(ctx->ggml_ctx, emb_out, 1, 1, emb_out->ne[0], emb_out->ne[1]);  // [N, out_channels, 1, 1]
            } else {
-                emb_out = ggml_reshape_4d(ctx, emb_out, 1, emb_out->ne[0], emb_out->ne[1], emb_out->ne[2]);  // [N, t, out_channels, 1]
+                emb_out = ggml_reshape_4d(ctx->ggml_ctx, emb_out, 1, emb_out->ne[0], emb_out->ne[1], emb_out->ne[2]);  // [N, t, out_channels, 1]
                if (exchange_temb_dims) {
                    // emb_out = rearrange(emb_out, "b t c ... -> b c t ...")
-                    emb_out = ggml_cont(ctx, ggml_permute(ctx, emb_out, 0, 2, 1, 3));  // [N, out_channels, t, 1]
+                    emb_out = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, emb_out, 0, 2, 1, 3));  // [N, out_channels, t, 1]
                }
            }

-            h = ggml_add(ctx, h, emb_out);  // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
+            h = ggml_add(ctx->ggml_ctx, h, emb_out);  // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
        }

        // out_layers
        h = out_layers_0->forward(ctx, h);
-        h = ggml_silu_inplace(ctx, h);
+        h = ggml_silu_inplace(ctx->ggml_ctx, h);
        // dropout, skip for inference
        h = out_layers_3->forward(ctx, h);

@ -172,7 +172,7 @@ public:
            x                    = skip_connection->forward(ctx, x);  // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
        }

-        h = ggml_add(ctx, h, x);
+        h = ggml_add(ctx->ggml_ctx, h, x);
        return h;  // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
    }
 };
@ -182,8 +182,8 @@ protected:
    int64_t dim_in;
    int64_t dim_out;

-    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, std::string prefix = "") override {
-        enum ggml_type wtype      = get_type(prefix + "proj.weight", tensor_types, GGML_TYPE_F32);
+    void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, std::string prefix = "") override {
+        enum ggml_type wtype      = get_type(prefix + "proj.weight", tensor_storage_map, GGML_TYPE_F32);
        enum ggml_type bias_wtype = GGML_TYPE_F32;
        params["proj.weight"]     = ggml_new_tensor_2d(ctx, wtype, dim_in, dim_out * 2);
        params["proj.bias"]       = ggml_new_tensor_1d(ctx, bias_wtype, dim_out * 2);
@ -193,24 +193,24 @@ public:
    GEGLU(int64_t dim_in, int64_t dim_out)
        : dim_in(dim_in), dim_out(dim_out) {}

-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) override {
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
        // x: [ne3, ne2, ne1, dim_in]
        // return: [ne3, ne2, ne1, dim_out]
        struct ggml_tensor* w = params["proj.weight"];
        struct ggml_tensor* b = params["proj.bias"];

-        auto x_w    = ggml_view_2d(ctx, w, w->ne[0], w->ne[1] / 2, w->nb[1], 0);                        // [dim_out, dim_in]
-        auto x_b    = ggml_view_1d(ctx, b, b->ne[0] / 2, 0);                                            // [dim_out, dim_in]
-        auto gate_w = ggml_view_2d(ctx, w, w->ne[0], w->ne[1] / 2, w->nb[1], w->nb[1] * w->ne[1] / 2);  // [dim_out, ]
-        auto gate_b = ggml_view_1d(ctx, b, b->ne[0] / 2, b->nb[0] * b->ne[0] / 2);                      // [dim_out, ]
+        auto x_w    = ggml_view_2d(ctx->ggml_ctx, w, w->ne[0], w->ne[1] / 2, w->nb[1], 0);                        // [dim_out, dim_in]
+        auto x_b    = ggml_view_1d(ctx->ggml_ctx, b, b->ne[0] / 2, 0);                                            // [dim_out, dim_in]
+        auto gate_w = ggml_view_2d(ctx->ggml_ctx, w, w->ne[0], w->ne[1] / 2, w->nb[1], w->nb[1] * w->ne[1] / 2);  // [dim_out, ]
+        auto gate_b = ggml_view_1d(ctx->ggml_ctx, b, b->ne[0] / 2, b->nb[0] * b->ne[0] / 2);                      // [dim_out, ]

        auto x_in = x;
-        x         = ggml_ext_linear(ctx, x_in, x_w, x_b);        // [ne3, ne2, ne1, dim_out]
-        auto gate = ggml_ext_linear(ctx, x_in, gate_w, gate_b);  // [ne3, ne2, ne1, dim_out]
+        x         = ggml_ext_linear(ctx->ggml_ctx, x_in, x_w, x_b);        // [ne3, ne2, ne1, dim_out]
+        auto gate = ggml_ext_linear(ctx->ggml_ctx, x_in, gate_w, gate_b);  // [ne3, ne2, ne1, dim_out]

-        gate = ggml_gelu_inplace(ctx, gate);
+        gate = ggml_gelu_inplace(ctx->ggml_ctx, gate);

-        x = ggml_mul(ctx, x, gate);  // [ne3, ne2, ne1, dim_out]
+        x = ggml_mul(ctx->ggml_ctx, x, gate);  // [ne3, ne2, ne1, dim_out]

        return x;
    }
@ -222,13 +222,13 @@ public:
        blocks["proj"] = std::shared_ptr<GGMLBlock>(new Linear(dim_in, dim_out, bias));
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) override {
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
        // x: [ne3, ne2, ne1, dim_in]
        // return: [ne3, ne2, ne1, dim_out]
        auto proj = std::dynamic_pointer_cast<Linear>(blocks["proj"]);

        x = proj->forward(ctx, x);
-        x = ggml_gelu_inplace(ctx, x);
+        x = ggml_gelu_inplace(ctx->ggml_ctx, x);
        return x;
    }
 };
@ -262,7 +262,7 @@ public:
        blocks["net.2"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim_out, true, false, false, scale));
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
        // x: [ne3, ne2, ne1, dim]
        // return: [ne3, ne2, ne1, dim_out]

@ -281,19 +281,16 @@ protected:
    int64_t context_dim;
    int64_t n_head;
    int64_t d_head;
-    bool flash_attn;

 public:
    CrossAttention(int64_t query_dim,
                   int64_t context_dim,
                   int64_t n_head,
-                   int64_t d_head,
-                   bool flash_attn = false)
+                   int64_t d_head)
        : n_head(n_head),
          d_head(d_head),
          query_dim(query_dim),
-          context_dim(context_dim),
-          flash_attn(flash_attn) {
+          context_dim(context_dim) {
        int64_t inner_dim = d_head * n_head;

        blocks["to_q"] = std::shared_ptr<GGMLBlock>(new Linear(query_dim, inner_dim, false));
@ -304,8 +301,7 @@ public:
        // to_out_1 is nn.Dropout(), skip for inference
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx,
-                                ggml_backend_t backend,
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                struct ggml_tensor* x,
                                struct ggml_tensor* context) {
        // x: [N, n_token, query_dim]
@ -325,7 +321,7 @@ public:
        auto k = to_k->forward(ctx, context);  // [N, n_context, inner_dim]
        auto v = to_v->forward(ctx, context);  // [N, n_context, inner_dim]

-        x = ggml_ext_attention_ext(ctx, backend, q, k, v, n_head, nullptr, false, false, flash_attn);  // [N, n_token, inner_dim]
+        x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, n_head, nullptr, false, false, ctx->flash_attn_enabled);  // [N, n_token, inner_dim]

        x = to_out_0->forward(ctx, x);  // [N, n_token, query_dim]
        return x;
@ -343,16 +339,15 @@ public:
                          int64_t n_head,
                          int64_t d_head,
                          int64_t context_dim,
-                          bool ff_in      = false,
-                          bool flash_attn = false)
+                          bool ff_in = false)
        : n_head(n_head), d_head(d_head), ff_in(ff_in) {
        // disable_self_attn is always False
        // disable_temporal_crossattention is always False
        // switch_temporal_ca_to_sa is always False
        // inner_dim is always None or equal to dim
        // gated_ff is always True
-        blocks["attn1"] = std::shared_ptr<GGMLBlock>(new CrossAttention(dim, dim, n_head, d_head, flash_attn));
-        blocks["attn2"] = std::shared_ptr<GGMLBlock>(new CrossAttention(dim, context_dim, n_head, d_head, flash_attn));
+        blocks["attn1"] = std::shared_ptr<GGMLBlock>(new CrossAttention(dim, dim, n_head, d_head));
+        blocks["attn2"] = std::shared_ptr<GGMLBlock>(new CrossAttention(dim, context_dim, n_head, d_head));
        blocks["ff"]    = std::shared_ptr<GGMLBlock>(new FeedForward(dim, dim));
        blocks["norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
        blocks["norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
@ -364,8 +359,7 @@ public:
        }
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx,
-                                ggml_backend_t backend,
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                struct ggml_tensor* x,
                                struct ggml_tensor* context) {
        // x: [N, n_token, query_dim]
@ -387,21 +381,21 @@ public:
            x           = norm_in->forward(ctx, x);
            x           = ff_in->forward(ctx, x);
            // self.is_res is always True
-            x = ggml_add(ctx, x, x_skip);
+            x = ggml_add(ctx->ggml_ctx, x, x_skip);
        }

        auto r = x;
        x      = norm1->forward(ctx, x);
-        x      = attn1->forward(ctx, backend, x, x);  // self-attention
-        x      = ggml_add(ctx, x, r);
+        x      = attn1->forward(ctx, x, x);  // self-attention
+        x      = ggml_add(ctx->ggml_ctx, x, r);
        r      = x;
        x      = norm2->forward(ctx, x);
-        x      = attn2->forward(ctx, backend, x, context);  // cross-attention
-        x      = ggml_add(ctx, x, r);
+        x      = attn2->forward(ctx, x, context);  // cross-attention
+        x      = ggml_add(ctx->ggml_ctx, x, r);
        r      = x;
        x      = norm3->forward(ctx, x);
        x      = ff->forward(ctx, x);
-        x      = ggml_add(ctx, x, r);
+        x      = ggml_add(ctx->ggml_ctx, x, r);

        return x;
    }
@ -414,6 +408,23 @@ protected:
    int64_t d_head;
    int64_t depth       = 1;    // 1
    int64_t context_dim = 768;  // hidden_size, 1024 for VERSION_SD2
+    bool use_linear     = false;
+
+    void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") {
+        auto iter = tensor_storage_map.find(prefix + "proj_out.weight");
+        if (iter != tensor_storage_map.end()) {
+            int64_t inner_dim = n_head * d_head;
+            if (iter->second.n_dims == 4 && use_linear) {
+                use_linear         = false;
+                blocks["proj_in"]  = std::make_shared<Conv2d>(in_channels, inner_dim, std::pair{1, 1});
+                blocks["proj_out"] = std::make_shared<Conv2d>(inner_dim, in_channels, std::pair{1, 1});
+            } else if (iter->second.n_dims == 2 && !use_linear) {
+                use_linear         = true;
+                blocks["proj_in"]  = std::make_shared<Linear>(in_channels, inner_dim);
+                blocks["proj_out"] = std::make_shared<Linear>(inner_dim, in_channels);
+            }
+        }
+    }

 public:
    SpatialTransformer(int64_t in_channels,
@ -421,35 +432,42 @@ public:
                       int64_t d_head,
                       int64_t depth,
                       int64_t context_dim,
-                       bool flash_attn = false)
+                       bool use_linear)
        : in_channels(in_channels),
          n_head(n_head),
          d_head(d_head),
          depth(depth),
-          context_dim(context_dim) {
-        // We will convert unet transformer linear to conv2d 1x1 when loading the weights, so use_linear is always False
+          context_dim(context_dim),
+          use_linear(use_linear) {
        // disable_self_attn is always False
        int64_t inner_dim = n_head * d_head;  // in_channels
        blocks["norm"]    = std::shared_ptr<GGMLBlock>(new GroupNorm32(in_channels));
-        blocks["proj_in"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, inner_dim, {1, 1}));
+        if (use_linear) {
+            blocks["proj_in"] = std::shared_ptr<GGMLBlock>(new Linear(in_channels, inner_dim));
+        } else {
+            blocks["proj_in"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, inner_dim, {1, 1}));
+        }

        for (int i = 0; i < depth; i++) {
            std::string name = "transformer_blocks." + std::to_string(i);
-            blocks[name]     = std::shared_ptr<GGMLBlock>(new BasicTransformerBlock(inner_dim, n_head, d_head, context_dim, false, flash_attn));
+            blocks[name]     = std::shared_ptr<GGMLBlock>(new BasicTransformerBlock(inner_dim, n_head, d_head, context_dim, false));
        }

-        blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(inner_dim, in_channels, {1, 1}));
+        if (use_linear) {
+            blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, in_channels));
+        } else {
+            blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(inner_dim, in_channels, {1, 1}));
+        }
    }

-    virtual struct ggml_tensor* forward(struct ggml_context* ctx,
-                                        ggml_backend_t backend,
+    virtual struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                        struct ggml_tensor* x,
                                        struct ggml_tensor* context) {
        // x: [N, in_channels, h, w]
        // context: [N, max_position(aka n_token), hidden_size(aka context_dim)]
        auto norm     = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm"]);
-        auto proj_in  = std::dynamic_pointer_cast<Conv2d>(blocks["proj_in"]);
-        auto proj_out = std::dynamic_pointer_cast<Conv2d>(blocks["proj_out"]);
+        auto proj_in  = std::dynamic_pointer_cast<UnaryBlock>(blocks["proj_in"]);
+        auto proj_out = std::dynamic_pointer_cast<UnaryBlock>(blocks["proj_out"]);

        auto x_in         = x;
        int64_t n         = x->ne[3];
@ -458,32 +476,45 @@ public:
        int64_t inner_dim = n_head * d_head;

        x = norm->forward(ctx, x);
-        x = proj_in->forward(ctx, x);  // [N, inner_dim, h, w]
-
-        x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 2, 0, 3));  // [N, h, w, inner_dim]
-        x = ggml_reshape_3d(ctx, x, inner_dim, w * h, n);      // [N, h * w, inner_dim]
+        if (use_linear) {
+            x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 2, 0, 3));  // [N, h, w, inner_dim]
+            x = ggml_reshape_3d(ctx->ggml_ctx, x, inner_dim, w * h, n);                // [N, h * w, inner_dim]
+            x = proj_in->forward(ctx, x);                                              // [N, inner_dim, h, w]
+        } else {
+            x = proj_in->forward(ctx, x);                                              // [N, inner_dim, h, w]
+            x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 2, 0, 3));  // [N, h, w, inner_dim]
+            x = ggml_reshape_3d(ctx->ggml_ctx, x, inner_dim, w * h, n);                // [N, h * w, inner_dim]
+        }

        for (int i = 0; i < depth; i++) {
            std::string name       = "transformer_blocks." + std::to_string(i);
            auto transformer_block = std::dynamic_pointer_cast<BasicTransformerBlock>(blocks[name]);

-            x = transformer_block->forward(ctx, backend, x, context);
+            x = transformer_block->forward(ctx, x, context);
        }

-        x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 0, 2, 3));  // [N, inner_dim, h * w]
-        x = ggml_reshape_4d(ctx, x, w, h, inner_dim, n);       // [N, inner_dim, h, w]
+        if (use_linear) {
+            // proj_out
+            x = proj_out->forward(ctx, x);  // [N, in_channels, h, w]

-        // proj_out
-        x = proj_out->forward(ctx, x);  // [N, in_channels, h, w]
+            x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 0, 2, 3));  // [N, inner_dim, h * w]
+            x = ggml_reshape_4d(ctx->ggml_ctx, x, w, h, inner_dim, n);                 // [N, inner_dim, h, w]
+        } else {
+            x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 0, 2, 3));  // [N, inner_dim, h * w]
+            x = ggml_reshape_4d(ctx->ggml_ctx, x, w, h, inner_dim, n);                 // [N, inner_dim, h, w]

-        x = ggml_add(ctx, x, x_in);
+            // proj_out
+            x = proj_out->forward(ctx, x);  // [N, in_channels, h, w]
+        }
+
+        x = ggml_add(ctx->ggml_ctx, x, x_in);
        return x;
    }
 };

 class AlphaBlender : public GGMLBlock {
 protected:
-    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, std::string prefix = "") override {
+    void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, std::string prefix = "") override {
        // Get the type of the "mix_factor" tensor from the input tensors map with the specified prefix
        enum ggml_type wtype = GGML_TYPE_F32;
        params["mix_factor"] = ggml_new_tensor_1d(ctx, wtype, 1);
@ -503,14 +534,14 @@ public:
        // since mix_factor.shape is [1,], we don't need rearrange using  rearrange_pattern
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx,
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                struct ggml_tensor* x_spatial,
                                struct ggml_tensor* x_temporal) {
        // image_only_indicator is always tensor([0.])
        float alpha = get_alpha();
-        auto x      = ggml_add(ctx,
-                               ggml_scale(ctx, x_spatial, alpha),
-                               ggml_scale(ctx, x_temporal, 1.0f - alpha));
+        auto x      = ggml_add(ctx->ggml_ctx,
+                               ggml_scale(ctx->ggml_ctx, x_spatial, alpha),
+                               ggml_scale(ctx->ggml_ctx, x_temporal, 1.0f - alpha));
        return x;
    }
 };
@ -528,7 +559,7 @@ public:
        blocks["time_mixer"] = std::shared_ptr<GGMLBlock>(new AlphaBlender());
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx,
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                struct ggml_tensor* x,
                                struct ggml_tensor* emb,
                                int num_video_frames) {
@ -546,18 +577,18 @@ public:
        int64_t H = x->ne[1];
        int64_t W = x->ne[0];

-        x          = ggml_reshape_4d(ctx, x, W * H, C, T, B);           // (b t) c h w -> b t c (h w)
-        x          = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));  // b t c (h w) -> b c t (h w)
+        x          = ggml_reshape_4d(ctx->ggml_ctx, x, W * H, C, T, B);                     // (b t) c h w -> b t c (h w)
+        x          = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3));  // b t c (h w) -> b c t (h w)
        auto x_mix = x;

-        emb = ggml_reshape_4d(ctx, emb, emb->ne[0], T, B, emb->ne[3]);  // (b t) ... -> b t ...
+        emb = ggml_reshape_4d(ctx->ggml_ctx, emb, emb->ne[0], T, B, emb->ne[3]);  // (b t) ... -> b t ...

        x = time_stack->forward(ctx, x, emb);  // b t c (h w)

        x = time_mixer->forward(ctx, x_mix, x);  // b t c (h w)

-        x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));  // b c t (h w) -> b t c (h w)
-        x = ggml_reshape_4d(ctx, x, W, H, C, T * B);           // b t c (h w) -> (b t) c h w
+        x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3));  // b c t (h w) -> b t c (h w)
+        x = ggml_reshape_4d(ctx->ggml_ctx, x, W, H, C, T * B);                     // b t c (h w) -> (b t) c h w

        return x;
    }
--- a/otherarch/sdcpp/conditioner.hpp
+++ b/otherarch/sdcpp/conditioner.hpp
@ -63,19 +63,19 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {

    FrozenCLIPEmbedderWithCustomWords(ggml_backend_t backend,
                                      bool offload_params_to_cpu,
-                                      const String2GGMLType& tensor_types,
+                                      const String2TensorStorage& tensor_storage_map,
                                      const std::string& embd_dir,
                                      SDVersion version = VERSION_SD1,
                                      PMVersion pv      = PM_VERSION_1)
        : version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407), embd_dir(embd_dir) {
        bool force_clip_f32 = embd_dir.size() > 0;
        if (sd_version_is_sd1(version)) {
-            text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, true, force_clip_f32);
+            text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, true, force_clip_f32);
        } else if (sd_version_is_sd2(version)) {
-            text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, true, force_clip_f32);
+            text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, true, force_clip_f32);
        } else if (sd_version_is_sdxl(version)) {
-            text_model  = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false, force_clip_f32);
-            text_model2 = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false, force_clip_f32);
+            text_model  = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false, force_clip_f32);
+            text_model2 = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false, force_clip_f32);
        }
    }

@ -111,7 +111,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
    bool load_embedding(std::string embd_name, std::string embd_path, std::vector<int32_t>& bpe_tokens) {
        // the order matters
        ModelLoader model_loader;
-        if (!model_loader.init_from_file(embd_path)) {
+        if (!model_loader.init_from_file_and_convert_name(embd_path)) {
            LOG_ERROR("embedding '%s' failed", embd_name.c_str());
            return false;
        }
@ -623,9 +623,21 @@ struct FrozenCLIPVisionEmbedder : public GGMLRunner {

    FrozenCLIPVisionEmbedder(ggml_backend_t backend,
                             bool offload_params_to_cpu,
-                             const String2GGMLType& tensor_types = {})
-        : vision_model(OPEN_CLIP_VIT_H_14), GGMLRunner(backend, offload_params_to_cpu) {
-        vision_model.init(params_ctx, tensor_types, "cond_stage_model.transformer");
+                             const String2TensorStorage& tensor_storage_map = {})
+        : GGMLRunner(backend, offload_params_to_cpu) {
+        std::string prefix = "cond_stage_model.transformer";
+        bool proj_in       = false;
+        for (const auto& [name, tensor_storage] : tensor_storage_map) {
+            if (!starts_with(name, prefix)) {
+                continue;
+            }
+            if (contains(name, "self_attn.in_proj")) {
+                proj_in = true;
+                break;
+            }
+        }
+        vision_model = CLIPVisionModelProjection(OPEN_CLIP_VIT_H_14, false, proj_in);
+        vision_model.init(params_ctx, tensor_storage_map, prefix);
    }

    std::string get_desc() override {
@ -641,7 +653,9 @@ struct FrozenCLIPVisionEmbedder : public GGMLRunner {

        pixel_values = to_backend(pixel_values);

-        struct ggml_tensor* hidden_states = vision_model.forward(compute_ctx, runtime_backend, pixel_values, return_pooled, clip_skip);
+        auto runner_ctx = get_context();
+
+        struct ggml_tensor* hidden_states = vision_model.forward(&runner_ctx, pixel_values, return_pooled, clip_skip);

        ggml_build_forward_expand(gf, hidden_states);

@ -671,12 +685,12 @@ struct SD3CLIPEmbedder : public Conditioner {

    SD3CLIPEmbedder(ggml_backend_t backend,
                    bool offload_params_to_cpu,
-                    const String2GGMLType& tensor_types = {})
+                    const String2TensorStorage& tensor_storage_map = {})
        : clip_g_tokenizer(0) {
        bool use_clip_l = false;
        bool use_clip_g = false;
        bool use_t5     = false;
-        for (auto pair : tensor_types) {
+        for (auto pair : tensor_storage_map) {
            if (pair.first.find("text_encoders.clip_l") != std::string::npos) {
                use_clip_l = true;
            } else if (pair.first.find("text_encoders.clip_g") != std::string::npos) {
@ -690,13 +704,13 @@ struct SD3CLIPEmbedder : public Conditioner {
            return;
        }
        if (use_clip_l) {
-            clip_l = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
+            clip_l = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
        }
        if (use_clip_g) {
-            clip_g = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
+            clip_g = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
        }
        if (use_t5) {
-            t5 = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer");
+            t5 = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.t5xxl.transformer");
        }
    }

@ -1080,10 +1094,10 @@ struct FluxCLIPEmbedder : public Conditioner {

    FluxCLIPEmbedder(ggml_backend_t backend,
                     bool offload_params_to_cpu,
-                     const String2GGMLType& tensor_types = {}) {
+                     const String2TensorStorage& tensor_storage_map = {}) {
        bool use_clip_l = false;
        bool use_t5     = false;
-        for (auto pair : tensor_types) {
+        for (auto pair : tensor_storage_map) {
            if (pair.first.find("text_encoders.clip_l") != std::string::npos) {
                use_clip_l = true;
            } else if (pair.first.find("text_encoders.t5xxl") != std::string::npos) {
@ -1097,12 +1111,12 @@ struct FluxCLIPEmbedder : public Conditioner {
        }

        if (use_clip_l) {
-            clip_l = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true);
+            clip_l = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true);
        } else {
            LOG_WARN("clip_l text encoder not found! Prompt adherence might be degraded.");
        }
        if (use_t5) {
-            t5 = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer");
+            t5 = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.t5xxl.transformer");
        } else {
            LOG_WARN("t5xxl text encoder not found! Prompt adherence might be degraded.");
        }
@ -1340,13 +1354,13 @@ struct T5CLIPEmbedder : public Conditioner {

    T5CLIPEmbedder(ggml_backend_t backend,
                   bool offload_params_to_cpu,
-                   const String2GGMLType& tensor_types = {},
-                   bool use_mask                       = false,
-                   int mask_pad                        = 1,
-                   bool is_umt5                        = false)
+                   const String2TensorStorage& tensor_storage_map = {},
+                   bool use_mask                                  = false,
+                   int mask_pad                                   = 1,
+                   bool is_umt5                                   = false)
        : use_mask(use_mask), mask_pad(mask_pad), t5_tokenizer(is_umt5) {
        bool use_t5 = false;
-        for (auto pair : tensor_types) {
+        for (auto pair : tensor_storage_map) {
            if (pair.first.find("text_encoders.t5xxl") != std::string::npos) {
                use_t5 = true;
            }
@ -1356,7 +1370,7 @@ struct T5CLIPEmbedder : public Conditioner {
            LOG_WARN("IMPORTANT NOTICE: No text encoders provided, cannot process prompts!");
            return;
        } else {
-            t5 = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer", is_umt5);
+            t5 = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.t5xxl.transformer", is_umt5);
        }
    }

@ -1547,12 +1561,12 @@ struct Qwen2_5_VLCLIPEmbedder : public Conditioner {

    Qwen2_5_VLCLIPEmbedder(ggml_backend_t backend,
                           bool offload_params_to_cpu,
-                           const String2GGMLType& tensor_types = {},
-                           const std::string prefix            = "",
-                           bool enable_vision                  = false) {
+                           const String2TensorStorage& tensor_storage_map = {},
+                           const std::string prefix                       = "",
+                           bool enable_vision                             = false) {
        qwenvl = std::make_shared<Qwen::Qwen2_5_VLRunner>(backend,
                                                          offload_params_to_cpu,
-                                                          tensor_types,
+                                                          tensor_storage_map,
                                                          "text_encoders.qwen2vl",
                                                          enable_vision);
    }
--- a/otherarch/sdcpp/control.hpp
+++ b/otherarch/sdcpp/control.hpp
@ -27,6 +27,7 @@ protected:
    int num_heads                          = 8;
    int num_head_channels                  = -1;   // channels // num_heads
    int context_dim                        = 768;  // 1024 for VERSION_SD2, 2048 for VERSION_SDXL
+    bool use_linear_projection             = false;

 public:
    int model_channels  = 320;
@ -82,7 +83,7 @@ public:
                                       int64_t d_head,
                                       int64_t depth,
                                       int64_t context_dim) -> SpatialTransformer* {
-            return new SpatialTransformer(in_channels, n_head, d_head, depth, context_dim);
+            return new SpatialTransformer(in_channels, n_head, d_head, depth, context_dim, use_linear_projection);
        };

        auto make_zero_conv = [&](int64_t channels) {
@ -165,7 +166,7 @@ public:
    }

    struct ggml_tensor* resblock_forward(std::string name,
-                                         struct ggml_context* ctx,
+                                         GGMLRunnerContext* ctx,
                                         struct ggml_tensor* x,
                                         struct ggml_tensor* emb) {
        auto block = std::dynamic_pointer_cast<ResBlock>(blocks[name]);
@ -173,15 +174,14 @@ public:
    }

    struct ggml_tensor* attention_layer_forward(std::string name,
-                                                struct ggml_context* ctx,
-                                                ggml_backend_t backend,
+                                                GGMLRunnerContext* ctx,
                                                struct ggml_tensor* x,
                                                struct ggml_tensor* context) {
        auto block = std::dynamic_pointer_cast<SpatialTransformer>(blocks[name]);
-        return block->forward(ctx, backend, x, context);
+        return block->forward(ctx, x, context);
    }

-    struct ggml_tensor* input_hint_block_forward(struct ggml_context* ctx,
+    struct ggml_tensor* input_hint_block_forward(GGMLRunnerContext* ctx,
                                                 struct ggml_tensor* hint,
                                                 struct ggml_tensor* emb,
                                                 struct ggml_tensor* context) {
@ -193,14 +193,13 @@ public:

                h = block->forward(ctx, h);
            } else {
-                h = ggml_silu_inplace(ctx, h);
+                h = ggml_silu_inplace(ctx->ggml_ctx, h);
            }
        }
        return h;
    }

-    std::vector<struct ggml_tensor*> forward(struct ggml_context* ctx,
-                                             ggml_backend_t backend,
+    std::vector<struct ggml_tensor*> forward(GGMLRunnerContext* ctx,
                                             struct ggml_tensor* x,
                                             struct ggml_tensor* hint,
                                             struct ggml_tensor* guided_hint,
@ -213,13 +212,13 @@ public:
        // y: [N, adm_in_channels] or [1, adm_in_channels]
        if (context != nullptr) {
            if (context->ne[2] != x->ne[3]) {
-                context = ggml_repeat(ctx, context, ggml_new_tensor_3d(ctx, GGML_TYPE_F32, context->ne[0], context->ne[1], x->ne[3]));
+                context = ggml_repeat(ctx->ggml_ctx, context, ggml_new_tensor_3d(ctx->ggml_ctx, GGML_TYPE_F32, context->ne[0], context->ne[1], x->ne[3]));
            }
        }

        if (y != nullptr) {
            if (y->ne[1] != x->ne[3]) {
-                y = ggml_repeat(ctx, y, ggml_new_tensor_2d(ctx, GGML_TYPE_F32, y->ne[0], x->ne[3]));
+                y = ggml_repeat(ctx->ggml_ctx, y, ggml_new_tensor_2d(ctx->ggml_ctx, GGML_TYPE_F32, y->ne[0], x->ne[3]));
            }
        }

@ -230,10 +229,10 @@ public:

        auto middle_block_out = std::dynamic_pointer_cast<Conv2d>(blocks["middle_block_out.0"]);

-        auto t_emb = ggml_ext_timestep_embedding(ctx, timesteps, model_channels);  // [N, model_channels]
+        auto t_emb = ggml_ext_timestep_embedding(ctx->ggml_ctx, timesteps, model_channels);  // [N, model_channels]

        auto emb = time_embed_0->forward(ctx, t_emb);
-        emb      = ggml_silu_inplace(ctx, emb);
+        emb      = ggml_silu_inplace(ctx->ggml_ctx, emb);
        emb      = time_embed_2->forward(ctx, emb);  // [N, time_embed_dim]

        // SDXL/SVD
@ -242,10 +241,10 @@ public:
            auto label_embed_2 = std::dynamic_pointer_cast<Linear>(blocks["label_emb.0.2"]);

            auto label_emb = label_embed_0->forward(ctx, y);
-            label_emb      = ggml_silu_inplace(ctx, label_emb);
+            label_emb      = ggml_silu_inplace(ctx->ggml_ctx, label_emb);
            label_emb      = label_embed_2->forward(ctx, label_emb);  // [N, time_embed_dim]

-            emb = ggml_add(ctx, emb, label_emb);  // [N, time_embed_dim]
+            emb = ggml_add(ctx->ggml_ctx, emb, label_emb);  // [N, time_embed_dim]
        }

        std::vector<struct ggml_tensor*> outs;
@ -259,7 +258,7 @@ public:

        // input block 0
        auto h = input_blocks_0_0->forward(ctx, x);
-        h      = ggml_add(ctx, h, guided_hint);
+        h      = ggml_add(ctx->ggml_ctx, h, guided_hint);
        outs.push_back(zero_convs_0->forward(ctx, h));

        // input block 1-11
@ -274,7 +273,7 @@ public:
                h                = resblock_forward(name, ctx, h, emb);  // [N, mult*model_channels, h, w]
                if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) {
                    std::string name = "input_blocks." + std::to_string(input_block_idx) + ".1";
-                    h                = attention_layer_forward(name, ctx, backend, h, context);  // [N, mult*model_channels, h, w]
+                    h                = attention_layer_forward(name, ctx, h, context);  // [N, mult*model_channels, h, w]
                }

                auto zero_conv = std::dynamic_pointer_cast<Conv2d>(blocks["zero_convs." + std::to_string(input_block_idx) + ".0"]);
@ -298,9 +297,9 @@ public:
        // [N, 4*model_channels, h/8, w/8]

        // middle_block
-        h = resblock_forward("middle_block.0", ctx, h, emb);                      // [N, 4*model_channels, h/8, w/8]
-        h = attention_layer_forward("middle_block.1", ctx, backend, h, context);  // [N, 4*model_channels, h/8, w/8]
-        h = resblock_forward("middle_block.2", ctx, h, emb);                      // [N, 4*model_channels, h/8, w/8]
+        h = resblock_forward("middle_block.0", ctx, h, emb);             // [N, 4*model_channels, h/8, w/8]
+        h = attention_layer_forward("middle_block.1", ctx, h, context);  // [N, 4*model_channels, h/8, w/8]
+        h = resblock_forward("middle_block.2", ctx, h, emb);             // [N, 4*model_channels, h/8, w/8]

        // out
        outs.push_back(middle_block_out->forward(ctx, h));
@ -320,21 +319,10 @@ struct ControlNet : public GGMLRunner {

    ControlNet(ggml_backend_t backend,
               bool offload_params_to_cpu,
-               const String2GGMLType& tensor_types = {},
-               SDVersion version                   = VERSION_SD1)
+               const String2TensorStorage& tensor_storage_map = {},
+               SDVersion version                              = VERSION_SD1)
        : GGMLRunner(backend, offload_params_to_cpu), control_net(version) {
-        control_net.init(params_ctx, tensor_types, "");
-    }
-
-    void enable_conv2d_direct() {
-        std::vector<GGMLBlock*> blocks;
-        control_net.get_all_blocks(blocks);
-        for (auto block : blocks) {
-            if (block->get_desc() == "Conv2d") {
-                auto conv_block = (Conv2d*)block;
-                conv_block->enable_direct();
-            }
-        }
+        control_net.init(params_ctx, tensor_storage_map, "");
    }

    ~ControlNet() override {
@ -404,8 +392,9 @@ struct ControlNet : public GGMLRunner {
        y         = to_backend(y);
        timesteps = to_backend(timesteps);

-        auto outs = control_net.forward(compute_ctx,
-                                        runtime_backend,
+        auto runner_ctx = get_context();
+
+        auto outs = control_net.forward(&runner_ctx,
                                        x,
                                        hint,
                                        guided_hint_cached ? guided_hint : nullptr,
@ -453,7 +442,7 @@ struct ControlNet : public GGMLRunner {
        std::set<std::string> ignore_tensors;

        ModelLoader model_loader;
-        if (!model_loader.init_from_file(file_path)) {
+        if (!model_loader.init_from_file_and_convert_name(file_path)) {
            LOG_ERROR("init control net model loader from file failed: '%s'", file_path.c_str());
            return false;
        }
--- a/otherarch/sdcpp/diffusion_model.hpp
+++ b/otherarch/sdcpp/diffusion_model.hpp
@ -36,6 +36,7 @@ struct DiffusionModel {
    virtual void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) = 0;
    virtual size_t get_params_buffer_size()                                             = 0;
    virtual int64_t get_adm_in_channels()                                               = 0;
+    virtual void set_flash_attn_enabled(bool enabled)                                   = 0;
 };

 struct UNetModel : public DiffusionModel {
@ -43,10 +44,9 @@ struct UNetModel : public DiffusionModel {

    UNetModel(ggml_backend_t backend,
              bool offload_params_to_cpu,
-              const String2GGMLType& tensor_types = {},
-              SDVersion version                   = VERSION_SD1,
-              bool flash_attn                     = false)
-        : unet(backend, offload_params_to_cpu, tensor_types, "model.diffusion_model", version, flash_attn) {
+              const String2TensorStorage& tensor_storage_map = {},
+              SDVersion version                              = VERSION_SD1)
+        : unet(backend, offload_params_to_cpu, tensor_storage_map, "model.diffusion_model", version) {
    }

    std::string get_desc() override {
@ -77,6 +77,10 @@ struct UNetModel : public DiffusionModel {
        return unet.unet.adm_in_channels;
    }

+    void set_flash_attn_enabled(bool enabled) {
+        unet.set_flash_attention_enabled(enabled);
+    }
+
    void compute(int n_threads,
                 DiffusionParams diffusion_params,
                 struct ggml_tensor** output     = nullptr,
@ -98,9 +102,8 @@ struct MMDiTModel : public DiffusionModel {

    MMDiTModel(ggml_backend_t backend,
               bool offload_params_to_cpu,
-               bool flash_attn                     = false,
-               const String2GGMLType& tensor_types = {})
-        : mmdit(backend, offload_params_to_cpu, flash_attn, tensor_types, "model.diffusion_model") {
+               const String2TensorStorage& tensor_storage_map = {})
+        : mmdit(backend, offload_params_to_cpu, tensor_storage_map, "model.diffusion_model") {
    }

    std::string get_desc() override {
@ -131,6 +134,10 @@ struct MMDiTModel : public DiffusionModel {
        return 768 + 1280;
    }

+    void set_flash_attn_enabled(bool enabled) {
+        mmdit.set_flash_attention_enabled(enabled);
+    }
+
    void compute(int n_threads,
                 DiffusionParams diffusion_params,
                 struct ggml_tensor** output     = nullptr,
@ -151,11 +158,10 @@ struct FluxModel : public DiffusionModel {

    FluxModel(ggml_backend_t backend,
              bool offload_params_to_cpu,
-              const String2GGMLType& tensor_types = {},
-              SDVersion version                   = VERSION_FLUX,
-              bool flash_attn                     = false,
-              bool use_mask                       = false)
-        : flux(backend, offload_params_to_cpu, tensor_types, "model.diffusion_model", version, flash_attn, use_mask) {
+              const String2TensorStorage& tensor_storage_map = {},
+              SDVersion version                              = VERSION_FLUX,
+              bool use_mask                                  = false)
+        : flux(backend, offload_params_to_cpu, tensor_storage_map, "model.diffusion_model", version, use_mask) {
    }

    std::string get_desc() override {
@ -186,6 +192,10 @@ struct FluxModel : public DiffusionModel {
        return 768;
    }

+    void set_flash_attn_enabled(bool enabled) {
+        flux.set_flash_attention_enabled(enabled);
+    }
+
    void compute(int n_threads,
                 DiffusionParams diffusion_params,
                 struct ggml_tensor** output     = nullptr,
@ -211,11 +221,10 @@ struct WanModel : public DiffusionModel {

    WanModel(ggml_backend_t backend,
             bool offload_params_to_cpu,
-             const String2GGMLType& tensor_types = {},
-             const std::string prefix            = "model.diffusion_model",
-             SDVersion version                   = VERSION_WAN2,
-             bool flash_attn                     = false)
-        : prefix(prefix), wan(backend, offload_params_to_cpu, tensor_types, prefix, version, flash_attn) {
+             const String2TensorStorage& tensor_storage_map = {},
+             const std::string prefix                       = "model.diffusion_model",
+             SDVersion version                              = VERSION_WAN2)
+        : prefix(prefix), wan(backend, offload_params_to_cpu, tensor_storage_map, prefix, version) {
    }

    std::string get_desc() override {
@ -246,6 +255,10 @@ struct WanModel : public DiffusionModel {
        return 768;
    }

+    void set_flash_attn_enabled(bool enabled) {
+        wan.set_flash_attention_enabled(enabled);
+    }
+
    void compute(int n_threads,
                 DiffusionParams diffusion_params,
                 struct ggml_tensor** output     = nullptr,
@ -270,11 +283,10 @@ struct QwenImageModel : public DiffusionModel {

    QwenImageModel(ggml_backend_t backend,
                   bool offload_params_to_cpu,
-                   const String2GGMLType& tensor_types = {},
-                   const std::string prefix            = "model.diffusion_model",
-                   SDVersion version                   = VERSION_QWEN_IMAGE,
-                   bool flash_attn                     = false)
-        : prefix(prefix), qwen_image(backend, offload_params_to_cpu, tensor_types, prefix, version, flash_attn) {
+                   const String2TensorStorage& tensor_storage_map = {},
+                   const std::string prefix                       = "model.diffusion_model",
+                   SDVersion version                              = VERSION_QWEN_IMAGE)
+        : prefix(prefix), qwen_image(backend, offload_params_to_cpu, tensor_storage_map, prefix, version) {
    }

    std::string get_desc() override {
@ -305,6 +317,10 @@ struct QwenImageModel : public DiffusionModel {
        return 768;
    }

+    void set_flash_attn_enabled(bool enabled) {
+        qwen_image.set_flash_attention_enabled(enabled);
+    }
+
    void compute(int n_threads,
                 DiffusionParams diffusion_params,
                 struct ggml_tensor** output     = nullptr,
--- a/otherarch/sdcpp/esrgan.hpp
+++ b/otherarch/sdcpp/esrgan.hpp
@ -27,11 +27,11 @@ public:
        blocks["conv5"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat + 4 * num_grow_ch, num_feat, {3, 3}, {1, 1}, {1, 1}));
    }

-    struct ggml_tensor* lrelu(struct ggml_context* ctx, struct ggml_tensor* x) {
-        return ggml_leaky_relu(ctx, x, 0.2f, true);
+    struct ggml_tensor* lrelu(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+        return ggml_leaky_relu(ctx->ggml_ctx, x, 0.2f, true);
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
        // x: [n, num_feat, h, w]
        // return: [n, num_feat, h, w]

@ -42,16 +42,16 @@ public:
        auto conv5 = std::dynamic_pointer_cast<Conv2d>(blocks["conv5"]);

        auto x1    = lrelu(ctx, conv1->forward(ctx, x));
-        auto x_cat = ggml_concat(ctx, x, x1, 2);
+        auto x_cat = ggml_concat(ctx->ggml_ctx, x, x1, 2);
        auto x2    = lrelu(ctx, conv2->forward(ctx, x_cat));
-        x_cat      = ggml_concat(ctx, x_cat, x2, 2);
+        x_cat      = ggml_concat(ctx->ggml_ctx, x_cat, x2, 2);
        auto x3    = lrelu(ctx, conv3->forward(ctx, x_cat));
-        x_cat      = ggml_concat(ctx, x_cat, x3, 2);
+        x_cat      = ggml_concat(ctx->ggml_ctx, x_cat, x3, 2);
        auto x4    = lrelu(ctx, conv4->forward(ctx, x_cat));
-        x_cat      = ggml_concat(ctx, x_cat, x4, 2);
+        x_cat      = ggml_concat(ctx->ggml_ctx, x_cat, x4, 2);
        auto x5    = conv5->forward(ctx, x_cat);

-        x5 = ggml_add(ctx, ggml_scale(ctx, x5, 0.2f), x);
+        x5 = ggml_add(ctx->ggml_ctx, ggml_scale(ctx->ggml_ctx, x5, 0.2f), x);
        return x5;
    }
 };
@ -64,7 +64,7 @@ public:
        blocks["rdb3"] = std::shared_ptr<GGMLBlock>(new ResidualDenseBlock(num_feat, num_grow_ch));
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
        // x: [n, num_feat, h, w]
        // return: [n, num_feat, h, w]

@ -76,7 +76,7 @@ public:
        out      = rdb2->forward(ctx, out);
        out      = rdb3->forward(ctx, out);

-        out = ggml_add(ctx, ggml_scale(ctx, out, 0.2f), x);
+        out = ggml_add(ctx->ggml_ctx, ggml_scale(ctx->ggml_ctx, out, 0.2f), x);
        return out;
    }
 };
@ -112,11 +112,11 @@ public:
    int get_scale() { return scale; }
    int get_num_block() { return num_block; }

-    struct ggml_tensor* lrelu(struct ggml_context* ctx, struct ggml_tensor* x) {
-        return ggml_leaky_relu(ctx, x, 0.2f, true);
+    struct ggml_tensor* lrelu(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+        return ggml_leaky_relu(ctx->ggml_ctx, x, 0.2f, true);
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
        // x: [n, num_in_ch, h, w]
        // return: [n, num_out_ch, h*scale, w*scale]
        auto conv_first = std::dynamic_pointer_cast<Conv2d>(blocks["conv_first"]);
@ -133,14 +133,14 @@ public:
            body_feat = block->forward(ctx, body_feat);
        }
        body_feat = conv_body->forward(ctx, body_feat);
-        feat      = ggml_add(ctx, feat, body_feat);
+        feat      = ggml_add(ctx->ggml_ctx, feat, body_feat);
        // upsample
        if (scale >= 2) {
            auto conv_up1 = std::dynamic_pointer_cast<Conv2d>(blocks["conv_up1"]);
-            feat          = lrelu(ctx, conv_up1->forward(ctx, ggml_upscale(ctx, feat, 2, GGML_SCALE_MODE_NEAREST)));
+            feat          = lrelu(ctx, conv_up1->forward(ctx, ggml_upscale(ctx->ggml_ctx, feat, 2, GGML_SCALE_MODE_NEAREST)));
            if (scale == 4) {
                auto conv_up2 = std::dynamic_pointer_cast<Conv2d>(blocks["conv_up2"]);
-                feat          = lrelu(ctx, conv_up2->forward(ctx, ggml_upscale(ctx, feat, 2, GGML_SCALE_MODE_NEAREST)));
+                feat          = lrelu(ctx, conv_up2->forward(ctx, ggml_upscale(ctx->ggml_ctx, feat, 2, GGML_SCALE_MODE_NEAREST)));
            }
        }
        // for all scales
@ -156,24 +156,11 @@ struct ESRGAN : public GGMLRunner {

    ESRGAN(ggml_backend_t backend,
           bool offload_params_to_cpu,
-           const String2GGMLType& tensor_types = {})
+           const String2TensorStorage& tensor_storage_map = {})
        : GGMLRunner(backend, offload_params_to_cpu) {
        // rrdb_net will be created in load_from_file
    }

-    void enable_conv2d_direct() {
-        if (!rrdb_net)
-            return;
-        std::vector<GGMLBlock*> blocks;
-        rrdb_net->get_all_blocks(blocks);
-        for (auto block : blocks) {
-            if (block->get_desc() == "Conv2d") {
-                auto conv_block = (Conv2d*)block;
-                conv_block->enable_direct();
-            }
-        }
-    }
-
    std::string get_desc() override {
        return "esrgan";
    }
@ -182,7 +169,7 @@ struct ESRGAN : public GGMLRunner {
        LOG_INFO("loading esrgan from '%s'", file_path.c_str());

        ModelLoader model_loader;
-        if (!model_loader.init_from_file(file_path)) {
+        if (!model_loader.init_from_file_and_convert_name(file_path)) {
            LOG_ERROR("init esrgan model loader from file failed: '%s'", file_path.c_str());
            return false;
        }
@ -359,7 +346,9 @@ struct ESRGAN : public GGMLRunner {
        constexpr int kGraphNodes = 1 << 16;  // 65k
        struct ggml_cgraph* gf    = ggml_new_graph_custom(compute_ctx, kGraphNodes, /*grads*/ false);
        x                         = to_backend(x);
-        struct ggml_tensor* out   = rrdb_net->forward(compute_ctx, x);
+
+        auto runner_ctx         = get_context();
+        struct ggml_tensor* out = rrdb_net->forward(&runner_ctx, x);
        ggml_build_forward_expand(gf, out);
        return gf;
    }
--- a/otherarch/sdcpp/flux.hpp
+++ b/otherarch/sdcpp/flux.hpp
@ -19,14 +19,14 @@ namespace Flux {
            blocks["out_layer"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_dim, hidden_dim, true));
        }

-        struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) override {
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
            // x: [..., in_dim]
            // return: [..., hidden_dim]
            auto in_layer  = std::dynamic_pointer_cast<Linear>(blocks["in_layer"]);
            auto out_layer = std::dynamic_pointer_cast<Linear>(blocks["out_layer"]);

            x = in_layer->forward(ctx, x);
-            x = ggml_silu_inplace(ctx, x);
+            x = ggml_silu_inplace(ctx->ggml_ctx, x);
            x = out_layer->forward(ctx, x);
            return x;
        }
@ -37,7 +37,7 @@ namespace Flux {
        int64_t hidden_size;
        float eps;

-        void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") override {
+        void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
            ggml_type wtype = GGML_TYPE_F32;
            params["scale"] = ggml_new_tensor_1d(ctx, wtype, hidden_size);
        }
@ -48,10 +48,10 @@ namespace Flux {
            : hidden_size(hidden_size),
              eps(eps) {}

-        struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) override {
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
            struct ggml_tensor* w = params["scale"];
-            x                     = ggml_rms_norm(ctx, x, eps);
-            x                     = ggml_mul(ctx, x, w);
+            x                     = ggml_rms_norm(ctx->ggml_ctx, x, eps);
+            x                     = ggml_mul(ctx->ggml_ctx, x, w);
            return x;
        }
    };
@ -63,7 +63,7 @@ namespace Flux {
            blocks["key_norm"]   = std::shared_ptr<GGMLBlock>(new RMSNorm(dim));
        }

-        struct ggml_tensor* query_norm(struct ggml_context* ctx, struct ggml_tensor* x) {
+        struct ggml_tensor* query_norm(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
            // x: [..., dim]
            // return: [..., dim]
            auto norm = std::dynamic_pointer_cast<RMSNorm>(blocks["query_norm"]);
@ -72,7 +72,7 @@ namespace Flux {
            return x;
        }

-        struct ggml_tensor* key_norm(struct ggml_context* ctx, struct ggml_tensor* x) {
+        struct ggml_tensor* key_norm(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
            // x: [..., dim]
            // return: [..., dim]
            auto norm = std::dynamic_pointer_cast<RMSNorm>(blocks["key_norm"]);
@ -85,13 +85,11 @@ namespace Flux {
    struct SelfAttention : public GGMLBlock {
    public:
        int64_t num_heads;
-        bool flash_attn;

    public:
        SelfAttention(int64_t dim,
                      int64_t num_heads = 8,
-                      bool qkv_bias     = false,
-                      bool flash_attn   = false)
+                      bool qkv_bias     = false)
            : num_heads(num_heads) {
            int64_t head_dim = dim / num_heads;
            blocks["qkv"]    = std::shared_ptr<GGMLBlock>(new Linear(dim, dim * 3, qkv_bias));
@ -99,39 +97,38 @@ namespace Flux {
            blocks["proj"]   = std::shared_ptr<GGMLBlock>(new Linear(dim, dim));
        }

-        std::vector<struct ggml_tensor*> pre_attention(struct ggml_context* ctx, struct ggml_tensor* x) {
+        std::vector<struct ggml_tensor*> pre_attention(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
            auto qkv_proj = std::dynamic_pointer_cast<Linear>(blocks["qkv"]);
            auto norm     = std::dynamic_pointer_cast<QKNorm>(blocks["norm"]);

            auto qkv         = qkv_proj->forward(ctx, x);
-            auto qkv_vec     = split_qkv(ctx, qkv);
+            auto qkv_vec     = split_qkv(ctx->ggml_ctx, qkv);
            int64_t head_dim = qkv_vec[0]->ne[0] / num_heads;
-            auto q           = ggml_reshape_4d(ctx, qkv_vec[0], head_dim, num_heads, qkv_vec[0]->ne[1], qkv_vec[0]->ne[2]);
-            auto k           = ggml_reshape_4d(ctx, qkv_vec[1], head_dim, num_heads, qkv_vec[1]->ne[1], qkv_vec[1]->ne[2]);
-            auto v           = ggml_reshape_4d(ctx, qkv_vec[2], head_dim, num_heads, qkv_vec[2]->ne[1], qkv_vec[2]->ne[2]);
+            auto q           = ggml_reshape_4d(ctx->ggml_ctx, qkv_vec[0], head_dim, num_heads, qkv_vec[0]->ne[1], qkv_vec[0]->ne[2]);
+            auto k           = ggml_reshape_4d(ctx->ggml_ctx, qkv_vec[1], head_dim, num_heads, qkv_vec[1]->ne[1], qkv_vec[1]->ne[2]);
+            auto v           = ggml_reshape_4d(ctx->ggml_ctx, qkv_vec[2], head_dim, num_heads, qkv_vec[2]->ne[1], qkv_vec[2]->ne[2]);
            q                = norm->query_norm(ctx, q);
            k                = norm->key_norm(ctx, k);
            return {q, k, v};
        }

-        struct ggml_tensor* post_attention(struct ggml_context* ctx, struct ggml_tensor* x) {
+        struct ggml_tensor* post_attention(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
            auto proj = std::dynamic_pointer_cast<Linear>(blocks["proj"]);

            x = proj->forward(ctx, x);  // [N, n_token, dim]
            return x;
        }

-        struct ggml_tensor* forward(struct ggml_context* ctx,
-                                    ggml_backend_t backend,
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                    struct ggml_tensor* x,
                                    struct ggml_tensor* pe,
                                    struct ggml_tensor* mask) {
            // x: [N, n_token, dim]
            // pe: [n_token, d_head/2, 2, 2]
            // return [N, n_token, dim]
-            auto qkv = pre_attention(ctx, x);                                                        // q,k,v: [N, n_token, n_head, d_head]
-            x        = Rope::attention(ctx, backend, qkv[0], qkv[1], qkv[2], pe, mask, flash_attn);  // [N, n_token, dim]
-            x        = post_attention(ctx, x);                                                       // [N, n_token, dim]
+            auto qkv = pre_attention(ctx, x);                                   // q,k,v: [N, n_token, n_head, d_head]
+            x        = Rope::attention(ctx, qkv[0], qkv[1], qkv[2], pe, mask);  // [N, n_token, dim]
+            x        = post_attention(ctx, x);                                  // [N, n_token, dim]
            return x;
        }
    };
@ -144,11 +141,11 @@ namespace Flux {
        ModulationOut(ggml_tensor* shift = nullptr, ggml_tensor* scale = nullptr, ggml_tensor* gate = nullptr)
            : shift(shift), scale(scale), gate(gate) {}

-        ModulationOut(struct ggml_context* ctx, ggml_tensor* vec, int64_t offset) {
+        ModulationOut(GGMLRunnerContext* ctx, ggml_tensor* vec, int64_t offset) {
            int64_t stride = vec->nb[1] * vec->ne[1];
-            shift          = ggml_view_2d(ctx, vec, vec->ne[0], vec->ne[1], vec->nb[1], stride * (offset + 0));  // [N, dim]
-            scale          = ggml_view_2d(ctx, vec, vec->ne[0], vec->ne[1], vec->nb[1], stride * (offset + 1));  // [N, dim]
-            gate           = ggml_view_2d(ctx, vec, vec->ne[0], vec->ne[1], vec->nb[1], stride * (offset + 2));  // [N, dim]
+            shift          = ggml_view_2d(ctx->ggml_ctx, vec, vec->ne[0], vec->ne[1], vec->nb[1], stride * (offset + 0));  // [N, dim]
+            scale          = ggml_view_2d(ctx->ggml_ctx, vec, vec->ne[0], vec->ne[1], vec->nb[1], stride * (offset + 1));  // [N, dim]
+            gate           = ggml_view_2d(ctx->ggml_ctx, vec, vec->ne[0], vec->ne[1], vec->nb[1], stride * (offset + 2));  // [N, dim]
        }
    };

@ -164,16 +161,16 @@ namespace Flux {
            blocks["lin"] = std::shared_ptr<GGMLBlock>(new Linear(dim, dim * multiplier));
        }

-        std::vector<ModulationOut> forward(struct ggml_context* ctx, struct ggml_tensor* vec) {
+        std::vector<ModulationOut> forward(GGMLRunnerContext* ctx, struct ggml_tensor* vec) {
            // x: [N, dim]
            // return: [ModulationOut, ModulationOut]
            auto lin = std::dynamic_pointer_cast<Linear>(blocks["lin"]);

-            auto out = ggml_silu(ctx, vec);
+            auto out = ggml_silu(ctx->ggml_ctx, vec);
            out      = lin->forward(ctx, out);  // [N, multiplier*dim]

-            auto m = ggml_reshape_3d(ctx, out, vec->ne[0], multiplier, vec->ne[1]);  // [N, multiplier, dim]
-            m      = ggml_cont(ctx, ggml_permute(ctx, m, 0, 2, 1, 3));               // [multiplier, N, dim]
+            auto m = ggml_reshape_3d(ctx->ggml_ctx, out, vec->ne[0], multiplier, vec->ne[1]);  // [N, multiplier, dim]
+            m      = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, m, 0, 2, 1, 3));     // [multiplier, N, dim]

            ModulationOut m_0 = ModulationOut(ctx, m, 0);
            if (is_double) {
@ -199,7 +196,6 @@ namespace Flux {
    }

    struct DoubleStreamBlock : public GGMLBlock {
-        bool flash_attn;
        bool prune_mod;
        int idx = 0;

@ -207,17 +203,16 @@ namespace Flux {
        DoubleStreamBlock(int64_t hidden_size,
                          int64_t num_heads,
                          float mlp_ratio,
-                          int idx         = 0,
-                          bool qkv_bias   = false,
-                          bool flash_attn = false,
-                          bool prune_mod  = false)
-            : idx(idx), flash_attn(flash_attn), prune_mod(prune_mod) {
+                          int idx        = 0,
+                          bool qkv_bias  = false,
+                          bool prune_mod = false)
+            : idx(idx), prune_mod(prune_mod) {
            int64_t mlp_hidden_dim = hidden_size * mlp_ratio;
            if (!prune_mod) {
                blocks["img_mod"] = std::shared_ptr<GGMLBlock>(new Modulation(hidden_size, true));
            }
            blocks["img_norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-6f, false));
-            blocks["img_attn"]  = std::shared_ptr<GGMLBlock>(new SelfAttention(hidden_size, num_heads, qkv_bias, flash_attn));
+            blocks["img_attn"]  = std::shared_ptr<GGMLBlock>(new SelfAttention(hidden_size, num_heads, qkv_bias));

            blocks["img_norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-6f, false));
            blocks["img_mlp.0"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, mlp_hidden_dim));
@ -228,7 +223,7 @@ namespace Flux {
                blocks["txt_mod"] = std::shared_ptr<GGMLBlock>(new Modulation(hidden_size, true));
            }
            blocks["txt_norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-6f, false));
-            blocks["txt_attn"]  = std::shared_ptr<GGMLBlock>(new SelfAttention(hidden_size, num_heads, qkv_bias, flash_attn));
+            blocks["txt_attn"]  = std::shared_ptr<GGMLBlock>(new SelfAttention(hidden_size, num_heads, qkv_bias));

            blocks["txt_norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-6f, false));
            blocks["txt_mlp.0"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, mlp_hidden_dim));
@ -236,7 +231,7 @@ namespace Flux {
            blocks["txt_mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(mlp_hidden_dim, hidden_size));
        }

-        std::vector<ModulationOut> get_distil_img_mod(struct ggml_context* ctx, struct ggml_tensor* vec) {
+        std::vector<ModulationOut> get_distil_img_mod(GGMLRunnerContext* ctx, struct ggml_tensor* vec) {
            // TODO: not hardcoded?
            const int single_blocks_count = 38;
            const int double_blocks_count = 19;
@ -245,7 +240,7 @@ namespace Flux {
            return {ModulationOut(ctx, vec, offset), ModulationOut(ctx, vec, offset + 3)};
        }

-        std::vector<ModulationOut> get_distil_txt_mod(struct ggml_context* ctx, struct ggml_tensor* vec) {
+        std::vector<ModulationOut> get_distil_txt_mod(GGMLRunnerContext* ctx, struct ggml_tensor* vec) {
            // TODO: not hardcoded?
            const int single_blocks_count = 38;
            const int double_blocks_count = 19;
@ -254,8 +249,7 @@ namespace Flux {
            return {ModulationOut(ctx, vec, offset), ModulationOut(ctx, vec, offset + 3)};
        }

-        std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(struct ggml_context* ctx,
-                                                                    ggml_backend_t backend,
+        std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(GGMLRunnerContext* ctx,
                                                                    struct ggml_tensor* img,
                                                                    struct ggml_tensor* txt,
                                                                    struct ggml_tensor* vec,
@ -300,7 +294,7 @@ namespace Flux {

            // prepare image for attention
            auto img_modulated = img_norm1->forward(ctx, img);
-            img_modulated      = Flux::modulate(ctx, img_modulated, img_mod1.shift, img_mod1.scale);
+            img_modulated      = Flux::modulate(ctx->ggml_ctx, img_modulated, img_mod1.shift, img_mod1.scale);
            auto img_qkv       = img_attn->pre_attention(ctx, img_modulated);  // q,k,v: [N, n_img_token, n_head, d_head]
            auto img_q         = img_qkv[0];
            auto img_k         = img_qkv[1];
@ -308,55 +302,55 @@ namespace Flux {

            // prepare txt for attention
            auto txt_modulated = txt_norm1->forward(ctx, txt);
-            txt_modulated      = Flux::modulate(ctx, txt_modulated, txt_mod1.shift, txt_mod1.scale);
+            txt_modulated      = Flux::modulate(ctx->ggml_ctx, txt_modulated, txt_mod1.shift, txt_mod1.scale);
            auto txt_qkv       = txt_attn->pre_attention(ctx, txt_modulated);  // q,k,v: [N, n_txt_token, n_head, d_head]
            auto txt_q         = txt_qkv[0];
            auto txt_k         = txt_qkv[1];
            auto txt_v         = txt_qkv[2];

            // run actual attention
-            auto q = ggml_concat(ctx, txt_q, img_q, 2);  // [N, n_txt_token + n_img_token, n_head, d_head]
-            auto k = ggml_concat(ctx, txt_k, img_k, 2);  // [N, n_txt_token + n_img_token, n_head, d_head]
-            auto v = ggml_concat(ctx, txt_v, img_v, 2);  // [N, n_txt_token + n_img_token, n_head, d_head]
+            auto q = ggml_concat(ctx->ggml_ctx, txt_q, img_q, 2);  // [N, n_txt_token + n_img_token, n_head, d_head]
+            auto k = ggml_concat(ctx->ggml_ctx, txt_k, img_k, 2);  // [N, n_txt_token + n_img_token, n_head, d_head]
+            auto v = ggml_concat(ctx->ggml_ctx, txt_v, img_v, 2);  // [N, n_txt_token + n_img_token, n_head, d_head]

-            auto attn         = Rope::attention(ctx, backend, q, k, v, pe, mask, flash_attn);  // [N, n_txt_token + n_img_token, n_head*d_head]
-            attn              = ggml_cont(ctx, ggml_permute(ctx, attn, 0, 2, 1, 3));           // [n_txt_token + n_img_token, N, hidden_size]
-            auto txt_attn_out = ggml_view_3d(ctx,
+            auto attn         = Rope::attention(ctx, q, k, v, pe, mask);                                  // [N, n_txt_token + n_img_token, n_head*d_head]
+            attn              = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, attn, 0, 2, 1, 3));  // [n_txt_token + n_img_token, N, hidden_size]
+            auto txt_attn_out = ggml_view_3d(ctx->ggml_ctx,
                                             attn,
                                             attn->ne[0],
                                             attn->ne[1],
                                             txt->ne[1],
                                             attn->nb[1],
                                             attn->nb[2],
-                                             0);                                              // [n_txt_token, N, hidden_size]
-            txt_attn_out      = ggml_cont(ctx, ggml_permute(ctx, txt_attn_out, 0, 2, 1, 3));  // [N, n_txt_token, hidden_size]
-            auto img_attn_out = ggml_view_3d(ctx,
+                                             0);                                                                  // [n_txt_token, N, hidden_size]
+            txt_attn_out      = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, txt_attn_out, 0, 2, 1, 3));  // [N, n_txt_token, hidden_size]
+            auto img_attn_out = ggml_view_3d(ctx->ggml_ctx,
                                             attn,
                                             attn->ne[0],
                                             attn->ne[1],
                                             img->ne[1],
                                             attn->nb[1],
                                             attn->nb[2],
-                                             attn->nb[2] * txt->ne[1]);                       // [n_img_token, N, hidden_size]
-            img_attn_out      = ggml_cont(ctx, ggml_permute(ctx, img_attn_out, 0, 2, 1, 3));  // [N, n_img_token, hidden_size]
+                                             attn->nb[2] * txt->ne[1]);                                           // [n_img_token, N, hidden_size]
+            img_attn_out      = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, img_attn_out, 0, 2, 1, 3));  // [N, n_img_token, hidden_size]

            // calculate the img bloks
-            img = ggml_add(ctx, img, ggml_mul(ctx, img_attn->post_attention(ctx, img_attn_out), img_mod1.gate));
+            img = ggml_add(ctx->ggml_ctx, img, ggml_mul(ctx->ggml_ctx, img_attn->post_attention(ctx, img_attn_out), img_mod1.gate));

-            auto img_mlp_out = img_mlp_0->forward(ctx, Flux::modulate(ctx, img_norm2->forward(ctx, img), img_mod2.shift, img_mod2.scale));
-            img_mlp_out      = ggml_gelu_inplace(ctx, img_mlp_out);
+            auto img_mlp_out = img_mlp_0->forward(ctx, Flux::modulate(ctx->ggml_ctx, img_norm2->forward(ctx, img), img_mod2.shift, img_mod2.scale));
+            img_mlp_out      = ggml_gelu_inplace(ctx->ggml_ctx, img_mlp_out);
            img_mlp_out      = img_mlp_2->forward(ctx, img_mlp_out);

-            img = ggml_add(ctx, img, ggml_mul(ctx, img_mlp_out, img_mod2.gate));
+            img = ggml_add(ctx->ggml_ctx, img, ggml_mul(ctx->ggml_ctx, img_mlp_out, img_mod2.gate));

            // calculate the txt bloks
-            txt = ggml_add(ctx, txt, ggml_mul(ctx, txt_attn->post_attention(ctx, txt_attn_out), txt_mod1.gate));
+            txt = ggml_add(ctx->ggml_ctx, txt, ggml_mul(ctx->ggml_ctx, txt_attn->post_attention(ctx, txt_attn_out), txt_mod1.gate));

-            auto txt_mlp_out = txt_mlp_0->forward(ctx, Flux::modulate(ctx, txt_norm2->forward(ctx, txt), txt_mod2.shift, txt_mod2.scale));
-            txt_mlp_out      = ggml_gelu_inplace(ctx, txt_mlp_out);
+            auto txt_mlp_out = txt_mlp_0->forward(ctx, Flux::modulate(ctx->ggml_ctx, txt_norm2->forward(ctx, txt), txt_mod2.shift, txt_mod2.scale));
+            txt_mlp_out      = ggml_gelu_inplace(ctx->ggml_ctx, txt_mlp_out);
            txt_mlp_out      = txt_mlp_2->forward(ctx, txt_mlp_out);

-            txt = ggml_add(ctx, txt, ggml_mul(ctx, txt_mlp_out, txt_mod2.gate));
+            txt = ggml_add(ctx->ggml_ctx, txt, ggml_mul(ctx->ggml_ctx, txt_mlp_out, txt_mod2.gate));

            return {img, txt};
        }
@ -367,7 +361,6 @@ namespace Flux {
        int64_t num_heads;
        int64_t hidden_size;
        int64_t mlp_hidden_dim;
-        bool flash_attn;
        bool prune_mod;
        int idx = 0;

@ -377,9 +370,8 @@ namespace Flux {
                          float mlp_ratio = 4.0f,
                          int idx         = 0,
                          float qk_scale  = 0.f,
-                          bool flash_attn = false,
                          bool prune_mod  = false)
-            : hidden_size(hidden_size), num_heads(num_heads), idx(idx), flash_attn(flash_attn), prune_mod(prune_mod) {
+            : hidden_size(hidden_size), num_heads(num_heads), idx(idx), prune_mod(prune_mod) {
            int64_t head_dim = hidden_size / num_heads;
            float scale      = qk_scale;
            if (scale <= 0.f) {
@ -397,13 +389,12 @@ namespace Flux {
            }
        }

-        ModulationOut get_distil_mod(struct ggml_context* ctx, struct ggml_tensor* vec) {
+        ModulationOut get_distil_mod(GGMLRunnerContext* ctx, struct ggml_tensor* vec) {
            int64_t offset = 3 * idx;
            return ModulationOut(ctx, vec, offset);
        }

-        struct ggml_tensor* forward(struct ggml_context* ctx,
-                                    ggml_backend_t backend,
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                    struct ggml_tensor* x,
                                    struct ggml_tensor* vec,
                                    struct ggml_tensor* pe,
@ -424,42 +415,42 @@ namespace Flux {

                mod = modulation->forward(ctx, vec)[0];
            }
-            auto x_mod   = Flux::modulate(ctx, pre_norm->forward(ctx, x), mod.shift, mod.scale);
-            auto qkv_mlp = linear1->forward(ctx, x_mod);                            // [N, n_token, hidden_size * 3 + mlp_hidden_dim]
-            qkv_mlp      = ggml_cont(ctx, ggml_permute(ctx, qkv_mlp, 2, 0, 1, 3));  // [hidden_size * 3 + mlp_hidden_dim, N, n_token]
+            auto x_mod   = Flux::modulate(ctx->ggml_ctx, pre_norm->forward(ctx, x), mod.shift, mod.scale);
+            auto qkv_mlp = linear1->forward(ctx, x_mod);                                                // [N, n_token, hidden_size * 3 + mlp_hidden_dim]
+            qkv_mlp      = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, qkv_mlp, 2, 0, 1, 3));  // [hidden_size * 3 + mlp_hidden_dim, N, n_token]

-            auto qkv = ggml_view_3d(ctx,
+            auto qkv = ggml_view_3d(ctx->ggml_ctx,
                                    qkv_mlp,
                                    qkv_mlp->ne[0],
                                    qkv_mlp->ne[1],
                                    hidden_size * 3,
                                    qkv_mlp->nb[1],
                                    qkv_mlp->nb[2],
-                                    0);                                     // [hidden_size * 3 , N, n_token]
-            qkv      = ggml_cont(ctx, ggml_permute(ctx, qkv, 1, 2, 0, 3));  // [N, n_token, hidden_size * 3]
-            auto mlp = ggml_view_3d(ctx,
+                                    0);                                                         // [hidden_size * 3 , N, n_token]
+            qkv      = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, qkv, 1, 2, 0, 3));  // [N, n_token, hidden_size * 3]
+            auto mlp = ggml_view_3d(ctx->ggml_ctx,
                                    qkv_mlp,
                                    qkv_mlp->ne[0],
                                    qkv_mlp->ne[1],
                                    mlp_hidden_dim,
                                    qkv_mlp->nb[1],
                                    qkv_mlp->nb[2],
-                                    qkv_mlp->nb[2] * hidden_size * 3);      // [mlp_hidden_dim , N, n_token]
-            mlp      = ggml_cont(ctx, ggml_permute(ctx, mlp, 1, 2, 0, 3));  // [N, n_token, mlp_hidden_dim]
+                                    qkv_mlp->nb[2] * hidden_size * 3);                          // [mlp_hidden_dim , N, n_token]
+            mlp      = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, mlp, 1, 2, 0, 3));  // [N, n_token, mlp_hidden_dim]

-            auto qkv_vec     = split_qkv(ctx, qkv);  // q,k,v: [N, n_token, hidden_size]
+            auto qkv_vec     = split_qkv(ctx->ggml_ctx, qkv);  // q,k,v: [N, n_token, hidden_size]
            int64_t head_dim = hidden_size / num_heads;
-            auto q           = ggml_reshape_4d(ctx, qkv_vec[0], head_dim, num_heads, qkv_vec[0]->ne[1], qkv_vec[0]->ne[2]);  // [N, n_token, n_head, d_head]
-            auto k           = ggml_reshape_4d(ctx, qkv_vec[1], head_dim, num_heads, qkv_vec[1]->ne[1], qkv_vec[1]->ne[2]);  // [N, n_token, n_head, d_head]
-            auto v           = ggml_reshape_4d(ctx, qkv_vec[2], head_dim, num_heads, qkv_vec[2]->ne[1], qkv_vec[2]->ne[2]);  // [N, n_token, n_head, d_head]
+            auto q           = ggml_reshape_4d(ctx->ggml_ctx, qkv_vec[0], head_dim, num_heads, qkv_vec[0]->ne[1], qkv_vec[0]->ne[2]);  // [N, n_token, n_head, d_head]
+            auto k           = ggml_reshape_4d(ctx->ggml_ctx, qkv_vec[1], head_dim, num_heads, qkv_vec[1]->ne[1], qkv_vec[1]->ne[2]);  // [N, n_token, n_head, d_head]
+            auto v           = ggml_reshape_4d(ctx->ggml_ctx, qkv_vec[2], head_dim, num_heads, qkv_vec[2]->ne[1], qkv_vec[2]->ne[2]);  // [N, n_token, n_head, d_head]
            q                = norm->query_norm(ctx, q);
            k                = norm->key_norm(ctx, k);
-            auto attn        = Rope::attention(ctx, backend, q, k, v, pe, mask, flash_attn);  // [N, n_token, hidden_size]
+            auto attn        = Rope::attention(ctx, q, k, v, pe, mask);  // [N, n_token, hidden_size]

-            auto attn_mlp = ggml_concat(ctx, attn, ggml_gelu_inplace(ctx, mlp), 0);  // [N, n_token, hidden_size + mlp_hidden_dim]
-            auto output   = linear2->forward(ctx, attn_mlp);                         // [N, n_token, hidden_size]
+            auto attn_mlp = ggml_concat(ctx->ggml_ctx, attn, ggml_gelu_inplace(ctx->ggml_ctx, mlp), 0);  // [N, n_token, hidden_size + mlp_hidden_dim]
+            auto output   = linear2->forward(ctx, attn_mlp);                                             // [N, n_token, hidden_size]

-            output = ggml_add(ctx, x, ggml_mul(ctx, output, mod.gate));
+            output = ggml_add(ctx->ggml_ctx, x, ggml_mul(ctx->ggml_ctx, output, mod.gate));
            return output;
        }
    };
@ -480,16 +471,16 @@ namespace Flux {
            }
        }

-        ModulationOut get_distil_mod(struct ggml_context* ctx, struct ggml_tensor* vec) {
+        ModulationOut get_distil_mod(GGMLRunnerContext* ctx, struct ggml_tensor* vec) {
            int64_t offset = vec->ne[2] - 2;
            int64_t stride = vec->nb[1] * vec->ne[1];
-            auto shift     = ggml_view_2d(ctx, vec, vec->ne[0], vec->ne[1], vec->nb[1], stride * (offset + 0));  // [N, dim]
-            auto scale     = ggml_view_2d(ctx, vec, vec->ne[0], vec->ne[1], vec->nb[1], stride * (offset + 1));  // [N, dim]
+            auto shift     = ggml_view_2d(ctx->ggml_ctx, vec, vec->ne[0], vec->ne[1], vec->nb[1], stride * (offset + 0));  // [N, dim]
+            auto scale     = ggml_view_2d(ctx->ggml_ctx, vec, vec->ne[0], vec->ne[1], vec->nb[1], stride * (offset + 1));  // [N, dim]
            // No gate
            return {shift, scale, nullptr};
        }

-        struct ggml_tensor* forward(struct ggml_context* ctx,
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                    struct ggml_tensor* x,
                                    struct ggml_tensor* c) {
            // x: [N, n_token, hidden_size]
@ -505,16 +496,16 @@ namespace Flux {
            } else {
                auto adaLN_modulation_1 = std::dynamic_pointer_cast<Linear>(blocks["adaLN_modulation.1"]);

-                auto m = adaLN_modulation_1->forward(ctx, ggml_silu(ctx, c));  // [N, 2 * hidden_size]
-                m      = ggml_reshape_3d(ctx, m, c->ne[0], 2, c->ne[1]);       // [N, 2, hidden_size]
-                m      = ggml_cont(ctx, ggml_permute(ctx, m, 0, 2, 1, 3));     // [2, N, hidden_size]
+                auto m = adaLN_modulation_1->forward(ctx, ggml_silu(ctx->ggml_ctx, c));         // [N, 2 * hidden_size]
+                m      = ggml_reshape_3d(ctx->ggml_ctx, m, c->ne[0], 2, c->ne[1]);              // [N, 2, hidden_size]
+                m      = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, m, 0, 2, 1, 3));  // [2, N, hidden_size]

                int64_t offset = m->nb[1] * m->ne[1];
-                shift          = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 0);  // [N, hidden_size]
-                scale          = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 1);  // [N, hidden_size]
+                shift          = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 0);  // [N, hidden_size]
+                scale          = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 1);  // [N, hidden_size]
            }

-            x = Flux::modulate(ctx, norm_final->forward(ctx, x), shift, scale);
+            x = Flux::modulate(ctx->ggml_ctx, norm_final->forward(ctx, x), shift, scale);
            x = linear->forward(ctx, x);

            return x;
@ -533,7 +524,7 @@ namespace Flux {
            blocks["out_proj"] = std::shared_ptr<GGMLBlock>(new Linear(inner_size, hidden_size, true));
        }

-        struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
            auto in_proj  = std::dynamic_pointer_cast<Linear>(blocks["in_proj"]);
            auto out_proj = std::dynamic_pointer_cast<Linear>(blocks["out_proj"]);

@ -541,7 +532,7 @@ namespace Flux {
            for (int i = 0; i < n_layers; i++) {
                auto norm  = std::dynamic_pointer_cast<RMSNorm>(blocks["norms." + std::to_string(i)]);
                auto embed = std::dynamic_pointer_cast<MLPEmbedder>(blocks["layers." + std::to_string(i)]);
-                x          = ggml_add_inplace(ctx, x, embed->forward(ctx, norm->forward(ctx, x)));
+                x          = ggml_add_inplace(ctx->ggml_ctx, x, embed->forward(ctx, norm->forward(ctx, x)));
            }
            x = out_proj->forward(ctx, x);

@ -556,7 +547,7 @@ namespace Flux {
            blocks["embedder.0"] = std::make_shared<Linear>(in_channels + max_freqs * max_freqs, hidden_size_input);
        }

-        struct ggml_tensor* forward(struct ggml_context* ctx,
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                    struct ggml_tensor* x,
                                    struct ggml_tensor* dct) {
            // x: (B, P^2, C)
@ -564,8 +555,8 @@ namespace Flux {
            // return: (B, P^2, hidden_size_input)
            auto embedder = std::dynamic_pointer_cast<Linear>(blocks["embedder.0"]);

-            dct = ggml_repeat_4d(ctx, dct, dct->ne[0], dct->ne[1], x->ne[2], x->ne[3]);
-            x   = ggml_concat(ctx, x, dct, 0);
+            dct = ggml_repeat_4d(ctx->ggml_ctx, dct, dct->ne[0], dct->ne[1], x->ne[2], x->ne[3]);
+            x   = ggml_concat(ctx->ggml_ctx, x, dct, 0);
            x   = embedder->forward(ctx, x);

            return x;
@ -583,7 +574,7 @@ namespace Flux {
            blocks["norm"]            = std::make_shared<RMSNorm>(hidden_size_x);
        }

-        struct ggml_tensor* forward(struct ggml_context* ctx,
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                    struct ggml_tensor* x,
                                    struct ggml_tensor* s) {
            // x: (batch_size, n_token, hidden_size_x)
@ -596,31 +587,31 @@ namespace Flux {
            int64_t hidden_size_x = x->ne[0];

            auto mlp_params = param_generator->forward(ctx, s);
-            auto fc_params  = ggml_ext_chunk(ctx, mlp_params, 3, 0);
-            auto fc1_gate   = ggml_reshape_3d(ctx, fc_params[0], hidden_size_x * mlp_ratio, hidden_size_x, batch_size);
-            auto fc1_value  = ggml_reshape_3d(ctx, fc_params[1], hidden_size_x * mlp_ratio, hidden_size_x, batch_size);
-            auto fc2        = ggml_reshape_3d(ctx, fc_params[2], hidden_size_x, mlp_ratio * hidden_size_x, batch_size);
+            auto fc_params  = ggml_ext_chunk(ctx->ggml_ctx, mlp_params, 3, 0);
+            auto fc1_gate   = ggml_reshape_3d(ctx->ggml_ctx, fc_params[0], hidden_size_x * mlp_ratio, hidden_size_x, batch_size);
+            auto fc1_value  = ggml_reshape_3d(ctx->ggml_ctx, fc_params[1], hidden_size_x * mlp_ratio, hidden_size_x, batch_size);
+            auto fc2        = ggml_reshape_3d(ctx->ggml_ctx, fc_params[2], hidden_size_x, mlp_ratio * hidden_size_x, batch_size);

-            fc1_gate  = ggml_cont(ctx, ggml_ext_torch_permute(ctx, fc1_gate, 1, 0, 2, 3));  // [batch_size, hidden_size_x*mlp_ratio, hidden_size_x]
-            fc1_gate  = ggml_l2_norm(ctx, fc1_gate, 1e-12f);
-            fc1_value = ggml_cont(ctx, ggml_ext_torch_permute(ctx, fc1_value, 1, 0, 2, 3));  // [batch_size, hidden_size_x*mlp_ratio, hidden_size_x]
-            fc1_value = ggml_l2_norm(ctx, fc1_value, 1e-12f);
-            fc2       = ggml_cont(ctx, ggml_ext_torch_permute(ctx, fc2, 1, 0, 2, 3));  // [batch_size, hidden_size_x, hidden_size_x*mlp_ratio]
-            fc2       = ggml_l2_norm(ctx, fc2, 1e-12f);
+            fc1_gate  = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, fc1_gate, 1, 0, 2, 3));  // [batch_size, hidden_size_x*mlp_ratio, hidden_size_x]
+            fc1_gate  = ggml_l2_norm(ctx->ggml_ctx, fc1_gate, 1e-12f);
+            fc1_value = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, fc1_value, 1, 0, 2, 3));  // [batch_size, hidden_size_x*mlp_ratio, hidden_size_x]
+            fc1_value = ggml_l2_norm(ctx->ggml_ctx, fc1_value, 1e-12f);
+            fc2       = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, fc2, 1, 0, 2, 3));  // [batch_size, hidden_size_x, hidden_size_x*mlp_ratio]
+            fc2       = ggml_l2_norm(ctx->ggml_ctx, fc2, 1e-12f);

            auto res_x = x;
            x          = norm->forward(ctx, x);  // [batch_size, n_token, hidden_size_x]

-            auto x1 = ggml_mul_mat(ctx, fc1_gate, x);  // [batch_size, n_token, hidden_size_x*mlp_ratio]
-            x1      = ggml_silu_inplace(ctx, x1);
+            auto x1 = ggml_mul_mat(ctx->ggml_ctx, fc1_gate, x);  // [batch_size, n_token, hidden_size_x*mlp_ratio]
+            x1      = ggml_silu_inplace(ctx->ggml_ctx, x1);

-            auto x2 = ggml_mul_mat(ctx, fc1_value, x);  // [batch_size, n_token, hidden_size_x*mlp_ratio]
+            auto x2 = ggml_mul_mat(ctx->ggml_ctx, fc1_value, x);  // [batch_size, n_token, hidden_size_x*mlp_ratio]

-            x = ggml_mul_inplace(ctx, x1, x2);  // [batch_size, n_token, hidden_size_x*mlp_ratio]
+            x = ggml_mul_inplace(ctx->ggml_ctx, x1, x2);  // [batch_size, n_token, hidden_size_x*mlp_ratio]

-            x = ggml_mul_mat(ctx, fc2, x);  // [batch_size, n_token, hidden_size_x]
+            x = ggml_mul_mat(ctx->ggml_ctx, fc2, x);  // [batch_size, n_token, hidden_size_x]

-            x = ggml_add_inplace(ctx, x, res_x);
+            x = ggml_add_inplace(ctx->ggml_ctx, x, res_x);

            return x;
        }
@ -633,7 +624,7 @@ namespace Flux {
            blocks["linear"] = std::make_shared<Linear>(hidden_size, out_channels);
        }

-        struct ggml_tensor* forward(struct ggml_context* ctx,
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                    struct ggml_tensor* x) {
            auto norm   = std::dynamic_pointer_cast<RMSNorm>(blocks["norm"]);
            auto linear = std::dynamic_pointer_cast<Linear>(blocks["linear"]);
@ -652,15 +643,15 @@ namespace Flux {
            blocks["conv"] = std::make_shared<Conv2d>(hidden_size, out_channels, std::pair{3, 3}, std::pair{1, 1}, std::pair{1, 1});
        }

-        struct ggml_tensor* forward(struct ggml_context* ctx,
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                    struct ggml_tensor* x) {
            // x: [N, C, H, W]
            auto norm = std::dynamic_pointer_cast<RMSNorm>(blocks["norm"]);
            auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]);

-            x = ggml_cont(ctx, ggml_ext_torch_permute(ctx, x, 2, 0, 1, 3));  // [N, H, W, C]
+            x = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, x, 2, 0, 1, 3));  // [N, H, W, C]
            x = norm->forward(ctx, x);
-            x = ggml_cont(ctx, ggml_ext_torch_permute(ctx, x, 1, 2, 0, 3));  // [N, C, H, W]
+            x = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, x, 1, 2, 0, 3));  // [N, C, H, W]
            x = conv->forward(ctx, x);

            return x;
@ -692,7 +683,6 @@ namespace Flux {
        int theta                   = 10000;
        bool qkv_bias               = true;
        bool guidance_embed         = true;
-        bool flash_attn             = true;
        int64_t in_dim              = 64;
        ChromaRadianceParams chroma_radiance_params;
    };
@ -731,7 +721,6 @@ namespace Flux {
                                                                                                   params.mlp_ratio,
                                                                                                   i,
                                                                                                   params.qkv_bias,
-                                                                                                   params.flash_attn,
                                                                                                   params.is_chroma);
            }

@ -741,7 +730,6 @@ namespace Flux {
                                                                                                   params.mlp_ratio,
                                                                                                   i,
                                                                                                   0.f,
-                                                                                                   params.flash_attn,
                                                                                                   params.is_chroma);
            }

@ -828,8 +816,7 @@ namespace Flux {
            return x;
        }

-        struct ggml_tensor* forward_orig(struct ggml_context* ctx,
-                                         ggml_backend_t backend,
+        struct ggml_tensor* forward_orig(GGMLRunnerContext* ctx,
                                         struct ggml_tensor* img,
                                         struct ggml_tensor* txt,
                                         struct ggml_tensor* timesteps,
@ -851,41 +838,41 @@ namespace Flux {
            if (params.is_chroma) {
                int64_t mod_index_length = 344;
                auto approx              = std::dynamic_pointer_cast<ChromaApproximator>(blocks["distilled_guidance_layer"]);
-                auto distill_timestep    = ggml_ext_timestep_embedding(ctx, timesteps, 16, 10000, 1000.f);
-                auto distill_guidance    = ggml_ext_timestep_embedding(ctx, guidance, 16, 10000, 1000.f);
+                auto distill_timestep    = ggml_ext_timestep_embedding(ctx->ggml_ctx, timesteps, 16, 10000, 1000.f);
+                auto distill_guidance    = ggml_ext_timestep_embedding(ctx->ggml_ctx, guidance, 16, 10000, 1000.f);

                // auto mod_index_arange  = ggml_arange(ctx, 0, (float)mod_index_length, 1);
                // ggml_arange tot working on a lot of backends, precomputing it on CPU instead
                GGML_ASSERT(mod_index_arange != nullptr);
-                auto modulation_index = ggml_ext_timestep_embedding(ctx, mod_index_arange, 32, 10000, 1000.f);  // [1, 344, 32]
+                auto modulation_index = ggml_ext_timestep_embedding(ctx->ggml_ctx, mod_index_arange, 32, 10000, 1000.f);  // [1, 344, 32]

                // Batch broadcast (will it ever be useful)
-                modulation_index = ggml_repeat(ctx, modulation_index, ggml_new_tensor_3d(ctx, GGML_TYPE_F32, modulation_index->ne[0], modulation_index->ne[1], img->ne[2]));  // [N, 344, 32]
+                modulation_index = ggml_repeat(ctx->ggml_ctx, modulation_index, ggml_new_tensor_3d(ctx->ggml_ctx, GGML_TYPE_F32, modulation_index->ne[0], modulation_index->ne[1], img->ne[2]));  // [N, 344, 32]

-                auto timestep_guidance = ggml_concat(ctx, distill_timestep, distill_guidance, 0);  // [N, 1, 32]
-                timestep_guidance      = ggml_repeat(ctx, timestep_guidance, modulation_index);    // [N, 344, 32]
+                auto timestep_guidance = ggml_concat(ctx->ggml_ctx, distill_timestep, distill_guidance, 0);  // [N, 1, 32]
+                timestep_guidance      = ggml_repeat(ctx->ggml_ctx, timestep_guidance, modulation_index);    // [N, 344, 32]

-                vec = ggml_concat(ctx, timestep_guidance, modulation_index, 0);  // [N, 344, 64]
+                vec = ggml_concat(ctx->ggml_ctx, timestep_guidance, modulation_index, 0);  // [N, 344, 64]
                // Permute for consistency with non-distilled modulation implementation
-                vec = ggml_cont(ctx, ggml_permute(ctx, vec, 0, 2, 1, 3));  // [344, N, 64]
-                vec = approx->forward(ctx, vec);                           // [344, N, hidden_size]
+                vec = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, vec, 0, 2, 1, 3));  // [344, N, 64]
+                vec = approx->forward(ctx, vec);                                               // [344, N, hidden_size]

                if (y != nullptr) {
-                    txt_img_mask = ggml_pad(ctx, y, img->ne[1], 0, 0, 0);
+                    txt_img_mask = ggml_pad(ctx->ggml_ctx, y, img->ne[1], 0, 0, 0);
                }
            } else {
                auto time_in   = std::dynamic_pointer_cast<MLPEmbedder>(blocks["time_in"]);
                auto vector_in = std::dynamic_pointer_cast<MLPEmbedder>(blocks["vector_in"]);
-                vec            = time_in->forward(ctx, ggml_ext_timestep_embedding(ctx, timesteps, 256, 10000, 1000.f));
+                vec            = time_in->forward(ctx, ggml_ext_timestep_embedding(ctx->ggml_ctx, timesteps, 256, 10000, 1000.f));
                if (params.guidance_embed) {
                    GGML_ASSERT(guidance != nullptr);
                    auto guidance_in = std::dynamic_pointer_cast<MLPEmbedder>(blocks["guidance_in"]);
                    // bf16 and fp16 result is different
-                    auto g_in = ggml_ext_timestep_embedding(ctx, guidance, 256, 10000, 1000.f);
-                    vec       = ggml_add(ctx, vec, guidance_in->forward(ctx, g_in));
+                    auto g_in = ggml_ext_timestep_embedding(ctx->ggml_ctx, guidance, 256, 10000, 1000.f);
+                    vec       = ggml_add(ctx->ggml_ctx, vec, guidance_in->forward(ctx, g_in));
                }

-                vec = ggml_add(ctx, vec, vector_in->forward(ctx, y));
+                vec = ggml_add(ctx->ggml_ctx, vec, vector_in->forward(ctx, y));
            }

            txt = txt_in->forward(ctx, txt);
@ -897,31 +884,31 @@ namespace Flux {

                auto block = std::dynamic_pointer_cast<DoubleStreamBlock>(blocks["double_blocks." + std::to_string(i)]);

-                auto img_txt = block->forward(ctx, backend, img, txt, vec, pe, txt_img_mask);
+                auto img_txt = block->forward(ctx, img, txt, vec, pe, txt_img_mask);
                img          = img_txt.first;   // [N, n_img_token, hidden_size]
                txt          = img_txt.second;  // [N, n_txt_token, hidden_size]
            }

-            auto txt_img = ggml_concat(ctx, txt, img, 1);  // [N, n_txt_token + n_img_token, hidden_size]
+            auto txt_img = ggml_concat(ctx->ggml_ctx, txt, img, 1);  // [N, n_txt_token + n_img_token, hidden_size]
            for (int i = 0; i < params.depth_single_blocks; i++) {
                if (skip_layers.size() > 0 && std::find(skip_layers.begin(), skip_layers.end(), i + params.depth) != skip_layers.end()) {
                    continue;
                }
                auto block = std::dynamic_pointer_cast<SingleStreamBlock>(blocks["single_blocks." + std::to_string(i)]);

-                txt_img = block->forward(ctx, backend, txt_img, vec, pe, txt_img_mask);
+                txt_img = block->forward(ctx, txt_img, vec, pe, txt_img_mask);
            }

-            txt_img = ggml_cont(ctx, ggml_permute(ctx, txt_img, 0, 2, 1, 3));  // [n_txt_token + n_img_token, N, hidden_size]
-            img     = ggml_view_3d(ctx,
+            txt_img = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, txt_img, 0, 2, 1, 3));  // [n_txt_token + n_img_token, N, hidden_size]
+            img     = ggml_view_3d(ctx->ggml_ctx,
                                   txt_img,
                                   txt_img->ne[0],
                                   txt_img->ne[1],
                                   img->ne[1],
                                   txt_img->nb[1],
                                   txt_img->nb[2],
-                                   txt_img->nb[2] * txt->ne[1]);           // [n_img_token, N, hidden_size]
-            img     = ggml_cont(ctx, ggml_permute(ctx, img, 0, 2, 1, 3));  // [N, n_img_token, hidden_size]
+                                   txt_img->nb[2] * txt->ne[1]);                               // [n_img_token, N, hidden_size]
+            img     = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, img, 0, 2, 1, 3));  // [N, n_img_token, hidden_size]

            if (final_layer) {
                img = final_layer->forward(ctx, img, vec);  // (N, T, patch_size ** 2 * out_channels)
@ -930,8 +917,7 @@ namespace Flux {
            return img;
        }

-        struct ggml_tensor* forward_chroma_radiance(struct ggml_context* ctx,
-                                                    ggml_backend_t backend,
+        struct ggml_tensor* forward_chroma_radiance(GGMLRunnerContext* ctx,
                                                    struct ggml_tensor* x,
                                                    struct ggml_tensor* timestep,
                                                    struct ggml_tensor* context,
@ -952,32 +938,32 @@ namespace Flux {
            int pad_h          = (patch_size - H % patch_size) % patch_size;
            int pad_w          = (patch_size - W % patch_size) % patch_size;

-            auto img      = pad_to_patch_size(ctx, x);
+            auto img      = pad_to_patch_size(ctx->ggml_ctx, x);
            auto orig_img = img;

            auto img_in_patch = std::dynamic_pointer_cast<Conv2d>(blocks["img_in_patch"]);

-            img = img_in_patch->forward(ctx, img);                                             // [N, hidden_size, H/patch_size, W/patch_size]
-            img = ggml_reshape_3d(ctx, img, img->ne[0] * img->ne[1], img->ne[2], img->ne[3]);  // [N, hidden_size, H/patch_size*W/patch_size]
-            img = ggml_cont(ctx, ggml_ext_torch_permute(ctx, img, 1, 0, 2, 3));                // [N, H/patch_size*W/patch_size, hidden_size]
+            img = img_in_patch->forward(ctx, img);                                                       // [N, hidden_size, H/patch_size, W/patch_size]
+            img = ggml_reshape_3d(ctx->ggml_ctx, img, img->ne[0] * img->ne[1], img->ne[2], img->ne[3]);  // [N, hidden_size, H/patch_size*W/patch_size]
+            img = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, img, 1, 0, 2, 3));      // [N, H/patch_size*W/patch_size, hidden_size]

-            auto out = forward_orig(ctx, backend, img, context, timestep, y, guidance, pe, mod_index_arange, skip_layers);  // [N, n_img_token, hidden_size]
+            auto out = forward_orig(ctx, img, context, timestep, y, guidance, pe, mod_index_arange, skip_layers);  // [N, n_img_token, hidden_size]

            // nerf decode
            auto nerf_image_embedder   = std::dynamic_pointer_cast<NerfEmbedder>(blocks["nerf_image_embedder"]);
            auto nerf_final_layer_conv = std::dynamic_pointer_cast<NerfFinalLayerConv>(blocks["nerf_final_layer_conv"]);

-            auto nerf_pixels    = patchify(ctx, orig_img);  // [N, num_patches, C * patch_size * patch_size]
+            auto nerf_pixels    = patchify(ctx->ggml_ctx, orig_img);  // [N, num_patches, C * patch_size * patch_size]
            int64_t num_patches = nerf_pixels->ne[1];
-            nerf_pixels         = ggml_reshape_3d(ctx,
+            nerf_pixels         = ggml_reshape_3d(ctx->ggml_ctx,
                                                  nerf_pixels,
                                                  nerf_pixels->ne[0] / C,
                                                  C,
-                                                  nerf_pixels->ne[1] * nerf_pixels->ne[2]);              // [N*num_patches, C, patch_size*patch_size]
-            nerf_pixels         = ggml_cont(ctx, ggml_ext_torch_permute(ctx, nerf_pixels, 1, 0, 2, 3));  // [N*num_patches, patch_size*patch_size, C]
+                                                  nerf_pixels->ne[1] * nerf_pixels->ne[2]);                                  // [N*num_patches, C, patch_size*patch_size]
+            nerf_pixels         = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, nerf_pixels, 1, 0, 2, 3));  // [N*num_patches, patch_size*patch_size, C]

-            auto nerf_hidden = ggml_reshape_2d(ctx, out, out->ne[0], out->ne[1] * out->ne[2]);  // [N*num_patches, hidden_size]
-            auto img_dct     = nerf_image_embedder->forward(ctx, nerf_pixels, dct);             // [N*num_patches, patch_size*patch_size, nerf_hidden_size]
+            auto nerf_hidden = ggml_reshape_2d(ctx->ggml_ctx, out, out->ne[0], out->ne[1] * out->ne[2]);  // [N*num_patches, hidden_size]
+            auto img_dct     = nerf_image_embedder->forward(ctx, nerf_pixels, dct);                       // [N*num_patches, patch_size*patch_size, nerf_hidden_size]

            for (int i = 0; i < params.chroma_radiance_params.nerf_depth; i++) {
                auto block = std::dynamic_pointer_cast<NerfGLUBlock>(blocks["nerf_blocks." + std::to_string(i)]);
@ -985,17 +971,16 @@ namespace Flux {
                img_dct = block->forward(ctx, img_dct, nerf_hidden);
            }

-            img_dct = ggml_cont(ctx, ggml_ext_torch_permute(ctx, img_dct, 1, 0, 2, 3));                                           // [N*num_patches, nerf_hidden_size, patch_size*patch_size]
-            img_dct = ggml_reshape_3d(ctx, img_dct, img_dct->ne[0] * img_dct->ne[1], num_patches, img_dct->ne[2] / num_patches);  // [N, num_patches, nerf_hidden_size*patch_size*patch_size]
-            img_dct = unpatchify(ctx, img_dct, (H + pad_h) / patch_size, (W + pad_w) / patch_size);                               // [N, nerf_hidden_size, H, W]
+            img_dct = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, img_dct, 1, 0, 2, 3));                                 // [N*num_patches, nerf_hidden_size, patch_size*patch_size]
+            img_dct = ggml_reshape_3d(ctx->ggml_ctx, img_dct, img_dct->ne[0] * img_dct->ne[1], num_patches, img_dct->ne[2] / num_patches);  // [N, num_patches, nerf_hidden_size*patch_size*patch_size]
+            img_dct = unpatchify(ctx->ggml_ctx, img_dct, (H + pad_h) / patch_size, (W + pad_w) / patch_size);                               // [N, nerf_hidden_size, H, W]

            out = nerf_final_layer_conv->forward(ctx, img_dct);  // [N, C, H, W]

            return out;
        }

-        struct ggml_tensor* forward_flux_chroma(struct ggml_context* ctx,
-                                                ggml_backend_t backend,
+        struct ggml_tensor* forward_flux_chroma(GGMLRunnerContext* ctx,
                                                struct ggml_tensor* x,
                                                struct ggml_tensor* timestep,
                                                struct ggml_tensor* context,
@ -1016,58 +1001,57 @@ namespace Flux {
            int pad_h          = (patch_size - H % patch_size) % patch_size;
            int pad_w          = (patch_size - W % patch_size) % patch_size;

-            auto img            = process_img(ctx, x);
+            auto img            = process_img(ctx->ggml_ctx, x);
            uint64_t img_tokens = img->ne[1];

            if (params.version == VERSION_FLUX_FILL) {
                GGML_ASSERT(c_concat != nullptr);
-                ggml_tensor* masked = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], C, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], 0);
-                ggml_tensor* mask   = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], 8 * 8, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * C);
+                ggml_tensor* masked = ggml_view_4d(ctx->ggml_ctx, c_concat, c_concat->ne[0], c_concat->ne[1], C, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], 0);
+                ggml_tensor* mask   = ggml_view_4d(ctx->ggml_ctx, c_concat, c_concat->ne[0], c_concat->ne[1], 8 * 8, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * C);

-                masked = process_img(ctx, masked);
-                mask   = process_img(ctx, mask);
+                masked = process_img(ctx->ggml_ctx, masked);
+                mask   = process_img(ctx->ggml_ctx, mask);

-                img = ggml_concat(ctx, img, ggml_concat(ctx, masked, mask, 0), 0);
+                img = ggml_concat(ctx->ggml_ctx, img, ggml_concat(ctx->ggml_ctx, masked, mask, 0), 0);
            } else if (params.version == VERSION_FLEX_2) {
                GGML_ASSERT(c_concat != nullptr);
-                ggml_tensor* masked  = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], C, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], 0);
-                ggml_tensor* mask    = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], 1, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * C);
-                ggml_tensor* control = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], C, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * (C + 1));
+                ggml_tensor* masked  = ggml_view_4d(ctx->ggml_ctx, c_concat, c_concat->ne[0], c_concat->ne[1], C, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], 0);
+                ggml_tensor* mask    = ggml_view_4d(ctx->ggml_ctx, c_concat, c_concat->ne[0], c_concat->ne[1], 1, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * C);
+                ggml_tensor* control = ggml_view_4d(ctx->ggml_ctx, c_concat, c_concat->ne[0], c_concat->ne[1], C, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * (C + 1));

-                masked  = process_img(ctx, masked);
-                mask    = process_img(ctx, mask);
-                control = process_img(ctx, control);
+                masked  = process_img(ctx->ggml_ctx, masked);
+                mask    = process_img(ctx->ggml_ctx, mask);
+                control = process_img(ctx->ggml_ctx, control);

-                img = ggml_concat(ctx, img, ggml_concat(ctx, ggml_concat(ctx, masked, mask, 0), control, 0), 0);
+                img = ggml_concat(ctx->ggml_ctx, img, ggml_concat(ctx->ggml_ctx, ggml_concat(ctx->ggml_ctx, masked, mask, 0), control, 0), 0);
            } else if (params.version == VERSION_FLUX_CONTROLS) {
                GGML_ASSERT(c_concat != nullptr);

-                auto control = process_img(ctx, c_concat);
-                img          = ggml_concat(ctx, img, control, 0);
+                auto control = process_img(ctx->ggml_ctx, c_concat);
+                img          = ggml_concat(ctx->ggml_ctx, img, control, 0);
            }

            if (ref_latents.size() > 0) {
                for (ggml_tensor* ref : ref_latents) {
-                    ref = process_img(ctx, ref);
-                    img = ggml_concat(ctx, img, ref, 1);
+                    ref = process_img(ctx->ggml_ctx, ref);
+                    img = ggml_concat(ctx->ggml_ctx, img, ref, 1);
                }
            }

-            auto out = forward_orig(ctx, backend, img, context, timestep, y, guidance, pe, mod_index_arange, skip_layers);  // [N, num_tokens, C * patch_size * patch_size]
+            auto out = forward_orig(ctx, img, context, timestep, y, guidance, pe, mod_index_arange, skip_layers);  // [N, num_tokens, C * patch_size * patch_size]

            if (out->ne[1] > img_tokens) {
-                out = ggml_cont(ctx, ggml_permute(ctx, out, 0, 2, 1, 3));  // [num_tokens, N, C * patch_size * patch_size]
-                out = ggml_view_3d(ctx, out, out->ne[0], out->ne[1], img_tokens, out->nb[1], out->nb[2], 0);
-                out = ggml_cont(ctx, ggml_permute(ctx, out, 0, 2, 1, 3));  // [N, h*w, C * patch_size * patch_size]
+                out = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, out, 0, 2, 1, 3));  // [num_tokens, N, C * patch_size * patch_size]
+                out = ggml_view_3d(ctx->ggml_ctx, out, out->ne[0], out->ne[1], img_tokens, out->nb[1], out->nb[2], 0);
+                out = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, out, 0, 2, 1, 3));  // [N, h*w, C * patch_size * patch_size]
            }

            // rearrange(out, "b (h w) (c ph pw) -> b c (h ph) (w pw)", h=h_len, w=w_len, ph=2, pw=2)
-            out = unpatchify(ctx, out, (H + pad_h) / patch_size, (W + pad_w) / patch_size);  // [N, C, H + pad_h, W + pad_w]
+            out = unpatchify(ctx->ggml_ctx, out, (H + pad_h) / patch_size, (W + pad_w) / patch_size);  // [N, C, H + pad_h, W + pad_w]
            return out;
        }

-        struct ggml_tensor* forward(struct ggml_context* ctx,
-                                    ggml_backend_t backend,
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                    struct ggml_tensor* x,
                                    struct ggml_tensor* timestep,
                                    struct ggml_tensor* context,
@ -1091,7 +1075,6 @@ namespace Flux {

            if (params.version == VERSION_CHROMA_RADIANCE) {
                return forward_chroma_radiance(ctx,
-                                               backend,
                                               x,
                                               timestep,
                                               context,
@ -1105,7 +1088,6 @@ namespace Flux {
                                               skip_layers);
            } else {
                return forward_flux_chroma(ctx,
-                                           backend,
                                           x,
                                           timestep,
                                           context,
@ -1133,14 +1115,12 @@ namespace Flux {

        FluxRunner(ggml_backend_t backend,
                   bool offload_params_to_cpu,
-                   const String2GGMLType& tensor_types = {},
-                   const std::string prefix            = "",
-                   SDVersion version                   = VERSION_FLUX,
-                   bool flash_attn                     = false,
-                   bool use_mask                       = false)
+                   const String2TensorStorage& tensor_storage_map = {},
+                   const std::string prefix                       = "",
+                   SDVersion version                              = VERSION_FLUX,
+                   bool use_mask                                  = false)
            : GGMLRunner(backend, offload_params_to_cpu), version(version), use_mask(use_mask) {
            flux_params.version             = version;
-            flux_params.flash_attn          = flash_attn;
            flux_params.guidance_embed      = false;
            flux_params.depth               = 0;
            flux_params.depth_single_blocks = 0;
@ -1154,7 +1134,7 @@ namespace Flux {
                flux_params.in_channels = 3;
                flux_params.patch_size  = 16;
            }
-            for (auto pair : tensor_types) {
+            for (auto pair : tensor_storage_map) {
                std::string tensor_name = pair.first;
                if (!starts_with(tensor_name, prefix))
                    continue;
@ -1192,7 +1172,7 @@ namespace Flux {
            }

            flux = Flux(flux_params);
-            flux.init(params_ctx, tensor_types, prefix);
+            flux.init(params_ctx, tensor_storage_map, prefix);
        }

        std::string get_desc() override {
@ -1323,8 +1303,9 @@ namespace Flux {
                set_backend_tensor_data(dct, dct_vec.data());
            }

-            struct ggml_tensor* out = flux.forward(compute_ctx,
-                                                   runtime_backend,
+            auto runner_ctx = get_context();
+
+            struct ggml_tensor* out = flux.forward(&runner_ctx,
                                                   x,
                                                   timesteps,
                                                   context,
@ -1417,26 +1398,24 @@ namespace Flux {
            ggml_type model_data_type = GGML_TYPE_Q8_0;

            ModelLoader model_loader;
-            if (!model_loader.init_from_file(file_path, "model.diffusion_model.")) {
+            if (!model_loader.init_from_file_and_convert_name(file_path, "model.diffusion_model.")) {
                LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
                return;
            }

-            auto tensor_types = model_loader.tensor_storages_types;
-            for (auto& item : tensor_types) {
-                // LOG_DEBUG("%s %u", item.first.c_str(), item.second);
-                if (ends_with(item.first, "weight")) {
-                    // item.second = model_data_type;
+            auto& tensor_storage_map = model_loader.get_tensor_storage_map();
+            for (auto& [name, tensor_storage] : tensor_storage_map) {
+                if (ends_with(name, "weight")) {
+                    tensor_storage.expected_type = model_data_type;
                }
            }

            std::shared_ptr<FluxRunner> flux = std::make_shared<FluxRunner>(backend,
                                                                            false,
-                                                                            tensor_types,
+                                                                            tensor_storage_map,
                                                                            "model.diffusion_model",
                                                                            VERSION_CHROMA_RADIANCE,
-                                                                            false,
-                                                                            true);
+                                                                            false);

            flux->alloc_params_buffer();
            std::map<std::string, ggml_tensor*> tensors;
--- a/otherarch/sdcpp/ggml_extend.hpp
+++ b/otherarch/sdcpp/ggml_extend.hpp
@ -881,7 +881,7 @@ __STATIC_INLINE__ void sd_tiling_non_square(ggml_tensor* input,
    ggml_tensor* input_tile  = ggml_new_tensor_4d(tiles_ctx, GGML_TYPE_F32, input_tile_size_x, input_tile_size_y, input->ne[2], input->ne[3]);
    ggml_tensor* output_tile = ggml_new_tensor_4d(tiles_ctx, GGML_TYPE_F32, output_tile_size_x, output_tile_size_y, output->ne[2], output->ne[3]);
    int num_tiles            = num_tiles_x * num_tiles_y;
-    LOG_INFO("processing %i tiles", num_tiles);
+    LOG_DEBUG("processing %i tiles", num_tiles);
    pretty_progress(0, num_tiles, 0.0f);
    int tile_count = 1;
    bool last_y = false, last_x = false;
@ -1163,8 +1163,8 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_attention_ext(struct ggml_context
                                                             struct ggml_tensor* mask = nullptr,
                                                             bool diag_mask_inf       = false,
                                                             bool skip_reshape        = false,
-                                                             bool flash_attn          = false,  // avoid overflow
-                                                             float kv_scale           = 1.0f) {
+                                                             bool flash_attn          = false,
+                                                             float kv_scale           = 1.0f) {  // avoid overflow
    int64_t L_q;
    int64_t L_k;
    int64_t C;
@ -1466,7 +1466,12 @@ __STATIC_INLINE__ size_t ggml_tensor_num(ggml_context* ctx) {
 #define MAX_PARAMS_TENSOR_NUM 32768
 #define MAX_GRAPH_SIZE 327680

-typedef std::map<std::string, enum ggml_type> String2GGMLType;
+struct GGMLRunnerContext {
+    ggml_backend_t backend     = nullptr;
+    ggml_context* ggml_ctx     = nullptr;
+    bool flash_attn_enabled    = false;
+    bool conv2d_direct_enabled = false;
+};

 struct GGMLRunner {
 protected:
@ -1494,6 +1499,9 @@ protected:
    std::map<std::string, struct ggml_tensor*> cache_tensor_map;  // name -> tensor
    const std::string final_result_name = "ggml_runner_final_result_tensor";

+    bool flash_attn_enabled    = false;
+    bool conv2d_direct_enabled = false;
+
    void alloc_params_ctx() {
        struct ggml_init_params params;
        params.mem_size   = static_cast<size_t>(MAX_PARAMS_TENSOR_NUM * ggml_tensor_overhead());
@ -1566,8 +1574,10 @@ protected:
    struct ggml_cgraph* get_compute_graph(get_graph_cb_t get_graph) {
        prepare_build_in_tensor_before();
        struct ggml_cgraph* gf = get_graph();
-        auto result            = ggml_graph_node(gf, -1);
-        ggml_set_name(result, final_result_name.c_str());
+        if (ggml_graph_n_nodes(gf) > 0) {
+            auto result = ggml_graph_node(gf, -1);
+            ggml_set_name(result, final_result_name.c_str());
+        }
        prepare_build_in_tensor_after(gf);
        return gf;
    }
@ -1750,6 +1760,15 @@ public:
        free_cache_ctx_and_buffer();
    }

+    virtual GGMLRunnerContext get_context() {
+        GGMLRunnerContext runner_ctx;
+        runner_ctx.ggml_ctx              = compute_ctx;
+        runner_ctx.backend               = runtime_backend;
+        runner_ctx.flash_attn_enabled    = flash_attn_enabled;
+        runner_ctx.conv2d_direct_enabled = conv2d_direct_enabled;
+        return runner_ctx;
+    }
+
    void reset_compute_ctx() {
        free_compute_ctx();
        alloc_compute_ctx();
@ -1870,6 +1889,14 @@ public:
            free_compute_buffer();
        }
    }
+
+    void set_flash_attention_enabled(bool enabled) {
+        flash_attn_enabled = enabled;
+    }
+
+    void set_conv2d_direct_enabled(bool enabled) {
+        conv2d_direct_enabled = enabled;
+    }
 };

 class GGMLBlock {
@ -1879,30 +1906,36 @@ protected:
    GGMLBlockMap blocks;
    ParameterMap params;

-    ggml_type get_type(const std::string& name, const String2GGMLType& tensor_types, ggml_type default_type) {
-        auto iter = tensor_types.find(name);
-        if (iter != tensor_types.end()) {
-            return iter->second;
+    ggml_type get_type(const std::string& name, const String2TensorStorage& tensor_storage_map, ggml_type default_type) {
+        ggml_type wtype = default_type;
+        auto iter       = tensor_storage_map.find(name);
+        if (iter != tensor_storage_map.end()) {
+            const TensorStorage& tensor_storage = iter->second;
+            if (tensor_storage.expected_type != GGML_TYPE_COUNT) {
+                wtype = tensor_storage.expected_type;
+            } else {
+                wtype = tensor_storage.type;
+            }
        }
-        return default_type;
+        return wtype;
    }

-    void init_blocks(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
+    void init_blocks(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") {
        for (auto& pair : blocks) {
            auto& block = pair.second;
-            block->init(ctx, tensor_types, prefix + pair.first);
+            block->init(ctx, tensor_storage_map, prefix + pair.first);
        }
    }

-    virtual void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {}
+    virtual void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") {}

 public:
-    void init(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, std::string prefix = "") {
+    void init(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, std::string prefix = "") {
        if (prefix.size() > 0) {
            prefix = prefix + ".";
        }
-        init_blocks(ctx, tensor_types, prefix);
-        init_params(ctx, tensor_types, prefix);
+        init_params(ctx, tensor_storage_map, prefix);
+        init_blocks(ctx, tensor_storage_map, prefix);
    }

    size_t get_params_num() {
@ -1961,12 +1994,12 @@ public:

 class UnaryBlock : public GGMLBlock {
 public:
-    virtual struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) = 0;
+    virtual struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) = 0;
 };

 class Identity : public UnaryBlock {
 public:
-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
        return x;
    }
 };
@ -1980,8 +2013,8 @@ protected:
    bool force_prec_f32;
    float scale;

-    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
-        enum ggml_type wtype = get_type(prefix + "weight", tensor_types, GGML_TYPE_F32);
+    void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
+        enum ggml_type wtype = get_type(prefix + "weight", tensor_storage_map, GGML_TYPE_F32);
        if (in_features % ggml_blck_size(wtype) != 0 || force_f32) {
            wtype = GGML_TYPE_F32;
        }
@ -2006,13 +2039,13 @@ public:
          force_prec_f32(force_prec_f32),
          scale(scale) {}

-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
        struct ggml_tensor* w = params["weight"];
        struct ggml_tensor* b = nullptr;
        if (bias) {
            b = params["bias"];
        }
-        return ggml_ext_linear(ctx, x, w, b, force_prec_f32, scale);
+        return ggml_ext_linear(ctx->ggml_ctx, x, w, b, force_prec_f32, scale);
    }
 };

@ -2028,8 +2061,8 @@ class Embedding : public UnaryBlock {
 protected:
    int64_t embedding_dim;
    int64_t num_embeddings;
-    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types, const std::string prefix = "") {
-        enum ggml_type wtype = get_type(prefix + "weight", tensor_types, GGML_TYPE_F32);
+    void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map, const std::string prefix = "") override {
+        enum ggml_type wtype = get_type(prefix + "weight", tensor_storage_map, GGML_TYPE_F32);
        if (!support_get_rows(wtype)) {
            wtype = GGML_TYPE_F32;
        }
@ -2042,7 +2075,7 @@ public:
          num_embeddings(num_embeddings) {
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx,
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                struct ggml_tensor* input_ids) {
        // input_ids: [N, n_token]
        auto weight = params["weight"];
@ -2050,11 +2083,11 @@ public:
        // There are issues with ggml batch inference, so we are expanding it here first.
        // TODO: fix ggml batch inference
        int64_t n = input_ids->ne[1];
-        input_ids = ggml_reshape_1d(ctx, input_ids, input_ids->ne[0] * input_ids->ne[1]);
+        input_ids = ggml_reshape_1d(ctx->ggml_ctx, input_ids, input_ids->ne[0] * input_ids->ne[1]);

-        input_ids      = ggml_reshape_3d(ctx, input_ids, input_ids->ne[0], 1, input_ids->ne[1]);
-        auto embedding = ggml_get_rows(ctx, weight, input_ids);
-        embedding      = ggml_reshape_3d(ctx, embedding, embedding->ne[0], embedding->ne[1] / n, n);
+        input_ids      = ggml_reshape_3d(ctx->ggml_ctx, input_ids, input_ids->ne[0], 1, input_ids->ne[1]);
+        auto embedding = ggml_get_rows(ctx->ggml_ctx, weight, input_ids);
+        embedding      = ggml_reshape_3d(ctx->ggml_ctx, embedding, embedding->ne[0], embedding->ne[1] / n, n);

        // [N, n_token, embedding_dim]
        return embedding;
@ -2070,10 +2103,9 @@ protected:
    std::pair<int, int> padding;
    std::pair<int, int> dilation;
    bool bias;
-    bool direct = false;
    float scale = 1.f;

-    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types, const std::string prefix = "") {
+    void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map, const std::string prefix = "") override {
        enum ggml_type wtype = GGML_TYPE_F16;
        params["weight"]     = ggml_new_tensor_4d(ctx, wtype, kernel_size.second, kernel_size.first, in_channels, out_channels);
        if (bias) {
@ -2098,10 +2130,6 @@ public:
          dilation(dilation),
          bias(bias) {}

-    void enable_direct() {
-        direct = true;
-    }
-
    void set_scale(float scale_value) {
        scale = scale_value;
    }
@ -2110,13 +2138,13 @@ public:
        return "Conv2d";
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
        struct ggml_tensor* w = params["weight"];
        struct ggml_tensor* b = nullptr;
        if (bias) {
            b = params["bias"];
        }
-        return ggml_ext_conv_2d(ctx,
+        return ggml_ext_conv_2d(ctx->ggml_ctx,
                                x,
                                w,
                                b,
@ -2126,7 +2154,7 @@ public:
                                padding.first,
                                dilation.second,
                                dilation.first,
-                                direct,
+                                ctx->conv2d_direct_enabled,
                                scale);
    }
 };
@ -2141,7 +2169,7 @@ protected:
    int64_t dilation;
    bool bias;

-    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types, const std::string prefix = "") {
+    void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map, const std::string prefix = "") override {
        enum ggml_type wtype = GGML_TYPE_F16;
        params["weight"]     = ggml_new_tensor_4d(ctx, wtype, 1, kernel_size, in_channels, out_channels);  // 5d => 4d
        if (bias) {
@ -2168,13 +2196,13 @@ public:

    // x: [N, IC, ID, IH*IW]
    // result: [N, OC, OD, OH*OW]
-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
        struct ggml_tensor* w = params["weight"];
        struct ggml_tensor* b = nullptr;
        if (bias) {
            b = params["bias"];
        }
-        return ggml_ext_conv_3d_nx1x1(ctx, x, w, b, stride, padding, dilation);
+        return ggml_ext_conv_3d_nx1x1(ctx->ggml_ctx, x, w, b, stride, padding, dilation);
    }
 };

@ -2188,7 +2216,7 @@ protected:
    std::tuple<int, int, int> dilation;
    bool bias;

-    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types, const std::string prefix = "") {
+    void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map, const std::string prefix = "") override {
        enum ggml_type wtype = GGML_TYPE_F16;
        params["weight"]     = ggml_new_tensor_4d(ctx,
                                                  wtype,
@ -2217,13 +2245,13 @@ public:
          dilation(dilation),
          bias(bias) {}

-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
        struct ggml_tensor* w = params["weight"];
        struct ggml_tensor* b = nullptr;
        if (bias) {
            b = params["bias"];
        }
-        return ggml_ext_conv_3d(ctx, x, w, b, in_channels,
+        return ggml_ext_conv_3d(ctx->ggml_ctx, x, w, b, in_channels,
                                std::get<2>(stride), std::get<1>(stride), std::get<0>(stride),
                                std::get<2>(padding), std::get<1>(padding), std::get<0>(padding),
                                std::get<2>(dilation), std::get<1>(dilation), std::get<0>(dilation));
@ -2237,7 +2265,7 @@ protected:
    bool elementwise_affine;
    bool bias;

-    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
+    void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
        if (elementwise_affine) {
            enum ggml_type wtype = GGML_TYPE_F32;
            params["weight"]     = ggml_new_tensor_1d(ctx, wtype, normalized_shape);
@ -2258,7 +2286,7 @@ public:
          elementwise_affine(elementwise_affine),
          bias(bias) {}

-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
        struct ggml_tensor* w = nullptr;
        struct ggml_tensor* b = nullptr;

@ -2268,7 +2296,7 @@ public:
                b = params["bias"];
            }
        }
-        return ggml_ext_layer_norm(ctx, x, w, b, eps);
+        return ggml_ext_layer_norm(ctx->ggml_ctx, x, w, b, eps);
    }
 };

@ -2279,7 +2307,7 @@ protected:
    float eps;
    bool affine;

-    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
+    void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
        if (affine) {
            enum ggml_type wtype      = GGML_TYPE_F32;
            enum ggml_type bias_wtype = GGML_TYPE_F32;
@ -2298,14 +2326,14 @@ public:
          eps(eps),
          affine(affine) {}

-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
        struct ggml_tensor* w = nullptr;
        struct ggml_tensor* b = nullptr;
        if (affine) {
            w = params["weight"];
            b = params["bias"];
        }
-        return ggml_ext_group_norm(ctx, x, w, b, num_groups);
+        return ggml_ext_group_norm(ctx->ggml_ctx, x, w, b, num_groups);
    }
 };

@ -2320,7 +2348,7 @@ protected:
    int64_t hidden_size;
    float eps;

-    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, std::string prefix = "") {
+    void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, std::string prefix = "") override {
        enum ggml_type wtype = GGML_TYPE_F32;
        params["weight"]     = ggml_new_tensor_1d(ctx, wtype, hidden_size);
    }
@ -2331,10 +2359,10 @@ public:
        : hidden_size(hidden_size),
          eps(eps) {}

-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
        struct ggml_tensor* w = params["weight"];
-        x                     = ggml_rms_norm(ctx, x, eps);
-        x                     = ggml_mul_inplace(ctx, x, w);
+        x                     = ggml_rms_norm(ctx->ggml_ctx, x, eps);
+        x                     = ggml_mul_inplace(ctx->ggml_ctx, x, w);
        return x;
    }
 };
@ -2343,9 +2371,11 @@ class MultiheadAttention : public GGMLBlock {
 protected:
    int64_t embed_dim;
    int64_t n_head;
+    bool proj_in;
    std::string q_proj_name;
    std::string k_proj_name;
    std::string v_proj_name;
+    std::string in_proj_name;
    std::string out_proj_name;

 public:
@ -2353,37 +2383,57 @@ public:
                       int64_t n_head,
                       bool qkv_proj_bias        = true,
                       bool out_proj_bias        = true,
+                       bool proj_in              = false,
                       std::string q_proj_name   = "q_proj",
                       std::string k_proj_name   = "k_proj",
                       std::string v_proj_name   = "v_proj",
+                       std::string in_proj_name  = "in_proj",
                       std::string out_proj_name = "out_proj")
        : embed_dim(embed_dim),
          n_head(n_head),
+          proj_in(proj_in),
          q_proj_name(q_proj_name),
          k_proj_name(k_proj_name),
          v_proj_name(v_proj_name),
+          in_proj_name(in_proj_name),
          out_proj_name(out_proj_name) {
-        blocks[q_proj_name]   = std::shared_ptr<GGMLBlock>(new Linear(embed_dim, embed_dim, qkv_proj_bias));
-        blocks[k_proj_name]   = std::shared_ptr<GGMLBlock>(new Linear(embed_dim, embed_dim, qkv_proj_bias));
-        blocks[v_proj_name]   = std::shared_ptr<GGMLBlock>(new Linear(embed_dim, embed_dim, qkv_proj_bias));
+        if (proj_in) {
+            blocks[in_proj_name] = std::shared_ptr<GGMLBlock>(new Linear(embed_dim, embed_dim * 3, qkv_proj_bias));
+        } else {
+            blocks[q_proj_name] = std::shared_ptr<GGMLBlock>(new Linear(embed_dim, embed_dim, qkv_proj_bias));
+            blocks[k_proj_name] = std::shared_ptr<GGMLBlock>(new Linear(embed_dim, embed_dim, qkv_proj_bias));
+            blocks[v_proj_name] = std::shared_ptr<GGMLBlock>(new Linear(embed_dim, embed_dim, qkv_proj_bias));
+        }
        blocks[out_proj_name] = std::shared_ptr<GGMLBlock>(new Linear(embed_dim, embed_dim, out_proj_bias));
    }

    // x: [N, n_token, embed_dim]
-    struct ggml_tensor* forward(struct ggml_context* ctx,
-                                ggml_backend_t backend,
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                struct ggml_tensor* x,
                                bool mask = false) {
-        auto q_proj   = std::dynamic_pointer_cast<Linear>(blocks[q_proj_name]);
-        auto k_proj   = std::dynamic_pointer_cast<Linear>(blocks[k_proj_name]);
-        auto v_proj   = std::dynamic_pointer_cast<Linear>(blocks[v_proj_name]);
        auto out_proj = std::dynamic_pointer_cast<Linear>(blocks[out_proj_name]);

-        struct ggml_tensor* q = q_proj->forward(ctx, x);
-        struct ggml_tensor* k = k_proj->forward(ctx, x);
-        struct ggml_tensor* v = v_proj->forward(ctx, x);
+        ggml_tensor* q;
+        ggml_tensor* k;
+        ggml_tensor* v;
+        if (proj_in) {
+            auto in_proj = std::dynamic_pointer_cast<Linear>(blocks[in_proj_name]);
+            auto qkv     = in_proj->forward(ctx, x);
+            auto qkv_vec = split_qkv(ctx->ggml_ctx, qkv);
+            q            = qkv_vec[0];
+            k            = qkv_vec[1];
+            v            = qkv_vec[2];
+        } else {
+            auto q_proj = std::dynamic_pointer_cast<Linear>(blocks[q_proj_name]);
+            auto k_proj = std::dynamic_pointer_cast<Linear>(blocks[k_proj_name]);
+            auto v_proj = std::dynamic_pointer_cast<Linear>(blocks[v_proj_name]);

-        x = ggml_ext_attention_ext(ctx, backend, q, k, v, n_head, nullptr, mask);  // [N, n_token, embed_dim]
+            q = q_proj->forward(ctx, x);
+            k = k_proj->forward(ctx, x);
+            v = v_proj->forward(ctx, x);
+        }
+
+        x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, n_head, nullptr, mask);  // [N, n_token, embed_dim]

        x = out_proj->forward(ctx, x);  // [N, n_token, embed_dim]
        return x;
--- a/otherarch/sdcpp/latent-preview.h
+++ b/otherarch/sdcpp/latent-preview.h
@ -0,0 +1,173 @@
+#include <cstddef>
+#include <cstdint>
+#include "ggml.h"
+
+const float wan_21_latent_rgb_proj[16][3] = {
+    {0.015123f, -0.148418f, 0.479828f},
+    {0.003652f, -0.010680f, -0.037142f},
+    {0.212264f, 0.063033f, 0.016779f},
+    {0.232999f, 0.406476f, 0.220125f},
+    {-0.051864f, -0.082384f, -0.069396f},
+    {0.085005f, -0.161492f, 0.010689f},
+    {-0.245369f, -0.506846f, -0.117010f},
+    {-0.151145f, 0.017721f, 0.007207f},
+    {-0.293239f, -0.207936f, -0.421135f},
+    {-0.187721f, 0.050783f, 0.177649f},
+    {-0.013067f, 0.265964f, 0.166578f},
+    {0.028327f, 0.109329f, 0.108642f},
+    {-0.205343f, 0.043991f, 0.148914f},
+    {0.014307f, -0.048647f, -0.007219f},
+    {0.217150f, 0.053074f, 0.319923f},
+    {0.155357f, 0.083156f, 0.064780f}};
+float wan_21_latent_rgb_bias[3] = {-0.270270f, -0.234976f, -0.456853f};
+
+const float wan_22_latent_rgb_proj[48][3] = {
+    {0.017126f, -0.027230f, -0.019257f},
+    {-0.113739f, -0.028715f, -0.022885f},
+    {-0.000106f, 0.021494f, 0.004629f},
+    {-0.013273f, -0.107137f, -0.033638f},
+    {-0.000381f, 0.000279f, 0.025877f},
+    {-0.014216f, -0.003975f, 0.040528f},
+    {0.001638f, -0.000748f, 0.011022f},
+    {0.029238f, -0.006697f, 0.035933f},
+    {0.021641f, -0.015874f, 0.040531f},
+    {-0.101984f, -0.070160f, -0.028855f},
+    {0.033207f, -0.021068f, 0.002663f},
+    {-0.104711f, 0.121673f, 0.102981f},
+    {0.082647f, -0.004991f, 0.057237f},
+    {-0.027375f, 0.031581f, 0.006868f},
+    {-0.045434f, 0.029444f, 0.019287f},
+    {-0.046572f, -0.012537f, 0.006675f},
+    {0.074709f, 0.033690f, 0.025289f},
+    {-0.008251f, -0.002745f, -0.006999f},
+    {0.012685f, -0.061856f, -0.048658f},
+    {0.042304f, -0.007039f, 0.000295f},
+    {-0.007644f, -0.060843f, -0.033142f},
+    {0.159909f, 0.045628f, 0.367541f},
+    {0.095171f, 0.086438f, 0.010271f},
+    {0.006812f, 0.019643f, 0.029637f},
+    {0.003467f, -0.010705f, 0.014252f},
+    {-0.099681f, -0.066272f, -0.006243f},
+    {0.047357f, 0.037040f, 0.000185f},
+    {-0.041797f, -0.089225f, -0.032257f},
+    {0.008928f, 0.017028f, 0.018684f},
+    {-0.042255f, 0.016045f, 0.006849f},
+    {0.011268f, 0.036462f, 0.037387f},
+    {0.011553f, -0.016375f, -0.048589f},
+    {0.046266f, -0.027189f, 0.056979f},
+    {0.009640f, -0.017576f, 0.030324f},
+    {-0.045794f, -0.036083f, -0.010616f},
+    {0.022418f, 0.039783f, -0.032939f},
+    {-0.052714f, -0.015525f, 0.007438f},
+    {0.193004f, 0.223541f, 0.264175f},
+    {-0.059406f, -0.008188f, 0.022867f},
+    {-0.156742f, -0.263791f, -0.007385f},
+    {-0.015717f, 0.016570f, 0.033969f},
+    {0.037969f, 0.109835f, 0.200449f},
+    {-0.000782f, -0.009566f, -0.008058f},
+    {0.010709f, 0.052960f, -0.044195f},
+    {0.017271f, 0.045839f, 0.034569f},
+    {0.009424f, 0.013088f, -0.001714f},
+    {-0.024805f, -0.059378f, -0.033756f},
+    {-0.078293f, 0.029070f, 0.026129f}};
+float wan_22_latent_rgb_bias[3] = {0.013160f, -0.096492f, -0.071323f};
+
+const float flux_latent_rgb_proj[16][3] = {
+    {-0.041168f, 0.019917f, 0.097253f},
+    {0.028096f, 0.026730f, 0.129576f},
+    {0.065618f, -0.067950f, -0.014651f},
+    {-0.012998f, -0.014762f, 0.081251f},
+    {0.078567f, 0.059296f, -0.024687f},
+    {-0.015987f, -0.003697f, 0.005012f},
+    {0.033605f, 0.138999f, 0.068517f},
+    {-0.024450f, -0.063567f, -0.030101f},
+    {-0.040194f, -0.016710f, 0.127185f},
+    {0.112681f, 0.088764f, -0.041940f},
+    {-0.023498f, 0.093664f, 0.025543f},
+    {0.082899f, 0.048320f, 0.007491f},
+    {0.075712f, 0.074139f, 0.081965f},
+    {-0.143501f, 0.018263f, -0.136138f},
+    {-0.025767f, -0.082035f, -0.040023f},
+    {-0.111849f, -0.055589f, -0.032361f}};
+float flux_latent_rgb_bias[3] = {0.024600f, -0.006937f, -0.008089f};
+
+// This one was taken straight from
+// https://github.com/Stability-AI/sd3.5/blob/8565799a3b41eb0c7ba976d18375f0f753f56402/sd3_impls.py#L288-L303
+// (MiT Licence)
+const float sd3_latent_rgb_proj[16][3] = {
+    {-0.0645f, 0.0177f, 0.1052f},
+    {0.0028f, 0.0312f, 0.0650f},
+    {0.1848f, 0.0762f, 0.0360f},
+    {0.0944f, 0.0360f, 0.0889f},
+    {0.0897f, 0.0506f, -0.0364f},
+    {-0.0020f, 0.1203f, 0.0284f},
+    {0.0855f, 0.0118f, 0.0283f},
+    {-0.0539f, 0.0658f, 0.1047f},
+    {-0.0057f, 0.0116f, 0.0700f},
+    {-0.0412f, 0.0281f, -0.0039f},
+    {0.1106f, 0.1171f, 0.1220f},
+    {-0.0248f, 0.0682f, -0.0481f},
+    {0.0815f, 0.0846f, 0.1207f},
+    {-0.0120f, -0.0055f, -0.0867f},
+    {-0.0749f, -0.0634f, -0.0456f},
+    {-0.1418f, -0.1457f, -0.1259f},
+};
+float sd3_latent_rgb_bias[3] = {0, 0, 0};
+
+const float sdxl_latent_rgb_proj[4][3] = {
+    {0.258303f, 0.277640f, 0.329699f},
+    {-0.299701f, 0.105446f, 0.014194f},
+    {0.050522f, 0.186163f, -0.143257f},
+    {-0.211938f, -0.149892f, -0.080036f}};
+float sdxl_latent_rgb_bias[3] = {0.144381f, -0.033313f, 0.007061f};
+
+const float sd_latent_rgb_proj[4][3] = {
+    {0.337366f, 0.216344f, 0.257386f},
+    {0.165636f, 0.386828f, 0.046994f},
+    {-0.267803f, 0.237036f, 0.223517f},
+    {-0.178022f, -0.200862f, -0.678514f}};
+float sd_latent_rgb_bias[3] = {-0.017478f, -0.055834f, -0.105825f};
+
+void preview_latent_video(uint8_t* buffer, struct ggml_tensor* latents, const float (*latent_rgb_proj)[3], const float latent_rgb_bias[3], int width, int height, int frames, int dim) {
+    size_t buffer_head = 0;
+    for (int k = 0; k < frames; k++) {
+        for (int j = 0; j < height; j++) {
+            for (int i = 0; i < width; i++) {
+                size_t latent_id = (i * latents->nb[0] + j * latents->nb[1] + k * latents->nb[2]);
+                float r = 0, g = 0, b = 0;
+                if (latent_rgb_proj != nullptr) {
+                    for (int d = 0; d < dim; d++) {
+                        float value = *(float*)((char*)latents->data + latent_id + d * latents->nb[ggml_n_dims(latents) - 1]);
+                        r += value * latent_rgb_proj[d][0];
+                        g += value * latent_rgb_proj[d][1];
+                        b += value * latent_rgb_proj[d][2];
+                    }
+                } else {
+                    // interpret first 3 channels as RGB
+                    r = *(float*)((char*)latents->data + latent_id + 0 * latents->nb[ggml_n_dims(latents) - 1]);
+                    g = *(float*)((char*)latents->data + latent_id + 1 * latents->nb[ggml_n_dims(latents) - 1]);
+                    b = *(float*)((char*)latents->data + latent_id + 2 * latents->nb[ggml_n_dims(latents) - 1]);
+                }
+                if (latent_rgb_bias != nullptr) {
+                    // bias
+                    r += latent_rgb_bias[0];
+                    g += latent_rgb_bias[1];
+                    b += latent_rgb_bias[2];
+                }
+                // change range
+                r = r * .5f + .5f;
+                g = g * .5f + .5f;
+                b = b * .5f + .5f;
+
+                // clamp rgb values to [0,1] range
+                r = r >= 0 ? r <= 1 ? r : 1 : 0;
+                g = g >= 0 ? g <= 1 ? g : 1 : 0;
+                b = b >= 0 ? b <= 1 ? b : 1 : 0;
+
+                buffer[buffer_head++] = (uint8_t)(r * 255);
+                buffer[buffer_head++] = (uint8_t)(g * 255);
+                buffer[buffer_head++] = (uint8_t)(b * 255);
+            }
+        }
+    }
+}
--- a/otherarch/sdcpp/lora.hpp
+++ b/otherarch/sdcpp/lora.hpp
--- a/otherarch/sdcpp/ltxv.hpp
+++ b/otherarch/sdcpp/ltxv.hpp
@ -27,7 +27,7 @@ namespace LTXV {
                                                                     bias));
        }

-        struct ggml_tensor* forward(struct ggml_context* ctx,
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                    struct ggml_tensor* x,
                                    bool causal = true) {
            // x: [N*IC, ID, IH, IW]
--- a/otherarch/sdcpp/main.cpp
+++ b/otherarch/sdcpp/main.cpp
@ -46,6 +46,13 @@ const char* modes_str[] = {
 };
 #define SD_ALL_MODES_STR "img_gen, vid_gen, convert, upscale"

+const char* previews_str[] = {
+    "none",
+    "proj",
+    "tae",
+    "vae",
+};
+
 enum SDMode {
    IMG_GEN,
    VID_GEN,
@ -135,6 +142,12 @@ struct SDParams {
    sd_tiling_params_t vae_tiling_params = {false, 0, 0, 0.5f, 0.0f, 0.0f};
    bool force_sdxl_vae_conv_scale       = false;

+    preview_t preview_method = PREVIEW_NONE;
+    int preview_interval     = 1;
+    std::string preview_path = "preview.png";
+    bool taesd_preview       = false;
+    bool preview_noisy       = false;
+
    SDParams() {
        sd_sample_params_init(&sample_params);
        sd_sample_params_init(&high_noise_sample_params);
@ -210,6 +223,8 @@ void print_params(SDParams params) {
    printf("    video_frames:                      %d\n", params.video_frames);
    printf("    vace_strength:                     %.2f\n", params.vace_strength);
    printf("    fps:                               %d\n", params.fps);
+    printf("    preview_mode:                      %s (%s)\n", previews_str[params.preview_method], params.preview_noisy ? "noisy" : "denoised");
+    printf("    preview_interval:                  %d\n", params.preview_interval);
    free(sample_params_str);
    free(high_noise_sample_params_str);
 }
@ -589,6 +604,10 @@ void parse_args(int argc, const char** argv, SDParams& params) {
         "--negative-prompt",
         "the negative prompt (default: \"\")",
         &params.negative_prompt},
+        {"",
+         "--preview-path",
+         "path to write preview image to (default: ./preview.png)",
+         &params.preview_path},
        {"",
         "--upscale-model",
         "path to esrgan model.",
@ -647,6 +666,10 @@ void parse_args(int argc, const char** argv, SDParams& params) {
         "shift timestep for NitroFusion models (default: 0). "
         "recommended N for NitroSD-Realism around 250 and 500 for NitroSD-Vibrant",
         &params.sample_params.shifted_timestep},
+        {"",
+         "--preview-interval",
+         "interval in denoising steps between consecutive updates of the image preview file (default is 1, meaning updating at every step)",
+         &params.preview_interval},
    };

    options.float_options = {
@ -801,7 +824,14 @@ void parse_args(int argc, const char** argv, SDParams& params) {
         "--disable-auto-resize-ref-image",
         "disable auto resize of ref images",
         false, &params.auto_resize_ref_image},
-    };
+        {"",
+         "--taesd-preview-only",
+         std::string("prevents usage of taesd for decoding the final image. (for use with --preview ") + previews_str[PREVIEW_TAE] + ")",
+         true, &params.taesd_preview},
+        {"",
+         "--preview-noisy",
+         "enables previewing noisy inputs of the models rather than the denoised outputs",
+         true, &params.preview_noisy}};

    auto on_mode_arg = [&](int argc, const char** argv, int index) {
        if (++index >= argc) {
@ -1046,6 +1076,26 @@ void parse_args(int argc, const char** argv, SDParams& params) {
        return 1;
    };

+    auto on_preview_arg = [&](int argc, const char** argv, int index) {
+        if (++index >= argc) {
+            return -1;
+        }
+        const char* preview = argv[index];
+        int preview_method  = -1;
+        for (int m = 0; m < PREVIEW_COUNT; m++) {
+            if (!strcmp(preview, previews_str[m])) {
+                preview_method = m;
+            }
+        }
+        if (preview_method == -1) {
+            fprintf(stderr, "error: preview method %s\n",
+                    preview);
+            return -1;
+        }
+        params.preview_method = (preview_t)preview_method;
+        return 1;
+    };
+
    options.manual_options = {
        {"-M",
         "--mode",
@ -1110,6 +1160,10 @@ void parse_args(int argc, const char** argv, SDParams& params) {
         "--vae-relative-tile-size",
         "relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)",
         on_relative_tile_size_arg},
+        {"",
+         "--preview",
+         std::string("preview method. must be one of the following [") + previews_str[0] + ", " + previews_str[1] + ", " + previews_str[2] + ", " + previews_str[3] + "] (default is " + previews_str[PREVIEW_NONE] + ")\n",
+         on_preview_arg},
    };

    if (!parse_options(argc, argv, options)) {
@ -1452,15 +1506,50 @@ bool load_images_from_dir(const std::string dir,
    return true;
 }

+const char* preview_path;
+float preview_fps;
+
+void step_callback(int step, int frame_count, sd_image_t* image, bool is_noisy) {
+    (void)step;
+    (void)is_noisy;
+    // is_noisy is set to true if the preview corresponds to noisy latents, false if it's denoised latents
+    // unused in this app, it will either be always noisy or always denoised here
+    if (frame_count == 1) {
+        stbi_write_png(preview_path, image->width, image->height, image->channel, image->data, 0);
+    } else {
+        create_mjpg_avi_from_sd_images(preview_path, image, frame_count, preview_fps);
+    }
+}
+
 int main(int argc, const char* argv[]) {
    SDParams params;
    parse_args(argc, argv, params);
+    preview_path = params.preview_path.c_str();
+    if (params.video_frames > 4) {
+        size_t last_dot_pos   = params.preview_path.find_last_of(".");
+        std::string base_path = params.preview_path;
+        std::string file_ext  = "";
+        if (last_dot_pos != std::string::npos) {  // filename has extension
+            base_path = params.preview_path.substr(0, last_dot_pos);
+            file_ext  = params.preview_path.substr(last_dot_pos);
+            std::transform(file_ext.begin(), file_ext.end(), file_ext.begin(), ::tolower);
+        }
+        if (file_ext == ".png") {
+            base_path    = base_path + ".avi";
+            preview_path = base_path.c_str();
+        }
+    }
+    preview_fps = params.fps;
+    if (params.preview_method == PREVIEW_PROJ)
+        preview_fps /= 4.0f;
+
    params.sample_params.guidance.slg.layers                 = params.skip_layers.data();
    params.sample_params.guidance.slg.layer_count            = params.skip_layers.size();
    params.high_noise_sample_params.guidance.slg.layers      = params.high_noise_skip_layers.data();
    params.high_noise_sample_params.guidance.slg.layer_count = params.high_noise_skip_layers.size();

    sd_set_log_callback(sd_log_cb, (void*)&params);
+    sd_set_preview_callback((sd_preview_cb_t)step_callback, params.preview_method, params.preview_interval, !params.preview_noisy, params.preview_noisy);

    if (params.verbose) {
        print_params(params);
@ -1654,6 +1743,7 @@ int main(int argc, const char* argv[]) {
        params.control_net_cpu,
        params.vae_on_cpu,
        params.diffusion_flash_attn,
+        params.taesd_preview,
        params.diffusion_conv_direct,
        params.vae_conv_direct,
        params.force_sdxl_vae_conv_scale,
--- a/otherarch/sdcpp/mmdit.hpp
+++ b/otherarch/sdcpp/mmdit.hpp
@ -27,13 +27,13 @@ public:
        blocks["fc2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_features, out_features, bias));
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
        // x: [N, n_token, in_features]
        auto fc1 = std::dynamic_pointer_cast<Linear>(blocks["fc1"]);
        auto fc2 = std::dynamic_pointer_cast<Linear>(blocks["fc2"]);

        x = fc1->forward(ctx, x);
-        x = ggml_gelu_inplace(ctx, x);
+        x = ggml_gelu_inplace(ctx->ggml_ctx, x);
        x = fc2->forward(ctx, x);
        return x;
    }
@ -72,7 +72,7 @@ public:
                                                               bias));
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
        // x: [N, C, H, W]
        // return: [N, H*W, embed_dim]
        auto proj = std::dynamic_pointer_cast<Conv2d>(blocks["proj"]);
@ -82,13 +82,13 @@ public:
            int64_t H = x->ne[1];
            int pad_h = (patch_size - H % patch_size) % patch_size;
            int pad_w = (patch_size - W % patch_size) % patch_size;
-            x         = ggml_pad(ctx, x, pad_w, pad_h, 0, 0);  // TODO: reflect pad mode
+            x         = ggml_pad(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0);  // TODO: reflect pad mode
        }
        x = proj->forward(ctx, x);

        if (flatten) {
-            x = ggml_reshape_3d(ctx, x, x->ne[0] * x->ne[1], x->ne[2], x->ne[3]);
-            x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 0, 2, 3));
+            x = ggml_reshape_3d(ctx->ggml_ctx, x, x->ne[0] * x->ne[1], x->ne[2], x->ne[3]);
+            x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 0, 2, 3));
        }
        return x;
    }
@ -107,16 +107,16 @@ public:
        blocks["mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size, true, true));
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* t) {
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* t) {
        // t: [N, ]
        // return: [N, hidden_size]
        auto mlp_0 = std::dynamic_pointer_cast<Linear>(blocks["mlp.0"]);
        auto mlp_2 = std::dynamic_pointer_cast<Linear>(blocks["mlp.2"]);

-        auto t_freq = ggml_ext_timestep_embedding(ctx, t, frequency_embedding_size);  // [N, frequency_embedding_size]
+        auto t_freq = ggml_ext_timestep_embedding(ctx->ggml_ctx, t, frequency_embedding_size);  // [N, frequency_embedding_size]

        auto t_emb = mlp_0->forward(ctx, t_freq);
-        t_emb      = ggml_silu_inplace(ctx, t_emb);
+        t_emb      = ggml_silu_inplace(ctx->ggml_ctx, t_emb);
        t_emb      = mlp_2->forward(ctx, t_emb);
        return t_emb;
    }
@ -131,14 +131,14 @@ public:
        blocks["mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size, true, true));
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
        // x: [N, input_dim]
        // return: [N, hidden_size]
        auto mlp_0 = std::dynamic_pointer_cast<Linear>(blocks["mlp.0"]);
        auto mlp_2 = std::dynamic_pointer_cast<Linear>(blocks["mlp.2"]);

        x = mlp_0->forward(ctx, x);
-        x = ggml_silu_inplace(ctx, x);
+        x = ggml_silu_inplace(ctx->ggml_ctx, x);
        x = mlp_2->forward(ctx, x);
        return x;
    }
@ -149,16 +149,14 @@ public:
    int64_t num_heads;
    bool pre_only;
    std::string qk_norm;
-    bool flash_attn;

 public:
    SelfAttention(int64_t dim,
                  int64_t num_heads   = 8,
                  std::string qk_norm = "",
                  bool qkv_bias       = false,
-                  bool pre_only       = false,
-                  bool flash_attn     = false)
-        : num_heads(num_heads), pre_only(pre_only), qk_norm(qk_norm), flash_attn(flash_attn) {
+                  bool pre_only       = false)
+        : num_heads(num_heads), pre_only(pre_only), qk_norm(qk_norm) {
        int64_t d_head = dim / num_heads;
        blocks["qkv"]  = std::shared_ptr<GGMLBlock>(new Linear(dim, dim * 3, qkv_bias));
        if (!pre_only) {
@ -173,15 +171,15 @@ public:
        }
    }

-    std::vector<struct ggml_tensor*> pre_attention(struct ggml_context* ctx, struct ggml_tensor* x) {
+    std::vector<struct ggml_tensor*> pre_attention(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
        auto qkv_proj = std::dynamic_pointer_cast<Linear>(blocks["qkv"]);

        auto qkv         = qkv_proj->forward(ctx, x);
-        auto qkv_vec     = split_qkv(ctx, qkv);
+        auto qkv_vec     = split_qkv(ctx->ggml_ctx, qkv);
        int64_t head_dim = qkv_vec[0]->ne[0] / num_heads;
-        auto q           = ggml_reshape_4d(ctx, qkv_vec[0], head_dim, num_heads, qkv_vec[0]->ne[1], qkv_vec[0]->ne[2]);  // [N, n_token, n_head, d_head]
-        auto k           = ggml_reshape_4d(ctx, qkv_vec[1], head_dim, num_heads, qkv_vec[1]->ne[1], qkv_vec[1]->ne[2]);  // [N, n_token, n_head, d_head]
-        auto v           = qkv_vec[2];                                                                                   // [N, n_token, n_head*d_head]
+        auto q           = ggml_reshape_4d(ctx->ggml_ctx, qkv_vec[0], head_dim, num_heads, qkv_vec[0]->ne[1], qkv_vec[0]->ne[2]);  // [N, n_token, n_head, d_head]
+        auto k           = ggml_reshape_4d(ctx->ggml_ctx, qkv_vec[1], head_dim, num_heads, qkv_vec[1]->ne[1], qkv_vec[1]->ne[2]);  // [N, n_token, n_head, d_head]
+        auto v           = qkv_vec[2];                                                                                             // [N, n_token, n_head*d_head]

        if (qk_norm == "rms" || qk_norm == "ln") {
            auto ln_q = std::dynamic_pointer_cast<UnaryBlock>(blocks["ln_q"]);
@ -190,13 +188,13 @@ public:
            k         = ln_k->forward(ctx, k);
        }

-        q = ggml_reshape_3d(ctx, q, q->ne[0] * q->ne[1], q->ne[2], q->ne[3]);  // [N, n_token, n_head*d_head]
-        k = ggml_reshape_3d(ctx, k, k->ne[0] * k->ne[1], k->ne[2], k->ne[3]);  // [N, n_token, n_head*d_head]
+        q = ggml_reshape_3d(ctx->ggml_ctx, q, q->ne[0] * q->ne[1], q->ne[2], q->ne[3]);  // [N, n_token, n_head*d_head]
+        k = ggml_reshape_3d(ctx->ggml_ctx, k, k->ne[0] * k->ne[1], k->ne[2], k->ne[3]);  // [N, n_token, n_head*d_head]

        return {q, k, v};
    }

-    struct ggml_tensor* post_attention(struct ggml_context* ctx, struct ggml_tensor* x) {
+    struct ggml_tensor* post_attention(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
        GGML_ASSERT(!pre_only);

        auto proj = std::dynamic_pointer_cast<Linear>(blocks["proj"]);
@ -206,12 +204,11 @@ public:
    }

    // x: [N, n_token, dim]
-    struct ggml_tensor* forward(struct ggml_context* ctx,
-                                ggml_backend_t backend,
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                struct ggml_tensor* x) {
        auto qkv = pre_attention(ctx, x);
-        x        = ggml_ext_attention_ext(ctx, backend, qkv[0], qkv[1], qkv[2], num_heads, nullptr, false, false, true);  // [N, n_token, dim]
-        x        = post_attention(ctx, x);                                                                                // [N, n_token, dim]
+        x        = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, qkv[0], qkv[1], qkv[2], num_heads, nullptr, false, false, ctx->flash_attn_enabled);  // [N, n_token, dim]
+        x        = post_attention(ctx, x);                                                                                                                  // [N, n_token, dim]
        return x;
    }
 };
@ -236,7 +233,6 @@ public:
    int64_t num_heads;
    bool pre_only;
    bool self_attn;
-    bool flash_attn;

 public:
    DismantledBlock(int64_t hidden_size,
@ -245,17 +241,16 @@ public:
                    std::string qk_norm = "",
                    bool qkv_bias       = false,
                    bool pre_only       = false,
-                    bool self_attn      = false,
-                    bool flash_attn     = false)
+                    bool self_attn      = false)
        : num_heads(num_heads), pre_only(pre_only), self_attn(self_attn) {
        // rmsnorm is always Flase
        // scale_mod_only is always Flase
        // swiglu is always Flase
        blocks["norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-06f, false));
-        blocks["attn"]  = std::shared_ptr<GGMLBlock>(new SelfAttention(hidden_size, num_heads, qk_norm, qkv_bias, pre_only, flash_attn));
+        blocks["attn"]  = std::shared_ptr<GGMLBlock>(new SelfAttention(hidden_size, num_heads, qk_norm, qkv_bias, pre_only));

        if (self_attn) {
-            blocks["attn2"] = std::shared_ptr<GGMLBlock>(new SelfAttention(hidden_size, num_heads, qk_norm, qkv_bias, false, flash_attn));
+            blocks["attn2"] = std::shared_ptr<GGMLBlock>(new SelfAttention(hidden_size, num_heads, qk_norm, qkv_bias, false));
        }

        if (!pre_only) {
@ -274,9 +269,9 @@ public:
        blocks["adaLN_modulation.1"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, n_mods * hidden_size));
    }

-    std::tuple<std::vector<struct ggml_tensor*>, std::vector<struct ggml_tensor*>, std::vector<struct ggml_tensor*>> pre_attention_x(struct ggml_context* ctx,
-                                                                                                                                     struct ggml_tensor* x,
-                                                                                                                                     struct ggml_tensor* c) {
+    std::tuple<std::vector<ggml_tensor*>, std::vector<ggml_tensor*>, std::vector<ggml_tensor*>> pre_attention_x(GGMLRunnerContext* ctx,
+                                                                                                                struct ggml_tensor* x,
+                                                                                                                struct ggml_tensor* c) {
        GGML_ASSERT(self_attn);
        // x: [N, n_token, hidden_size]
        // c: [N, hidden_size]
@ -286,35 +281,35 @@ public:
        auto adaLN_modulation_1 = std::dynamic_pointer_cast<Linear>(blocks["adaLN_modulation.1"]);

        int64_t n_mods = 9;
-        auto m         = adaLN_modulation_1->forward(ctx, ggml_silu(ctx, c));  // [N, n_mods * hidden_size]
-        m              = ggml_reshape_3d(ctx, m, c->ne[0], n_mods, c->ne[1]);  // [N, n_mods, hidden_size]
-        m              = ggml_cont(ctx, ggml_permute(ctx, m, 0, 2, 1, 3));     // [n_mods, N, hidden_size]
+        auto m         = adaLN_modulation_1->forward(ctx, ggml_silu(ctx->ggml_ctx, c));         // [N, n_mods * hidden_size]
+        m              = ggml_reshape_3d(ctx->ggml_ctx, m, c->ne[0], n_mods, c->ne[1]);         // [N, n_mods, hidden_size]
+        m              = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, m, 0, 2, 1, 3));  // [n_mods, N, hidden_size]

        int64_t offset = m->nb[1] * m->ne[1];
-        auto shift_msa = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 0);  // [N, hidden_size]
-        auto scale_msa = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 1);  // [N, hidden_size]
-        auto gate_msa  = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 2);  // [N, hidden_size]
+        auto shift_msa = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 0);  // [N, hidden_size]
+        auto scale_msa = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 1);  // [N, hidden_size]
+        auto gate_msa  = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 2);  // [N, hidden_size]

-        auto shift_mlp = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 3);  // [N, hidden_size]
-        auto scale_mlp = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 4);  // [N, hidden_size]
-        auto gate_mlp  = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 5);  // [N, hidden_size]
+        auto shift_mlp = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 3);  // [N, hidden_size]
+        auto scale_mlp = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 4);  // [N, hidden_size]
+        auto gate_mlp  = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 5);  // [N, hidden_size]

-        auto shift_msa2 = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 6);  // [N, hidden_size]
-        auto scale_msa2 = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 7);  // [N, hidden_size]
-        auto gate_msa2  = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 8);  // [N, hidden_size]
+        auto shift_msa2 = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 6);  // [N, hidden_size]
+        auto scale_msa2 = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 7);  // [N, hidden_size]
+        auto gate_msa2  = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 8);  // [N, hidden_size]

        auto x_norm = norm1->forward(ctx, x);

-        auto attn_in = modulate(ctx, x_norm, shift_msa, scale_msa);
+        auto attn_in = modulate(ctx->ggml_ctx, x_norm, shift_msa, scale_msa);
        auto qkv     = attn->pre_attention(ctx, attn_in);

-        auto attn2_in = modulate(ctx, x_norm, shift_msa2, scale_msa2);
+        auto attn2_in = modulate(ctx->ggml_ctx, x_norm, shift_msa2, scale_msa2);
        auto qkv2     = attn2->pre_attention(ctx, attn2_in);

        return {qkv, qkv2, {x, gate_msa, shift_mlp, scale_mlp, gate_mlp, gate_msa2}};
    }

-    std::pair<std::vector<struct ggml_tensor*>, std::vector<struct ggml_tensor*>> pre_attention(struct ggml_context* ctx,
+    std::pair<std::vector<struct ggml_tensor*>, std::vector<struct ggml_tensor*>> pre_attention(GGMLRunnerContext* ctx,
                                                                                                struct ggml_tensor* x,
                                                                                                struct ggml_tensor* c) {
        // x: [N, n_token, hidden_size]
@ -327,33 +322,33 @@ public:
        if (pre_only) {
            n_mods = 2;
        }
-        auto m = adaLN_modulation_1->forward(ctx, ggml_silu(ctx, c));  // [N, n_mods * hidden_size]
-        m      = ggml_reshape_3d(ctx, m, c->ne[0], n_mods, c->ne[1]);  // [N, n_mods, hidden_size]
-        m      = ggml_cont(ctx, ggml_permute(ctx, m, 0, 2, 1, 3));     // [n_mods, N, hidden_size]
+        auto m = adaLN_modulation_1->forward(ctx, ggml_silu(ctx->ggml_ctx, c));         // [N, n_mods * hidden_size]
+        m      = ggml_reshape_3d(ctx->ggml_ctx, m, c->ne[0], n_mods, c->ne[1]);         // [N, n_mods, hidden_size]
+        m      = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, m, 0, 2, 1, 3));  // [n_mods, N, hidden_size]

        int64_t offset = m->nb[1] * m->ne[1];
-        auto shift_msa = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 0);  // [N, hidden_size]
-        auto scale_msa = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 1);  // [N, hidden_size]
+        auto shift_msa = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 0);  // [N, hidden_size]
+        auto scale_msa = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 1);  // [N, hidden_size]
        if (!pre_only) {
-            auto gate_msa  = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 2);  // [N, hidden_size]
-            auto shift_mlp = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 3);  // [N, hidden_size]
-            auto scale_mlp = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 4);  // [N, hidden_size]
-            auto gate_mlp  = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 5);  // [N, hidden_size]
+            auto gate_msa  = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 2);  // [N, hidden_size]
+            auto shift_mlp = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 3);  // [N, hidden_size]
+            auto scale_mlp = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 4);  // [N, hidden_size]
+            auto gate_mlp  = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 5);  // [N, hidden_size]

-            auto attn_in = modulate(ctx, norm1->forward(ctx, x), shift_msa, scale_msa);
+            auto attn_in = modulate(ctx->ggml_ctx, norm1->forward(ctx, x), shift_msa, scale_msa);

            auto qkv = attn->pre_attention(ctx, attn_in);

            return {qkv, {x, gate_msa, shift_mlp, scale_mlp, gate_mlp}};
        } else {
-            auto attn_in = modulate(ctx, norm1->forward(ctx, x), shift_msa, scale_msa);
+            auto attn_in = modulate(ctx->ggml_ctx, norm1->forward(ctx, x), shift_msa, scale_msa);
            auto qkv     = attn->pre_attention(ctx, attn_in);

            return {qkv, {nullptr, nullptr, nullptr, nullptr, nullptr}};
        }
    }

-    struct ggml_tensor* post_attention_x(struct ggml_context* ctx,
+    struct ggml_tensor* post_attention_x(GGMLRunnerContext* ctx,
                                         struct ggml_tensor* attn_out,
                                         struct ggml_tensor* attn2_out,
                                         struct ggml_tensor* x,
@ -376,22 +371,22 @@ public:
        auto norm2 = std::dynamic_pointer_cast<LayerNorm>(blocks["norm2"]);
        auto mlp   = std::dynamic_pointer_cast<Mlp>(blocks["mlp"]);

-        gate_msa  = ggml_reshape_3d(ctx, gate_msa, gate_msa->ne[0], 1, gate_msa->ne[1]);     // [N, 1, hidden_size]
-        gate_mlp  = ggml_reshape_3d(ctx, gate_mlp, gate_mlp->ne[0], 1, gate_mlp->ne[1]);     // [N, 1, hidden_size]
-        gate_msa2 = ggml_reshape_3d(ctx, gate_msa2, gate_msa2->ne[0], 1, gate_msa2->ne[1]);  // [N, 1, hidden_size]
+        gate_msa  = ggml_reshape_3d(ctx->ggml_ctx, gate_msa, gate_msa->ne[0], 1, gate_msa->ne[1]);     // [N, 1, hidden_size]
+        gate_mlp  = ggml_reshape_3d(ctx->ggml_ctx, gate_mlp, gate_mlp->ne[0], 1, gate_mlp->ne[1]);     // [N, 1, hidden_size]
+        gate_msa2 = ggml_reshape_3d(ctx->ggml_ctx, gate_msa2, gate_msa2->ne[0], 1, gate_msa2->ne[1]);  // [N, 1, hidden_size]

        attn_out  = attn->post_attention(ctx, attn_out);
        attn2_out = attn2->post_attention(ctx, attn2_out);

-        x            = ggml_add(ctx, x, ggml_mul(ctx, attn_out, gate_msa));
-        x            = ggml_add(ctx, x, ggml_mul(ctx, attn2_out, gate_msa2));
-        auto mlp_out = mlp->forward(ctx, modulate(ctx, norm2->forward(ctx, x), shift_mlp, scale_mlp));
-        x            = ggml_add(ctx, x, ggml_mul(ctx, mlp_out, gate_mlp));
+        x            = ggml_add(ctx->ggml_ctx, x, ggml_mul(ctx->ggml_ctx, attn_out, gate_msa));
+        x            = ggml_add(ctx->ggml_ctx, x, ggml_mul(ctx->ggml_ctx, attn2_out, gate_msa2));
+        auto mlp_out = mlp->forward(ctx, modulate(ctx->ggml_ctx, norm2->forward(ctx, x), shift_mlp, scale_mlp));
+        x            = ggml_add(ctx->ggml_ctx, x, ggml_mul(ctx->ggml_ctx, mlp_out, gate_mlp));

        return x;
    }

-    struct ggml_tensor* post_attention(struct ggml_context* ctx,
+    struct ggml_tensor* post_attention(GGMLRunnerContext* ctx,
                                       struct ggml_tensor* attn_out,
                                       struct ggml_tensor* x,
                                       struct ggml_tensor* gate_msa,
@ -411,20 +406,19 @@ public:
        auto norm2 = std::dynamic_pointer_cast<LayerNorm>(blocks["norm2"]);
        auto mlp   = std::dynamic_pointer_cast<Mlp>(blocks["mlp"]);

-        gate_msa = ggml_reshape_3d(ctx, gate_msa, gate_msa->ne[0], 1, gate_msa->ne[1]);  // [N, 1, hidden_size]
-        gate_mlp = ggml_reshape_3d(ctx, gate_mlp, gate_mlp->ne[0], 1, gate_mlp->ne[1]);  // [N, 1, hidden_size]
+        gate_msa = ggml_reshape_3d(ctx->ggml_ctx, gate_msa, gate_msa->ne[0], 1, gate_msa->ne[1]);  // [N, 1, hidden_size]
+        gate_mlp = ggml_reshape_3d(ctx->ggml_ctx, gate_mlp, gate_mlp->ne[0], 1, gate_mlp->ne[1]);  // [N, 1, hidden_size]

        attn_out = attn->post_attention(ctx, attn_out);

-        x            = ggml_add(ctx, x, ggml_mul(ctx, attn_out, gate_msa));
-        auto mlp_out = mlp->forward(ctx, modulate(ctx, norm2->forward(ctx, x), shift_mlp, scale_mlp));
-        x            = ggml_add(ctx, x, ggml_mul(ctx, mlp_out, gate_mlp));
+        x            = ggml_add(ctx->ggml_ctx, x, ggml_mul(ctx->ggml_ctx, attn_out, gate_msa));
+        auto mlp_out = mlp->forward(ctx, modulate(ctx->ggml_ctx, norm2->forward(ctx, x), shift_mlp, scale_mlp));
+        x            = ggml_add(ctx->ggml_ctx, x, ggml_mul(ctx->ggml_ctx, mlp_out, gate_mlp));

        return x;
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx,
-                                ggml_backend_t backend,
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                struct ggml_tensor* x,
                                struct ggml_tensor* c) {
        // x: [N, n_token, hidden_size]
@ -441,8 +435,8 @@ public:
            auto qkv2          = std::get<1>(qkv_intermediates);
            auto intermediates = std::get<2>(qkv_intermediates);

-            auto attn_out  = ggml_ext_attention_ext(ctx, backend, qkv[0], qkv[1], qkv[2], num_heads, nullptr, false, false, flash_attn);     // [N, n_token, dim]
-            auto attn2_out = ggml_ext_attention_ext(ctx, backend, qkv2[0], qkv2[1], qkv2[2], num_heads, nullptr, false, false, flash_attn);  // [N, n_token, dim]
+            auto attn_out  = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, qkv[0], qkv[1], qkv[2], num_heads, nullptr, false, false, ctx->flash_attn_enabled);     // [N, n_token, dim]
+            auto attn2_out = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, qkv2[0], qkv2[1], qkv2[2], num_heads, nullptr, false, false, ctx->flash_attn_enabled);  // [N, n_token, dim]
            x              = post_attention_x(ctx,
                                              attn_out,
                                              attn2_out,
@ -458,7 +452,7 @@ public:
            auto qkv               = qkv_intermediates.first;
            auto intermediates     = qkv_intermediates.second;

-            auto attn_out = ggml_ext_attention_ext(ctx, backend, qkv[0], qkv[1], qkv[2], num_heads, nullptr, false, false, flash_attn);  // [N, n_token, dim]
+            auto attn_out = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, qkv[0], qkv[1], qkv[2], num_heads, nullptr, false, false, ctx->flash_attn_enabled);  // [N, n_token, dim]
            x             = post_attention(ctx,
                                           attn_out,
                                           intermediates[0],
@ -472,9 +466,7 @@ public:
 };

 __STATIC_INLINE__ std::pair<struct ggml_tensor*, struct ggml_tensor*>
-block_mixing(struct ggml_context* ctx,
-             ggml_backend_t backend,
-             bool flash_attn,
+block_mixing(GGMLRunnerContext* ctx,
             struct ggml_tensor* context,
             struct ggml_tensor* x,
             struct ggml_tensor* c,
@ -501,29 +493,29 @@ block_mixing(struct ggml_context* ctx,
    }
    std::vector<struct ggml_tensor*> qkv;
    for (int i = 0; i < 3; i++) {
-        qkv.push_back(ggml_concat(ctx, context_qkv[i], x_qkv[i], 1));
+        qkv.push_back(ggml_concat(ctx->ggml_ctx, context_qkv[i], x_qkv[i], 1));
    }

-    auto attn         = ggml_ext_attention_ext(ctx, backend, qkv[0], qkv[1], qkv[2], x_block->num_heads, nullptr, false, false, flash_attn);  // [N, n_context + n_token, hidden_size]
-    attn              = ggml_cont(ctx, ggml_permute(ctx, attn, 0, 2, 1, 3));                                                                  // [n_context + n_token, N, hidden_size]
-    auto context_attn = ggml_view_3d(ctx,
+    auto attn         = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, qkv[0], qkv[1], qkv[2], x_block->num_heads, nullptr, false, false, ctx->flash_attn_enabled);  // [N, n_context + n_token, hidden_size]
+    attn              = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, attn, 0, 2, 1, 3));                                                                          // [n_context + n_token, N, hidden_size]
+    auto context_attn = ggml_view_3d(ctx->ggml_ctx,
                                     attn,
                                     attn->ne[0],
                                     attn->ne[1],
                                     context->ne[1],
                                     attn->nb[1],
                                     attn->nb[2],
-                                     0);                                              // [n_context, N, hidden_size]
-    context_attn      = ggml_cont(ctx, ggml_permute(ctx, context_attn, 0, 2, 1, 3));  // [N, n_context, hidden_size]
-    auto x_attn       = ggml_view_3d(ctx,
+                                     0);                                                                  // [n_context, N, hidden_size]
+    context_attn      = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, context_attn, 0, 2, 1, 3));  // [N, n_context, hidden_size]
+    auto x_attn       = ggml_view_3d(ctx->ggml_ctx,
                                     attn,
                                     attn->ne[0],
                                     attn->ne[1],
                                     x->ne[1],
                                     attn->nb[1],
                                     attn->nb[2],
-                                     attn->nb[2] * context->ne[1]);             // [n_token, N, hidden_size]
-    x_attn            = ggml_cont(ctx, ggml_permute(ctx, x_attn, 0, 2, 1, 3));  // [N, n_token, hidden_size]
+                                     attn->nb[2] * context->ne[1]);                                 // [n_token, N, hidden_size]
+    x_attn            = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x_attn, 0, 2, 1, 3));  // [N, n_token, hidden_size]

    if (!context_block->pre_only) {
        context = context_block->post_attention(ctx,
@ -538,7 +530,7 @@ block_mixing(struct ggml_context* ctx,
    }

    if (x_block->self_attn) {
-        auto attn2 = ggml_ext_attention_ext(ctx, backend, x_qkv2[0], x_qkv2[1], x_qkv2[2], x_block->num_heads);  // [N, n_token, hidden_size]
+        auto attn2 = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, x_qkv2[0], x_qkv2[1], x_qkv2[2], x_block->num_heads, nullptr, false, false, ctx->flash_attn_enabled);  // [N, n_token, hidden_size]

        x = x_block->post_attention_x(ctx,
                                      x_attn,
@ -563,8 +555,6 @@ block_mixing(struct ggml_context* ctx,
 }

 struct JointBlock : public GGMLBlock {
-    bool flash_attn;
-
 public:
    JointBlock(int64_t hidden_size,
               int64_t num_heads,
@ -572,22 +562,19 @@ public:
               std::string qk_norm = "",
               bool qkv_bias       = false,
               bool pre_only       = false,
-               bool self_attn_x    = false,
-               bool flash_attn     = false)
-        : flash_attn(flash_attn) {
-        blocks["context_block"] = std::shared_ptr<GGMLBlock>(new DismantledBlock(hidden_size, num_heads, mlp_ratio, qk_norm, qkv_bias, pre_only, false, flash_attn));
-        blocks["x_block"]       = std::shared_ptr<GGMLBlock>(new DismantledBlock(hidden_size, num_heads, mlp_ratio, qk_norm, qkv_bias, false, self_attn_x, flash_attn));
+               bool self_attn_x    = false) {
+        blocks["context_block"] = std::shared_ptr<GGMLBlock>(new DismantledBlock(hidden_size, num_heads, mlp_ratio, qk_norm, qkv_bias, pre_only, false));
+        blocks["x_block"]       = std::shared_ptr<GGMLBlock>(new DismantledBlock(hidden_size, num_heads, mlp_ratio, qk_norm, qkv_bias, false, self_attn_x));
    }

-    std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(struct ggml_context* ctx,
-                                                                ggml_backend_t backend,
+    std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(GGMLRunnerContext* ctx,
                                                                struct ggml_tensor* context,
                                                                struct ggml_tensor* x,
                                                                struct ggml_tensor* c) {
        auto context_block = std::dynamic_pointer_cast<DismantledBlock>(blocks["context_block"]);
        auto x_block       = std::dynamic_pointer_cast<DismantledBlock>(blocks["x_block"]);

-        return block_mixing(ctx, backend, flash_attn, context, x, c, context_block, x_block);
+        return block_mixing(ctx, context, x, c, context_block, x_block);
    }
 };

@ -603,7 +590,7 @@ public:
        blocks["adaLN_modulation.1"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, 2 * hidden_size));
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx,
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                struct ggml_tensor* x,
                                struct ggml_tensor* c) {
        // x: [N, n_token, hidden_size]
@ -613,15 +600,15 @@ public:
        auto linear             = std::dynamic_pointer_cast<Linear>(blocks["linear"]);
        auto adaLN_modulation_1 = std::dynamic_pointer_cast<Linear>(blocks["adaLN_modulation.1"]);

-        auto m = adaLN_modulation_1->forward(ctx, ggml_silu(ctx, c));  // [N, 2 * hidden_size]
-        m      = ggml_reshape_3d(ctx, m, c->ne[0], 2, c->ne[1]);       // [N, 2, hidden_size]
-        m      = ggml_cont(ctx, ggml_permute(ctx, m, 0, 2, 1, 3));     // [2, N, hidden_size]
+        auto m = adaLN_modulation_1->forward(ctx, ggml_silu(ctx->ggml_ctx, c));         // [N, 2 * hidden_size]
+        m      = ggml_reshape_3d(ctx->ggml_ctx, m, c->ne[0], 2, c->ne[1]);              // [N, 2, hidden_size]
+        m      = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, m, 0, 2, 1, 3));  // [2, N, hidden_size]

        int64_t offset = m->nb[1] * m->ne[1];
-        auto shift     = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 0);  // [N, hidden_size]
-        auto scale     = ggml_view_2d(ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 1);  // [N, hidden_size]
+        auto shift     = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 0);  // [N, hidden_size]
+        auto scale     = ggml_view_2d(ctx->ggml_ctx, m, m->ne[0], m->ne[1], m->nb[1], offset * 1);  // [N, hidden_size]

-        x = modulate(ctx, norm_final->forward(ctx, x), shift, scale);
+        x = modulate(ctx->ggml_ctx, norm_final->forward(ctx, x), shift, scale);
        x = linear->forward(ctx, x);

        return x;
@ -645,16 +632,14 @@ protected:
    int64_t context_embedder_out_dim = 1536;
    int64_t hidden_size;
    std::string qk_norm;
-    bool flash_attn = false;

-    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, std::string prefix = "") override {
+    void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, std::string prefix = "") override {
        enum ggml_type wtype = GGML_TYPE_F32;
        params["pos_embed"]  = ggml_new_tensor_3d(ctx, wtype, hidden_size, num_patchs, 1);
    }

 public:
-    MMDiT(bool flash_attn = false, const String2GGMLType& tensor_types = {})
-        : flash_attn(flash_attn) {
+    MMDiT(const String2TensorStorage& tensor_storage_map = {}) {
        // input_size is always None
        // learn_sigma is always False
        // register_length is alwalys 0
@ -667,8 +652,7 @@ public:
        // pos_embed_offset is not used
        // context_embedder_config is always {'target': 'torch.nn.Linear', 'params': {'in_features': 4096, 'out_features': 1536}}

-        // read tensors from tensor_types
-        for (auto pair : tensor_types) {
+        for (auto pair : tensor_storage_map) {
            std::string tensor_name = pair.first;
            if (tensor_name.find("model.diffusion_model.") == std::string::npos)
                continue;
@ -722,8 +706,7 @@ public:
                                                                                                    qk_norm,
                                                                                                    true,
                                                                                                    i == depth - 1,
-                                                                                                    i <= d_self,
-                                                                                                    flash_attn));
+                                                                                                    i <= d_self));
        }

        blocks["final_layer"] = std::shared_ptr<GGMLBlock>(new FinalLayer(hidden_size, patch_size, out_channels));
@ -791,8 +774,7 @@ public:
        return x;
    }

-    struct ggml_tensor* forward_core_with_concat(struct ggml_context* ctx,
-                                                 ggml_backend_t backend,
+    struct ggml_tensor* forward_core_with_concat(GGMLRunnerContext* ctx,
                                                 struct ggml_tensor* x,
                                                 struct ggml_tensor* c_mod,
                                                 struct ggml_tensor* context,
@ -811,7 +793,7 @@ public:

            auto block = std::dynamic_pointer_cast<JointBlock>(blocks["joint_blocks." + std::to_string(i)]);

-            auto context_x = block->forward(ctx, backend, context, x, c_mod);
+            auto context_x = block->forward(ctx, context, x, c_mod);
            context        = context_x.first;
            x              = context_x.second;
        }
@ -821,8 +803,7 @@ public:
        return x;
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx,
-                                ggml_backend_t backend,
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                struct ggml_tensor* x,
                                struct ggml_tensor* t,
                                struct ggml_tensor* y        = nullptr,
@ -840,16 +821,16 @@ public:
        int64_t w = x->ne[0];
        int64_t h = x->ne[1];

-        auto patch_embed = x_embedder->forward(ctx, x);            // [N, H*W, hidden_size]
-        auto pos_embed   = cropped_pos_embed(ctx, h, w);           // [1, H*W, hidden_size]
-        x                = ggml_add(ctx, patch_embed, pos_embed);  // [N, H*W, hidden_size]
+        auto patch_embed = x_embedder->forward(ctx, x);                      // [N, H*W, hidden_size]
+        auto pos_embed   = cropped_pos_embed(ctx->ggml_ctx, h, w);           // [1, H*W, hidden_size]
+        x                = ggml_add(ctx->ggml_ctx, patch_embed, pos_embed);  // [N, H*W, hidden_size]

        auto c = t_embedder->forward(ctx, t);  // [N, hidden_size]
        if (y != nullptr && adm_in_channels != -1) {
            auto y_embedder = std::dynamic_pointer_cast<VectorEmbedder>(blocks["y_embedder"]);

            y = y_embedder->forward(ctx, y);  // [N, hidden_size]
-            c = ggml_add(ctx, c, y);
+            c = ggml_add(ctx->ggml_ctx, c, y);
        }

        if (context != nullptr) {
@ -858,9 +839,9 @@ public:
            context = context_embedder->forward(ctx, context);  // [N, L, D] aka [N, L, 1536]
        }

-        x = forward_core_with_concat(ctx, backend, x, c, context, skip_layers);  // (N, H*W, patch_size ** 2 * out_channels)
+        x = forward_core_with_concat(ctx, x, c, context, skip_layers);  // (N, H*W, patch_size ** 2 * out_channels)

-        x = unpatchify(ctx, x, h, w);  // [N, C, H, W]
+        x = unpatchify(ctx->ggml_ctx, x, h, w);  // [N, C, H, W]

        return x;
    }
@ -870,11 +851,10 @@ struct MMDiTRunner : public GGMLRunner {

    MMDiTRunner(ggml_backend_t backend,
                bool offload_params_to_cpu,
-                bool flash_attn,
-                const String2GGMLType& tensor_types = {},
-                const std::string prefix            = "")
-        : GGMLRunner(backend, offload_params_to_cpu), mmdit(flash_attn, tensor_types) {
-        mmdit.init(params_ctx, tensor_types, prefix);
+                const String2TensorStorage& tensor_storage_map = {},
+                const std::string prefix                       = "")
+        : GGMLRunner(backend, offload_params_to_cpu), mmdit(tensor_storage_map) {
+        mmdit.init(params_ctx, tensor_storage_map, prefix);
    }

    std::string get_desc() override {
@ -897,8 +877,8 @@ struct MMDiTRunner : public GGMLRunner {
        y         = to_backend(y);
        timesteps = to_backend(timesteps);

-        struct ggml_tensor* out = mmdit.forward(compute_ctx,
-                                                runtime_backend,
+        auto runner_ctx         = get_context();
+        struct ggml_tensor* out = mmdit.forward(&runner_ctx,
                                                x,
                                                timesteps,
                                                y,
@ -972,7 +952,7 @@ struct MMDiTRunner : public GGMLRunner {
        // ggml_backend_t backend    = ggml_backend_cuda_init(0);
        ggml_backend_t backend             = ggml_backend_cpu_init();
        ggml_type model_data_type          = GGML_TYPE_F16;
-        std::shared_ptr<MMDiTRunner> mmdit = std::make_shared<MMDiTRunner>(backend, false, false);
+        std::shared_ptr<MMDiTRunner> mmdit = std::make_shared<MMDiTRunner>(backend, false);
        {
            LOG_INFO("loading from '%s'", file_path.c_str());

@ -981,7 +961,7 @@ struct MMDiTRunner : public GGMLRunner {
            mmdit->get_param_tensors(tensors, "model.diffusion_model");

            ModelLoader model_loader;
-            if (!model_loader.init_from_file(file_path)) {
+            if (!model_loader.init_from_file_and_convert_name(file_path)) {
                LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
                return;
            }
--- a/otherarch/sdcpp/model.cpp
+++ b/otherarch/sdcpp/model.cpp
--- a/otherarch/sdcpp/model.h
+++ b/otherarch/sdcpp/model.h
@ -15,6 +15,7 @@
 #include "ggml.h"
 #include "gguf.h"
 #include <nlohmann/json.hpp>
+#include "ordered_map.hpp"
 #include "zip.h"

 #define SD_MAX_DIMS 5
@ -26,6 +27,7 @@ enum SDVersion {
    VERSION_SD1_TINY_UNET,
    VERSION_SD2,
    VERSION_SD2_INPAINT,
+    VERSION_SD2_TINY_UNET,
    VERSION_SDXL,
    VERSION_SDXL_INPAINT,
    VERSION_SDXL_PIX2PIX,
@ -52,7 +54,7 @@ static inline bool sd_version_is_sd1(SDVersion version) {
 }

 static inline bool sd_version_is_sd2(SDVersion version) {
-    if (version == VERSION_SD2 || version == VERSION_SD2_INPAINT) {
+    if (version == VERSION_SD2 || version == VERSION_SD2_INPAINT || version == VERSION_SD2_TINY_UNET) {
        return true;
    }
    return false;
@ -65,6 +67,15 @@ static inline bool sd_version_is_sdxl(SDVersion version) {
    return false;
 }

+static inline bool sd_version_is_unet(SDVersion version) {
+    if (sd_version_is_sd1(version) ||
+        sd_version_is_sd2(version) ||
+        sd_version_is_sdxl(version)) {
+        return true;
+    }
+    return false;
+}
+
 static inline bool sd_version_is_sd3(SDVersion version) {
    if (version == VERSION_SD3) {
        return true;
@ -98,7 +109,11 @@ static inline bool sd_version_is_qwen_image(SDVersion version) {
 }

 static inline bool sd_version_is_inpaint(SDVersion version) {
-    if (version == VERSION_SD1_INPAINT || version == VERSION_SD2_INPAINT || version == VERSION_SDXL_INPAINT || version == VERSION_FLUX_FILL || version == VERSION_FLEX_2) {
+    if (version == VERSION_SD1_INPAINT ||
+        version == VERSION_SD2_INPAINT ||
+        version == VERSION_SDXL_INPAINT ||
+        version == VERSION_FLUX_FILL ||
+        version == VERSION_FLEX_2) {
        return true;
    }
    return false;
@ -134,6 +149,7 @@ enum PMVersion {
 struct TensorStorage {
    std::string name;
    ggml_type type          = GGML_TYPE_F32;
+    ggml_type expected_type = GGML_TYPE_COUNT;
    bool is_bf16            = false;
    bool is_f8_e4m3         = false;
    bool is_f8_e5m2         = false;
@ -242,12 +258,15 @@ struct TensorStorage {

 typedef std::function<bool(const TensorStorage&, ggml_tensor**)> on_new_tensor_cb_t;

-typedef std::map<std::string, enum ggml_type> String2GGMLType;
+typedef OrderedMap<std::string, TensorStorage> String2TensorStorage;

 class ModelLoader {
 protected:
+    SDVersion version_ = VERSION_COUNT;
    std::vector<std::string> file_paths_;
-    std::vector<TensorStorage> tensor_storages;
+    String2TensorStorage tensor_storage_map;
+
+    void add_tensor_storage(const TensorStorage& tensor_storage);

    bool parse_data_pkl(uint8_t* buffer,
                        size_t buffer_size,
@ -262,16 +281,18 @@ protected:
    bool init_from_diffusers_file(const std::string& file_path, const std::string& prefix = "");

 public:
-    String2GGMLType tensor_storages_types;
-
    bool init_from_file(const std::string& file_path, const std::string& prefix = "");
    bool has_diffusion_model_tensors();
-    bool model_is_unet();
+    void convert_tensors_name();
+    bool init_from_file_and_convert_name(const std::string& file_path,
+                                         const std::string& prefix = "",
+                                         SDVersion version         = VERSION_COUNT);
    SDVersion get_sd_version();
    std::map<ggml_type, uint32_t> get_wtype_stat();
    std::map<ggml_type, uint32_t> get_conditioner_wtype_stat();
    std::map<ggml_type, uint32_t> get_diffusion_model_wtype_stat();
    std::map<ggml_type, uint32_t> get_vae_wtype_stat();
+    String2TensorStorage& get_tensor_storage_map() { return tensor_storage_map; }
    void set_wtype_override(ggml_type wtype, std::string prefix = "");
    bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads = 0);
    bool load_tensors(std::map<std::string, struct ggml_tensor*>& tensors,
@ -280,8 +301,8 @@ public:

    std::vector<std::string> get_tensor_names() const {
        std::vector<std::string> names;
-        for (const auto& ts : tensor_storages) {
-            names.push_back(ts.name);
+        for (const auto& [name, tensor_storage] : tensor_storage_map) {
+            names.push_back(name);
        }
        return names;
    }
--- a/otherarch/sdcpp/name_conversion.cpp
+++ b/otherarch/sdcpp/name_conversion.cpp
--- a/otherarch/sdcpp/name_conversion.h
+++ b/otherarch/sdcpp/name_conversion.h
@ -0,0 +1,10 @@
+#ifndef __NAME_CONVERSTION_H__
+#define __NAME_CONVERSTION_H__
+
+#include <string>
+
+#include "model.h"
+
+std::string convert_tensor_name(std::string name, SDVersion version);
+
+#endif  // __NAME_CONVERSTION_H__
--- a/otherarch/sdcpp/ordered_map.hpp
+++ b/otherarch/sdcpp/ordered_map.hpp
@ -0,0 +1,177 @@
+#ifndef __ORDERED_MAP_HPP__
+#define __ORDERED_MAP_HPP__
+
+#include <iostream>
+#include <list>
+#include <string>
+#include <unordered_map>
+
+#include <initializer_list>
+#include <iterator>
+#include <list>
+#include <stdexcept>
+#include <unordered_map>
+#include <utility>
+
+template <typename Key, typename T>
+class OrderedMap {
+public:
+    using key_type        = Key;
+    using mapped_type     = T;
+    using value_type      = std::pair<const Key, T>;
+    using list_type       = std::list<value_type>;
+    using size_type       = typename list_type::size_type;
+    using difference_type = typename list_type::difference_type;
+    using iterator        = typename list_type::iterator;
+    using const_iterator  = typename list_type::const_iterator;
+
+private:
+    list_type data_;
+    std::unordered_map<Key, iterator> index_;
+
+public:
+    // --- constructors ---
+    OrderedMap() = default;
+
+    OrderedMap(std::initializer_list<value_type> init) {
+        for (const auto& kv : init)
+            insert(kv);
+    }
+
+    OrderedMap(const OrderedMap&)                = default;
+    OrderedMap(OrderedMap&&) noexcept            = default;
+    OrderedMap& operator=(const OrderedMap&)     = default;
+    OrderedMap& operator=(OrderedMap&&) noexcept = default;
+
+    // --- element access ---
+    T& at(const Key& key) {
+        auto it = index_.find(key);
+        if (it == index_.end())
+            throw std::out_of_range("OrderedMap::at: key not found");
+        return it->second->second;
+    }
+
+    const T& at(const Key& key) const {
+        auto it = index_.find(key);
+        if (it == index_.end())
+            throw std::out_of_range("OrderedMap::at: key not found");
+        return it->second->second;
+    }
+
+    T& operator[](const Key& key) {
+        auto it = index_.find(key);
+        if (it == index_.end()) {
+            data_.emplace_back(key, T{});
+            auto iter   = std::prev(data_.end());
+            index_[key] = iter;
+            return iter->second;
+        }
+        return it->second->second;
+    }
+
+    // --- iterators ---
+    iterator begin() noexcept { return data_.begin(); }
+    const_iterator begin() const noexcept { return data_.begin(); }
+    const_iterator cbegin() const noexcept { return data_.cbegin(); }
+
+    iterator end() noexcept { return data_.end(); }
+    const_iterator end() const noexcept { return data_.end(); }
+    const_iterator cend() const noexcept { return data_.cend(); }
+
+    // --- capacity ---
+    bool empty() const noexcept { return data_.empty(); }
+    size_type size() const noexcept { return data_.size(); }
+
+    // --- modifiers ---
+    void clear() noexcept {
+        data_.clear();
+        index_.clear();
+    }
+
+    std::pair<iterator, bool> insert(const value_type& value) {
+        auto it = index_.find(value.first);
+        if (it != index_.end()) {
+            return {it->second, false};
+        }
+        data_.push_back(value);
+        auto iter           = std::prev(data_.end());
+        index_[value.first] = iter;
+        return {iter, true};
+    }
+
+    std::pair<iterator, bool> insert(value_type&& value) {
+        auto it = index_.find(value.first);
+        if (it != index_.end()) {
+            return {it->second, false};
+        }
+        data_.push_back(std::move(value));
+        auto iter           = std::prev(data_.end());
+        index_[iter->first] = iter;
+        return {iter, true};
+    }
+
+    void erase(const Key& key) {
+        auto it = index_.find(key);
+        if (it != index_.end()) {
+            data_.erase(it->second);
+            index_.erase(it);
+        }
+    }
+
+    iterator erase(iterator pos) {
+        index_.erase(pos->first);
+        return data_.erase(pos);
+    }
+
+    // --- lookup ---
+    size_type count(const Key& key) const {
+        return index_.count(key);
+    }
+
+    iterator find(const Key& key) {
+        auto it = index_.find(key);
+        if (it == index_.end())
+            return data_.end();
+        return it->second;
+    }
+
+    const_iterator find(const Key& key) const {
+        auto it = index_.find(key);
+        if (it == index_.end())
+            return data_.end();
+        return it->second;
+    }
+
+    bool contains(const Key& key) const {
+        return index_.find(key) != index_.end();
+    }
+
+    // --- comparison ---
+    bool operator==(const OrderedMap& other) const {
+        return data_ == other.data_;
+    }
+
+    bool operator!=(const OrderedMap& other) const {
+        return !(*this == other);
+    }
+
+    template <typename... Args>
+    std::pair<iterator, bool> emplace(Args&&... args) {
+        value_type value(std::forward<Args>(args)...);
+        auto it = index_.find(value.first);
+        if (it != index_.end()) {
+            return {it->second, false};
+        }
+        data_.push_back(std::move(value));
+        auto iter           = std::prev(data_.end());
+        index_[iter->first] = iter;
+        return {iter, true};
+    }
+
+    void swap(OrderedMap& other) noexcept {
+        data_.swap(other.data_);
+        index_.swap(other.index_);
+    }
+};
+
+#endif  // __ORDERED_MAP_HPP__
--- a/otherarch/sdcpp/pmid.hpp
+++ b/otherarch/sdcpp/pmid.hpp
@ -21,7 +21,7 @@ public:
        blocks["layernorm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(in_dim));
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
        // x: [N, channels, h, w]

        auto fc1        = std::dynamic_pointer_cast<Linear>(blocks["fc1"]);
@ -33,11 +33,11 @@ public:
        x = layer_norm->forward(ctx, x);
        // x = ggml_add(ctx, ggml_mul_mat(ctx, fc1_w, x),  fc1_b);
        x = fc1->forward(ctx, x);
-        x = ggml_gelu_inplace(ctx, x);
+        x = ggml_gelu_inplace(ctx->ggml_ctx, x);
        x = fc2->forward(ctx, x);
        // x = ggml_add(ctx, ggml_mul_mat(ctx, fc2_w, x),  fc2_b);
        if (use_residue)
-            x = ggml_add(ctx, x, r);
+            x = ggml_add(ctx->ggml_ctx, x, r);
        return x;
    }
 };
@ -54,7 +54,7 @@ public:
        blocks["1"]   = std::shared_ptr<GGMLBlock>(new Mlp(dim, inner_dim, dim, false));
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx,
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                struct ggml_tensor* x) {
        auto norm = std::dynamic_pointer_cast<LayerNorm>(blocks["0"]);
        auto ff   = std::dynamic_pointer_cast<Mlp>(blocks["1"]);
@ -100,7 +100,7 @@ public:
                ggml_cont(ctx, tli)};
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx,
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                struct ggml_tensor* x,
                                struct ggml_tensor* latents) {
        // x (torch.Tensor): image features
@ -118,33 +118,33 @@ public:
        auto to_q  = std::dynamic_pointer_cast<Linear>(blocks["to_q"]);
        auto q     = to_q->forward(ctx, latents);

-        auto kv_input = ggml_concat(ctx, x, latents, 1);
+        auto kv_input = ggml_concat(ctx->ggml_ctx, x, latents, 1);
        auto to_kv    = std::dynamic_pointer_cast<Linear>(blocks["to_kv"]);
        auto kv       = to_kv->forward(ctx, kv_input);
-        auto k        = ggml_view_4d(ctx, kv, kv->ne[0] / 2, kv->ne[1], kv->ne[2], kv->ne[3], kv->nb[1] / 2, kv->nb[2] / 2, kv->nb[3] / 2, 0);
-        auto v        = ggml_view_4d(ctx, kv, kv->ne[0] / 2, kv->ne[1], kv->ne[2], kv->ne[3], kv->nb[1] / 2, kv->nb[2] / 2, kv->nb[3] / 2, kv->nb[0] * (kv->ne[0] / 2));
-        k             = ggml_cont(ctx, k);
-        v             = ggml_cont(ctx, v);
-        q             = reshape_tensor(ctx, q, heads);
-        k             = reshape_tensor(ctx, k, heads);
-        v             = reshape_tensor(ctx, v, heads);
+        auto k        = ggml_view_4d(ctx->ggml_ctx, kv, kv->ne[0] / 2, kv->ne[1], kv->ne[2], kv->ne[3], kv->nb[1] / 2, kv->nb[2] / 2, kv->nb[3] / 2, 0);
+        auto v        = ggml_view_4d(ctx->ggml_ctx, kv, kv->ne[0] / 2, kv->ne[1], kv->ne[2], kv->ne[3], kv->nb[1] / 2, kv->nb[2] / 2, kv->nb[3] / 2, kv->nb[0] * (kv->ne[0] / 2));
+        k             = ggml_cont(ctx->ggml_ctx, k);
+        v             = ggml_cont(ctx->ggml_ctx, v);
+        q             = reshape_tensor(ctx->ggml_ctx, q, heads);
+        k             = reshape_tensor(ctx->ggml_ctx, k, heads);
+        v             = reshape_tensor(ctx->ggml_ctx, v, heads);
        scale         = 1.f / sqrt(sqrt((float)dim_head));
-        k             = ggml_scale_inplace(ctx, k, scale);
-        q             = ggml_scale_inplace(ctx, q, scale);
+        k             = ggml_scale_inplace(ctx->ggml_ctx, k, scale);
+        q             = ggml_scale_inplace(ctx->ggml_ctx, q, scale);
        // auto weight = ggml_mul_mat(ctx, q, k);
-        auto weight = ggml_mul_mat(ctx, k, q);  // NOTE order of mul is opposite to pytorch
+        auto weight = ggml_mul_mat(ctx->ggml_ctx, k, q);  // NOTE order of mul is opposite to pytorch

        // GGML's softmax() is equivalent to pytorch's softmax(x, dim=-1)
        // in this case, dimension along which Softmax will be computed is the last dim
        // in torch and the first dim in GGML, consistent with the convention that pytorch's
        // last dimension (varying most rapidly) corresponds to GGML's first (varying most rapidly).
        // weight = ggml_soft_max(ctx, weight);
-        weight = ggml_soft_max_inplace(ctx, weight);
-        v      = ggml_cont(ctx, ggml_transpose(ctx, v));
+        weight = ggml_soft_max_inplace(ctx->ggml_ctx, weight);
+        v      = ggml_cont(ctx->ggml_ctx, ggml_transpose(ctx->ggml_ctx, v));
        // auto out = ggml_mul_mat(ctx, weight, v);
-        auto out    = ggml_mul_mat(ctx, v, weight);  // NOTE order of mul is opposite to pytorch
-        out         = ggml_cont(ctx, ggml_permute(ctx, out, 0, 2, 1, 3));
-        out         = ggml_reshape_3d(ctx, out, ne[0], ne[1], ggml_nelements(out) / (ne[0] * ne[1]));
+        auto out    = ggml_mul_mat(ctx->ggml_ctx, v, weight);  // NOTE order of mul is opposite to pytorch
+        out         = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, out, 0, 2, 1, 3));
+        out         = ggml_reshape_3d(ctx->ggml_ctx, out, ne[0], ne[1], ggml_nelements(out) / (ne[0] * ne[1]));
        auto to_out = std::dynamic_pointer_cast<Linear>(blocks["to_out"]);
        out         = to_out->forward(ctx, out);
        return out;
@ -176,7 +176,7 @@ public:
        }
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx,
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                struct ggml_tensor* latents,
                                struct ggml_tensor* x) {
        // x: [N, channels, h, w]
@ -191,9 +191,9 @@ public:
            name             = "layers." + std::to_string(i) + ".1";
            auto ff          = std::dynamic_pointer_cast<PMFeedForward>(blocks[name]);
            auto t           = attn->forward(ctx, x, latents);
-            latents          = ggml_add(ctx, t, latents);
+            latents          = ggml_add(ctx->ggml_ctx, t, latents);
            t                = ff->forward(ctx, latents);
-            latents          = ggml_add(ctx, t, latents);
+            latents          = ggml_add(ctx->ggml_ctx, t, latents);
        }
        latents = proj_out->forward(ctx, latents);
        latents = norm_out->forward(ctx, latents);
@ -225,7 +225,7 @@ public:
            4));
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx,
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                struct ggml_tensor* x,
                                struct ggml_tensor* last_hidden_state) {
        // x: [N, channels, h, w]
@ -235,11 +235,11 @@ public:

        x                       = token_proj->forward(ctx, x);
        int64_t nel             = ggml_nelements(x);
-        x                       = ggml_reshape_3d(ctx, x, cross_attention_dim, num_tokens, nel / (cross_attention_dim * num_tokens));
+        x                       = ggml_reshape_3d(ctx->ggml_ctx, x, cross_attention_dim, num_tokens, nel / (cross_attention_dim * num_tokens));
        x                       = token_norm->forward(ctx, x);
        struct ggml_tensor* out = perceiver_resampler->forward(ctx, x, last_hidden_state);
        if (use_residul)
-            out = ggml_add(ctx, x, out);
+            out = ggml_add(ctx->ggml_ctx, x, out);
        return out;
    }
 };
@ -256,24 +256,24 @@ public:
        blocks["layer_norm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(embed_dim));
    }

-    struct ggml_tensor* fuse_fn(struct ggml_context* ctx,
+    struct ggml_tensor* fuse_fn(GGMLRunnerContext* ctx,
                                struct ggml_tensor* prompt_embeds,
                                struct ggml_tensor* id_embeds) {
        auto mlp1       = std::dynamic_pointer_cast<FuseBlock>(blocks["mlp1"]);
        auto mlp2       = std::dynamic_pointer_cast<FuseBlock>(blocks["mlp2"]);
        auto layer_norm = std::dynamic_pointer_cast<LayerNorm>(blocks["layer_norm"]);

-        auto stacked_id_embeds = ggml_concat(ctx, prompt_embeds, id_embeds, 0);
+        auto stacked_id_embeds = ggml_concat(ctx->ggml_ctx, prompt_embeds, id_embeds, 0);

        stacked_id_embeds = mlp1->forward(ctx, stacked_id_embeds);
-        stacked_id_embeds = ggml_add(ctx, stacked_id_embeds, prompt_embeds);
+        stacked_id_embeds = ggml_add(ctx->ggml_ctx, stacked_id_embeds, prompt_embeds);
        stacked_id_embeds = mlp2->forward(ctx, stacked_id_embeds);
        stacked_id_embeds = layer_norm->forward(ctx, stacked_id_embeds);

        return stacked_id_embeds;
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx,
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                struct ggml_tensor* prompt_embeds,
                                struct ggml_tensor* id_embeds,
                                struct ggml_tensor* class_tokens_mask,
@ -286,25 +286,25 @@ public:
        // # slice out the image token embeddings
        ggml_set_name(class_tokens_mask_pos, "class_tokens_mask_pos");
        ggml_set_name(prompt_embeds, "prompt_embeds");
-        struct ggml_tensor* image_token_embeds = ggml_get_rows(ctx, prompt_embeds, class_tokens_mask_pos);
+        struct ggml_tensor* image_token_embeds = ggml_get_rows(ctx->ggml_ctx, prompt_embeds, class_tokens_mask_pos);
        ggml_set_name(image_token_embeds, "image_token_embeds");
-        valid_id_embeds                       = ggml_reshape_2d(ctx, valid_id_embeds, valid_id_embeds->ne[0],
+        valid_id_embeds                       = ggml_reshape_2d(ctx->ggml_ctx, valid_id_embeds, valid_id_embeds->ne[0],
                                                                ggml_nelements(valid_id_embeds) / valid_id_embeds->ne[0]);
        struct ggml_tensor* stacked_id_embeds = fuse_fn(ctx, image_token_embeds, valid_id_embeds);

        if (left && right) {
-            stacked_id_embeds = ggml_concat(ctx, left, stacked_id_embeds, 1);
-            stacked_id_embeds = ggml_concat(ctx, stacked_id_embeds, right, 1);
+            stacked_id_embeds = ggml_concat(ctx->ggml_ctx, left, stacked_id_embeds, 1);
+            stacked_id_embeds = ggml_concat(ctx->ggml_ctx, stacked_id_embeds, right, 1);
        } else if (left) {
-            stacked_id_embeds = ggml_concat(ctx, left, stacked_id_embeds, 1);
+            stacked_id_embeds = ggml_concat(ctx->ggml_ctx, left, stacked_id_embeds, 1);
        } else if (right) {
-            stacked_id_embeds = ggml_concat(ctx, stacked_id_embeds, right, 1);
+            stacked_id_embeds = ggml_concat(ctx->ggml_ctx, stacked_id_embeds, right, 1);
        }

-        class_tokens_mask                         = ggml_cont(ctx, ggml_transpose(ctx, class_tokens_mask));
-        class_tokens_mask                         = ggml_repeat(ctx, class_tokens_mask, prompt_embeds);
-        prompt_embeds                             = ggml_mul(ctx, prompt_embeds, class_tokens_mask);
-        struct ggml_tensor* updated_prompt_embeds = ggml_add(ctx, prompt_embeds, stacked_id_embeds);
+        class_tokens_mask                         = ggml_cont(ctx->ggml_ctx, ggml_transpose(ctx->ggml_ctx, class_tokens_mask));
+        class_tokens_mask                         = ggml_repeat(ctx->ggml_ctx, class_tokens_mask, prompt_embeds);
+        prompt_embeds                             = ggml_mul(ctx->ggml_ctx, prompt_embeds, class_tokens_mask);
+        struct ggml_tensor* updated_prompt_embeds = ggml_add(ctx->ggml_ctx, prompt_embeds, stacked_id_embeds);
        ggml_set_name(updated_prompt_embeds, "updated_prompt_embeds");
        return updated_prompt_embeds;
    }
@ -317,8 +317,7 @@ struct PhotoMakerIDEncoderBlock : public CLIPVisionModelProjection {
        blocks["fuse_module"]         = std::shared_ptr<GGMLBlock>(new FuseModule(2048));
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx,
-                                ggml_backend_t backend,
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                struct ggml_tensor* id_pixel_values,
                                struct ggml_tensor* prompt_embeds,
                                struct ggml_tensor* class_tokens_mask,
@ -331,15 +330,15 @@ struct PhotoMakerIDEncoderBlock : public CLIPVisionModelProjection {
        auto visual_projection_2 = std::dynamic_pointer_cast<Linear>(blocks["visual_projection_2"]);
        auto fuse_module         = std::dynamic_pointer_cast<FuseModule>(blocks["fuse_module"]);

-        struct ggml_tensor* shared_id_embeds = vision_model->forward(ctx, backend, id_pixel_values);  // [N, hidden_size]
-        struct ggml_tensor* id_embeds        = visual_projection->forward(ctx, shared_id_embeds);     // [N, proj_dim(768)]
-        struct ggml_tensor* id_embeds_2      = visual_projection_2->forward(ctx, shared_id_embeds);   // [N, 1280]
+        struct ggml_tensor* shared_id_embeds = vision_model->forward(ctx, id_pixel_values);          // [N, hidden_size]
+        struct ggml_tensor* id_embeds        = visual_projection->forward(ctx, shared_id_embeds);    // [N, proj_dim(768)]
+        struct ggml_tensor* id_embeds_2      = visual_projection_2->forward(ctx, shared_id_embeds);  // [N, 1280]

-        id_embeds   = ggml_cont(ctx, ggml_permute(ctx, id_embeds, 2, 0, 1, 3));
-        id_embeds_2 = ggml_cont(ctx, ggml_permute(ctx, id_embeds_2, 2, 0, 1, 3));
+        id_embeds   = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, id_embeds, 2, 0, 1, 3));
+        id_embeds_2 = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, id_embeds_2, 2, 0, 1, 3));

-        id_embeds = ggml_concat(ctx, id_embeds, id_embeds_2, 2);  // [batch_size, seq_length, 1, 2048] check whether concat at dim 2 is right
-        id_embeds = ggml_cont(ctx, ggml_permute(ctx, id_embeds, 1, 2, 0, 3));
+        id_embeds = ggml_concat(ctx->ggml_ctx, id_embeds, id_embeds_2, 2);  // [batch_size, seq_length, 1, 2048] check whether concat at dim 2 is right
+        id_embeds = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, id_embeds, 1, 2, 0, 3));

        struct ggml_tensor* updated_prompt_embeds = fuse_module->forward(ctx,
                                                                         prompt_embeds,
@ -366,8 +365,7 @@ struct PhotoMakerIDEncoder_CLIPInsightfaceExtendtokenBlock : public CLIPVisionMo
                                                                                        num_tokens));
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx,
-                                ggml_backend_t backend,
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                struct ggml_tensor* id_pixel_values,
                                struct ggml_tensor* prompt_embeds,
                                struct ggml_tensor* class_tokens_mask,
@ -381,7 +379,7 @@ struct PhotoMakerIDEncoder_CLIPInsightfaceExtendtokenBlock : public CLIPVisionMo
        auto qformer_perceiver = std::dynamic_pointer_cast<QFormerPerceiver>(blocks["qformer_perceiver"]);

        // struct ggml_tensor* last_hidden_state = vision_model->forward(ctx, id_pixel_values);          // [N, hidden_size]
-        struct ggml_tensor* last_hidden_state = vision_model->forward(ctx, backend, id_pixel_values, false);  // [N, hidden_size]
+        struct ggml_tensor* last_hidden_state = vision_model->forward(ctx, id_pixel_values, false);  // [N, hidden_size]
        id_embeds                             = qformer_perceiver->forward(ctx, id_embeds, last_hidden_state);

        struct ggml_tensor* updated_prompt_embeds = fuse_module->forward(ctx,
@ -414,7 +412,7 @@ public:
 public:
    PhotoMakerIDEncoder(ggml_backend_t backend,
                        bool offload_params_to_cpu,
-                        const String2GGMLType& tensor_types,
+                        const String2TensorStorage& tensor_storage_map,
                        const std::string prefix,
                        SDVersion version = VERSION_SDXL,
                        PMVersion pm_v    = PM_VERSION_1,
@ -424,9 +422,9 @@ public:
          pm_version(pm_v),
          style_strength(sty) {
        if (pm_version == PM_VERSION_1) {
-            id_encoder.init(params_ctx, tensor_types, prefix);
+            id_encoder.init(params_ctx, tensor_storage_map, prefix);
        } else if (pm_version == PM_VERSION_2) {
-            id_encoder2.init(params_ctx, tensor_types, prefix);
+            id_encoder2.init(params_ctx, tensor_storage_map, prefix);
        }
    }

@ -458,7 +456,7 @@ public:
        zeros_right.clear();
        zeros_right_16.clear();

-        ggml_context* ctx0 = compute_ctx;
+        auto runner_ctx = get_context();

        struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);

@ -466,7 +464,7 @@ public:
        int64_t seq_length  = prompt_embeds->ne[1];
        ggml_type type      = GGML_TYPE_F32;

-        struct ggml_tensor* class_tokens_mask_d = ggml_new_tensor_1d(ctx0, type, class_tokens_mask.size());
+        struct ggml_tensor* class_tokens_mask_d = ggml_new_tensor_1d(runner_ctx.ggml_ctx, type, class_tokens_mask.size());

        struct ggml_tensor* id_pixel_values_d = to_backend(id_pixel_values);
        struct ggml_tensor* prompt_embeds_d   = to_backend(prompt_embeds);
@ -488,16 +486,16 @@ public:
        }
        // printf("\n");
        if (ctmpos[0] > 0) {
-            // left = ggml_new_tensor_3d(ctx0, type, hidden_size, 1, ctmpos[0]);
-            left = ggml_new_tensor_3d(ctx0, type, hidden_size, ctmpos[0], 1);
+            // left = ggml_new_tensor_3d(runner_ctx.ggml_ctx, type, hidden_size, 1, ctmpos[0]);
+            left = ggml_new_tensor_3d(runner_ctx.ggml_ctx, type, hidden_size, ctmpos[0], 1);
        }
        if (ctmpos[ctmpos.size() - 1] < seq_length - 1) {
-            // right = ggml_new_tensor_3d(ctx0, type,
+            // right = ggml_new_tensor_3d(runner_ctx.ggml_ctx, type,
            //                            hidden_size, 1, seq_length - ctmpos[ctmpos.size() - 1] - 1);
-            right = ggml_new_tensor_3d(ctx0, type,
+            right = ggml_new_tensor_3d(runner_ctx.ggml_ctx, type,
                                       hidden_size, seq_length - ctmpos[ctmpos.size() - 1] - 1, 1);
        }
-        struct ggml_tensor* class_tokens_mask_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ctmpos.size());
+        struct ggml_tensor* class_tokens_mask_pos = ggml_new_tensor_1d(runner_ctx.ggml_ctx, GGML_TYPE_I32, ctmpos.size());

        {
            if (type == GGML_TYPE_F16)
@ -530,16 +528,14 @@ public:
        }
        struct ggml_tensor* updated_prompt_embeds = nullptr;
        if (pm_version == PM_VERSION_1)
-            updated_prompt_embeds = id_encoder.forward(ctx0,
-                                                       runtime_backend,
+            updated_prompt_embeds = id_encoder.forward(&runner_ctx,
                                                       id_pixel_values_d,
                                                       prompt_embeds_d,
                                                       class_tokens_mask_d,
                                                       class_tokens_mask_pos,
                                                       left, right);
        else if (pm_version == PM_VERSION_2)
-            updated_prompt_embeds = id_encoder2.forward(ctx0,
-                                                        runtime_backend,
+            updated_prompt_embeds = id_encoder2.forward(&runner_ctx,
                                                        id_pixel_values_d,
                                                        prompt_embeds_d,
                                                        class_tokens_mask_d,
@ -582,7 +578,7 @@ struct PhotoMakerIDEmbed : public GGMLRunner {
                      const std::string& file_path = "",
                      const std::string& prefix    = "")
        : file_path(file_path), GGMLRunner(backend, offload_params_to_cpu), model_loader(ml) {
-        if (!model_loader->init_from_file(file_path, prefix)) {
+        if (!model_loader->init_from_file_and_convert_name(file_path, prefix)) {
            load_failed = true;
        }
    }
--- a/otherarch/sdcpp/qwen_image.hpp
+++ b/otherarch/sdcpp/qwen_image.hpp
@ -27,18 +27,18 @@ namespace Qwen {
            blocks["linear_2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, out_dim, sample_proj_bias));
        }

-        struct ggml_tensor* forward(struct ggml_context* ctx,
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                    struct ggml_tensor* sample,
                                    struct ggml_tensor* condition = nullptr) {
            if (condition != nullptr) {
                auto cond_proj = std::dynamic_pointer_cast<Linear>(blocks["cond_proj"]);
-                sample         = ggml_add(ctx, sample, cond_proj->forward(ctx, condition));
+                sample         = ggml_add(ctx->ggml_ctx, sample, cond_proj->forward(ctx, condition));
            }
            auto linear_1 = std::dynamic_pointer_cast<Linear>(blocks["linear_1"]);
            auto linear_2 = std::dynamic_pointer_cast<Linear>(blocks["linear_2"]);

            sample = linear_1->forward(ctx, sample);
-            sample = ggml_silu_inplace(ctx, sample);
+            sample = ggml_silu_inplace(ctx->ggml_ctx, sample);
            sample = linear_2->forward(ctx, sample);
            return sample;
        }
@ -50,13 +50,13 @@ namespace Qwen {
            blocks["timestep_embedder"] = std::shared_ptr<GGMLBlock>(new TimestepEmbedding(256, embedding_dim));
        }

-        struct ggml_tensor* forward(struct ggml_context* ctx,
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                    struct ggml_tensor* timesteps) {
            // timesteps: [N,]
            // return: [N, embedding_dim]
            auto timestep_embedder = std::dynamic_pointer_cast<TimestepEmbedding>(blocks["timestep_embedder"]);

-            auto timesteps_proj = ggml_ext_timestep_embedding(ctx, timesteps, 256, 10000, 1.f);
+            auto timesteps_proj = ggml_ext_timestep_embedding(ctx->ggml_ctx, timesteps, 256, 10000, 1.f);
            auto timesteps_emb  = timestep_embedder->forward(ctx, timesteps_proj);
            return timesteps_emb;
        }
@ -65,7 +65,6 @@ namespace Qwen {
    struct QwenImageAttention : public GGMLBlock {
    protected:
        int64_t dim_head;
-        bool flash_attn;

    public:
        QwenImageAttention(int64_t query_dim,
@ -75,9 +74,8 @@ namespace Qwen {
                           int64_t out_context_dim = 0,
                           bool bias               = true,
                           bool out_bias           = true,
-                           float eps               = 1e-6,
-                           bool flash_attn         = false)
-            : dim_head(dim_head), flash_attn(flash_attn) {
+                           float eps               = 1e-6)
+            : dim_head(dim_head) {
            int64_t inner_dim = out_dim > 0 ? out_dim : dim_head * num_heads;
            out_dim           = out_dim > 0 ? out_dim : query_dim;
            out_context_dim   = out_context_dim > 0 ? out_context_dim : query_dim;
@ -105,8 +103,7 @@ namespace Qwen {
            blocks["to_add_out"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, out_context_dim, out_bias, false, false, scale));
        }

-        std::pair<ggml_tensor*, ggml_tensor*> forward(struct ggml_context* ctx,
-                                                      ggml_backend_t backend,
+        std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
                                                      struct ggml_tensor* img,
                                                      struct ggml_tensor* txt,
                                                      struct ggml_tensor* pe,
@ -138,49 +135,49 @@ namespace Qwen {

            auto img_q        = to_q->forward(ctx, img);
            int64_t num_heads = img_q->ne[0] / dim_head;
-            img_q             = ggml_reshape_4d(ctx, img_q, dim_head, num_heads, n_img_token, N);  // [N, n_img_token, n_head, d_head]
+            img_q             = ggml_reshape_4d(ctx->ggml_ctx, img_q, dim_head, num_heads, n_img_token, N);  // [N, n_img_token, n_head, d_head]
            auto img_k        = to_k->forward(ctx, img);
-            img_k             = ggml_reshape_4d(ctx, img_k, dim_head, num_heads, n_img_token, N);  // [N, n_img_token, n_head, d_head]
+            img_k             = ggml_reshape_4d(ctx->ggml_ctx, img_k, dim_head, num_heads, n_img_token, N);  // [N, n_img_token, n_head, d_head]
            auto img_v        = to_v->forward(ctx, img);
-            img_v             = ggml_reshape_4d(ctx, img_v, dim_head, num_heads, n_img_token, N);  // [N, n_img_token, n_head, d_head]
+            img_v             = ggml_reshape_4d(ctx->ggml_ctx, img_v, dim_head, num_heads, n_img_token, N);  // [N, n_img_token, n_head, d_head]

            img_q = norm_q->forward(ctx, img_q);
            img_k = norm_k->forward(ctx, img_k);

            auto txt_q = add_q_proj->forward(ctx, txt);
-            txt_q      = ggml_reshape_4d(ctx, txt_q, dim_head, num_heads, n_txt_token, N);  // [N, n_txt_token, n_head, d_head]
+            txt_q      = ggml_reshape_4d(ctx->ggml_ctx, txt_q, dim_head, num_heads, n_txt_token, N);  // [N, n_txt_token, n_head, d_head]
            auto txt_k = add_k_proj->forward(ctx, txt);
-            txt_k      = ggml_reshape_4d(ctx, txt_k, dim_head, num_heads, n_txt_token, N);  // [N, n_txt_token, n_head, d_head]
+            txt_k      = ggml_reshape_4d(ctx->ggml_ctx, txt_k, dim_head, num_heads, n_txt_token, N);  // [N, n_txt_token, n_head, d_head]
            auto txt_v = add_v_proj->forward(ctx, txt);
-            txt_v      = ggml_reshape_4d(ctx, txt_v, dim_head, num_heads, n_txt_token, N);  // [N, n_txt_token, n_head, d_head]
+            txt_v      = ggml_reshape_4d(ctx->ggml_ctx, txt_v, dim_head, num_heads, n_txt_token, N);  // [N, n_txt_token, n_head, d_head]

            txt_q = norm_added_q->forward(ctx, txt_q);
            txt_k = norm_added_k->forward(ctx, txt_k);

-            auto q = ggml_concat(ctx, txt_q, img_q, 2);  // [N, n_txt_token + n_img_token, n_head, d_head]
-            auto k = ggml_concat(ctx, txt_k, img_k, 2);  // [N, n_txt_token + n_img_token, n_head, d_head]
-            auto v = ggml_concat(ctx, txt_v, img_v, 2);  // [N, n_txt_token + n_img_token, n_head, d_head]
+            auto q = ggml_concat(ctx->ggml_ctx, txt_q, img_q, 2);  // [N, n_txt_token + n_img_token, n_head, d_head]
+            auto k = ggml_concat(ctx->ggml_ctx, txt_k, img_k, 2);  // [N, n_txt_token + n_img_token, n_head, d_head]
+            auto v = ggml_concat(ctx->ggml_ctx, txt_v, img_v, 2);  // [N, n_txt_token + n_img_token, n_head, d_head]

-            auto attn         = Rope::attention(ctx, backend, q, k, v, pe, mask, flash_attn, (1.0f / 128.f));  // [N, n_txt_token + n_img_token, n_head*d_head]
-            attn              = ggml_cont(ctx, ggml_permute(ctx, attn, 0, 2, 1, 3));                           // [n_txt_token + n_img_token, N, hidden_size]
-            auto txt_attn_out = ggml_view_3d(ctx,
+            auto attn         = Rope::attention(ctx, q, k, v, pe, mask, (1.0f / 128.f));                  // [N, n_txt_token + n_img_token, n_head*d_head]
+            attn              = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, attn, 0, 2, 1, 3));  // [n_txt_token + n_img_token, N, hidden_size]
+            auto txt_attn_out = ggml_view_3d(ctx->ggml_ctx,
                                             attn,
                                             attn->ne[0],
                                             attn->ne[1],
                                             txt->ne[1],
                                             attn->nb[1],
                                             attn->nb[2],
-                                             0);                                              // [n_txt_token, N, hidden_size]
-            txt_attn_out      = ggml_cont(ctx, ggml_permute(ctx, txt_attn_out, 0, 2, 1, 3));  // [N, n_txt_token, hidden_size]
-            auto img_attn_out = ggml_view_3d(ctx,
+                                             0);                                                                  // [n_txt_token, N, hidden_size]
+            txt_attn_out      = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, txt_attn_out, 0, 2, 1, 3));  // [N, n_txt_token, hidden_size]
+            auto img_attn_out = ggml_view_3d(ctx->ggml_ctx,
                                             attn,
                                             attn->ne[0],
                                             attn->ne[1],
                                             img->ne[1],
                                             attn->nb[1],
                                             attn->nb[2],
-                                             attn->nb[2] * txt->ne[1]);                       // [n_img_token, N, hidden_size]
-            img_attn_out      = ggml_cont(ctx, ggml_permute(ctx, img_attn_out, 0, 2, 1, 3));  // [N, n_img_token, hidden_size]
+                                             attn->nb[2] * txt->ne[1]);                                           // [n_img_token, N, hidden_size]
+            img_attn_out      = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, img_attn_out, 0, 2, 1, 3));  // [N, n_img_token, hidden_size]

            img_attn_out = to_out_0->forward(ctx, img_attn_out);
            txt_attn_out = to_add_out->forward(ctx, txt_attn_out);
@ -194,8 +191,7 @@ namespace Qwen {
        QwenImageTransformerBlock(int64_t dim,
                                  int64_t num_attention_heads,
                                  int64_t attention_head_dim,
-                                  float eps       = 1e-6,
-                                  bool flash_attn = false) {
+                                  float eps = 1e-6) {
            // img_mod.0 is nn.SiLU()
            blocks["img_mod.1"] = std::shared_ptr<GGMLBlock>(new Linear(dim, 6 * dim, true));

@ -217,12 +213,10 @@ namespace Qwen {
                                                                               0,     // out_context-dim
                                                                               true,  // bias
                                                                               true,  // out_bias
-                                                                               eps,
-                                                                               flash_attn));
+                                                                               eps));
        }

-        virtual std::pair<ggml_tensor*, ggml_tensor*> forward(struct ggml_context* ctx,
-                                                              ggml_backend_t backend,
+        virtual std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
                                                              struct ggml_tensor* img,
                                                              struct ggml_tensor* txt,
                                                              struct ggml_tensor* t_emb,
@ -244,40 +238,40 @@ namespace Qwen {

            auto attn = std::dynamic_pointer_cast<QwenImageAttention>(blocks["attn"]);

-            auto img_mod_params    = ggml_silu(ctx, t_emb);
+            auto img_mod_params    = ggml_silu(ctx->ggml_ctx, t_emb);
            img_mod_params         = img_mod_1->forward(ctx, img_mod_params);
-            auto img_mod_param_vec = ggml_ext_chunk(ctx, img_mod_params, 6, 0);
+            auto img_mod_param_vec = ggml_ext_chunk(ctx->ggml_ctx, img_mod_params, 6, 0);

-            auto txt_mod_params    = ggml_silu(ctx, t_emb);
+            auto txt_mod_params    = ggml_silu(ctx->ggml_ctx, t_emb);
            txt_mod_params         = txt_mod_1->forward(ctx, txt_mod_params);
-            auto txt_mod_param_vec = ggml_ext_chunk(ctx, txt_mod_params, 6, 0);
+            auto txt_mod_param_vec = ggml_ext_chunk(ctx->ggml_ctx, txt_mod_params, 6, 0);

            auto img_normed    = img_norm1->forward(ctx, img);
-            auto img_modulated = Flux::modulate(ctx, img_normed, img_mod_param_vec[0], img_mod_param_vec[1]);
+            auto img_modulated = Flux::modulate(ctx->ggml_ctx, img_normed, img_mod_param_vec[0], img_mod_param_vec[1]);
            auto img_gate1     = img_mod_param_vec[2];

            auto txt_normed    = txt_norm1->forward(ctx, txt);
-            auto txt_modulated = Flux::modulate(ctx, txt_normed, txt_mod_param_vec[0], txt_mod_param_vec[1]);
+            auto txt_modulated = Flux::modulate(ctx->ggml_ctx, txt_normed, txt_mod_param_vec[0], txt_mod_param_vec[1]);
            auto txt_gate1     = txt_mod_param_vec[2];

-            auto [img_attn_output, txt_attn_output] = attn->forward(ctx, backend, img_modulated, txt_modulated, pe);
+            auto [img_attn_output, txt_attn_output] = attn->forward(ctx, img_modulated, txt_modulated, pe);

-            img = ggml_add(ctx, img, ggml_mul(ctx, img_attn_output, img_gate1));
-            txt = ggml_add(ctx, txt, ggml_mul(ctx, txt_attn_output, txt_gate1));
+            img = ggml_add(ctx->ggml_ctx, img, ggml_mul(ctx->ggml_ctx, img_attn_output, img_gate1));
+            txt = ggml_add(ctx->ggml_ctx, txt, ggml_mul(ctx->ggml_ctx, txt_attn_output, txt_gate1));

            auto img_normed2    = img_norm2->forward(ctx, img);
-            auto img_modulated2 = Flux::modulate(ctx, img_normed2, img_mod_param_vec[3], img_mod_param_vec[4]);
+            auto img_modulated2 = Flux::modulate(ctx->ggml_ctx, img_normed2, img_mod_param_vec[3], img_mod_param_vec[4]);
            auto img_gate2      = img_mod_param_vec[5];

            auto txt_normed2    = txt_norm2->forward(ctx, txt);
-            auto txt_modulated2 = Flux::modulate(ctx, txt_normed2, txt_mod_param_vec[3], txt_mod_param_vec[4]);
+            auto txt_modulated2 = Flux::modulate(ctx->ggml_ctx, txt_normed2, txt_mod_param_vec[3], txt_mod_param_vec[4]);
            auto txt_gate2      = txt_mod_param_vec[5];

            auto img_mlp_out = img_mlp->forward(ctx, img_modulated2);
            auto txt_mlp_out = txt_mlp->forward(ctx, txt_modulated2);

-            img = ggml_add(ctx, img, ggml_mul(ctx, img_mlp_out, img_gate2));
-            txt = ggml_add(ctx, txt, ggml_mul(ctx, txt_mlp_out, txt_gate2));
+            img = ggml_add(ctx->ggml_ctx, img, ggml_mul(ctx->ggml_ctx, img_mlp_out, img_gate2));
+            txt = ggml_add(ctx->ggml_ctx, txt, ggml_mul(ctx->ggml_ctx, txt_mlp_out, txt_gate2));

            return {img, txt};
        }
@ -294,7 +288,7 @@ namespace Qwen {
            blocks["linear"] = std::shared_ptr<GGMLBlock>(new Linear(conditioning_embedding_dim, embedding_dim * 2, bias));
        }

-        struct ggml_tensor* forward(struct ggml_context* ctx,
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                    struct ggml_tensor* x,
                                    struct ggml_tensor* c) {
            // x: [N, n_token, hidden_size]
@ -304,13 +298,13 @@ namespace Qwen {
            auto norm   = std::dynamic_pointer_cast<LayerNorm>(blocks["norm"]);
            auto linear = std::dynamic_pointer_cast<Linear>(blocks["linear"]);

-            auto emb   = linear->forward(ctx, ggml_silu(ctx, c));
-            auto mods  = ggml_ext_chunk(ctx, emb, 2, 0);
+            auto emb   = linear->forward(ctx, ggml_silu(ctx->ggml_ctx, c));
+            auto mods  = ggml_ext_chunk(ctx->ggml_ctx, emb, 2, 0);
            auto scale = mods[0];
            auto shift = mods[1];

            x = norm->forward(ctx, x);
-            x = Flux::modulate(ctx, x, shift, scale);
+            x = Flux::modulate(ctx->ggml_ctx, x, shift, scale);

            return x;
        }
@ -327,7 +321,6 @@ namespace Qwen {
        float theta                 = 10000;
        std::vector<int> axes_dim   = {16, 56, 56};
        int64_t axes_dim_sum        = 128;
-        bool flash_attn             = false;
    };

    class QwenImageModel : public GGMLBlock {
@ -349,8 +342,7 @@ namespace Qwen {
                auto block                                        = std::shared_ptr<GGMLBlock>(new QwenImageTransformerBlock(inner_dim,
                                                                                                                             params.num_attention_heads,
                                                                                                                             params.attention_head_dim,
-                                                                                                                             1e-6f,
-                                                                                                                             params.flash_attn));
+                                                                                                                             1e-6f));
                blocks["transformer_blocks." + std::to_string(i)] = block;
            }

@ -421,8 +413,7 @@ namespace Qwen {
            return x;
        }

-        struct ggml_tensor* forward_orig(struct ggml_context* ctx,
-                                         ggml_backend_t backend,
+        struct ggml_tensor* forward_orig(GGMLRunnerContext* ctx,
                                         struct ggml_tensor* x,
                                         struct ggml_tensor* timestep,
                                         struct ggml_tensor* context,
@ -442,7 +433,7 @@ namespace Qwen {
            for (int i = 0; i < params.num_layers; i++) {
                auto block = std::dynamic_pointer_cast<QwenImageTransformerBlock>(blocks["transformer_blocks." + std::to_string(i)]);

-                auto result = block->forward(ctx, backend, img, txt, t_emb, pe);
+                auto result = block->forward(ctx, img, txt, t_emb, pe);
                img         = result.first;
                txt         = result.second;
            }
@ -453,8 +444,7 @@ namespace Qwen {
            return img;
        }

-        struct ggml_tensor* forward(struct ggml_context* ctx,
-                                    ggml_backend_t backend,
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                    struct ggml_tensor* x,
                                    struct ggml_tensor* timestep,
                                    struct ggml_tensor* context,
@ -472,32 +462,32 @@ namespace Qwen {
            int64_t C = x->ne[2];
            int64_t N = x->ne[3];

-            auto img            = process_img(ctx, x);
+            auto img            = process_img(ctx->ggml_ctx, x);
            uint64_t img_tokens = img->ne[1];

            if (ref_latents.size() > 0) {
                for (ggml_tensor* ref : ref_latents) {
-                    ref = process_img(ctx, ref);
-                    img = ggml_concat(ctx, img, ref, 1);
+                    ref = process_img(ctx->ggml_ctx, ref);
+                    img = ggml_concat(ctx->ggml_ctx, img, ref, 1);
                }
            }

            int64_t h_len = ((H + (params.patch_size / 2)) / params.patch_size);
            int64_t w_len = ((W + (params.patch_size / 2)) / params.patch_size);

-            auto out = forward_orig(ctx, backend, img, timestep, context, pe);  // [N, h_len*w_len, ph*pw*C]
+            auto out = forward_orig(ctx, img, timestep, context, pe);  // [N, h_len*w_len, ph*pw*C]

            if (out->ne[1] > img_tokens) {
-                out = ggml_cont(ctx, ggml_permute(ctx, out, 0, 2, 1, 3));  // [num_tokens, N, C * patch_size * patch_size]
-                out = ggml_view_3d(ctx, out, out->ne[0], out->ne[1], img_tokens, out->nb[1], out->nb[2], 0);
-                out = ggml_cont(ctx, ggml_permute(ctx, out, 0, 2, 1, 3));  // [N, h*w, C * patch_size * patch_size]
+                out = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, out, 0, 2, 1, 3));  // [num_tokens, N, C * patch_size * patch_size]
+                out = ggml_view_3d(ctx->ggml_ctx, out, out->ne[0], out->ne[1], img_tokens, out->nb[1], out->nb[2], 0);
+                out = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, out, 0, 2, 1, 3));  // [N, h*w, C * patch_size * patch_size]
            }

-            out = unpatchify(ctx, out, h_len, w_len);  // [N, C, H + pad_h, W + pad_w]
+            out = unpatchify(ctx->ggml_ctx, out, h_len, w_len);  // [N, C, H + pad_h, W + pad_w]

            // slice
-            out = ggml_ext_slice(ctx, out, 1, 0, H);  // [N, C, H, W + pad_w]
-            out = ggml_ext_slice(ctx, out, 0, 0, W);  // [N, C, H, W]
+            out = ggml_ext_slice(ctx->ggml_ctx, out, 1, 0, H);  // [N, C, H, W + pad_w]
+            out = ggml_ext_slice(ctx->ggml_ctx, out, 0, 0, W);  // [N, C, H, W]

            return out;
        }
@ -512,14 +502,12 @@ namespace Qwen {

        QwenImageRunner(ggml_backend_t backend,
                        bool offload_params_to_cpu,
-                        const String2GGMLType& tensor_types = {},
-                        const std::string prefix            = "",
-                        SDVersion version                   = VERSION_QWEN_IMAGE,
-                        bool flash_attn                     = false)
+                        const String2TensorStorage& tensor_storage_map = {},
+                        const std::string prefix                       = "",
+                        SDVersion version                              = VERSION_QWEN_IMAGE)
            : GGMLRunner(backend, offload_params_to_cpu) {
-            qwen_image_params.flash_attn = flash_attn;
            qwen_image_params.num_layers = 0;
-            for (auto pair : tensor_types) {
+            for (auto pair : tensor_storage_map) {
                std::string tensor_name = pair.first;
                if (tensor_name.find(prefix) == std::string::npos)
                    continue;
@ -538,7 +526,7 @@ namespace Qwen {
            }
            LOG_INFO("qwen_image_params.num_layers: %ld", qwen_image_params.num_layers);
            qwen_image = QwenImageModel(qwen_image_params);
-            qwen_image.init(params_ctx, tensor_types, prefix);
+            qwen_image.init(params_ctx, tensor_storage_map, prefix);
        }

        std::string get_desc() override {
@ -582,8 +570,9 @@ namespace Qwen {
            // pe->data = nullptr;
            set_backend_tensor_data(pe, pe_vec.data());

-            struct ggml_tensor* out = qwen_image.forward(compute_ctx,
-                                                         runtime_backend,
+            auto runner_ctx = get_context();
+
+            struct ggml_tensor* out = qwen_image.forward(&runner_ctx,
                                                         x,
                                                         timesteps,
                                                         context,
@ -655,25 +644,23 @@ namespace Qwen {
            ggml_type model_data_type = GGML_TYPE_Q8_0;

            ModelLoader model_loader;
-            if (!model_loader.init_from_file(file_path, "model.diffusion_model.")) {
+            if (!model_loader.init_from_file_and_convert_name(file_path, "model.diffusion_model.")) {
                LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
                return;
            }

-            auto tensor_types = model_loader.tensor_storages_types;
-            for (auto& item : tensor_types) {
-                // LOG_DEBUG("%s %u", item.first.c_str(), item.second);
-                if (ends_with(item.first, "weight")) {
-                    item.second = model_data_type;
+            auto& tensor_storage_map = model_loader.get_tensor_storage_map();
+            for (auto& [name, tensor_storage] : tensor_storage_map) {
+                if (ends_with(name, "weight")) {
+                    tensor_storage.expected_type = model_data_type;
                }
            }

            std::shared_ptr<QwenImageRunner> qwen_image = std::make_shared<QwenImageRunner>(backend,
                                                                                            false,
-                                                                                            tensor_types,
+                                                                                            tensor_storage_map,
                                                                                            "model.diffusion_model",
-                                                                                            VERSION_QWEN_IMAGE,
-                                                                                            true);
+                                                                                            VERSION_QWEN_IMAGE);

            qwen_image->alloc_params_buffer();
            std::map<std::string, ggml_tensor*> tensors;
--- a/otherarch/sdcpp/qwenvl.hpp
+++ b/otherarch/sdcpp/qwenvl.hpp
@ -349,15 +349,15 @@ namespace Qwen {
            blocks["down_proj"] = std::shared_ptr<GGMLBlock>(new Linear(intermediate_size, hidden_size, bias));
        }

-        struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
            // x: [N, n_token, hidden_size]
            auto gate_proj = std::dynamic_pointer_cast<Linear>(blocks["gate_proj"]);
            auto up_proj   = std::dynamic_pointer_cast<Linear>(blocks["up_proj"]);
            auto down_proj = std::dynamic_pointer_cast<Linear>(blocks["down_proj"]);

            auto h = gate_proj->forward(ctx, x);
-            h      = ggml_silu_inplace(ctx, h);
-            h      = ggml_mul_inplace(ctx, h, up_proj->forward(ctx, x));
+            h      = ggml_silu_inplace(ctx->ggml_ctx, h);
+            h      = ggml_mul_inplace(ctx->ggml_ctx, h, up_proj->forward(ctx, x));
            h      = down_proj->forward(ctx, h);
            return h;
        }
@ -409,10 +409,10 @@ namespace Qwen {
            }
        }

-        struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
            // x: [N*grid_t*grid_h*grid_w, in_channels, temporal_patch_size*patch_size*patch_size]
            // return: [N*grid_t*grid_h*grid_w, embed_dim]
-            x = ggml_reshape_4d(ctx,
+            x = ggml_reshape_4d(ctx->ggml_ctx,
                                x,
                                patch_size,
                                patch_size,
@ -423,22 +423,22 @@ namespace Qwen {
                auto proj_0 = std::dynamic_pointer_cast<Conv2d>(blocks["proj.0"]);
                auto proj_1 = std::dynamic_pointer_cast<Conv2d>(blocks["proj.1"]);

-                auto x0 = ggml_ext_slice(ctx, x, 2, 0, 1);
-                x0      = ggml_reshape_4d(ctx, x0, x0->ne[0], x0->ne[1], in_channels, x0->ne[3] / in_channels);
+                auto x0 = ggml_ext_slice(ctx->ggml_ctx, x, 2, 0, 1);
+                x0      = ggml_reshape_4d(ctx->ggml_ctx, x0, x0->ne[0], x0->ne[1], in_channels, x0->ne[3] / in_channels);
                x0      = proj_0->forward(ctx, x0);

-                auto x1 = ggml_ext_slice(ctx, x, 2, 1, 2);
-                x1      = ggml_reshape_4d(ctx, x1, x1->ne[0], x1->ne[1], in_channels, x1->ne[3] / in_channels);
+                auto x1 = ggml_ext_slice(ctx->ggml_ctx, x, 2, 1, 2);
+                x1      = ggml_reshape_4d(ctx->ggml_ctx, x1, x1->ne[0], x1->ne[1], in_channels, x1->ne[3] / in_channels);
                x1      = proj_1->forward(ctx, x1);

-                x = ggml_add(ctx, x0, x1);
+                x = ggml_add(ctx->ggml_ctx, x0, x1);
            } else {
                auto proj = std::dynamic_pointer_cast<Conv3d>(blocks["proj"]);

                x = proj->forward(ctx, x);
            }

-            x = ggml_reshape_2d(ctx, x, embed_dim, ggml_nelements(x) / embed_dim);
+            x = ggml_reshape_2d(ctx->ggml_ctx, x, embed_dim, ggml_nelements(x) / embed_dim);
            return x;
        }
    };
@ -458,15 +458,15 @@ namespace Qwen {
            blocks["mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, dim));
        }

-        struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
            auto ln_q  = std::dynamic_pointer_cast<RMSNorm>(blocks["ln_q"]);
            auto mlp_0 = std::dynamic_pointer_cast<Linear>(blocks["mlp.0"]);
            auto mlp_2 = std::dynamic_pointer_cast<Linear>(blocks["mlp.2"]);

            x = ln_q->forward(ctx, x);
-            x = ggml_reshape_2d(ctx, x, hidden_size, ggml_nelements(x) / hidden_size);
+            x = ggml_reshape_2d(ctx->ggml_ctx, x, hidden_size, ggml_nelements(x) / hidden_size);
            x = mlp_0->forward(ctx, x);
-            x = ggml_gelu(ctx, x);
+            x = ggml_gelu(ctx->ggml_ctx, x);
            x = mlp_2->forward(ctx, x);
            return x;
        }
@ -495,8 +495,7 @@ namespace Qwen {
            blocks["proj"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size));
        }

-        struct ggml_tensor* forward(struct ggml_context* ctx,
-                                    ggml_backend_t backend,
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                    struct ggml_tensor* x,
                                    struct ggml_tensor* pe,
                                    struct ggml_tensor* mask = nullptr) {
@ -519,14 +518,14 @@ namespace Qwen {
            } else {
                auto qkv_proj = std::dynamic_pointer_cast<Linear>(blocks["qkv"]);
                auto qkv      = qkv_proj->forward(ctx, x);
-                qkv_vec       = split_qkv(ctx, qkv);
+                qkv_vec       = split_qkv(ctx->ggml_ctx, qkv);
            }

-            auto q = ggml_reshape_4d(ctx, qkv_vec[0], head_dim, num_heads, qkv_vec[0]->ne[1], qkv_vec[0]->ne[2]);  // [N, n_token, n_head, d_head]
-            auto k = ggml_reshape_4d(ctx, qkv_vec[1], head_dim, num_heads, qkv_vec[1]->ne[1], qkv_vec[1]->ne[2]);  // [N, n_token, n_head, d_head]
-            auto v = ggml_reshape_4d(ctx, qkv_vec[2], head_dim, num_heads, qkv_vec[2]->ne[1], qkv_vec[2]->ne[2]);  // [N, n_token, n_head, d_head]
+            auto q = ggml_reshape_4d(ctx->ggml_ctx, qkv_vec[0], head_dim, num_heads, qkv_vec[0]->ne[1], qkv_vec[0]->ne[2]);  // [N, n_token, n_head, d_head]
+            auto k = ggml_reshape_4d(ctx->ggml_ctx, qkv_vec[1], head_dim, num_heads, qkv_vec[1]->ne[1], qkv_vec[1]->ne[2]);  // [N, n_token, n_head, d_head]
+            auto v = ggml_reshape_4d(ctx->ggml_ctx, qkv_vec[2], head_dim, num_heads, qkv_vec[2]->ne[1], qkv_vec[2]->ne[2]);  // [N, n_token, n_head, d_head]

-            x = Rope::attention(ctx, backend, q, k, v, pe, mask, false, 1.f, false);  // [N, n_token, hidden_size]
+            x = Rope::attention(ctx, q, k, v, pe, mask, 1.f, false);  // [N, n_token, hidden_size]

            x = proj->forward(ctx, x);  // [N, n_token, hidden_size]
            return x;
@ -546,8 +545,7 @@ namespace Qwen {
            blocks["norm2"] = std::shared_ptr<GGMLBlock>(new RMSNorm(hidden_size, eps));
        }

-        struct ggml_tensor* forward(struct ggml_context* ctx,
-                                    ggml_backend_t backend,
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                    struct ggml_tensor* x,
                                    struct ggml_tensor* pe,
                                    struct ggml_tensor* mask = nullptr) {
@ -559,13 +557,13 @@ namespace Qwen {

            auto residual = x;
            x             = norm1->forward(ctx, x);
-            x             = attn->forward(ctx, backend, x, pe, mask);
-            x             = ggml_add_inplace(ctx, x, residual);
+            x             = attn->forward(ctx, x, pe, mask);
+            x             = ggml_add_inplace(ctx->ggml_ctx, x, residual);

            residual = x;
            x        = norm2->forward(ctx, x);
            x        = mlp->forward(ctx, x);
-            x        = ggml_add_inplace(ctx, x, residual);
+            x        = ggml_add_inplace(ctx->ggml_ctx, x, residual);

            return x;
        }
@ -607,8 +605,7 @@ namespace Qwen {
            blocks["merger"] = std::shared_ptr<GGMLBlock>(new Qwen2_5_VLPatchMerger(out_hidden_size, hidden_size, spatial_merge_size));
        }

-        struct ggml_tensor* forward(struct ggml_context* ctx,
-                                    ggml_backend_t backend,
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                    struct ggml_tensor* pixel_values,
                                    struct ggml_tensor* pe,
                                    struct ggml_tensor* window_index,
@ -623,9 +620,9 @@ namespace Qwen {

            auto x = patch_embed->forward(ctx, pixel_values);

-            x = ggml_reshape_4d(ctx, x, x->ne[0] * spatial_merge_size * spatial_merge_size, x->ne[1] / spatial_merge_size / spatial_merge_size, x->ne[2], x->ne[3]);
-            x = ggml_get_rows(ctx, x, window_index);
-            x = ggml_reshape_4d(ctx, x, x->ne[0] / spatial_merge_size / spatial_merge_size, x->ne[1] * spatial_merge_size * spatial_merge_size, x->ne[2], x->ne[3]);
+            x = ggml_reshape_4d(ctx->ggml_ctx, x, x->ne[0] * spatial_merge_size * spatial_merge_size, x->ne[1] / spatial_merge_size / spatial_merge_size, x->ne[2], x->ne[3]);
+            x = ggml_get_rows(ctx->ggml_ctx, x, window_index);
+            x = ggml_reshape_4d(ctx->ggml_ctx, x, x->ne[0] / spatial_merge_size / spatial_merge_size, x->ne[1] * spatial_merge_size * spatial_merge_size, x->ne[2], x->ne[3]);

            for (int i = 0; i < num_layers; i++) {
                auto block = std::dynamic_pointer_cast<Qwen2_5_VLVisionBlock>(blocks["blocks." + std::to_string(i)]);
@ -634,12 +631,12 @@ namespace Qwen {
                if (fullatt_block_indexes.find(i) != fullatt_block_indexes.end()) {
                    mask = nullptr;
                }
-                x = block->forward(ctx, backend, x, pe, mask);
+                x = block->forward(ctx, x, pe, mask);
            }

            x = merger->forward(ctx, x);

-            x = ggml_get_rows(ctx, x, window_inverse_index);
+            x = ggml_get_rows(ctx->ggml_ctx, x, window_inverse_index);

            return x;
        }
@ -664,8 +661,7 @@ namespace Qwen {
            blocks["o_proj"] = std::shared_ptr<GGMLBlock>(new Linear(num_heads * head_dim, hidden_size, false));
        }

-        struct ggml_tensor* forward(struct ggml_context* ctx,
-                                    ggml_backend_t backend,
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                    struct ggml_tensor* x,
                                    struct ggml_tensor* input_pos) {
            // x: [N, n_token, hidden_size]
@ -680,21 +676,21 @@ namespace Qwen {
            auto k = k_proj->forward(ctx, x);  // [N, n_token, num_kv_heads*head_dim]
            auto v = v_proj->forward(ctx, x);  // [N, n_token, num_kv_heads*head_dim]

-            q = ggml_reshape_4d(ctx, q, head_dim, num_heads, n_token, N);     // [N, n_token, num_heads, head_dim]
-            k = ggml_reshape_4d(ctx, k, head_dim, num_kv_heads, n_token, N);  // [N, n_token, num_kv_heads, head_dim]
-            v = ggml_reshape_4d(ctx, v, head_dim, num_kv_heads, n_token, N);  // [N, n_token, num_kv_heads, head_dim]
+            q = ggml_reshape_4d(ctx->ggml_ctx, q, head_dim, num_heads, n_token, N);     // [N, n_token, num_heads, head_dim]
+            k = ggml_reshape_4d(ctx->ggml_ctx, k, head_dim, num_kv_heads, n_token, N);  // [N, n_token, num_kv_heads, head_dim]
+            v = ggml_reshape_4d(ctx->ggml_ctx, v, head_dim, num_kv_heads, n_token, N);  // [N, n_token, num_kv_heads, head_dim]

            int sections[4] = {16, 24, 24, 0};
-            q               = ggml_rope_multi(ctx, q, input_pos, nullptr, head_dim, sections, GGML_ROPE_TYPE_MROPE, 128000, 1000000.f, 1.f, 0.f, 1.f, 32.f, 1.f);
-            k               = ggml_rope_multi(ctx, k, input_pos, nullptr, head_dim, sections, GGML_ROPE_TYPE_MROPE, 128000, 1000000.f, 1.f, 0.f, 1.f, 32.f, 1.f);
+            q               = ggml_rope_multi(ctx->ggml_ctx, q, input_pos, nullptr, head_dim, sections, GGML_ROPE_TYPE_MROPE, 128000, 1000000.f, 1.f, 0.f, 1.f, 32.f, 1.f);
+            k               = ggml_rope_multi(ctx->ggml_ctx, k, input_pos, nullptr, head_dim, sections, GGML_ROPE_TYPE_MROPE, 128000, 1000000.f, 1.f, 0.f, 1.f, 32.f, 1.f);

-            q = ggml_cont(ctx, ggml_ext_torch_permute(ctx, q, 0, 2, 1, 3));        // [N, num_heads, n_token, head_dim]
-            q = ggml_reshape_3d(ctx, q, q->ne[0], q->ne[1], q->ne[2] * q->ne[3]);  // [N*num_heads, n_token, head_dim]
+            q = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, q, 0, 2, 1, 3));  // [N, num_heads, n_token, head_dim]
+            q = ggml_reshape_3d(ctx->ggml_ctx, q, q->ne[0], q->ne[1], q->ne[2] * q->ne[3]);      // [N*num_heads, n_token, head_dim]

-            k = ggml_cont(ctx, ggml_ext_torch_permute(ctx, k, 0, 2, 1, 3));        // [N, num_kv_heads, n_token, head_dim]
-            k = ggml_reshape_3d(ctx, k, k->ne[0], k->ne[1], k->ne[2] * k->ne[3]);  // [N*num_kv_heads, n_token, head_dim]
+            k = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, k, 0, 2, 1, 3));  // [N, num_kv_heads, n_token, head_dim]
+            k = ggml_reshape_3d(ctx->ggml_ctx, k, k->ne[0], k->ne[1], k->ne[2] * k->ne[3]);      // [N*num_kv_heads, n_token, head_dim]

-            x = ggml_ext_attention_ext(ctx, backend, q, k, v, num_heads, nullptr, true, true, false);  // [N, n_token, hidden_size]
+            x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, num_heads, nullptr, true, true, false);  // [N, n_token, hidden_size]

            x = out_proj->forward(ctx, x);  // [N, n_token, hidden_size]
            return x;
@ -714,8 +710,7 @@ namespace Qwen {
            blocks["post_attention_layernorm"] = std::shared_ptr<GGMLBlock>(new RMSNorm(hidden_size, eps));
        }

-        struct ggml_tensor* forward(struct ggml_context* ctx,
-                                    ggml_backend_t backend,
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                    struct ggml_tensor* x,
                                    struct ggml_tensor* input_pos) {
            // x: [N, n_token, hidden_size]
@ -726,13 +721,13 @@ namespace Qwen {

            auto residual = x;
            x             = input_layernorm->forward(ctx, x);
-            x             = self_attn->forward(ctx, backend, x, input_pos);
-            x             = ggml_add_inplace(ctx, x, residual);
+            x             = self_attn->forward(ctx, x, input_pos);
+            x             = ggml_add_inplace(ctx->ggml_ctx, x, residual);

            residual = x;
            x        = post_attention_layernorm->forward(ctx, x);
            x        = mlp->forward(ctx, x);
-            x        = ggml_add_inplace(ctx, x, residual);
+            x        = ggml_add_inplace(ctx->ggml_ctx, x, residual);

            return x;
        }
@ -761,8 +756,7 @@ namespace Qwen {
            blocks["norm"] = std::shared_ptr<GGMLBlock>(new RMSNorm(hidden_size, eps));
        }

-        struct ggml_tensor* forward(struct ggml_context* ctx,
-                                    ggml_backend_t backend,
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                    struct ggml_tensor* input_ids,
                                    struct ggml_tensor* input_pos,
                                    std::vector<std::pair<int, ggml_tensor*>> image_embeds) {
@ -777,7 +771,7 @@ namespace Qwen {
            if (image_embeds.size() > 0) {
                GGML_ASSERT(x->ne[2] == 1);  // N == 1

-                auto raw_x              = ggml_cast(ctx, x, image_embeds[0].second->type);
+                auto raw_x              = ggml_cast(ctx->ggml_ctx, x, image_embeds[0].second->type);
                int64_t txt_token_start = 0;
                int64_t txt_token_end   = 0;

@ -791,23 +785,23 @@ namespace Qwen {
                    }
                    txt_token_end = image_embeds[i].first;

-                    auto txt_embed = ggml_ext_slice(ctx, raw_x, 1, txt_token_start, txt_token_end);
+                    auto txt_embed = ggml_ext_slice(ctx->ggml_ctx, raw_x, 1, txt_token_start, txt_token_end);
                    if (input_embed == nullptr) {
                        input_embed = txt_embed;
                    } else {
-                        input_embed = ggml_concat(ctx, input_embed, txt_embed, 1);
+                        input_embed = ggml_concat(ctx->ggml_ctx, input_embed, txt_embed, 1);
                    }

                    auto image_embed = image_embeds[i].second;
-                    input_embed      = ggml_concat(ctx, input_embed, image_embed, 1);
+                    input_embed      = ggml_concat(ctx->ggml_ctx, input_embed, image_embed, 1);
                }

                txt_token_start = image_embeds[image_embeds.size() - 1].first + image_embeds[image_embeds.size() - 1].second->ne[1];
                txt_token_end   = raw_x->ne[1];

-                auto final_txt_embed = ggml_ext_slice(ctx, raw_x, 1, txt_token_start, txt_token_end);
+                auto final_txt_embed = ggml_ext_slice(ctx->ggml_ctx, raw_x, 1, txt_token_start, txt_token_end);

-                input_embed = ggml_concat(ctx, input_embed, final_txt_embed, 1);
+                input_embed = ggml_concat(ctx->ggml_ctx, input_embed, final_txt_embed, 1);
                GGML_ASSERT(raw_x->ne[1] == input_embed->ne[1]);

                x = input_embed;
@ -816,7 +810,7 @@ namespace Qwen {
            for (int i = 0; i < num_layers; i++) {
                auto block = std::dynamic_pointer_cast<Qwen2_5_VLBlock>(blocks["layers." + std::to_string(i)]);

-                x = block->forward(ctx, backend, x, input_pos);
+                x = block->forward(ctx, x, input_pos);
            }

            x = norm->forward(ctx, x);
@ -880,20 +874,18 @@ namespace Qwen {
            }
        }

-        struct ggml_tensor* forward(struct ggml_context* ctx,
-                                    ggml_backend_t backend,
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                    struct ggml_tensor* input_ids,
                                    struct ggml_tensor* input_pos,
                                    std::vector<std::pair<int, ggml_tensor*>> image_embeds) {
            // input_ids: [N, n_token]
            auto model = std::dynamic_pointer_cast<Qwen2_5_VLTextModel>(blocks["model"]);

-            auto x = model->forward(ctx, backend, input_ids, input_pos, image_embeds);
+            auto x = model->forward(ctx, input_ids, input_pos, image_embeds);
            return x;
        }

-        struct ggml_tensor* vision_forward(struct ggml_context* ctx,
-                                           ggml_backend_t backend,
+        struct ggml_tensor* vision_forward(GGMLRunnerContext* ctx,
                                           struct ggml_tensor* pixel_values,
                                           struct ggml_tensor* pe,
                                           struct ggml_tensor* window_index,
@ -901,7 +893,7 @@ namespace Qwen {
                                           struct ggml_tensor* window_mask) {
            GGML_ASSERT(enable_vision);
            auto vision_model = std::dynamic_pointer_cast<Qwen2_5_VLVisionModel>(blocks["visual"]);
-            return vision_model->forward(ctx, backend, pixel_values, pe, window_index, window_inverse_index, window_mask);
+            return vision_model->forward(ctx, pixel_values, pe, window_index, window_inverse_index, window_mask);
        }
    };

@ -918,13 +910,13 @@ namespace Qwen {

        Qwen2_5_VLRunner(ggml_backend_t backend,
                         bool offload_params_to_cpu,
-                         const String2GGMLType& tensor_types,
+                         const String2TensorStorage& tensor_storage_map,
                         const std::string prefix,
                         bool enable_vision_ = false)
            : GGMLRunner(backend, offload_params_to_cpu), enable_vision(enable_vision_) {
            bool have_vision_weight = false;
            bool llama_cpp_style    = false;
-            for (auto pair : tensor_types) {
+            for (auto pair : tensor_storage_map) {
                std::string tensor_name = pair.first;
                if (tensor_name.find(prefix) == std::string::npos)
                    continue;
@ -948,7 +940,7 @@ namespace Qwen {
                }
            }
            model = Qwen2_5_VL(params, enable_vision, llama_cpp_style);
-            model.init(params_ctx, tensor_types, prefix);
+            model.init(params_ctx, tensor_storage_map, prefix);
        }

        std::string get_desc() override {
@ -959,23 +951,21 @@ namespace Qwen {
            model.get_param_tensors(tensors, prefix);
        }

-        struct ggml_tensor* forward(struct ggml_context* ctx,
-                                    ggml_backend_t backend,
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                    struct ggml_tensor* input_ids,
                                    struct ggml_tensor* input_pos,
                                    std::vector<std::pair<int, ggml_tensor*>> image_embeds) {
-            auto hidden_states = model.forward(ctx, backend, input_ids, input_pos, image_embeds);  // [N, n_token, hidden_size]
+            auto hidden_states = model.forward(ctx, input_ids, input_pos, image_embeds);  // [N, n_token, hidden_size]
            return hidden_states;
        }

-        struct ggml_tensor* vision_forward(struct ggml_context* ctx,
-                                           ggml_backend_t backend,
+        struct ggml_tensor* vision_forward(GGMLRunnerContext* ctx,
                                           struct ggml_tensor* pixel_values,
                                           struct ggml_tensor* input_pos,
                                           struct ggml_tensor* window_index,
                                           struct ggml_tensor* window_inverse_index,
                                           struct ggml_tensor* window_mask) {
-            auto hidden_states = model.vision_forward(ctx, backend, pixel_values, input_pos, window_index, window_inverse_index, window_mask);
+            auto hidden_states = model.vision_forward(ctx, pixel_values, input_pos, window_index, window_inverse_index, window_mask);
            return hidden_states;
        }

@ -1002,7 +992,9 @@ namespace Qwen {
                                                n_tokens * 4);
            set_backend_tensor_data(input_pos, input_pos_vec.data());

-            struct ggml_tensor* hidden_states = forward(compute_ctx, runtime_backend, input_ids, input_pos, image_embeds);
+            auto runner_ctx = get_context();
+
+            struct ggml_tensor* hidden_states = forward(&runner_ctx, input_ids, input_pos, image_embeds);

            ggml_build_forward_expand(gf, hidden_states);

@ -1167,8 +1159,8 @@ namespace Qwen {
            // pe->data = nullptr;
            set_backend_tensor_data(pe, pe_vec.data());

-            struct ggml_tensor* hidden_states = vision_forward(compute_ctx,
-                                                               runtime_backend,
+            auto runnter_ctx                  = get_context();
+            struct ggml_tensor* hidden_states = vision_forward(&runnter_ctx,
                                                               pixel_values,
                                                               pe,
                                                               window_index,
@ -1196,10 +1188,10 @@ namespace Qwen {

        Qwen2_5_VLEmbedder(ggml_backend_t backend,
                           bool offload_params_to_cpu,
-                           const String2GGMLType& tensor_types = {},
-                           const std::string prefix            = "",
-                           bool enable_vision                  = false)
-            : model(backend, offload_params_to_cpu, tensor_types, prefix, enable_vision) {
+                           const String2TensorStorage& tensor_storage_map = {},
+                           const std::string prefix                       = "",
+                           bool enable_vision                             = false)
+            : model(backend, offload_params_to_cpu, tensor_storage_map, prefix, enable_vision) {
        }

        void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
@ -1350,22 +1342,21 @@ namespace Qwen {
            ggml_type model_data_type = GGML_TYPE_F16;

            ModelLoader model_loader;
-            if (!model_loader.init_from_file(file_path, "qwen2vl.")) {
+            if (!model_loader.init_from_file_and_convert_name(file_path, "qwen2vl.")) {
                LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
                return;
            }

-            auto tensor_types = model_loader.tensor_storages_types;
-            for (auto& item : tensor_types) {
-                // LOG_DEBUG("%s %u", item.first.c_str(), item.second);
-                if (ends_with(item.first, "weight")) {
-                    item.second = model_data_type;
+            auto& tensor_storage_map = model_loader.get_tensor_storage_map();
+            for (auto& [name, tensor_storage] : tensor_storage_map) {
+                if (ends_with(name, "weight")) {
+                    tensor_storage.expected_type = model_data_type;
                }
            }

            std::shared_ptr<Qwen2_5_VLEmbedder> qwenvl = std::make_shared<Qwen2_5_VLEmbedder>(backend,
                                                                                              false,
-                                                                                              tensor_types,
+                                                                                              tensor_storage_map,
                                                                                              "qwen2vl",
                                                                                              true);

--- a/otherarch/sdcpp/rope.hpp
+++ b/otherarch/sdcpp/rope.hpp
@ -386,23 +386,21 @@ namespace Rope {
        return x_out;
    }

-    __STATIC_INLINE__ struct ggml_tensor* attention(struct ggml_context* ctx,
-                                                    ggml_backend_t backend,
+    __STATIC_INLINE__ struct ggml_tensor* attention(GGMLRunnerContext* ctx,
                                                    struct ggml_tensor* q,
                                                    struct ggml_tensor* k,
                                                    struct ggml_tensor* v,
                                                    struct ggml_tensor* pe,
                                                    struct ggml_tensor* mask,
-                                                    bool flash_attn,
                                                    float kv_scale        = 1.0f,
                                                    bool rope_interleaved = true) {
        // q,k,v: [N, L, n_head, d_head]
        // pe: [L, d_head/2, 2, 2]
        // return: [N, L, n_head*d_head]
-        q = apply_rope(ctx, q, pe, rope_interleaved);  // [N*n_head, L, d_head]
-        k = apply_rope(ctx, k, pe, rope_interleaved);  // [N*n_head, L, d_head]
+        q = apply_rope(ctx->ggml_ctx, q, pe, rope_interleaved);  // [N*n_head, L, d_head]
+        k = apply_rope(ctx->ggml_ctx, k, pe, rope_interleaved);  // [N*n_head, L, d_head]

-        auto x = ggml_ext_attention_ext(ctx, backend, q, k, v, v->ne[1], mask, false, true, flash_attn, kv_scale);  // [N, L, n_head*d_head]
+        auto x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, v->ne[1], mask, false, true, ctx->flash_attn_enabled, kv_scale);  // [N, L, n_head*d_head]
        return x;
    }
 };  // namespace Rope
--- a/otherarch/sdcpp/sdtype_adapter.cpp
+++ b/otherarch/sdcpp/sdtype_adapter.cpp
@ -25,6 +25,7 @@ std::string sd_load_qwen2_merges();
 #include "flux.hpp"
 #include "stable-diffusion.cpp"
 #include "util.cpp"
+#include "name_conversion.cpp"
 #include "upscaler.cpp"
 #include "model.cpp"
 #include "tokenize_util.cpp"
--- a/otherarch/sdcpp/stable-diffusion.cpp
+++ b/otherarch/sdcpp/stable-diffusion.cpp
@ -16,6 +16,8 @@
 #include "tae.hpp"
 #include "vae.hpp"

+#include "latent-preview.h"
+
 const char* model_version_to_str[] = {
    "SD 1.x",
    "SD 1.x Inpaint",
@ -23,6 +25,7 @@ const char* model_version_to_str[] = {
    "SD 1.x Tiny UNet",
    "SD 2.x",
    "SD 2.x Inpaint",
+    "SD 2.x Tiny UNet",
    "SDXL",
    "SDXL Inpaint",
    "SDXL Instruct-Pix2Pix",
@ -73,6 +76,14 @@ void calculate_alphas_cumprod(float* alphas_cumprod,
    }
 }

+void suppress_pp(int step, int steps, float time, void* data) {
+    (void)step;
+    (void)steps;
+    (void)time;
+    (void)data;
+    return;
+}
+
 /*=============================================== StableDiffusionGGML ================================================*/

 class StableDiffusionGGML {
@ -229,7 +240,7 @@ public:
            }
        }

-        bool is_unet = model_loader.model_is_unet();
+        bool is_unet = sd_version_is_unet(model_loader.get_sd_version());
        int tempver = model_loader.get_sd_version();
        bool iswan = (tempver==VERSION_WAN2 || tempver==VERSION_WAN2_2_I2V || tempver==VERSION_WAN2_2_TI2V);
        bool isqwenimg = (tempver==VERSION_QWEN_IMAGE);
@ -331,15 +342,18 @@ public:
            }
        }

+        model_loader.convert_tensors_name();
+
        version = model_loader.get_sd_version();

        // kcpp fallback to separate diffusion model passed as model
        if (version == VERSION_COUNT &&
            strlen(SAFE_STR(sd_ctx_params->model_path)) > 0 &&
-            strlen(SAFE_STR(sd_ctx_params->diffusion_model_path)) == 0)
+            strlen(SAFE_STR(sd_ctx_params->diffusion_model_path)) == 0 &&
+            t5_path_fixed!="" )
        {
-            // bool endswithsafetensors = ends_with(sd_ctx_params->model_path, ".safetensors");
-            if(!model_loader.has_diffusion_model_tensors())
+            bool endswithsafetensors = ends_with(sd_ctx_params->model_path, ".safetensors");
+            if(endswithsafetensors && !model_loader.has_diffusion_model_tensors())
            {
                LOG_INFO("SD Diffusion Model tensors missing! Fallback trying alternative tensor names...\n");
                if (!model_loader.init_from_file(sd_ctx_params->model_path, "model.diffusion_model.")) {
@ -354,12 +368,12 @@ public:
            return false;
        }

-        auto& tensor_types = model_loader.tensor_storages_types;
-        for (auto& item : tensor_types) {
-            // LOG_DEBUG("%s %u", item.first.c_str(), item.second);
-            if (contains(item.first, "qwen2vl") && ends_with(item.first, "weight") && (item.second == GGML_TYPE_F32 || item.second == GGML_TYPE_BF16)) {
-                item.second = GGML_TYPE_F16;
-                // LOG_DEBUG(" change %s %u", item.first.c_str(), item.second);
+        auto& tensor_storage_map = model_loader.get_tensor_storage_map();
+        for (auto& [name, tensor_storage] : tensor_storage_map) {
+            if (contains(name, "qwen2vl") &&
+                ends_with(name, "weight") &&
+                (tensor_storage.type == GGML_TYPE_F32 || tensor_storage.type == GGML_TYPE_BF16)) {
+                tensor_storage.expected_type = GGML_TYPE_F16;
            }
        }

@ -459,20 +473,16 @@ public:
                LOG_INFO("CLIP: Using CPU backend");
                clip_backend = ggml_backend_cpu_init();
            }
-            if (sd_ctx_params->diffusion_flash_attn) {
-                LOG_INFO("Using flash attention in the diffusion model");
-            }
            if (sd_version_is_sd3(version)) {
                cond_stage_model = std::make_shared<SD3CLIPEmbedder>(clip_backend,
                                                                     offload_params_to_cpu,
-                                                                     model_loader.tensor_storages_types);
+                                                                     tensor_storage_map);
                diffusion_model  = std::make_shared<MMDiTModel>(backend,
                                                               offload_params_to_cpu,
-                                                               sd_ctx_params->diffusion_flash_attn,
-                                                               model_loader.tensor_storages_types);
+                                                               tensor_storage_map);
            } else if (sd_version_is_flux(version)) {
                bool is_chroma = false;
-                for (auto pair : model_loader.tensor_storages_types) {
+                for (auto pair : tensor_storage_map) {
                    if (pair.first.find("distilled_guidance_layer.in_proj.weight") != std::string::npos) {
                        is_chroma = true;
                        break;
@ -490,45 +500,42 @@ public:

                    cond_stage_model = std::make_shared<T5CLIPEmbedder>(clip_backend,
                                                                        offload_params_to_cpu,
-                                                                        model_loader.tensor_storages_types,
+                                                                        tensor_storage_map,
                                                                        sd_ctx_params->chroma_use_t5_mask,
                                                                        sd_ctx_params->chroma_t5_mask_pad);
                } else {
                    cond_stage_model = std::make_shared<FluxCLIPEmbedder>(clip_backend,
                                                                          offload_params_to_cpu,
-                                                                          model_loader.tensor_storages_types);
+                                                                          tensor_storage_map);
                }
                diffusion_model = std::make_shared<FluxModel>(backend,
                                                              offload_params_to_cpu,
-                                                              model_loader.tensor_storages_types,
+                                                              tensor_storage_map,
                                                              version,
-                                                              sd_ctx_params->diffusion_flash_attn,
                                                              sd_ctx_params->chroma_use_dit_mask);
            } else if (sd_version_is_wan(version)) {
                cond_stage_model = std::make_shared<T5CLIPEmbedder>(clip_backend,
                                                                    offload_params_to_cpu,
-                                                                    model_loader.tensor_storages_types,
+                                                                    tensor_storage_map,
                                                                    true,
                                                                    1,
                                                                    true);
                diffusion_model  = std::make_shared<WanModel>(backend,
                                                             offload_params_to_cpu,
-                                                             model_loader.tensor_storages_types,
+                                                             tensor_storage_map,
                                                             "model.diffusion_model",
-                                                             version,
-                                                             sd_ctx_params->diffusion_flash_attn);
+                                                             version);
                if (strlen(SAFE_STR(sd_ctx_params->high_noise_diffusion_model_path)) > 0) {
                    high_noise_diffusion_model = std::make_shared<WanModel>(backend,
                                                                            offload_params_to_cpu,
-                                                                            model_loader.tensor_storages_types,
+                                                                            tensor_storage_map,
                                                                            "model.high_noise_diffusion_model",
-                                                                            version,
-                                                                            sd_ctx_params->diffusion_flash_attn);
+                                                                            version);
                }
                if (diffusion_model->get_desc() == "Wan2.1-I2V-14B" || diffusion_model->get_desc() == "Wan2.1-FLF2V-14B") {
                    clip_vision = std::make_shared<FrozenCLIPVisionEmbedder>(backend,
                                                                             offload_params_to_cpu,
-                                                                             model_loader.tensor_storages_types);
+                                                                             tensor_storage_map);
                    clip_vision->alloc_params_buffer();
                    clip_vision->get_param_tensors(tensors);
                }
@ -539,41 +546,44 @@ public:
                }
                cond_stage_model = std::make_shared<Qwen2_5_VLCLIPEmbedder>(clip_backend,
                                                                            offload_params_to_cpu,
-                                                                            model_loader.tensor_storages_types,
+                                                                            tensor_storage_map,
                                                                            "",
                                                                            enable_vision);
                diffusion_model  = std::make_shared<QwenImageModel>(backend,
                                                                   offload_params_to_cpu,
-                                                                   model_loader.tensor_storages_types,
+                                                                   tensor_storage_map,
                                                                   "model.diffusion_model",
-                                                                   version,
-                                                                   sd_ctx_params->diffusion_flash_attn);
+                                                                   version);
            } else {  // SD1.x SD2.x SDXL
                if (strstr(SAFE_STR(sd_ctx_params->photo_maker_path), "v2")) {
                    cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend,
                                                                                           offload_params_to_cpu,
-                                                                                           model_loader.tensor_storages_types,
+                                                                                           tensor_storage_map,
                                                                                           SAFE_STR(sd_ctx_params->embedding_dir),
                                                                                           version,
                                                                                           PM_VERSION_2);
                } else {
                    cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend,
                                                                                           offload_params_to_cpu,
-                                                                                           model_loader.tensor_storages_types,
+                                                                                           tensor_storage_map,
                                                                                           SAFE_STR(sd_ctx_params->embedding_dir),
                                                                                           version);
                }
                diffusion_model = std::make_shared<UNetModel>(backend,
                                                              offload_params_to_cpu,
-                                                              model_loader.tensor_storages_types,
-                                                              version,
-                                                              sd_ctx_params->diffusion_flash_attn);
+                                                              tensor_storage_map,
+                                                              version);
                if (sd_ctx_params->diffusion_conv_direct) {
                    LOG_INFO("Using Conv2d direct in the diffusion model");
-                    std::dynamic_pointer_cast<UNetModel>(diffusion_model)->unet.enable_conv2d_direct();
+                    std::dynamic_pointer_cast<UNetModel>(diffusion_model)->unet.set_conv2d_direct_enabled(true);
                }
            }

+            if (sd_ctx_params->diffusion_flash_attn) {
+                LOG_INFO("Using flash attention in the diffusion model");
+                diffusion_model->set_flash_attn_enabled(true);
+            }
+
            cond_stage_model->alloc_params_buffer();
            cond_stage_model->get_param_tensors(tensors);

@ -599,7 +609,7 @@ public:
            if (sd_version_is_wan(version) || sd_version_is_qwen_image(version)) {
                first_stage_model = std::make_shared<WAN::WanVAERunner>(vae_backend,
                                                                        offload_params_to_cpu,
-                                                                        model_loader.tensor_storages_types,
+                                                                        tensor_storage_map,
                                                                        "first_stage_model",
                                                                        vae_decode_only,
                                                                        version);
@ -608,17 +618,17 @@ public:
            } else if (version == VERSION_CHROMA_RADIANCE) {
                first_stage_model = std::make_shared<FakeVAE>(vae_backend,
                                                              offload_params_to_cpu);
-            } else if (!use_tiny_autoencoder) {
+            } else if (!use_tiny_autoencoder || sd_ctx_params->tae_preview_only) {
                first_stage_model = std::make_shared<AutoEncoderKL>(vae_backend,
                                                                    offload_params_to_cpu,
-                                                                    model_loader.tensor_storages_types,
+                                                                    tensor_storage_map,
                                                                    "first_stage_model",
                                                                    vae_decode_only,
                                                                    false,
                                                                    version);
                if (sd_ctx_params->vae_conv_direct) {
                    LOG_INFO("Using Conv2d direct in the vae model");
-                    first_stage_model->enable_conv2d_direct();
+                    first_stage_model->set_conv2d_direct_enabled(true);
                }
                if (version == VERSION_SDXL &&
                    (strlen(SAFE_STR(sd_ctx_params->vae_path)) == 0 || sd_ctx_params->force_sdxl_vae_conv_scale)) {
@ -631,16 +641,17 @@ public:
                }
                first_stage_model->alloc_params_buffer();
                first_stage_model->get_param_tensors(tensors, "first_stage_model");
-            } else {
+            }
+            if (use_tiny_autoencoder) {
                tae_first_stage = std::make_shared<TinyAutoEncoder>(vae_backend,
                                                                    offload_params_to_cpu,
-                                                                    model_loader.tensor_storages_types,
+                                                                    tensor_storage_map,
                                                                    "decoder.layers",
                                                                    vae_decode_only,
                                                                    version);
                if (sd_ctx_params->vae_conv_direct) {
                    LOG_INFO("Using Conv2d direct in the tae model");
-                    tae_first_stage->enable_conv2d_direct();
+                    tae_first_stage->set_conv2d_direct_enabled(true);
                }
            }
            // first_stage_model->get_param_tensors(tensors, "first_stage_model.");
@ -655,18 +666,18 @@ public:
                }
                control_net = std::make_shared<ControlNet>(controlnet_backend,
                                                           offload_params_to_cpu,
-                                                           model_loader.tensor_storages_types,
+                                                           tensor_storage_map,
                                                           version);
                if (sd_ctx_params->diffusion_conv_direct) {
                    LOG_INFO("Using Conv2d direct in the control net");
-                    control_net->enable_conv2d_direct();
+                    control_net->set_conv2d_direct_enabled(true);
                }
            }

            if (strstr(SAFE_STR(sd_ctx_params->photo_maker_path), "v2")) {
                pmid_model = std::make_shared<PhotoMakerIDEncoder>(backend,
                                                                   offload_params_to_cpu,
-                                                                   model_loader.tensor_storages_types,
+                                                                   tensor_storage_map,
                                                                   "pmid",
                                                                   version,
                                                                   PM_VERSION_2);
@ -674,7 +685,7 @@ public:
            } else {
                pmid_model = std::make_shared<PhotoMakerIDEncoder>(backend,
                                                                   offload_params_to_cpu,
-                                                                   model_loader.tensor_storages_types,
+                                                                   tensor_storage_map,
                                                                   "pmid",
                                                                   version);
            }
@ -682,13 +693,13 @@ public:
              if (version != VERSION_SDXL) { // kcpp
                printf("\n!!!!\nWARNING: PhotoMaker is only compatible with SDXL models. PhotoMaker will be disabled!\n!!!!\n");
              } else {
-                pmid_lora = std::make_shared<LoraModel>(backend, sd_ctx_params->photo_maker_path, "");
+                pmid_lora = std::make_shared<LoraModel>(backend, sd_ctx_params->photo_maker_path, "", version);
                if (!pmid_lora->load_from_file(true, n_threads)) {
                    LOG_WARN("load photomaker lora tensors from %s failed", sd_ctx_params->photo_maker_path);
                    return false;
                }
                LOG_INFO("loading stacked ID embedding (PHOTOMAKER) model file from '%s'", sd_ctx_params->photo_maker_path);
-                if (!model_loader.init_from_file(sd_ctx_params->photo_maker_path, "pmid.")) {
+                if (!model_loader.init_from_file_and_convert_name(sd_ctx_params->photo_maker_path, "pmid.")) {
                    LOG_WARN("loading stacked ID embedding from '%s' failed", sd_ctx_params->photo_maker_path);
                } else {
                    stacked_id = true;
@ -723,7 +734,7 @@ public:
            ignore_tensors.insert("first_stage_model.");
        }
        if (stacked_id) {
-            ignore_tensors.insert("lora.");
+            ignore_tensors.insert("pmid.unet.");
        }

        if (vae_decode_only) {
@ -751,9 +762,10 @@ public:
                unet_params_mem_size += high_noise_diffusion_model->get_params_buffer_size();
            }
            size_t vae_params_mem_size = 0;
-            if (!use_tiny_autoencoder) {
+            if (!use_tiny_autoencoder || sd_ctx_params->tae_preview_only) {
                vae_params_mem_size = first_stage_model->get_params_buffer_size();
-            } else {
+            }
+            if (use_tiny_autoencoder) {
                if (!tae_first_stage->load_from_file(taesd_path_fixed, n_threads)) {
                    return false;
                }
@ -859,12 +871,12 @@ public:
                    is_using_v_parameterization = true;
                }
            } else if (sd_version_is_sdxl(version)) {
-                if (model_loader.tensor_storages_types.find("edm_vpred.sigma_max") != model_loader.tensor_storages_types.end()) {
+                if (tensor_storage_map.find("edm_vpred.sigma_max") != tensor_storage_map.end()) {
                    // CosXL models
                    // TODO: get sigma_min and sigma_max values from file
                    is_using_edm_v_parameterization = true;
                }
-                if (model_loader.tensor_storages_types.find("v_pred") != model_loader.tensor_storages_types.end()) {
+                if (tensor_storage_map.find("v_pred") != tensor_storage_map.end()) {
                    is_using_v_parameterization = true;
                }
            } else if (version == VERSION_SVD) {
@ -884,10 +896,9 @@ public:
                float shift = sd_ctx_params->flow_shift;
                if (shift == INFINITY) {
                    shift = 1.0f;  // TODO: validate
-                    for (auto pair : model_loader.tensor_storages_types) {
-                        if (pair.first.find("model.diffusion_model.guidance_in.in_layer.weight") != std::string::npos) {
+                    for (const auto& [name, tensor_storage] : tensor_storage_map) {
+                        if (starts_with(name, "model.diffusion_model.guidance_in.in_layer.weight")) {
                            shift = 1.15f;
-                            break;
                        }
                    }
                }
@ -927,6 +938,7 @@ public:

        LOG_DEBUG("finished loaded file");
        ggml_free(ctx);
+        use_tiny_autoencoder = use_tiny_autoencoder && !sd_ctx_params->tae_preview_only;
        return true;
    }

@ -1031,7 +1043,7 @@ public:
            LOG_WARN("can not find %s for lora %s", st_file_path.c_str(), lora_path.c_str());
            return;
        }
-        LoraModel lora(backend, file_path);
+        LoraModel lora(backend, file_path, "", version);
        if (!lora.load_from_file(false, n_threads)) {
            LOG_WARN("load lora tensors from %s failed", file_path.c_str());
            return;
@ -1068,7 +1080,7 @@ public:
            LOG_WARN("can not find %s or %s for lora %s", st_file_path.c_str(), ckpt_file_path.c_str(), lora_name.c_str());
            return;
        }
-        LoraModel lora(backend, file_path, is_high_noise ? "model.high_noise_" : "");
+        LoraModel lora(backend, file_path, is_high_noise ? "model.high_noise_" : "", version);
        if (!lora.load_from_file(false, n_threads)) {
            LOG_WARN("load lora tensors from %s failed", file_path.c_str());
            return;
@ -1270,6 +1282,156 @@ public:
        }
    }

+    void silent_tiling(ggml_tensor* input, ggml_tensor* output, const int scale, const int tile_size, const float tile_overlap_factor, on_tile_process on_processing) {
+        sd_progress_cb_t cb = sd_get_progress_callback();
+        void* cbd           = sd_get_progress_callback_data();
+        sd_set_progress_callback((sd_progress_cb_t)suppress_pp, nullptr);
+        sd_tiling(input, output, scale, tile_size, tile_overlap_factor, on_processing);
+        sd_set_progress_callback(cb, cbd);
+    }
+
+    void preview_image(ggml_context* work_ctx,
+                       int step,
+                       struct ggml_tensor* latents,
+                       enum SDVersion version,
+                       preview_t preview_mode,
+                       ggml_tensor* result,
+                       std::function<void(int, int, sd_image_t*, bool)> step_callback,
+                       bool is_noisy) {
+        const uint32_t channel = 3;
+        uint32_t width         = latents->ne[0];
+        uint32_t height        = latents->ne[1];
+        uint32_t dim           = latents->ne[ggml_n_dims(latents) - 1];
+
+        if (preview_mode == PREVIEW_PROJ) {
+            const float(*latent_rgb_proj)[channel] = nullptr;
+            float* latent_rgb_bias                 = nullptr;
+
+            if (dim == 48) {
+                if (sd_version_is_wan(version)) {
+                    latent_rgb_proj = wan_22_latent_rgb_proj;
+                    latent_rgb_bias = wan_22_latent_rgb_bias;
+                } else {
+                    LOG_WARN("No latent to RGB projection known for this model");
+                    // unknown model
+                    return;
+                }
+            } else if (dim == 16) {
+                // 16 channels VAE -> Flux or SD3
+
+                if (sd_version_is_sd3(version)) {
+                    latent_rgb_proj = sd3_latent_rgb_proj;
+                    latent_rgb_bias = sd3_latent_rgb_bias;
+                } else if (sd_version_is_flux(version)) {
+                    latent_rgb_proj = flux_latent_rgb_proj;
+                    latent_rgb_bias = flux_latent_rgb_bias;
+                } else if (sd_version_is_wan(version) || sd_version_is_qwen_image(version)) {
+                    latent_rgb_proj = wan_21_latent_rgb_proj;
+                    latent_rgb_bias = wan_21_latent_rgb_bias;
+                } else {
+                    LOG_WARN("No latent to RGB projection known for this model");
+                    // unknown model
+                    return;
+                }
+
+            } else if (dim == 4) {
+                // 4 channels VAE
+                if (sd_version_is_sdxl(version)) {
+                    latent_rgb_proj = sdxl_latent_rgb_proj;
+                    latent_rgb_bias = sdxl_latent_rgb_bias;
+                } else if (sd_version_is_sd1(version) || sd_version_is_sd2(version)) {
+                    latent_rgb_proj = sd_latent_rgb_proj;
+                    latent_rgb_bias = sd_latent_rgb_bias;
+                } else {
+                    // unknown model
+                    LOG_WARN("No latent to RGB projection known for this model");
+                    return;
+                }
+            } else if (dim == 3) {
+                // Do nothing, assuming already RGB latents
+            } else {
+                LOG_WARN("No latent to RGB projection known for this model");
+                // unknown latent space
+                return;
+            }
+
+            uint32_t frames = 1;
+            if (ggml_n_dims(latents) == 4) {
+                frames = latents->ne[2];
+            }
+
+            uint8_t* data = (uint8_t*)malloc(frames * width * height * channel * sizeof(uint8_t));
+
+            preview_latent_video(data, latents, latent_rgb_proj, latent_rgb_bias, width, height, frames, dim);
+            sd_image_t* images = (sd_image_t*)malloc(frames * sizeof(sd_image_t));
+            for (int i = 0; i < frames; i++) {
+                images[i] = {width, height, channel, data + i * width * height * channel};
+            }
+            step_callback(step, frames, images, is_noisy);
+            free(data);
+            free(images);
+        } else {
+            if (preview_mode == PREVIEW_VAE) {
+                process_latent_out(latents);
+                if (vae_tiling_params.enabled) {
+                    // split latent in 32x32 tiles and compute in several steps
+                    auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
+                        first_stage_model->compute(n_threads, in, true, &out, nullptr);
+                    };
+                    silent_tiling(latents, result, get_vae_scale_factor(), 32, 0.5f, on_tiling);
+
+                } else {
+                    first_stage_model->compute(n_threads, latents, true, &result, work_ctx);
+                }
+
+                first_stage_model->free_compute_buffer();
+                process_vae_output_tensor(result);
+                process_latent_in(latents);
+            } else if (preview_mode == PREVIEW_TAE) {
+                if (tae_first_stage == nullptr) {
+                    LOG_WARN("TAE not found for preview");
+                    return;
+                }
+                if (vae_tiling_params.enabled) {
+                    // split latent in 64x64 tiles and compute in several steps
+                    auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
+                        tae_first_stage->compute(n_threads, in, true, &out, nullptr);
+                    };
+                    silent_tiling(latents, result, get_vae_scale_factor(), 64, 0.5f, on_tiling);
+                } else {
+                    tae_first_stage->compute(n_threads, latents, true, &result, work_ctx);
+                }
+                tae_first_stage->free_compute_buffer();
+            } else {
+                return;
+            }
+
+            ggml_ext_tensor_clamp_inplace(result, 0.0f, 1.0f);
+            uint32_t frames = 1;
+            if (ggml_n_dims(latents) == 4) {
+                frames = result->ne[2];
+            }
+
+            sd_image_t* images = (sd_image_t*)malloc(frames * sizeof(sd_image_t));
+            // print_ggml_tensor(result,true);
+            for (size_t i = 0; i < frames; i++) {
+                images[i].width   = result->ne[0];
+                images[i].height  = result->ne[1];
+                images[i].channel = 3;
+                images[i].data    = ggml_tensor_to_sd_image(result, i, ggml_n_dims(latents) == 4);
+            }
+
+            step_callback(step, frames, images, is_noisy);
+
+            ggml_ext_tensor_scale_inplace(result, 0);
+            for (int i = 0; i < frames; i++) {
+                free(images[i].data);
+            }
+
+            free(images);
+        }
+    }
+
    ggml_tensor* sample(ggml_context* work_ctx,
                        std::shared_ptr<DiffusionModel> work_diffusion_model,
                        bool inverse_noise_scaling,
@ -1345,7 +1507,34 @@ public:

        int64_t t0 = ggml_time_us();

+        struct ggml_tensor* preview_tensor = nullptr;
+        auto sd_preview_mode               = sd_get_preview_mode();
+        if (sd_preview_mode != PREVIEW_NONE && sd_preview_mode != PREVIEW_PROJ) {
+            int64_t W = x->ne[0] * get_vae_scale_factor();
+            int64_t H = x->ne[1] * get_vae_scale_factor();
+            if (ggml_n_dims(x) == 4) {
+                // assuming video mode (if batch processing gets implemented this will break)
+                int T = x->ne[2];
+                if (sd_version_is_wan(version)) {
+                    T = ((T - 1) * 4) + 1;
+                }
+                preview_tensor = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32,
+                                                    W,
+                                                    H,
+                                                    T,
+                                                    3);
+            } else {
+                preview_tensor = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32,
+                                                    W,
+                                                    H,
+                                                    3,
+                                                    x->ne[3]);
+            }
+        }
+
        auto denoise = [&](ggml_tensor* input, float sigma, int step) -> ggml_tensor* {
+            auto sd_preview_cb   = sd_get_preview_callback();
+            auto sd_preview_mode = sd_get_preview_mode();
            if (step == 1 || step == -1) {
                pretty_progress(0, (int)steps, 0);
            }
@ -1380,6 +1569,11 @@ public:
            if (denoise_mask != nullptr && version == VERSION_WAN2_2_TI2V) {
                apply_mask(noised_input, init_latent, denoise_mask);
            }
+            if (sd_preview_cb != nullptr && sd_should_preview_noisy()) {
+                if (step % sd_get_preview_interval() == 0) {
+                    preview_image(work_ctx, step, noised_input, version, sd_preview_mode, preview_tensor, sd_preview_cb, true);
+                }
+            }

            std::vector<struct ggml_tensor*> controls;

@ -1501,16 +1695,22 @@ public:
                vec_denoised[i] = latent_result * c_out + vec_input[i] * c_skip;
            }

+            if (denoise_mask != nullptr) {
+                apply_mask(denoised, init_latent, denoise_mask);
+            }
+
+            if (sd_preview_cb != nullptr && sd_should_preview_denoised()) {
+                if (step % sd_get_preview_interval() == 0) {
+                    preview_image(work_ctx, step, denoised, version, sd_preview_mode, preview_tensor, sd_preview_cb, false);
+                }
+            }
+
            int64_t t1 = ggml_time_us();
            if (step > 0 || step == -(int)steps) {
                int showstep = std::abs(step);
                pretty_progress(showstep, (int)steps, (t1 - t0) / 1000000.f / showstep);
                // LOG_INFO("step %d sampling completed taking %.2fs", step, (t1 - t0) * 1.0f / 1000000);
            }
-            if (denoise_mask != nullptr) {
-                apply_mask(denoised, init_latent, denoise_mask);
-            }
-
            return denoised;
        };

@ -2016,6 +2216,29 @@ enum prediction_t str_to_prediction(const char* str) {
    return PREDICTION_COUNT;
 }

+const char* preview_to_str[] = {
+    "none",
+    "proj",
+    "tae",
+    "vae",
+};
+
+const char* sd_preview_name(enum preview_t preview) {
+    if (preview < PREVIEW_COUNT) {
+        return preview_to_str[preview];
+    }
+    return NONE_STR;
+}
+
+enum preview_t str_to_preview(const char* str) {
+    for (int i = 0; i < PREVIEW_COUNT; i++) {
+        if (!strcmp(str, preview_to_str[i])) {
+            return (enum preview_t)i;
+        }
+    }
+    return PREVIEW_COUNT;
+}
+
 void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
    *sd_ctx_params                         = {};
    sd_ctx_params->vae_decode_only         = true;
@ -2384,18 +2607,24 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
                id_embeds = load_tensor_from_file(work_ctx, pm_params.id_embed_path);
                // print_ggml_tensor(id_embeds, true, "id_embeds:");
            }
-            id_cond.c_crossattn = sd_ctx->sd->id_encoder(work_ctx, init_img, id_cond.c_crossattn, id_embeds, class_tokens_mask);
-            int64_t t1          = ggml_time_ms();
-            LOG_INFO("Photomaker ID Stacking, taking %" PRId64 " ms", t1 - t0);
-            if (sd_ctx->sd->free_params_immediately) {
-                sd_ctx->sd->pmid_model->free_params_buffer();
-            }
-            // Encode input prompt without the trigger word for delayed conditioning
-            prompt_text_only = sd_ctx->sd->cond_stage_model->remove_trigger_from_prompt(work_ctx, prompt);
-            // printf("%s || %s \n", prompt.c_str(), prompt_text_only.c_str());
-            prompt = prompt_text_only;  //
-            if (sample_steps < 50) {
-                LOG_WARN("It's recommended to use >= 50 steps for photo maker!");
+            if (pmv2 && id_embeds == nullptr) {
+                LOG_WARN("Provided PhotoMaker images, but NO valid ID embeds file for PM v2");
+                LOG_WARN("Turn off PhotoMaker");
+                sd_ctx->sd->stacked_id = false;
+            } else {
+                id_cond.c_crossattn = sd_ctx->sd->id_encoder(work_ctx, init_img, id_cond.c_crossattn, id_embeds, class_tokens_mask);
+                int64_t t1          = ggml_time_ms();
+                LOG_INFO("Photomaker ID Stacking, taking %" PRId64 " ms", t1 - t0);
+                if (sd_ctx->sd->free_params_immediately) {
+                    sd_ctx->sd->pmid_model->free_params_buffer();
+                }
+                // Encode input prompt without the trigger word for delayed conditioning
+                prompt_text_only = sd_ctx->sd->cond_stage_model->remove_trigger_from_prompt(work_ctx, prompt);
+                // printf("%s || %s \n", prompt.c_str(), prompt_text_only.c_str());
+                prompt = prompt_text_only;  //
+                if (sample_steps < 50) {
+                    LOG_WARN("It's recommended to use >= 50 steps for photo maker!");
+                }
            }
        } else {
            LOG_WARN("Provided PhotoMaker model file, but NO input ID images");
@ -2752,7 +2981,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
                    } else {
                        float m = ggml_ext_tensor_get_f32(mask_img, mx, my);
                        ggml_ext_tensor_set_f32(concat_latent, m, ix, iy, 0);
-                        for (int k = 0; k < masked_latent->ne[2];k++) {
+                        for (int k = 0; k < masked_latent->ne[2]; k++) {
                            float v = ggml_ext_tensor_get_f32(masked_latent, ix, iy, k);
                            ggml_ext_tensor_set_f32(concat_latent, v, ix, iy, k + mask_channels);
                        }
--- a/otherarch/sdcpp/stable-diffusion.h
+++ b/otherarch/sdcpp/stable-diffusion.h
@ -126,6 +126,14 @@ enum sd_log_level_t {
    SD_LOG_ERROR
 };

+enum preview_t {
+    PREVIEW_NONE,
+    PREVIEW_PROJ,
+    PREVIEW_TAE,
+    PREVIEW_VAE,
+    PREVIEW_COUNT
+};
+
 typedef struct {
    bool enabled;
    int tile_size_x;
@ -162,6 +170,7 @@ typedef struct {
    bool keep_control_net_on_cpu;
    bool keep_vae_on_cpu;
    bool diffusion_flash_attn;
+    bool tae_preview_only;
    bool diffusion_conv_direct;
    bool vae_conv_direct;
    bool force_sdxl_vae_conv_scale;
@ -254,9 +263,11 @@ typedef struct sd_ctx_t sd_ctx_t;

 typedef void (*sd_log_cb_t)(enum sd_log_level_t level, const char* text, void* data);
 typedef void (*sd_progress_cb_t)(int step, int steps, float time, void* data);
+typedef void (*sd_preview_cb_t)(int step, int frame_count, sd_image_t* frames, bool is_noisy);

 SD_API void sd_set_log_callback(sd_log_cb_t sd_log_cb, void* data);
 SD_API void sd_set_progress_callback(sd_progress_cb_t cb, void* data);
+SD_API void sd_set_preview_callback(sd_preview_cb_t cb, preview_t mode, int interval, bool denoised, bool noisy);
 SD_API int32_t sd_get_num_physical_cores();
 SD_API const char* sd_get_system_info();

@ -270,6 +281,8 @@ SD_API const char* sd_schedule_name(enum scheduler_t scheduler);
 SD_API enum scheduler_t str_to_schedule(const char* str);
 SD_API const char* sd_prediction_name(enum prediction_t prediction);
 SD_API enum prediction_t str_to_prediction(const char* str);
+SD_API const char* sd_preview_name(enum preview_t preview);
+SD_API enum preview_t str_to_preview(const char* str);

 SD_API void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params);
 SD_API char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params);
--- a/otherarch/sdcpp/t5.hpp
+++ b/otherarch/sdcpp/t5.hpp
@ -461,7 +461,7 @@ protected:
    int64_t hidden_size;
    float eps;

-    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") override {
+    void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
        enum ggml_type wtype = GGML_TYPE_F32;
        params["weight"]     = ggml_new_tensor_1d(ctx, wtype, hidden_size);
    }
@ -472,10 +472,10 @@ public:
        : hidden_size(hidden_size),
          eps(eps) {}

-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) override {
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
        struct ggml_tensor* w = params["weight"];
-        x                     = ggml_rms_norm(ctx, x, eps);
-        x                     = ggml_mul(ctx, x, w);
+        x                     = ggml_rms_norm(ctx->ggml_ctx, x, eps);
+        x                     = ggml_mul(ctx->ggml_ctx, x, w);
        return x;
    }
 };
@ -487,13 +487,13 @@ public:
        blocks["wo"] = std::shared_ptr<GGMLBlock>(new Linear(ff_dim, model_dim, false));
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) override {
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
        // x: [N, n_token, model_dim]
        auto wi = std::dynamic_pointer_cast<Linear>(blocks["wi"]);
        auto wo = std::dynamic_pointer_cast<Linear>(blocks["wo"]);

        x = wi->forward(ctx, x);
-        x = ggml_relu_inplace(ctx, x);
+        x = ggml_relu_inplace(ctx->ggml_ctx, x);
        x = wo->forward(ctx, x);
        return x;
    }
@ -509,15 +509,15 @@ public:
        blocks["wo"] = std::shared_ptr<GGMLBlock>(new Linear(ff_dim, model_dim, false, false, false, scale));
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) override {
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
        // x: [N, n_token, model_dim]
        auto wi_0 = std::dynamic_pointer_cast<Linear>(blocks["wi_0"]);
        auto wi_1 = std::dynamic_pointer_cast<Linear>(blocks["wi_1"]);
        auto wo   = std::dynamic_pointer_cast<Linear>(blocks["wo"]);

-        auto hidden_gelu   = ggml_gelu_inplace(ctx, wi_0->forward(ctx, x));
+        auto hidden_gelu   = ggml_gelu_inplace(ctx->ggml_ctx, wi_0->forward(ctx, x));
        auto hidden_linear = wi_1->forward(ctx, x);
-        x                  = ggml_mul_inplace(ctx, hidden_gelu, hidden_linear);
+        x                  = ggml_mul_inplace(ctx->ggml_ctx, hidden_gelu, hidden_linear);
        x                  = wo->forward(ctx, x);
        return x;
    }
@ -530,14 +530,14 @@ public:
        blocks["layer_norm"]     = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim));
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) override {
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
        // x: [N, n_token, model_dim]
        auto DenseReluDense = std::dynamic_pointer_cast<T5DenseGatedActDense>(blocks["DenseReluDense"]);
        auto layer_norm     = std::dynamic_pointer_cast<T5LayerNorm>(blocks["layer_norm"]);

        auto forwarded_states = layer_norm->forward(ctx, x);
        forwarded_states      = DenseReluDense->forward(ctx, forwarded_states);
-        x                     = ggml_add_inplace(ctx, forwarded_states, x);
+        x                     = ggml_add_inplace(ctx->ggml_ctx, forwarded_states, x);
        return x;
    }
 };
@ -569,18 +569,17 @@ public:
        }
    }

-    struct ggml_tensor* compute_bias(struct ggml_context* ctx,
+    struct ggml_tensor* compute_bias(GGMLRunnerContext* ctx,
                                     struct ggml_tensor* relative_position_bucket) {
        auto relative_attention_bias = std::dynamic_pointer_cast<Embedding>(blocks["relative_attention_bias"]);

-        auto values = relative_attention_bias->forward(ctx, relative_position_bucket);  // shape (query_length, key_length, num_heads)
-        values      = ggml_cont(ctx, ggml_permute(ctx, values, 2, 0, 1, 3));            // shape (1, num_heads, query_length, key_length)
+        auto values = relative_attention_bias->forward(ctx, relative_position_bucket);            // shape (query_length, key_length, num_heads)
+        values      = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, values, 2, 0, 1, 3));  // shape (1, num_heads, query_length, key_length)
        return values;
    }

    // x: [N, n_token, model_dim]
-    std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(struct ggml_context* ctx,
-                                                                ggml_backend_t backend,
+    std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(GGMLRunnerContext* ctx,
                                                                struct ggml_tensor* x,
                                                                struct ggml_tensor* past_bias                = nullptr,
                                                                struct ggml_tensor* mask                     = nullptr,
@ -602,16 +601,16 @@ public:
        }
        if (past_bias != nullptr) {
            if (mask != nullptr) {
-                mask = ggml_repeat(ctx, mask, past_bias);
-                mask = ggml_add(ctx, mask, past_bias);
+                mask = ggml_repeat(ctx->ggml_ctx, mask, past_bias);
+                mask = ggml_add(ctx->ggml_ctx, mask, past_bias);
            } else {
                mask = past_bias;
            }
        }

-        k = ggml_scale_inplace(ctx, k, sqrt(d_head));
+        k = ggml_scale_inplace(ctx->ggml_ctx, k, sqrt(d_head));

-        x = ggml_ext_attention_ext(ctx, backend, q, k, v, num_heads, mask);  // [N, n_token, d_head * n_head]
+        x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, num_heads, mask);  // [N, n_token, d_head * n_head]

        x = out_proj->forward(ctx, x);  // [N, n_token, model_dim]
        return {x, past_bias};
@ -629,8 +628,7 @@ public:
        blocks["layer_norm"]    = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim));
    }

-    std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(struct ggml_context* ctx,
-                                                                ggml_backend_t backend,
+    std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(GGMLRunnerContext* ctx,
                                                                struct ggml_tensor* x,
                                                                struct ggml_tensor* past_bias                = nullptr,
                                                                struct ggml_tensor* mask                     = nullptr,
@ -640,11 +638,11 @@ public:
        auto layer_norm    = std::dynamic_pointer_cast<T5LayerNorm>(blocks["layer_norm"]);

        auto normed_hidden_state = layer_norm->forward(ctx, x);
-        auto ret                 = SelfAttention->forward(ctx, backend, normed_hidden_state, past_bias, mask, relative_position_bucket);
+        auto ret                 = SelfAttention->forward(ctx, normed_hidden_state, past_bias, mask, relative_position_bucket);
        auto output              = ret.first;
        past_bias                = ret.second;

-        x = ggml_add_inplace(ctx, output, x);
+        x = ggml_add_inplace(ctx->ggml_ctx, output, x);
        return {x, past_bias};
    }
 };
@ -656,8 +654,7 @@ public:
        blocks["layer.1"] = std::shared_ptr<GGMLBlock>(new T5LayerFF(model_dim, ff_dim));
    }

-    std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(struct ggml_context* ctx,
-                                                                ggml_backend_t backend,
+    std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(GGMLRunnerContext* ctx,
                                                                struct ggml_tensor* x,
                                                                struct ggml_tensor* past_bias                = nullptr,
                                                                struct ggml_tensor* mask                     = nullptr,
@ -666,7 +663,7 @@ public:
        auto layer_0 = std::dynamic_pointer_cast<T5LayerSelfAttention>(blocks["layer.0"]);
        auto layer_1 = std::dynamic_pointer_cast<T5LayerFF>(blocks["layer.1"]);

-        auto ret  = layer_0->forward(ctx, backend, x, past_bias, mask, relative_position_bucket);
+        auto ret  = layer_0->forward(ctx, x, past_bias, mask, relative_position_bucket);
        x         = ret.first;
        past_bias = ret.second;
        x         = layer_1->forward(ctx, x);
@ -692,8 +689,7 @@ public:
        blocks["final_layer_norm"] = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim));
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx,
-                                ggml_backend_t backend,
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                struct ggml_tensor* x,
                                struct ggml_tensor* past_bias                = nullptr,
                                struct ggml_tensor* attention_mask           = nullptr,
@ -702,7 +698,7 @@ public:
        for (int i = 0; i < num_layers; i++) {
            auto block = std::dynamic_pointer_cast<T5Block>(blocks["block." + std::to_string(i)]);

-            auto ret  = block->forward(ctx, backend, x, past_bias, attention_mask, relative_position_bucket);
+            auto ret  = block->forward(ctx, x, past_bias, attention_mask, relative_position_bucket);
            x         = ret.first;
            past_bias = ret.second;
        }
@ -740,8 +736,7 @@ public:
                                                                     params.model_dim));
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx,
-                                ggml_backend_t backend,
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                struct ggml_tensor* input_ids,
                                struct ggml_tensor* past_bias                = nullptr,
                                struct ggml_tensor* attention_mask           = nullptr,
@ -752,7 +747,7 @@ public:
        auto encoder = std::dynamic_pointer_cast<T5Stack>(blocks["encoder"]);

        auto x = shared->forward(ctx, input_ids);
-        x      = encoder->forward(ctx, backend, x, past_bias, attention_mask, relative_position_bucket);
+        x      = encoder->forward(ctx, x, past_bias, attention_mask, relative_position_bucket);
        return x;
    }
 };
@ -764,7 +759,7 @@ struct T5Runner : public GGMLRunner {

    T5Runner(ggml_backend_t backend,
             bool offload_params_to_cpu,
-             const String2GGMLType& tensor_types,
+             const String2TensorStorage& tensor_storage_map,
             const std::string prefix,
             bool is_umt5 = false)
        : GGMLRunner(backend, offload_params_to_cpu) {
@ -773,7 +768,7 @@ struct T5Runner : public GGMLRunner {
            params.relative_attention = false;
        }
        model = T5(params);
-        model.init(params_ctx, tensor_types, prefix);
+        model.init(params_ctx, tensor_storage_map, prefix);
    }

    std::string get_desc() override {
@ -784,15 +779,14 @@ struct T5Runner : public GGMLRunner {
        model.get_param_tensors(tensors, prefix);
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx,
-                                ggml_backend_t backend,
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                struct ggml_tensor* input_ids,
                                struct ggml_tensor* relative_position_bucket,
                                struct ggml_tensor* attention_mask = nullptr) {
        size_t N       = input_ids->ne[1];
        size_t n_token = input_ids->ne[0];

-        auto hidden_states = model.forward(ctx, backend, input_ids, nullptr, attention_mask, relative_position_bucket);  // [N, n_token, model_dim]
+        auto hidden_states = model.forward(ctx, input_ids, nullptr, attention_mask, relative_position_bucket);  // [N, n_token, model_dim]
        return hidden_states;
    }

@ -818,7 +812,8 @@ struct T5Runner : public GGMLRunner {
                                                           input_ids->ne[0]);
        set_backend_tensor_data(relative_position_bucket, relative_position_bucket_vec.data());

-        struct ggml_tensor* hidden_states = forward(compute_ctx, runtime_backend, input_ids, relative_position_bucket, attention_mask);
+        auto runner_ctx                   = get_context();
+        struct ggml_tensor* hidden_states = forward(&runner_ctx, input_ids, relative_position_bucket, attention_mask);

        ggml_build_forward_expand(gf, hidden_states);

@ -910,10 +905,10 @@ struct T5Embedder {

    T5Embedder(ggml_backend_t backend,
               bool offload_params_to_cpu,
-               const String2GGMLType& tensor_types = {},
-               const std::string prefix            = "",
-               bool is_umt5                        = false)
-        : model(backend, offload_params_to_cpu, tensor_types, prefix, is_umt5), tokenizer(is_umt5) {
+               const String2TensorStorage& tensor_storage_map = {},
+               const std::string prefix                       = "",
+               bool is_umt5                                   = false)
+        : model(backend, offload_params_to_cpu, tensor_storage_map, prefix, is_umt5), tokenizer(is_umt5) {
    }

    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
@ -1009,20 +1004,19 @@ struct T5Embedder {
        ggml_type model_data_type = GGML_TYPE_F16;

        ModelLoader model_loader;
-        if (!model_loader.init_from_file(file_path)) {
+        if (!model_loader.init_from_file_and_convert_name(file_path)) {
            LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
            return;
        }

-        auto tensor_types = model_loader.tensor_storages_types;
-        for (auto& item : tensor_types) {
-            // LOG_DEBUG("%s %u", item.first.c_str(), item.second);
-            if (ends_with(item.first, "weight")) {
-                item.second = model_data_type;
+        auto& tensor_storage_map = model_loader.get_tensor_storage_map();
+        for (auto& [name, tensor_storage] : tensor_storage_map) {
+            if (ends_with(name, "weight")) {
+                tensor_storage.expected_type = model_data_type;
            }
        }

-        std::shared_ptr<T5Embedder> t5 = std::make_shared<T5Embedder>(backend, false, tensor_types, "", true);
+        std::shared_ptr<T5Embedder> t5 = std::make_shared<T5Embedder>(backend, false, tensor_storage_map, "", true);

        t5->alloc_params_buffer();
        std::map<std::string, ggml_tensor*> tensors;
--- a/otherarch/sdcpp/tae.hpp
+++ b/otherarch/sdcpp/tae.hpp
@ -29,7 +29,7 @@ public:
        }
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) override {
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
        // x: [n, n_in, h, w]
        // return: [n, n_out, h, w]

@ -38,9 +38,9 @@ public:
        auto conv_4 = std::dynamic_pointer_cast<Conv2d>(blocks["conv.4"]);

        auto h = conv_0->forward(ctx, x);
-        h      = ggml_relu_inplace(ctx, h);
+        h      = ggml_relu_inplace(ctx->ggml_ctx, h);
        h      = conv_2->forward(ctx, h);
-        h      = ggml_relu_inplace(ctx, h);
+        h      = ggml_relu_inplace(ctx->ggml_ctx, h);
        h      = conv_4->forward(ctx, h);

        if (n_in != n_out) {
@ -49,8 +49,8 @@ public:
            x = skip->forward(ctx, x);
        }

-        h = ggml_add(ctx, h, x);
-        h = ggml_relu_inplace(ctx, h);
+        h = ggml_add(ctx->ggml_ctx, h, x);
+        h = ggml_relu_inplace(ctx->ggml_ctx, h);
        return h;
    }
 };
@ -86,7 +86,7 @@ public:
        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, z_channels, {3, 3}, {1, 1}, {1, 1}));
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) override {
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
        // x: [n, in_channels, h, w]
        // return: [n, z_channels, h/8, w/8]

@ -136,20 +136,20 @@ public:
        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* z) override {
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* z) override {
        // z: [n, z_channels, h, w]
        // return: [n, out_channels, h*8, w*8]

-        auto h = ggml_scale(ctx, z, 1.0f / 3.0f);
-        h      = ggml_tanh_inplace(ctx, h);
-        h      = ggml_scale(ctx, h, 3.0f);
+        auto h = ggml_scale(ctx->ggml_ctx, z, 1.0f / 3.0f);
+        h      = ggml_tanh_inplace(ctx->ggml_ctx, h);
+        h      = ggml_scale(ctx->ggml_ctx, h, 3.0f);

        for (int i = 0; i < num_blocks * 3 + 10; i++) {
            if (blocks.find(std::to_string(i)) == blocks.end()) {
                if (i == 1) {
-                    h = ggml_relu_inplace(ctx, h);
+                    h = ggml_relu_inplace(ctx->ggml_ctx, h);
                } else {
-                    h = ggml_upscale(ctx, h, 2, GGML_SCALE_MODE_NEAREST);
+                    h = ggml_upscale(ctx->ggml_ctx, h, 2, GGML_SCALE_MODE_NEAREST);
                }
                continue;
            }
@ -180,12 +180,12 @@ public:
        }
    }

-    struct ggml_tensor* decode(struct ggml_context* ctx, struct ggml_tensor* z) {
+    struct ggml_tensor* decode(GGMLRunnerContext* ctx, struct ggml_tensor* z) {
        auto decoder = std::dynamic_pointer_cast<TinyDecoder>(blocks["decoder.layers"]);
        return decoder->forward(ctx, z);
    }

-    struct ggml_tensor* encode(struct ggml_context* ctx, struct ggml_tensor* x) {
+    struct ggml_tensor* encode(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
        auto encoder = std::dynamic_pointer_cast<TinyEncoder>(blocks["encoder.layers"]);
        return encoder->forward(ctx, x);
    }
@ -197,25 +197,14 @@ struct TinyAutoEncoder : public GGMLRunner {

    TinyAutoEncoder(ggml_backend_t backend,
                    bool offload_params_to_cpu,
-                    const String2GGMLType& tensor_types,
+                    const String2TensorStorage& tensor_storage_map,
                    const std::string prefix,
                    bool decoder_only = true,
                    SDVersion version = VERSION_SD1)
        : decode_only(decoder_only),
          taesd(decoder_only, version),
          GGMLRunner(backend, offload_params_to_cpu) {
-        taesd.init(params_ctx, tensor_types, prefix);
-    }
-
-    void enable_conv2d_direct() {
-        std::vector<GGMLBlock*> blocks;
-        taesd.get_all_blocks(blocks);
-        for (auto block : blocks) {
-            if (block->get_desc() == "Conv2d") {
-                auto conv_block = (Conv2d*)block;
-                conv_block->enable_direct();
-            }
-        }
+        taesd.init(params_ctx, tensor_storage_map, prefix);
    }

    std::string get_desc() override {
@ -233,7 +222,7 @@ struct TinyAutoEncoder : public GGMLRunner {
        }

        ModelLoader model_loader;
-        if (!model_loader.init_from_file(file_path)) {
+        if (!model_loader.init_from_file_and_convert_name(file_path)) {
            LOG_ERROR("init taesd model loader from file failed: '%s'", file_path.c_str());
            return false;
        }
@ -252,7 +241,8 @@ struct TinyAutoEncoder : public GGMLRunner {
    struct ggml_cgraph* build_graph(struct ggml_tensor* z, bool decode_graph) {
        struct ggml_cgraph* gf  = ggml_new_graph(compute_ctx);
        z                       = to_backend(z);
-        struct ggml_tensor* out = decode_graph ? taesd.decode(compute_ctx, z) : taesd.encode(compute_ctx, z);
+        auto runner_ctx         = get_context();
+        struct ggml_tensor* out = decode_graph ? taesd.decode(&runner_ctx, z) : taesd.encode(&runner_ctx, z);
        ggml_build_forward_expand(gf, out);
        return gf;
    }
--- a/otherarch/sdcpp/unet.hpp
+++ b/otherarch/sdcpp/unet.hpp
@ -20,9 +20,10 @@ public:
                            int64_t d_head,
                            int64_t depth,
                            int64_t context_dim,
+                            bool use_linear,
                            int64_t time_depth            = 1,
                            int64_t max_time_embed_period = 10000)
-        : SpatialTransformer(in_channels, n_head, d_head, depth, context_dim),
+        : SpatialTransformer(in_channels, n_head, d_head, depth, context_dim, use_linear),
          max_time_embed_period(max_time_embed_period) {
        // We will convert unet transformer linear to conv2d 1x1 when loading the weights, so use_linear is always False
        // use_spatial_context is always True
@ -60,8 +61,7 @@ public:
        blocks["time_mixer"] = std::shared_ptr<GGMLBlock>(new AlphaBlender());
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx,
-                                ggml_backend_t backend,
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                struct ggml_tensor* x,
                                struct ggml_tensor* context,
                                int timesteps) {
@ -92,7 +92,7 @@ public:
        auto time_context    = context;  // [b*t, n_context, context_dim]
        auto spatial_context = context;
        // time_context_first_timestep = time_context[::timesteps]
-        auto time_context_first_timestep = ggml_view_3d(ctx,
+        auto time_context_first_timestep = ggml_view_3d(ctx->ggml_ctx,
                                                        time_context,
                                                        time_context->ne[0],
                                                        time_context->ne[1],
@ -100,26 +100,26 @@ public:
                                                        time_context->nb[1],
                                                        time_context->nb[2],
                                                        0);  // [b, n_context, context_dim]
-        time_context                     = ggml_new_tensor_3d(ctx, GGML_TYPE_F32,
+        time_context                     = ggml_new_tensor_3d(ctx->ggml_ctx, GGML_TYPE_F32,
                                                              time_context_first_timestep->ne[0],
                                                              time_context_first_timestep->ne[1],
                                                              time_context_first_timestep->ne[2] * h * w);
-        time_context                     = ggml_repeat(ctx, time_context_first_timestep, time_context);  // [b*h*w, n_context, context_dim]
+        time_context                     = ggml_repeat(ctx->ggml_ctx, time_context_first_timestep, time_context);  // [b*h*w, n_context, context_dim]

        x = norm->forward(ctx, x);
        x = proj_in->forward(ctx, x);  // [N, inner_dim, h, w]

-        x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 2, 0, 3));  // [N, h, w, inner_dim]
-        x = ggml_reshape_3d(ctx, x, inner_dim, w * h, n);      // [N, h * w, inner_dim]
+        x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 2, 0, 3));  // [N, h, w, inner_dim]
+        x = ggml_reshape_3d(ctx->ggml_ctx, x, inner_dim, w * h, n);                // [N, h * w, inner_dim]

-        auto num_frames = ggml_arange(ctx, 0, timesteps, 1);
+        auto num_frames = ggml_arange(ctx->ggml_ctx, 0, timesteps, 1);
        // since b is 1, no need to do repeat
-        auto t_emb = ggml_ext_timestep_embedding(ctx, num_frames, in_channels, max_time_embed_period);  // [N, in_channels]
+        auto t_emb = ggml_ext_timestep_embedding(ctx->ggml_ctx, num_frames, in_channels, max_time_embed_period);  // [N, in_channels]

        auto emb = time_pos_embed_0->forward(ctx, t_emb);
-        emb      = ggml_silu_inplace(ctx, emb);
-        emb      = time_pos_embed_2->forward(ctx, emb);                   // [N, in_channels]
-        emb      = ggml_reshape_3d(ctx, emb, emb->ne[0], 1, emb->ne[1]);  // [N, 1, in_channels]
+        emb      = ggml_silu_inplace(ctx->ggml_ctx, emb);
+        emb      = time_pos_embed_2->forward(ctx, emb);                             // [N, in_channels]
+        emb      = ggml_reshape_3d(ctx->ggml_ctx, emb, emb->ne[0], 1, emb->ne[1]);  // [N, 1, in_channels]

        for (int i = 0; i < depth; i++) {
            std::string transformer_name = "transformer_blocks." + std::to_string(i);
@ -128,11 +128,11 @@ public:
            auto block     = std::dynamic_pointer_cast<BasicTransformerBlock>(blocks[transformer_name]);
            auto mix_block = std::dynamic_pointer_cast<BasicTransformerBlock>(blocks[time_stack_name]);

-            x = block->forward(ctx, backend, x, spatial_context);  // [N, h * w, inner_dim]
+            x = block->forward(ctx, x, spatial_context);  // [N, h * w, inner_dim]

            // in_channels == inner_dim
            auto x_mix = x;
-            x_mix      = ggml_add(ctx, x_mix, emb);  // [N, h * w, inner_dim]
+            x_mix      = ggml_add(ctx->ggml_ctx, x_mix, emb);  // [N, h * w, inner_dim]

            int64_t N = x_mix->ne[2];
            int64_t T = timesteps;
@ -140,26 +140,26 @@ public:
            int64_t S = x_mix->ne[1];
            int64_t C = x_mix->ne[0];

-            x_mix = ggml_reshape_4d(ctx, x_mix, C, S, T, B);               // (b t) s c -> b t s c
-            x_mix = ggml_cont(ctx, ggml_permute(ctx, x_mix, 0, 2, 1, 3));  // b t s c -> b s t c
-            x_mix = ggml_reshape_3d(ctx, x_mix, C, T, S * B);              // b s t c -> (b s) t c
+            x_mix = ggml_reshape_4d(ctx->ggml_ctx, x_mix, C, S, T, B);                         // (b t) s c -> b t s c
+            x_mix = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x_mix, 0, 2, 1, 3));  // b t s c -> b s t c
+            x_mix = ggml_reshape_3d(ctx->ggml_ctx, x_mix, C, T, S * B);                        // b s t c -> (b s) t c

-            x_mix = mix_block->forward(ctx, backend, x_mix, time_context);  // [B * h * w, T, inner_dim]
+            x_mix = mix_block->forward(ctx, x_mix, time_context);  // [B * h * w, T, inner_dim]

-            x_mix = ggml_reshape_4d(ctx, x_mix, C, T, S, B);               // (b s) t c -> b s t c
-            x_mix = ggml_cont(ctx, ggml_permute(ctx, x_mix, 0, 2, 1, 3));  // b s t c -> b t s c
-            x_mix = ggml_reshape_3d(ctx, x_mix, C, S, T * B);              // b t s c -> (b t) s c
+            x_mix = ggml_reshape_4d(ctx->ggml_ctx, x_mix, C, T, S, B);                         // (b s) t c -> b s t c
+            x_mix = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x_mix, 0, 2, 1, 3));  // b s t c -> b t s c
+            x_mix = ggml_reshape_3d(ctx->ggml_ctx, x_mix, C, S, T * B);                        // b t s c -> (b t) s c

            x = time_mixer->forward(ctx, x, x_mix);  // [N, h * w, inner_dim]
        }

-        x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 0, 2, 3));  // [N, inner_dim, h * w]
-        x = ggml_reshape_4d(ctx, x, w, h, inner_dim, n);       // [N, inner_dim, h, w]
+        x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 0, 2, 3));  // [N, inner_dim, h * w]
+        x = ggml_reshape_4d(ctx->ggml_ctx, x, w, h, inner_dim, n);                 // [N, inner_dim, h, w]

        // proj_out
        x = proj_out->forward(ctx, x);  // [N, in_channels, h, w]

-        x = ggml_add(ctx, x, x_in);
+        x = ggml_add(ctx->ggml_ctx, x, x_in);
        return x;
    }
 };
@ -179,17 +179,20 @@ protected:
    int num_heads                          = 8;
    int num_head_channels                  = -1;   // channels // num_heads
    int context_dim                        = 768;  // 1024 for VERSION_SD2, 2048 for VERSION_SDXL
+    bool use_linear_projection             = false;
+    bool tiny_unet                         = false;

 public:
    int model_channels  = 320;
    int adm_in_channels = 2816;  // only for VERSION_SDXL/SVD

-    UnetModelBlock(SDVersion version = VERSION_SD1, const String2GGMLType& tensor_types = {}, bool flash_attn = false)
+    UnetModelBlock(SDVersion version = VERSION_SD1, const String2TensorStorage& tensor_storage_map = {})
        : version(version) {
        if (sd_version_is_sd2(version)) {
-            context_dim       = 1024;
-            num_head_channels = 64;
-            num_heads         = -1;
+            context_dim           = 1024;
+            num_head_channels     = 64;
+            num_heads             = -1;
+            use_linear_projection = true;
        } else if (sd_version_is_sdxl(version)) {
            context_dim           = 2048;
            attention_resolutions = {4, 2};
@ -197,22 +200,26 @@ public:
            transformer_depth     = {1, 2, 10};
            num_head_channels     = 64;
            num_heads             = -1;
+            use_linear_projection = true;
        } else if (version == VERSION_SVD) {
-            in_channels       = 8;
-            out_channels      = 4;
-            context_dim       = 1024;
-            adm_in_channels   = 768;
-            num_head_channels = 64;
-            num_heads         = -1;
-        } else if (version == VERSION_SD1_TINY_UNET) {
-            num_res_blocks = 1;
-            channel_mult   = {1, 2, 4};
+            in_channels           = 8;
+            out_channels          = 4;
+            context_dim           = 1024;
+            adm_in_channels       = 768;
+            num_head_channels     = 64;
+            num_heads             = -1;
+            use_linear_projection = true;
        }
        if (sd_version_is_inpaint(version)) {
            in_channels = 9;
        } else if (sd_version_is_unet_edit(version)) {
            in_channels = 8;
        }
+        if (version == VERSION_SD1_TINY_UNET || version == VERSION_SD2_TINY_UNET) {
+            num_res_blocks = 1;
+            channel_mult   = {1, 2, 4};
+            tiny_unet      = true;
+        }

        // dims is always 2
        // use_temporal_attention is always True for SVD
@ -250,9 +257,9 @@ public:
                                       int64_t depth,
                                       int64_t context_dim) -> SpatialTransformer* {
            if (version == VERSION_SVD) {
-                return new SpatialVideoTransformer(in_channels, n_head, d_head, depth, context_dim);
+                return new SpatialVideoTransformer(in_channels, n_head, d_head, depth, context_dim, use_linear_projection);
            } else {
-                return new SpatialTransformer(in_channels, n_head, d_head, depth, context_dim, flash_attn);
+                return new SpatialTransformer(in_channels, n_head, d_head, depth, context_dim, use_linear_projection);
            }
        };

@ -286,7 +293,7 @@ public:
                                                                                  context_dim));
                }
                input_block_chans.push_back(ch);
-                if (version == VERSION_SD1_TINY_UNET) {
+                if (tiny_unet) {
                    input_block_idx++;
                }
            }
@ -307,7 +314,7 @@ public:
            d_head = num_head_channels;
            n_head = ch / d_head;
        }
-        if (version != VERSION_SD1_TINY_UNET) {
+        if (!tiny_unet) {
            blocks["middle_block.0"] = std::shared_ptr<GGMLBlock>(get_resblock(ch, time_embed_dim, ch));
            if (version != VERSION_SDXL_SSD1B) {
                blocks["middle_block.1"] = std::shared_ptr<GGMLBlock>(get_attention_layer(ch,
@ -354,7 +361,7 @@ public:
                }

                if (i > 0 && j == num_res_blocks) {
-                    if (version == VERSION_SD1_TINY_UNET) {
+                    if (tiny_unet) {
                        output_block_idx++;
                        if (output_block_idx == 2) {
                            up_sample_idx = 1;
@ -377,7 +384,7 @@ public:
    }

    struct ggml_tensor* resblock_forward(std::string name,
-                                         struct ggml_context* ctx,
+                                         GGMLRunnerContext* ctx,
                                         struct ggml_tensor* x,
                                         struct ggml_tensor* emb,
                                         int num_video_frames) {
@ -393,24 +400,22 @@ public:
    }

    struct ggml_tensor* attention_layer_forward(std::string name,
-                                                struct ggml_context* ctx,
-                                                ggml_backend_t backend,
+                                                GGMLRunnerContext* ctx,
                                                struct ggml_tensor* x,
                                                struct ggml_tensor* context,
                                                int timesteps) {
        if (version == VERSION_SVD) {
            auto block = std::dynamic_pointer_cast<SpatialVideoTransformer>(blocks[name]);

-            return block->forward(ctx, backend, x, context, timesteps);
+            return block->forward(ctx, x, context, timesteps);
        } else {
            auto block = std::dynamic_pointer_cast<SpatialTransformer>(blocks[name]);

-            return block->forward(ctx, backend, x, context);
+            return block->forward(ctx, x, context);
        }
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx,
-                                ggml_backend_t backend,
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                struct ggml_tensor* x,
                                struct ggml_tensor* timesteps,
                                struct ggml_tensor* context,
@ -427,20 +432,20 @@ public:
        // return: [N, out_channels, h, w]
        if (context != nullptr) {
            if (context->ne[2] != x->ne[3]) {
-                context = ggml_repeat(ctx, context, ggml_new_tensor_3d(ctx, GGML_TYPE_F32, context->ne[0], context->ne[1], x->ne[3]));
+                context = ggml_repeat(ctx->ggml_ctx, context, ggml_new_tensor_3d(ctx->ggml_ctx, GGML_TYPE_F32, context->ne[0], context->ne[1], x->ne[3]));
            }
        }

        if (c_concat != nullptr) {
            if (c_concat->ne[3] != x->ne[3]) {
-                c_concat = ggml_repeat(ctx, c_concat, x);
+                c_concat = ggml_repeat(ctx->ggml_ctx, c_concat, x);
            }
-            x = ggml_concat(ctx, x, c_concat, 2);
+            x = ggml_concat(ctx->ggml_ctx, x, c_concat, 2);
        }

        if (y != nullptr) {
            if (y->ne[1] != x->ne[3]) {
-                y = ggml_repeat(ctx, y, ggml_new_tensor_2d(ctx, GGML_TYPE_F32, y->ne[0], x->ne[3]));
+                y = ggml_repeat(ctx->ggml_ctx, y, ggml_new_tensor_2d(ctx->ggml_ctx, GGML_TYPE_F32, y->ne[0], x->ne[3]));
            }
        }

@ -451,10 +456,10 @@ public:
        auto out_0 = std::dynamic_pointer_cast<GroupNorm32>(blocks["out.0"]);
        auto out_2 = std::dynamic_pointer_cast<Conv2d>(blocks["out.2"]);

-        auto t_emb = ggml_ext_timestep_embedding(ctx, timesteps, model_channels);  // [N, model_channels]
+        auto t_emb = ggml_ext_timestep_embedding(ctx->ggml_ctx, timesteps, model_channels);  // [N, model_channels]

        auto emb = time_embed_0->forward(ctx, t_emb);
-        emb      = ggml_silu_inplace(ctx, emb);
+        emb      = ggml_silu_inplace(ctx->ggml_ctx, emb);
        emb      = time_embed_2->forward(ctx, emb);  // [N, time_embed_dim]

        // SDXL/SVD
@ -463,10 +468,10 @@ public:
            auto label_embed_2 = std::dynamic_pointer_cast<Linear>(blocks["label_emb.0.2"]);

            auto label_emb = label_embed_0->forward(ctx, y);
-            label_emb      = ggml_silu_inplace(ctx, label_emb);
+            label_emb      = ggml_silu_inplace(ctx->ggml_ctx, label_emb);
            label_emb      = label_embed_2->forward(ctx, label_emb);  // [N, time_embed_dim]

-            emb = ggml_add(ctx, emb, label_emb);  // [N, time_embed_dim]
+            emb = ggml_add(ctx->ggml_ctx, emb, label_emb);  // [N, time_embed_dim]
        }

        // input_blocks
@ -489,11 +494,11 @@ public:
                h                = resblock_forward(name, ctx, h, emb, num_video_frames);  // [N, mult*model_channels, h, w]
                if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) {
                    std::string name = "input_blocks." + std::to_string(input_block_idx) + ".1";
-                    h                = attention_layer_forward(name, ctx, backend, h, context, num_video_frames);  // [N, mult*model_channels, h, w]
+                    h                = attention_layer_forward(name, ctx, h, context, num_video_frames);  // [N, mult*model_channels, h, w]
                }
                hs.push_back(h);
            }
-            if (version == VERSION_SD1_TINY_UNET) {
+            if (tiny_unet) {
                input_block_idx++;
            }
            if (i != len_mults - 1) {
@ -510,16 +515,16 @@ public:
        // [N, 4*model_channels, h/8, w/8]

        // middle_block
-        if (version != VERSION_SD1_TINY_UNET) {
+        if (!tiny_unet) {
            h = resblock_forward("middle_block.0", ctx, h, emb, num_video_frames);  // [N, 4*model_channels, h/8, w/8]
            if (version != VERSION_SDXL_SSD1B) {
-                h = attention_layer_forward("middle_block.1", ctx, backend, h, context, num_video_frames);  // [N, 4*model_channels, h/8, w/8]
-                h = resblock_forward("middle_block.2", ctx, h, emb, num_video_frames);                      // [N, 4*model_channels, h/8, w/8]
+                h = attention_layer_forward("middle_block.1", ctx, h, context, num_video_frames);  // [N, 4*model_channels, h/8, w/8]
+                h = resblock_forward("middle_block.2", ctx, h, emb, num_video_frames);             // [N, 4*model_channels, h/8, w/8]
            }
        }
        if (controls.size() > 0) {
-            auto cs = ggml_scale_inplace(ctx, controls[controls.size() - 1], control_strength);
-            h       = ggml_add(ctx, h, cs);  // middle control
+            auto cs = ggml_scale_inplace(ctx->ggml_ctx, controls[controls.size() - 1], control_strength);
+            h       = ggml_add(ctx->ggml_ctx, h, cs);  // middle control
        }
        int control_offset = controls.size() - 2;

@ -531,12 +536,12 @@ public:
                hs.pop_back();

                if (controls.size() > 0) {
-                    auto cs = ggml_scale_inplace(ctx, controls[control_offset], control_strength);
-                    h_skip  = ggml_add(ctx, h_skip, cs);  // control net condition
+                    auto cs = ggml_scale_inplace(ctx->ggml_ctx, controls[control_offset], control_strength);
+                    h_skip  = ggml_add(ctx->ggml_ctx, h_skip, cs);  // control net condition
                    control_offset--;
                }

-                h = ggml_concat(ctx, h, h_skip, 2);
+                h = ggml_concat(ctx->ggml_ctx, h, h_skip, 2);

                std::string name = "output_blocks." + std::to_string(output_block_idx) + ".0";

@ -546,13 +551,13 @@ public:
                if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) {
                    std::string name = "output_blocks." + std::to_string(output_block_idx) + ".1";

-                    h = attention_layer_forward(name, ctx, backend, h, context, num_video_frames);
+                    h = attention_layer_forward(name, ctx, h, context, num_video_frames);

                    up_sample_idx++;
                }

                if (i > 0 && j == num_res_blocks) {
-                    if (version == VERSION_SD1_TINY_UNET) {
+                    if (tiny_unet) {
                        output_block_idx++;
                        if (output_block_idx == 2) {
                            up_sample_idx = 1;
@ -572,7 +577,7 @@ public:

        // out
        h = out_0->forward(ctx, h);
-        h = ggml_silu_inplace(ctx, h);
+        h = ggml_silu_inplace(ctx->ggml_ctx, h);
        h = out_2->forward(ctx, h);
        ggml_set_name(h, "bench-end");
        return h;  // [N, out_channels, h, w]
@ -584,24 +589,11 @@ struct UNetModelRunner : public GGMLRunner {

    UNetModelRunner(ggml_backend_t backend,
                    bool offload_params_to_cpu,
-                    const String2GGMLType& tensor_types,
+                    const String2TensorStorage& tensor_storage_map,
                    const std::string prefix,
-                    SDVersion version = VERSION_SD1,
-                    bool flash_attn   = false)
-        : GGMLRunner(backend, offload_params_to_cpu), unet(version, tensor_types, flash_attn) {
-        unet.init(params_ctx, tensor_types, prefix);
-    }
-
-    void enable_conv2d_direct() {
-        std::vector<GGMLBlock*> blocks;
-        unet.get_all_blocks(blocks);
-        for (auto block : blocks) {
-            if (block->get_desc() == "Conv2d") {
-                LOG_DEBUG("block %s", block->get_desc().c_str());
-                auto conv_block = (Conv2d*)block;
-                conv_block->enable_direct();
-            }
-        }
+                    SDVersion version = VERSION_SD1)
+        : GGMLRunner(backend, offload_params_to_cpu), unet(version, tensor_storage_map) {
+        unet.init(params_ctx, tensor_storage_map, prefix);
    }

    std::string get_desc() override {
@ -636,8 +628,9 @@ struct UNetModelRunner : public GGMLRunner {
            controls[i] = to_backend(controls[i]);
        }

-        struct ggml_tensor* out = unet.forward(compute_ctx,
-                                               runtime_backend,
+        auto runner_ctx = get_context();
+
+        struct ggml_tensor* out = unet.forward(&runner_ctx,
                                               x,
                                               timesteps,
                                               context,
--- a/otherarch/sdcpp/upscaler.cpp
+++ b/otherarch/sdcpp/upscaler.cpp
@ -42,7 +42,7 @@ struct UpscalerGGML {
        backend = ggml_backend_sycl_init(0);
 #endif
        ModelLoader model_loader;
-        if (!model_loader.init_from_file(esrgan_path)) {
+        if (!model_loader.init_from_file_and_convert_name(esrgan_path)) {
            LOG_ERROR("init model loader from file failed: '%s'", esrgan_path.c_str());
        }
        model_loader.set_wtype_override(model_data_type);
@ -51,9 +51,9 @@ struct UpscalerGGML {
            backend = ggml_backend_cpu_init();
        }
        LOG_INFO("Upscaler weight type: %s", ggml_type_name(model_data_type));
-        esrgan_upscaler = std::make_shared<ESRGAN>(backend, offload_params_to_cpu, model_loader.tensor_storages_types);
+        esrgan_upscaler = std::make_shared<ESRGAN>(backend, offload_params_to_cpu, model_loader.get_tensor_storage_map());
        if (direct) {
-            esrgan_upscaler->enable_conv2d_direct();
+            esrgan_upscaler->set_conv2d_direct_enabled(true);
        }
        if (!esrgan_upscaler->load_from_file(esrgan_path, n_threads)) {
            return false;
--- a/otherarch/sdcpp/util.cpp
+++ b/otherarch/sdcpp/util.cpp
@ -175,6 +175,12 @@ int32_t sd_get_num_physical_cores() {
 static sd_progress_cb_t sd_progress_cb = nullptr;
 void* sd_progress_cb_data              = nullptr;

+static sd_preview_cb_t sd_preview_cb = nullptr;
+preview_t sd_preview_mode            = PREVIEW_NONE;
+int sd_preview_interval              = 1;
+bool sd_preview_denoised             = true;
+bool sd_preview_noisy                = false;
+
 std::u32string utf8_to_utf32(const std::string& utf8_str) {
    std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
    return converter.from_bytes(utf8_str);
@ -348,6 +354,37 @@ void sd_set_progress_callback(sd_progress_cb_t cb, void* data) {
    sd_progress_cb      = cb;
    sd_progress_cb_data = data;
 }
+void sd_set_preview_callback(sd_preview_cb_t cb, preview_t mode = PREVIEW_PROJ, int interval = 1, bool denoised = true, bool noisy = false) {
+    sd_preview_cb       = cb;
+    sd_preview_mode     = mode;
+    sd_preview_interval = interval;
+    sd_preview_denoised = denoised;
+    sd_preview_noisy    = noisy;
+}
+
+sd_preview_cb_t sd_get_preview_callback() {
+    return sd_preview_cb;
+}
+
+preview_t sd_get_preview_mode() {
+    return sd_preview_mode;
+}
+int sd_get_preview_interval() {
+    return sd_preview_interval;
+}
+bool sd_should_preview_denoised() {
+    return sd_preview_denoised;
+}
+bool sd_should_preview_noisy() {
+    return sd_preview_noisy;
+}
+
+sd_progress_cb_t sd_get_progress_callback() {
+    return sd_progress_cb;
+}
+void* sd_get_progress_callback_data() {
+    return sd_progress_cb_data;
+}
 const char* sd_get_system_info() {
    static char buffer[1024];
    std::stringstream ss;
--- a/otherarch/sdcpp/util.h
+++ b/otherarch/sdcpp/util.h
@ -52,6 +52,15 @@ std::string trim(const std::string& s);

 std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::string& text);

+sd_progress_cb_t sd_get_progress_callback();
+void* sd_get_progress_callback_data();
+
+sd_preview_cb_t sd_get_preview_callback();
+preview_t sd_get_preview_mode();
+int sd_get_preview_interval();
+bool sd_should_preview_denoised();
+bool sd_should_preview_noisy();
+
 void log_message(const char* format, ...);
 void set_sd_log_level(int log);
 bool get_sd_log_level();
--- a/otherarch/sdcpp/vae.hpp
+++ b/otherarch/sdcpp/vae.hpp
@ -30,7 +30,7 @@ public:
        }
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) override {
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
        // x: [N, in_channels, h, w]
        // t_emb is always None
        auto norm1 = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm1"]);
@ -40,12 +40,12 @@ public:

        auto h = x;
        h      = norm1->forward(ctx, h);
-        h      = ggml_silu_inplace(ctx, h);  // swish
+        h      = ggml_silu_inplace(ctx->ggml_ctx, h);  // swish
        h      = conv1->forward(ctx, h);
        // return h;

        h = norm2->forward(ctx, h);
-        h = ggml_silu_inplace(ctx, h);  // swish
+        h = ggml_silu_inplace(ctx->ggml_ctx, h);  // swish
        // dropout, skip for inference
        h = conv2->forward(ctx, h);

@ -56,7 +56,7 @@ public:
            x = nin_shortcut->forward(ctx, x);  // [N, out_channels, h, w]
        }

-        h = ggml_add(ctx, h, x);
+        h = ggml_add(ctx->ggml_ctx, h, x);
        return h;  // [N, out_channels, h, w]
    }
 };
@ -64,25 +64,51 @@ public:
 class AttnBlock : public UnaryBlock {
 protected:
    int64_t in_channels;
+    bool use_linear;

-public:
-    AttnBlock(int64_t in_channels)
-        : in_channels(in_channels) {
-        blocks["norm"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(in_channels));
-        blocks["q"]    = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1}));
-        blocks["k"]    = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1}));
-        blocks["v"]    = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1}));
-
-        blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1}));
+    void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") {
+        auto iter = tensor_storage_map.find(prefix + "proj_out.weight");
+        if (iter != tensor_storage_map.end()) {
+            if (iter->second.n_dims == 4 && use_linear) {
+                use_linear         = false;
+                blocks["q"]        = std::make_shared<Conv2d>(in_channels, in_channels, std::pair{1, 1});
+                blocks["k"]        = std::make_shared<Conv2d>(in_channels, in_channels, std::pair{1, 1});
+                blocks["v"]        = std::make_shared<Conv2d>(in_channels, in_channels, std::pair{1, 1});
+                blocks["proj_out"] = std::make_shared<Conv2d>(in_channels, in_channels, std::pair{1, 1});
+            } else if (iter->second.n_dims == 2 && !use_linear) {
+                use_linear         = true;
+                blocks["q"]        = std::make_shared<Linear>(in_channels, in_channels);
+                blocks["k"]        = std::make_shared<Linear>(in_channels, in_channels);
+                blocks["v"]        = std::make_shared<Linear>(in_channels, in_channels);
+                blocks["proj_out"] = std::make_shared<Linear>(in_channels, in_channels);
+            }
+        }
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) override {
+public:
+    AttnBlock(int64_t in_channels, bool use_linear)
+        : in_channels(in_channels), use_linear(use_linear) {
+        blocks["norm"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(in_channels));
+        if (use_linear) {
+            blocks["q"]        = std::shared_ptr<GGMLBlock>(new Linear(in_channels, in_channels));
+            blocks["k"]        = std::shared_ptr<GGMLBlock>(new Linear(in_channels, in_channels));
+            blocks["v"]        = std::shared_ptr<GGMLBlock>(new Linear(in_channels, in_channels));
+            blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Linear(in_channels, in_channels));
+        } else {
+            blocks["q"]        = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1}));
+            blocks["k"]        = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1}));
+            blocks["v"]        = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1}));
+            blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1}));
+        }
+    }
+
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
        // x: [N, in_channels, h, w]
        auto norm     = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm"]);
-        auto q_proj   = std::dynamic_pointer_cast<Conv2d>(blocks["q"]);
-        auto k_proj   = std::dynamic_pointer_cast<Conv2d>(blocks["k"]);
-        auto v_proj   = std::dynamic_pointer_cast<Conv2d>(blocks["v"]);
-        auto proj_out = std::dynamic_pointer_cast<Conv2d>(blocks["proj_out"]);
+        auto q_proj   = std::dynamic_pointer_cast<UnaryBlock>(blocks["q"]);
+        auto k_proj   = std::dynamic_pointer_cast<UnaryBlock>(blocks["k"]);
+        auto v_proj   = std::dynamic_pointer_cast<UnaryBlock>(blocks["v"]);
+        auto proj_out = std::dynamic_pointer_cast<UnaryBlock>(blocks["proj_out"]);

        auto h_ = norm->forward(ctx, x);

@ -91,25 +117,46 @@ public:
        const int64_t h = h_->ne[1];
        const int64_t w = h_->ne[0];

-        auto q = q_proj->forward(ctx, h_);                          // [N, in_channels, h, w]
-        q      = ggml_cont(ctx, ggml_permute(ctx, q, 1, 2, 0, 3));  // [N, h, w, in_channels]
-        q      = ggml_reshape_3d(ctx, q, c, h * w, n);              // [N, h * w, in_channels]
+        ggml_tensor* q;
+        ggml_tensor* k;
+        ggml_tensor* v;
+        if (use_linear) {
+            h_ = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, h_, 1, 2, 0, 3));  // [N, h, w, in_channels]
+            h_ = ggml_reshape_3d(ctx->ggml_ctx, h_, c, h * w, n);                        // [N, h * w, in_channels]

-        auto k = k_proj->forward(ctx, h_);                          // [N, in_channels, h, w]
-        k      = ggml_cont(ctx, ggml_permute(ctx, k, 1, 2, 0, 3));  // [N, h, w, in_channels]
-        k      = ggml_reshape_3d(ctx, k, c, h * w, n);              // [N, h * w, in_channels]
+            q = q_proj->forward(ctx, h_);  // [N, h * w, in_channels]
+            k = k_proj->forward(ctx, h_);  // [N, h * w, in_channels]
+            v = v_proj->forward(ctx, h_);  // [N, h * w, in_channels]

-        auto v = v_proj->forward(ctx, h_);              // [N, in_channels, h, w]
-        v      = ggml_reshape_3d(ctx, v, h * w, c, n);  // [N, in_channels, h * w]
+            v = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, v, 1, 0, 2, 3));  // [N, in_channels, h * w]
+        } else {
+            q = q_proj->forward(ctx, h_);                                              // [N, in_channels, h, w]
+            q = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, q, 1, 2, 0, 3));  // [N, h, w, in_channels]
+            q = ggml_reshape_3d(ctx->ggml_ctx, q, c, h * w, n);                        // [N, h * w, in_channels]

-        h_ = ggml_ext_attention(ctx, q, k, v, false);  // [N, h * w, in_channels]
+            k = k_proj->forward(ctx, h_);                                              // [N, in_channels, h, w]
+            k = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, k, 1, 2, 0, 3));  // [N, h, w, in_channels]
+            k = ggml_reshape_3d(ctx->ggml_ctx, k, c, h * w, n);                        // [N, h * w, in_channels]

-        h_ = ggml_cont(ctx, ggml_permute(ctx, h_, 1, 0, 2, 3));  // [N, in_channels, h * w]
-        h_ = ggml_reshape_4d(ctx, h_, w, h, c, n);               // [N, in_channels, h, w]
+            v = v_proj->forward(ctx, h_);                        // [N, in_channels, h, w]
+            v = ggml_reshape_3d(ctx->ggml_ctx, v, h * w, c, n);  // [N, in_channels, h * w]
+        }

-        h_ = proj_out->forward(ctx, h_);  // [N, in_channels, h, w]
+        h_ = ggml_ext_attention(ctx->ggml_ctx, q, k, v, false);  // [N, h * w, in_channels]

-        h_ = ggml_add(ctx, h_, x);
+        if (use_linear) {
+            h_ = proj_out->forward(ctx, h_);  // [N, h * w, in_channels]
+
+            h_ = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, h_, 1, 0, 2, 3));  // [N, in_channels, h * w]
+            h_ = ggml_reshape_4d(ctx->ggml_ctx, h_, w, h, c, n);                         // [N, in_channels, h, w]
+        } else {
+            h_ = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, h_, 1, 0, 2, 3));  // [N, in_channels, h * w]
+            h_ = ggml_reshape_4d(ctx->ggml_ctx, h_, w, h, c, n);                         // [N, in_channels, h, w]
+
+            h_ = proj_out->forward(ctx, h_);  // [N, in_channels, h, w]
+        }
+
+        h_ = ggml_add(ctx->ggml_ctx, h_, x);
        return h_;
    }
 };
@ -133,7 +180,7 @@ public:
                                                                             kernel_padding));
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx,
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
                                struct ggml_tensor* x) override {
        // timesteps always None
        // skip_video always False
@ -152,19 +199,19 @@ public:
        int64_t H = x->ne[1];
        int64_t W = x->ne[0];

-        x = ggml_reshape_4d(ctx, x, W * H, C, T, B);           // (b t) c h w -> b t c (h w)
-        x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));  // b t c (h w) -> b c t (h w)
-        x = time_mix_conv->forward(ctx, x);                    // [B, OC, T, OH * OW]
-        x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));  // b c t (h w) -> b t c (h w)
-        x = ggml_reshape_4d(ctx, x, W, H, C, T * B);           // b t c (h w) -> (b t) c h w
-        return x;                                              // [B*T, OC, OH, OW]
+        x = ggml_reshape_4d(ctx->ggml_ctx, x, W * H, C, T, B);                     // (b t) c h w -> b t c (h w)
+        x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3));  // b t c (h w) -> b c t (h w)
+        x = time_mix_conv->forward(ctx, x);                                        // [B, OC, T, OH * OW]
+        x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3));  // b c t (h w) -> b t c (h w)
+        x = ggml_reshape_4d(ctx->ggml_ctx, x, W, H, C, T * B);                     // b t c (h w) -> (b t) c h w
+        return x;                                                                  // [B*T, OC, OH, OW]
    }
 };

 class VideoResnetBlock : public ResnetBlock {
 protected:
-    void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") override {
-        enum ggml_type wtype = get_type(prefix + "mix_factor", tensor_types, GGML_TYPE_F32);
+    void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
+        enum ggml_type wtype = get_type(prefix + "mix_factor", tensor_storage_map, GGML_TYPE_F32);
        params["mix_factor"] = ggml_new_tensor_1d(ctx, wtype, 1);
    }

@ -182,7 +229,7 @@ public:
        blocks["time_stack"] = std::shared_ptr<GGMLBlock>(new ResBlock(out_channels, 0, out_channels, {video_kernel_size, 1}, 3, false, true));
    }

-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) override {
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
        // x: [N, in_channels, h, w] aka [b*t, in_channels, h, w]
        // return: [N, out_channels, h, w] aka [b*t, out_channels, h, w]
        // t_emb is always None
@ -199,19 +246,19 @@ public:
        int64_t H = x->ne[1];
        int64_t W = x->ne[0];

-        x          = ggml_reshape_4d(ctx, x, W * H, C, T, B);           // (b t) c h w -> b t c (h w)
-        x          = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));  // b t c (h w) -> b c t (h w)
+        x          = ggml_reshape_4d(ctx->ggml_ctx, x, W * H, C, T, B);                     // (b t) c h w -> b t c (h w)
+        x          = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3));  // b t c (h w) -> b c t (h w)
        auto x_mix = x;

        x = time_stack->forward(ctx, x);  // b t c (h w)

        float alpha = get_alpha();
-        x           = ggml_add(ctx,
-                               ggml_scale(ctx, x, alpha),
-                               ggml_scale(ctx, x_mix, 1.0f - alpha));
+        x           = ggml_add(ctx->ggml_ctx,
+                               ggml_scale(ctx->ggml_ctx, x, alpha),
+                               ggml_scale(ctx->ggml_ctx, x_mix, 1.0f - alpha));

-        x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));  // b c t (h w) -> b t c (h w)
-        x = ggml_reshape_4d(ctx, x, W, H, C, T * B);           // b t c (h w) -> (b t) c h w
+        x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3));  // b c t (h w) -> b t c (h w)
+        x = ggml_reshape_4d(ctx->ggml_ctx, x, W, H, C, T * B);                     // b t c (h w) -> (b t) c h w

        return x;
    }
@ -233,7 +280,8 @@ public:
            int num_res_blocks,
            int in_channels,
            int z_channels,
-            bool double_z = true)
+            bool double_z              = true,
+            bool use_linear_projection = false)
        : ch(ch),
          ch_mult(ch_mult),
          num_res_blocks(num_res_blocks),
@ -264,14 +312,14 @@ public:
        }

        blocks["mid.block_1"] = std::shared_ptr<GGMLBlock>(new ResnetBlock(block_in, block_in));
-        blocks["mid.attn_1"]  = std::shared_ptr<GGMLBlock>(new AttnBlock(block_in));
+        blocks["mid.attn_1"]  = std::shared_ptr<GGMLBlock>(new AttnBlock(block_in, use_linear_projection));
        blocks["mid.block_2"] = std::shared_ptr<GGMLBlock>(new ResnetBlock(block_in, block_in));

        blocks["norm_out"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(block_in));
        blocks["conv_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(block_in, double_z ? z_channels * 2 : z_channels, {3, 3}, {1, 1}, {1, 1}));
    }

-    virtual struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+    virtual struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
        // x: [N, in_channels, h, w]

        auto conv_in     = std::dynamic_pointer_cast<Conv2d>(blocks["conv_in"]);
@ -307,8 +355,8 @@ public:

        // end
        h = norm_out->forward(ctx, h);
-        h = ggml_silu_inplace(ctx, h);  // nonlinearity/swish
-        h = conv_out->forward(ctx, h);  // [N, z_channels*2, h, w]
+        h = ggml_silu_inplace(ctx->ggml_ctx, h);  // nonlinearity/swish
+        h = conv_out->forward(ctx, h);            // [N, z_channels*2, h, w]
        return h;
    }
 };
@ -351,8 +399,9 @@ public:
            std::vector<int> ch_mult,
            int num_res_blocks,
            int z_channels,
-            bool video_decoder    = false,
-            int video_kernel_size = 3)
+            bool use_linear_projection = false,
+            bool video_decoder         = false,
+            int video_kernel_size      = 3)
        : ch(ch),
          out_ch(out_ch),
          ch_mult(ch_mult),
@ -366,7 +415,7 @@ public:
        blocks["conv_in"] = std::shared_ptr<GGMLBlock>(new Conv2d(z_channels, block_in, {3, 3}, {1, 1}, {1, 1}));

        blocks["mid.block_1"] = get_resnet_block(block_in, block_in);
-        blocks["mid.attn_1"]  = std::shared_ptr<GGMLBlock>(new AttnBlock(block_in));
+        blocks["mid.attn_1"]  = std::shared_ptr<GGMLBlock>(new AttnBlock(block_in, use_linear_projection));
        blocks["mid.block_2"] = get_resnet_block(block_in, block_in);

        for (int i = num_resolutions - 1; i >= 0; i--) {
@ -388,7 +437,7 @@ public:
        blocks["conv_out"] = get_conv_out(block_in, out_ch, {3, 3}, {1, 1}, {1, 1});
    }

-    virtual struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* z) {
+    virtual struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* z) {
        // z: [N, z_channels, h, w]
        // alpha is always 0
        // merge_strategy is always learned
@ -429,8 +478,8 @@ public:
        }

        h = norm_out->forward(ctx, h);
-        h = ggml_silu_inplace(ctx, h);  // nonlinearity/swish
-        h = conv_out->forward(ctx, h);  // [N, out_ch, h*8, w*8]
+        h = ggml_silu_inplace(ctx->ggml_ctx, h);  // nonlinearity/swish
+        h = conv_out->forward(ctx, h);            // [N, out_ch, h*8, w*8]
        return h;
    }
 };
@ -454,9 +503,10 @@ protected:
    } dd_config;

 public:
-    AutoencodingEngine(bool decode_only       = true,
-                       bool use_video_decoder = false,
-                       SDVersion version      = VERSION_SD1)
+    AutoencodingEngine(SDVersion version          = VERSION_SD1,
+                       bool decode_only           = true,
+                       bool use_linear_projection = false,
+                       bool use_video_decoder     = false)
        : decode_only(decode_only), use_video_decoder(use_video_decoder) {
        if (sd_version_is_dit(version)) {
            dd_config.z_channels = 16;
@ -470,6 +520,7 @@ public:
                                                                   dd_config.ch_mult,
                                                                   dd_config.num_res_blocks,
                                                                   dd_config.z_channels,
+                                                                   use_linear_projection,
                                                                   use_video_decoder));
        if (use_quant) {
            blocks["post_quant_conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(dd_config.z_channels,
@ -482,7 +533,8 @@ public:
                                                                       dd_config.num_res_blocks,
                                                                       dd_config.in_channels,
                                                                       dd_config.z_channels,
-                                                                       dd_config.double_z));
+                                                                       dd_config.double_z,
+                                                                       use_linear_projection));
            if (use_quant) {
                int factor = dd_config.double_z ? 2 : 1;

@ -493,7 +545,7 @@ public:
        }
    }

-    struct ggml_tensor* decode(struct ggml_context* ctx, struct ggml_tensor* z) {
+    struct ggml_tensor* decode(GGMLRunnerContext* ctx, struct ggml_tensor* z) {
        // z: [N, z_channels, h, w]
        if (use_quant) {
            auto post_quant_conv = std::dynamic_pointer_cast<Conv2d>(blocks["post_quant_conv"]);
@ -507,7 +559,7 @@ public:
        return h;
    }

-    struct ggml_tensor* encode(struct ggml_context* ctx, struct ggml_tensor* x) {
+    struct ggml_tensor* encode(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
        // x: [N, in_channels, h, w]
        auto encoder = std::dynamic_pointer_cast<Encoder>(blocks["encoder"]);

@ -529,7 +581,6 @@ struct VAE : public GGMLRunner {
                         struct ggml_tensor** output,
                         struct ggml_context* output_ctx)                                                         = 0;
    virtual void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) = 0;
-    virtual void enable_conv2d_direct(){};
    virtual void set_conv2d_scale(float scale) { SD_UNUSED(scale); };
 };

@ -563,24 +614,26 @@ struct AutoEncoderKL : public VAE {

    AutoEncoderKL(ggml_backend_t backend,
                  bool offload_params_to_cpu,
-                  const String2GGMLType& tensor_types,
+                  const String2TensorStorage& tensor_storage_map,
                  const std::string prefix,
                  bool decode_only       = false,
                  bool use_video_decoder = false,
                  SDVersion version      = VERSION_SD1)
-        : decode_only(decode_only), ae(decode_only, use_video_decoder, version), VAE(backend, offload_params_to_cpu) {
-        ae.init(params_ctx, tensor_types, prefix);
-    }
-
-    void enable_conv2d_direct() override {
-        std::vector<GGMLBlock*> blocks;
-        ae.get_all_blocks(blocks);
-        for (auto block : blocks) {
-            if (block->get_desc() == "Conv2d") {
-                auto conv_block = (Conv2d*)block;
-                conv_block->enable_direct();
+        : decode_only(decode_only), VAE(backend, offload_params_to_cpu) {
+        bool use_linear_projection = false;
+        for (const auto& [name, tensor_storage] : tensor_storage_map) {
+            if (!starts_with(name, prefix)) {
+                continue;
+            }
+            if (ends_with(name, "attn_1.proj_out.weight")) {
+                if (tensor_storage.n_dims == 2) {
+                    use_linear_projection = true;
+                }
+                break;
            }
        }
+        ae = AutoencodingEngine(version, decode_only, use_linear_projection, use_video_decoder);
+        ae.init(params_ctx, tensor_storage_map, prefix);
    }

    void set_conv2d_scale(float scale) override {
@ -607,7 +660,9 @@ struct AutoEncoderKL : public VAE {

        z = to_backend(z);

-        struct ggml_tensor* out = decode_graph ? ae.decode(compute_ctx, z) : ae.encode(compute_ctx, z);
+        auto runner_ctx = get_context();
+
+        struct ggml_tensor* out = decode_graph ? ae.decode(&runner_ctx, z) : ae.encode(&runner_ctx, z);

        ggml_build_forward_expand(gf, out);

--- a/otherarch/sdcpp/wan.hpp
+++ b/otherarch/sdcpp/wan.hpp