initial files from sdcpp (not working)

2025-09-11 01:24:36 +00:00 · 2024-02-28 15:45:13 +08:00 · 2024-02-28 15:45:13 +08:00 · 26696970ce
commit 26696970ce
parent ad638285de
33 changed files with 582497 additions and 4 deletions
--- a/8
+++ b/8
@ -39,8 +39,8 @@ endif
 #

 # keep standard at C11 and C++11
-CFLAGS   = -I.            -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./include/vulkan -O3 -DNDEBUG -std=c11   -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE
-CXXFLAGS = -I. -I./common -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./include/vulkan -O3 -DNDEBUG -std=c++11 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE
+CFLAGS   = -I.            -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./otherarch/sdcpp -I./otherarch/sdcpp/thirdparty -I./include/vulkan -O3 -DNDEBUG -std=c11   -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE
+CXXFLAGS = -I. -I./common -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./otherarch/sdcpp -I./otherarch/sdcpp/thirdparty -I./include/vulkan -O3 -DNDEBUG -std=c++11 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE
 LDFLAGS  =
 FASTCFLAGS = $(subst -O3,-Ofast,$(CFLAGS))

@ -496,12 +496,14 @@ gpttype_adapter_vulkan_noavx2.o: $(GPTTYPE_ADAPTER)
 	$(CXX) $(CXXFLAGS) $(FAILSAFE_FLAGS) $(VULKAN_FLAGS) -c $< -o $@

 clean:
-	rm -vf *.o main quantize_llama quantize_gpt2 quantize_gptj quantize_neox quantize_mpt quantize-stats perplexity embedding benchmark-matmult save-load-state gguf imatrix imatrix.exe gguf.exe main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe quantize_mpt.exe koboldcpp_default.dll koboldcpp_openblas.dll koboldcpp_failsafe.dll koboldcpp_noavx2.dll koboldcpp_clblast.dll koboldcpp_clblast_noavx2.dll koboldcpp_cublas.dll koboldcpp_hipblas.dll koboldcpp_vulkan.dll koboldcpp_vulkan_noavx2.dll koboldcpp_default.so koboldcpp_openblas.so koboldcpp_failsafe.so koboldcpp_noavx2.so koboldcpp_clblast.so koboldcpp_clblast_noavx2.so koboldcpp_cublas.so koboldcpp_hipblas.so koboldcpp_vulkan.so koboldcpp_vulkan_noavx2.so
+	rm -vf *.o main sdmain quantize_llama quantize_gpt2 quantize_gptj quantize_neox quantize_mpt quantize-stats perplexity embedding benchmark-matmult save-load-state gguf imatrix imatrix.exe gguf.exe main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe quantize_mpt.exe koboldcpp_default.dll koboldcpp_openblas.dll koboldcpp_failsafe.dll koboldcpp_noavx2.dll koboldcpp_clblast.dll koboldcpp_clblast_noavx2.dll koboldcpp_cublas.dll koboldcpp_hipblas.dll koboldcpp_vulkan.dll koboldcpp_vulkan_noavx2.dll koboldcpp_default.so koboldcpp_openblas.so koboldcpp_failsafe.so koboldcpp_noavx2.so koboldcpp_clblast.so koboldcpp_clblast_noavx2.so koboldcpp_cublas.so koboldcpp_hipblas.so koboldcpp_vulkan.so koboldcpp_vulkan_noavx2.so

 # useful tools
 main: examples/main/main.cpp common/sampling.cpp build-info.h ggml.o ggml-quants.o ggml-alloc.o ggml-backend.o llama.o common.o console.o grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 	@echo '====  Run ./main -h for help.  ===='
+sdmain: otherarch/sdcpp/main.cpp otherarch/sdcpp/stable-diffusion.cpp common/sampling.cpp build-info.h ggml.o ggml-quants.o ggml-alloc.o ggml-backend.o llama.o common.o console.o grammar-parser.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 imatrix: examples/imatrix/imatrix.cpp common/sampling.cpp build-info.h ggml.o ggml-quants.o ggml-alloc.o ggml-backend.o llama.o common.o console.o grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 gguf: examples/gguf/gguf.cpp build-info.h ggml.o llama.o $(OBJS)
--- a/ggml.h
+++ b/ggml.h
@ -229,7 +229,7 @@
 #define GGML_MAX_CONTEXTS       64
 #define GGML_MAX_SRC            10
 #ifndef GGML_MAX_NAME
-#define GGML_MAX_NAME           64
+#define GGML_MAX_NAME           128
 #endif
 #define GGML_MAX_OP_PARAMS      64
 #define GGML_DEFAULT_N_THREADS  4
--- a/otherarch/sdcpp/CMakeLists.txt
+++ b/otherarch/sdcpp/CMakeLists.txt
@ -0,0 +1,95 @@
+cmake_minimum_required(VERSION 3.12)
+project("stable-diffusion")
+
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
+    set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
+endif()
+
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
+
+if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
+    set(SD_STANDALONE ON)
+else()
+    set(SD_STANDALONE OFF)
+endif()
+
+#
+# Option list
+#
+
+# general
+option(SD_CUBLAS                     "sd: cuda backend" OFF)
+option(SD_HIPBLAS                    "sd: rocm backend" OFF)
+option(SD_METAL                      "sd: metal backend" OFF)
+option(SD_FLASH_ATTN                 "sd: use flash attention for x4 less memory usage" OFF)
+option(SD_FAST_SOFTMAX               "sd: x1.5 faster softmax, indeterministic (sometimes, same seed don't generate same image), cuda only" OFF)
+option(SD_BUILD_SHARED_LIBS          "sd: build shared libs" OFF)
+
+if(SD_CUBLAS)
+	message("Use CUBLAS as backend stable-diffusion")
+    set(GGML_CUBLAS ON)
+    add_definitions(-DSD_USE_CUBLAS)
+endif()
+
+if(SD_METAL)
+	message("Use Metal as backend stable-diffusion")
+    set(GGML_METAL ON)
+    add_definitions(-DSD_USE_METAL)
+endif()
+
+if (SD_HIPBLAS)
+    message("Use HIPBLAS as backend stable-diffusion")
+    set(GGML_HIPBLAS ON)
+    add_definitions(-DSD_USE_CUBLAS)
+    if(SD_FAST_SOFTMAX)
+        set(GGML_CUDA_FAST_SOFTMAX ON)
+    endif()
+endif ()
+
+if(SD_FLASH_ATTN)
+    message("Use Flash Attention for memory optimization")
+    add_definitions(-DSD_USE_FLASH_ATTENTION)
+endif()
+
+set(SD_LIB stable-diffusion)
+
+file(GLOB SD_LIB_SOURCES
+    "*.h"
+    "*.cpp"
+    "*.hpp"
+)
+
+# we can get only one share lib
+if(SD_BUILD_SHARED_LIBS)
+    message("Build shared library")
+    set(BUILD_SHARED_LIBS OFF)
+    message(${SD_LIB_SOURCES})
+    add_library(${SD_LIB} SHARED ${SD_LIB_SOURCES})
+    add_definitions(-DSD_BUILD_SHARED_LIB)
+    target_compile_definitions(${SD_LIB} PRIVATE -DSD_BUILD_DLL)
+    set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+else()
+    message("Build static library")
+    add_library(${SD_LIB} STATIC ${SD_LIB_SOURCES})
+endif()
+
+
+set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
+
+
+# deps
+add_subdirectory(ggml)
+
+add_subdirectory(thirdparty)
+
+target_link_libraries(${SD_LIB} PUBLIC ggml zip)
+target_include_directories(${SD_LIB} PUBLIC . thirdparty)
+target_compile_features(${SD_LIB} PUBLIC cxx_std_11)
+
+
+add_subdirectory(examples)
+
--- a/otherarch/sdcpp/SDCPP_LICENSE
+++ b/otherarch/sdcpp/SDCPP_LICENSE
@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2023 leejet
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/otherarch/sdcpp/clip.hpp
+++ b/otherarch/sdcpp/clip.hpp
--- a/otherarch/sdcpp/common.hpp
+++ b/otherarch/sdcpp/common.hpp
@ -0,0 +1,529 @@
+#ifndef __COMMON_HPP__
+#define __COMMON_HPP__
+
+#include "ggml_extend.hpp"
+
+class DownSampleBlock : public GGMLBlock {
+protected:
+    int channels;
+    int out_channels;
+    bool vae_downsample;
+
+public:
+    DownSampleBlock(int channels,
+                    int out_channels,
+                    bool vae_downsample = false)
+        : channels(channels),
+          out_channels(out_channels),
+          vae_downsample(vae_downsample) {
+        if (vae_downsample) {
+            blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {2, 2}, {0, 0}));
+        } else {
+            blocks["op"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {2, 2}, {1, 1}));
+        }
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        // x: [N, channels, h, w]
+        if (vae_downsample) {
+            auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]);
+
+            x = ggml_pad(ctx, x, 1, 1, 0, 0);
+            x = conv->forward(ctx, x);
+        } else {
+            auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["op"]);
+
+            x = conv->forward(ctx, x);
+        }
+        return x;  // [N, out_channels, h/2, w/2]
+    }
+};
+
+class UpSampleBlock : public GGMLBlock {
+protected:
+    int channels;
+    int out_channels;
+
+public:
+    UpSampleBlock(int channels,
+                  int out_channels)
+        : channels(channels),
+          out_channels(out_channels) {
+        blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        // x: [N, channels, h, w]
+        auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]);
+
+        x = ggml_upscale(ctx, x, 2);  // [N, channels, h*2, w*2]
+        x = conv->forward(ctx, x);    // [N, out_channels, h*2, w*2]
+        return x;
+    }
+};
+
+class ResBlock : public GGMLBlock {
+protected:
+    // network hparams
+    int64_t channels;      // model_channels * (1, 1, 1, 2, 2, 4, 4, 4)
+    int64_t emb_channels;  // time_embed_dim
+    int64_t out_channels;  // mult * model_channels
+    std::pair<int, int> kernel_size;
+    int dims;
+    bool skip_t_emb;
+    bool exchange_temb_dims;
+
+    std::shared_ptr<GGMLBlock> conv_nd(int dims,
+                                       int64_t in_channels,
+                                       int64_t out_channels,
+                                       std::pair<int, int> kernel_size,
+                                       std::pair<int, int> padding) {
+        GGML_ASSERT(dims == 2 || dims == 3);
+        if (dims == 3) {
+            return std::shared_ptr<GGMLBlock>(new Conv3dnx1x1(in_channels, out_channels, kernel_size.first, 1, padding.first));
+        } else {
+            return std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, out_channels, kernel_size, {1, 1}, padding));
+        }
+    }
+
+public:
+    ResBlock(int64_t channels,
+             int64_t emb_channels,
+             int64_t out_channels,
+             std::pair<int, int> kernel_size = {3, 3},
+             int dims                        = 2,
+             bool exchange_temb_dims         = false,
+             bool skip_t_emb                 = false)
+        : channels(channels),
+          emb_channels(emb_channels),
+          out_channels(out_channels),
+          kernel_size(kernel_size),
+          dims(dims),
+          skip_t_emb(skip_t_emb),
+          exchange_temb_dims(exchange_temb_dims) {
+        std::pair<int, int> padding = {kernel_size.first / 2, kernel_size.second / 2};
+        blocks["in_layers.0"]       = std::shared_ptr<GGMLBlock>(new GroupNorm32(channels));
+        // in_layer_1 is nn.SILU()
+        blocks["in_layers.2"] = conv_nd(dims, channels, out_channels, kernel_size, padding);
+
+        if (!skip_t_emb) {
+            // emb_layer_0 is nn.SILU()
+            blocks["emb_layers.1"] = std::shared_ptr<GGMLBlock>(new Linear(emb_channels, out_channels));
+        }
+
+        blocks["out_layers.0"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(out_channels));
+        // out_layer_1 is nn.SILU()
+        // out_layer_2 is nn.Dropout(), skip for inference
+        blocks["out_layers.3"] = conv_nd(dims, out_channels, out_channels, kernel_size, padding);
+
+        if (out_channels != channels) {
+            blocks["skip_connection"] = conv_nd(dims, channels, out_channels, {1, 1}, {0, 0});
+        }
+    }
+
+    virtual struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* emb = NULL) {
+        // For dims==3, we reduce dimension from 5d to 4d by merging h and w, in order not to change ggml
+        // [N, c, t, h, w] => [N, c, t, h * w]
+        // x: [N, channels, h, w] if dims == 2 else [N, channels, t, h, w]
+        // emb: [N, emb_channels] if dims == 2 else [N, t, emb_channels]
+        auto in_layers_0  = std::dynamic_pointer_cast<GroupNorm32>(blocks["in_layers.0"]);
+        auto in_layers_2  = std::dynamic_pointer_cast<UnaryBlock>(blocks["in_layers.2"]);
+        auto out_layers_0 = std::dynamic_pointer_cast<GroupNorm32>(blocks["out_layers.0"]);
+        auto out_layers_3 = std::dynamic_pointer_cast<UnaryBlock>(blocks["out_layers.3"]);
+
+        if (emb == NULL) {
+            GGML_ASSERT(skip_t_emb);
+        }
+
+        // in_layers
+        auto h = in_layers_0->forward(ctx, x);
+        h      = ggml_silu_inplace(ctx, h);
+        h      = in_layers_2->forward(ctx, h);  // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
+
+        // emb_layers
+        if (!skip_t_emb) {
+            auto emb_layer_1 = std::dynamic_pointer_cast<Linear>(blocks["emb_layers.1"]);
+
+            auto emb_out = ggml_silu(ctx, emb);
+            emb_out      = emb_layer_1->forward(ctx, emb_out);  // [N, out_channels] if dims == 2 else [N, t, out_channels]
+
+            if (dims == 2) {
+                emb_out = ggml_reshape_4d(ctx, emb_out, 1, 1, emb_out->ne[0], emb_out->ne[1]);  // [N, out_channels, 1, 1]
+            } else {
+                emb_out = ggml_reshape_4d(ctx, emb_out, 1, emb_out->ne[0], emb_out->ne[1], emb_out->ne[2]);  // [N, t, out_channels, 1]
+                if (exchange_temb_dims) {
+                    // emb_out = rearrange(emb_out, "b t c ... -> b c t ...")
+                    emb_out = ggml_cont(ctx, ggml_permute(ctx, emb_out, 0, 2, 1, 3));  // [N, out_channels, t, 1]
+                }
+            }
+
+            h = ggml_add(ctx, h, emb_out);  // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
+        }
+
+        // out_layers
+        h = out_layers_0->forward(ctx, h);
+        h = ggml_silu_inplace(ctx, h);
+        // dropout, skip for inference
+        h = out_layers_3->forward(ctx, h);
+
+        // skip connection
+        if (out_channels != channels) {
+            auto skip_connection = std::dynamic_pointer_cast<UnaryBlock>(blocks["skip_connection"]);
+            x                    = skip_connection->forward(ctx, x);  // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
+        }
+
+        h = ggml_add(ctx, h, x);
+        return h;  // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
+    }
+};
+
+class GEGLU : public GGMLBlock {
+protected:
+    int64_t dim_in;
+    int64_t dim_out;
+
+    void init_params(struct ggml_context* ctx, ggml_type wtype) {
+        params["proj.weight"] = ggml_new_tensor_2d(ctx, wtype, dim_in, dim_out * 2);
+        params["proj.bias"]   = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, dim_out * 2);
+    }
+
+public:
+    GEGLU(int64_t dim_in, int64_t dim_out)
+        : dim_in(dim_in), dim_out(dim_out) {}
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        // x: [ne3, ne2, ne1, dim_in]
+        // return: [ne3, ne2, ne1, dim_out]
+        struct ggml_tensor* w = params["proj.weight"];
+        struct ggml_tensor* b = params["proj.bias"];
+
+        auto x_w    = ggml_view_2d(ctx, w, w->ne[0], w->ne[1] / 2, w->nb[1], 0);                        // [dim_out, dim_in]
+        auto x_b    = ggml_view_1d(ctx, b, b->ne[0] / 2, 0);                                            // [dim_out, dim_in]
+        auto gate_w = ggml_view_2d(ctx, w, w->ne[0], w->ne[1] / 2, w->nb[1], w->nb[1] * w->ne[1] / 2);  // [dim_out, ]
+        auto gate_b = ggml_view_1d(ctx, b, b->ne[0] / 2, b->nb[0] * b->ne[0] / 2);                      // [dim_out, ]
+
+        auto x_in = x;
+        x         = ggml_nn_linear(ctx, x_in, x_w, x_b);        // [ne3, ne2, ne1, dim_out]
+        auto gate = ggml_nn_linear(ctx, x_in, gate_w, gate_b);  // [ne3, ne2, ne1, dim_out]
+
+        gate = ggml_gelu_inplace(ctx, gate);
+
+        x = ggml_mul(ctx, x, gate);  // [ne3, ne2, ne1, dim_out]
+
+        return x;
+    }
+};
+
+class FeedForward : public GGMLBlock {
+public:
+    FeedForward(int64_t dim,
+                int64_t dim_out,
+                int64_t mult = 4) {
+        int64_t inner_dim = dim * mult;
+
+        blocks["net.0"] = std::shared_ptr<GGMLBlock>(new GEGLU(dim, inner_dim));
+        // net_1 is nn.Dropout(), skip for inference
+        blocks["net.2"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim_out));
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        // x: [ne3, ne2, ne1, dim]
+        // return: [ne3, ne2, ne1, dim_out]
+
+        auto net_0 = std::dynamic_pointer_cast<GEGLU>(blocks["net.0"]);
+        auto net_2 = std::dynamic_pointer_cast<Linear>(blocks["net.2"]);
+
+        x = net_0->forward(ctx, x);  // [ne3, ne2, ne1, inner_dim]
+        x = net_2->forward(ctx, x);  // [ne3, ne2, ne1, dim_out]
+        return x;
+    }
+};
+
+class CrossAttention : public GGMLBlock {
+protected:
+    int64_t query_dim;
+    int64_t context_dim;
+    int64_t n_head;
+    int64_t d_head;
+
+public:
+    CrossAttention(int64_t query_dim,
+                   int64_t context_dim,
+                   int64_t n_head,
+                   int64_t d_head)
+        : n_head(n_head),
+          d_head(d_head),
+          query_dim(query_dim),
+          context_dim(context_dim) {
+        int64_t inner_dim = d_head * n_head;
+
+        blocks["to_q"] = std::shared_ptr<GGMLBlock>(new Linear(query_dim, inner_dim, false));
+        blocks["to_k"] = std::shared_ptr<GGMLBlock>(new Linear(context_dim, inner_dim, false));
+        blocks["to_v"] = std::shared_ptr<GGMLBlock>(new Linear(context_dim, inner_dim, false));
+
+        blocks["to_out.0"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, query_dim));
+        // to_out_1 is nn.Dropout(), skip for inference
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* context) {
+        // x: [N, n_token, query_dim]
+        // context: [N, n_context, context_dim]
+        // return: [N, n_token, query_dim]
+        auto to_q     = std::dynamic_pointer_cast<Linear>(blocks["to_q"]);
+        auto to_k     = std::dynamic_pointer_cast<Linear>(blocks["to_k"]);
+        auto to_v     = std::dynamic_pointer_cast<Linear>(blocks["to_v"]);
+        auto to_out_0 = std::dynamic_pointer_cast<Linear>(blocks["to_out.0"]);
+
+        int64_t n         = x->ne[2];
+        int64_t n_token   = x->ne[1];
+        int64_t n_context = context->ne[1];
+        int64_t inner_dim = d_head * n_head;
+
+        auto q = to_q->forward(ctx, x);                                 // [N, n_token, inner_dim]
+        q      = ggml_reshape_4d(ctx, q, d_head, n_head, n_token, n);   // [N, n_token, n_head, d_head]
+        q      = ggml_cont(ctx, ggml_permute(ctx, q, 0, 2, 1, 3));      // [N, n_head, n_token, d_head]
+        q      = ggml_reshape_3d(ctx, q, d_head, n_token, n_head * n);  // [N * n_head, n_token, d_head]
+
+        auto k = to_k->forward(ctx, context);                             // [N, n_context, inner_dim]
+        k      = ggml_reshape_4d(ctx, k, d_head, n_head, n_context, n);   // [N, n_context, n_head, d_head]
+        k      = ggml_cont(ctx, ggml_permute(ctx, k, 0, 2, 1, 3));        // [N, n_head, n_context, d_head]
+        k      = ggml_reshape_3d(ctx, k, d_head, n_context, n_head * n);  // [N * n_head, n_context, d_head]
+
+        auto v = to_v->forward(ctx, context);                             // [N, n_context, inner_dim]
+        v      = ggml_reshape_4d(ctx, v, d_head, n_head, n_context, n);   // [N, n_context, n_head, d_head]
+        v      = ggml_cont(ctx, ggml_permute(ctx, v, 1, 2, 0, 3));        // [N, n_head, d_head, n_context]
+        v      = ggml_reshape_3d(ctx, v, n_context, d_head, n_head * n);  // [N * n_head, d_head, n_context]
+
+        auto kqv = ggml_nn_attention(ctx, q, k, v, false);  // [N * n_head, n_token, d_head]
+        kqv      = ggml_reshape_4d(ctx, kqv, d_head, n_token, n_head, n);
+        kqv      = ggml_cont(ctx, ggml_permute(ctx, kqv, 0, 2, 1, 3));  // [N, n_token, n_head, d_head]
+
+        x = ggml_reshape_3d(ctx, kqv, d_head * n_head, n_token, n);  // [N, n_token, inner_dim]
+
+        x = to_out_0->forward(ctx, x);  // [N, n_token, query_dim]
+        return x;
+    }
+};
+
+class BasicTransformerBlock : public GGMLBlock {
+protected:
+    int64_t n_head;
+    int64_t d_head;
+    bool ff_in;
+
+public:
+    BasicTransformerBlock(int64_t dim,
+                          int64_t n_head,
+                          int64_t d_head,
+                          int64_t context_dim,
+                          bool ff_in = false)
+        : n_head(n_head), d_head(d_head), ff_in(ff_in) {
+        // disable_self_attn is always False
+        // disable_temporal_crossattention is always False
+        // switch_temporal_ca_to_sa is always False
+        // inner_dim is always None or equal to dim
+        // gated_ff is always True
+        blocks["attn1"] = std::shared_ptr<GGMLBlock>(new CrossAttention(dim, dim, n_head, d_head));
+        blocks["attn2"] = std::shared_ptr<GGMLBlock>(new CrossAttention(dim, context_dim, n_head, d_head));
+        blocks["ff"]    = std::shared_ptr<GGMLBlock>(new FeedForward(dim, dim));
+        blocks["norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
+        blocks["norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
+        blocks["norm3"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
+
+        if (ff_in) {
+            blocks["norm_in"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
+            blocks["ff_in"]   = std::shared_ptr<GGMLBlock>(new FeedForward(dim, dim));
+        }
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* context) {
+        // x: [N, n_token, query_dim]
+        // context: [N, n_context, context_dim]
+        // return: [N, n_token, query_dim]
+
+        auto attn1 = std::dynamic_pointer_cast<CrossAttention>(blocks["attn1"]);
+        auto attn2 = std::dynamic_pointer_cast<CrossAttention>(blocks["attn2"]);
+        auto ff    = std::dynamic_pointer_cast<FeedForward>(blocks["ff"]);
+        auto norm1 = std::dynamic_pointer_cast<LayerNorm>(blocks["norm1"]);
+        auto norm2 = std::dynamic_pointer_cast<LayerNorm>(blocks["norm2"]);
+        auto norm3 = std::dynamic_pointer_cast<LayerNorm>(blocks["norm3"]);
+
+        if (ff_in) {
+            auto norm_in = std::dynamic_pointer_cast<LayerNorm>(blocks["norm_in"]);
+            auto ff_in   = std::dynamic_pointer_cast<FeedForward>(blocks["ff_in"]);
+
+            auto x_skip = x;
+            x           = norm_in->forward(ctx, x);
+            x           = ff_in->forward(ctx, x);
+            // self.is_res is always True
+            x = ggml_add(ctx, x, x_skip);
+        }
+
+        auto r = x;
+        x      = norm1->forward(ctx, x);
+        x      = attn1->forward(ctx, x, x);  // self-attention
+        x      = ggml_add(ctx, x, r);
+        r      = x;
+        x      = norm2->forward(ctx, x);
+        x      = attn2->forward(ctx, x, context);  // cross-attention
+        x      = ggml_add(ctx, x, r);
+        r      = x;
+        x      = norm3->forward(ctx, x);
+        x      = ff->forward(ctx, x);
+        x      = ggml_add(ctx, x, r);
+
+        return x;
+    }
+};
+
+class SpatialTransformer : public GGMLBlock {
+protected:
+    int64_t in_channels;  // mult * model_channels
+    int64_t n_head;
+    int64_t d_head;
+    int64_t depth       = 1;    // 1
+    int64_t context_dim = 768;  // hidden_size, 1024 for VERSION_2_x
+
+public:
+    SpatialTransformer(int64_t in_channels,
+                       int64_t n_head,
+                       int64_t d_head,
+                       int64_t depth,
+                       int64_t context_dim)
+        : in_channels(in_channels),
+          n_head(n_head),
+          d_head(d_head),
+          depth(depth),
+          context_dim(context_dim) {
+        // We will convert unet transformer linear to conv2d 1x1 when loading the weights, so use_linear is always False
+        // disable_self_attn is always False
+        int64_t inner_dim = n_head * d_head;  // in_channels
+        blocks["norm"]    = std::shared_ptr<GGMLBlock>(new GroupNorm32(in_channels));
+        blocks["proj_in"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, inner_dim, {1, 1}));
+
+        for (int i = 0; i < depth; i++) {
+            std::string name = "transformer_blocks." + std::to_string(i);
+            blocks[name]     = std::shared_ptr<GGMLBlock>(new BasicTransformerBlock(inner_dim, n_head, d_head, context_dim));
+        }
+
+        blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(inner_dim, in_channels, {1, 1}));
+    }
+
+    virtual struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* context) {
+        // x: [N, in_channels, h, w]
+        // context: [N, max_position(aka n_token), hidden_size(aka context_dim)]
+        auto norm     = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm"]);
+        auto proj_in  = std::dynamic_pointer_cast<Conv2d>(blocks["proj_in"]);
+        auto proj_out = std::dynamic_pointer_cast<Conv2d>(blocks["proj_out"]);
+
+        auto x_in         = x;
+        int64_t n         = x->ne[3];
+        int64_t h         = x->ne[1];
+        int64_t w         = x->ne[0];
+        int64_t inner_dim = n_head * d_head;
+
+        x = norm->forward(ctx, x);
+        x = proj_in->forward(ctx, x);  // [N, inner_dim, h, w]
+
+        x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 2, 0, 3));  // [N, h, w, inner_dim]
+        x = ggml_reshape_3d(ctx, x, inner_dim, w * h, n);      // [N, h * w, inner_dim]
+
+        for (int i = 0; i < depth; i++) {
+            std::string name       = "transformer_blocks." + std::to_string(i);
+            auto transformer_block = std::dynamic_pointer_cast<BasicTransformerBlock>(blocks[name]);
+
+            x = transformer_block->forward(ctx, x, context);
+        }
+
+        x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 0, 2, 3));  // [N, inner_dim, h * w]
+        x = ggml_reshape_4d(ctx, x, w, h, inner_dim, n);       // [N, inner_dim, h, w]
+
+        // proj_out
+        x = proj_out->forward(ctx, x);  // [N, in_channels, h, w]
+
+        x = ggml_add(ctx, x, x_in);
+        return x;
+    }
+};
+
+class AlphaBlender : public GGMLBlock {
+protected:
+    void init_params(struct ggml_context* ctx, ggml_type wtype) {
+        params["mix_factor"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
+    }
+
+    float get_alpha() {
+        // image_only_indicator is always tensor([0.]) and since mix_factor.shape is [1,]
+        // so learned_with_images is same as learned
+        float alpha = ggml_backend_tensor_get_f32(params["mix_factor"]);
+        return sigmoid(alpha);
+    }
+
+public:
+    AlphaBlender() {
+        // merge_strategy is always learned_with_images
+        // for inference, we don't need to set alpha
+        // since mix_factor.shape is [1,], we don't need rearrange using  rearrange_pattern
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx,
+                                struct ggml_tensor* x_spatial,
+                                struct ggml_tensor* x_temporal) {
+        // image_only_indicator is always tensor([0.])
+        float alpha = get_alpha();
+        auto x      = ggml_add(ctx,
+                               ggml_scale(ctx, x_spatial, alpha),
+                               ggml_scale(ctx, x_temporal, 1.0f - alpha));
+        return x;
+    }
+};
+
+class VideoResBlock : public ResBlock {
+public:
+    VideoResBlock(int channels,
+                  int emb_channels,
+                  int out_channels,
+                  std::pair<int, int> kernel_size = {3, 3},
+                  int64_t video_kernel_size       = 3,
+                  int dims                        = 2)  // always 2
+        : ResBlock(channels, emb_channels, out_channels, kernel_size, dims) {
+        blocks["time_stack"] = std::shared_ptr<GGMLBlock>(new ResBlock(out_channels, emb_channels, out_channels, kernel_size, 3, true));
+        blocks["time_mixer"] = std::shared_ptr<GGMLBlock>(new AlphaBlender());
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx,
+                                struct ggml_tensor* x,
+                                struct ggml_tensor* emb,
+                                int num_video_frames) {
+        // x: [N, channels, h, w] aka [b*t, channels, h, w]
+        // emb: [N, emb_channels] aka [b*t, emb_channels]
+        // image_only_indicator is always tensor([0.])
+        auto time_stack = std::dynamic_pointer_cast<ResBlock>(blocks["time_stack"]);
+        auto time_mixer = std::dynamic_pointer_cast<AlphaBlender>(blocks["time_mixer"]);
+
+        x = ResBlock::forward(ctx, x, emb);
+
+        int64_t T = num_video_frames;
+        int64_t B = x->ne[3] / T;
+        int64_t C = x->ne[2];
+        int64_t H = x->ne[1];
+        int64_t W = x->ne[0];
+
+        x          = ggml_reshape_4d(ctx, x, W * H, C, T, B);           // (b t) c h w -> b t c (h w)
+        x          = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));  // b t c (h w) -> b c t (h w)
+        auto x_mix = x;
+
+        emb = ggml_reshape_4d(ctx, emb, emb->ne[0], T, B, emb->ne[3]);  // (b t) ... -> b t ...
+
+        x = time_stack->forward(ctx, x, emb);  // b t c (h w)
+
+        x = time_mixer->forward(ctx, x_mix, x);  // b t c (h w)
+
+        x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));  // b c t (h w) -> b t c (h w)
+        x = ggml_reshape_4d(ctx, x, W, H, C, T * B);           // b t c (h w) -> (b t) c h w
+
+        return x;
+    }
+};
+
+#endif  // __COMMON_HPP__
--- a/otherarch/sdcpp/control.hpp
+++ b/otherarch/sdcpp/control.hpp
@ -0,0 +1,466 @@
+#ifndef __CONTROL_HPP__
+#define __CONTROL_HPP__
+
+#include "common.hpp"
+#include "ggml_extend.hpp"
+#include "model.h"
+
+#define CONTROL_NET_GRAPH_SIZE 1536
+
+/*
+    =================================== ControlNet ===================================
+    Reference: https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/cldm/cldm.py
+
+*/
+class ControlNetBlock : public GGMLBlock {
+protected:
+    SDVersion version = VERSION_1_x;
+    // network hparams
+    int in_channels                        = 4;
+    int out_channels                       = 4;
+    int hint_channels                      = 3;
+    int num_res_blocks                     = 2;
+    std::vector<int> attention_resolutions = {4, 2, 1};
+    std::vector<int> channel_mult          = {1, 2, 4, 4};
+    std::vector<int> transformer_depth     = {1, 1, 1, 1};
+    int time_embed_dim                     = 1280;  // model_channels*4
+    int num_heads                          = 8;
+    int num_head_channels                  = -1;   // channels // num_heads
+    int context_dim                        = 768;  // 1024 for VERSION_2_x, 2048 for VERSION_XL
+
+public:
+    int model_channels  = 320;
+    int adm_in_channels = 2816;  // only for VERSION_XL
+
+    ControlNetBlock(SDVersion version = VERSION_1_x)
+        : version(version) {
+        if (version == VERSION_2_x) {
+            context_dim       = 1024;
+            num_head_channels = 64;
+            num_heads         = -1;
+        } else if (version == VERSION_XL) {
+            context_dim           = 2048;
+            attention_resolutions = {4, 2};
+            channel_mult          = {1, 2, 4};
+            transformer_depth     = {1, 2, 10};
+            num_head_channels     = 64;
+            num_heads             = -1;
+        } else if (version == VERSION_SVD) {
+            in_channels       = 8;
+            out_channels      = 4;
+            context_dim       = 1024;
+            adm_in_channels   = 768;
+            num_head_channels = 64;
+            num_heads         = -1;
+        }
+
+        blocks["time_embed.0"] = std::shared_ptr<GGMLBlock>(new Linear(model_channels, time_embed_dim));
+        // time_embed_1 is nn.SiLU()
+        blocks["time_embed.2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, time_embed_dim));
+
+        if (version == VERSION_XL || version == VERSION_SVD) {
+            blocks["label_emb.0.0"] = std::shared_ptr<GGMLBlock>(new Linear(adm_in_channels, time_embed_dim));
+            // label_emb_1 is nn.SiLU()
+            blocks["label_emb.0.2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, time_embed_dim));
+        }
+
+        // input_blocks
+        blocks["input_blocks.0.0"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, model_channels, {3, 3}, {1, 1}, {1, 1}));
+
+        std::vector<int> input_block_chans;
+        input_block_chans.push_back(model_channels);
+        int ch              = model_channels;
+        int input_block_idx = 0;
+        int ds              = 1;
+
+        auto get_resblock = [&](int64_t channels, int64_t emb_channels, int64_t out_channels) -> ResBlock* {
+            return new ResBlock(channels, emb_channels, out_channels);
+        };
+
+        auto get_attention_layer = [&](int64_t in_channels,
+                                       int64_t n_head,
+                                       int64_t d_head,
+                                       int64_t depth,
+                                       int64_t context_dim) -> SpatialTransformer* {
+            return new SpatialTransformer(in_channels, n_head, d_head, depth, context_dim);
+        };
+
+        auto make_zero_conv = [&](int64_t channels) {
+            return new Conv2d(channels, channels, {1, 1});
+        };
+
+        blocks["zero_convs.0.0"] = std::shared_ptr<GGMLBlock>(make_zero_conv(model_channels));
+
+        blocks["input_hint_block.0"] = std::shared_ptr<GGMLBlock>(new Conv2d(hint_channels, 16, {3, 3}, {1, 1}, {1, 1}));
+        // nn.SiLU()
+        blocks["input_hint_block.2"] = std::shared_ptr<GGMLBlock>(new Conv2d(16, 16, {3, 3}, {1, 1}, {1, 1}));
+        // nn.SiLU()
+        blocks["input_hint_block.4"] = std::shared_ptr<GGMLBlock>(new Conv2d(16, 32, {3, 3}, {2, 2}, {1, 1}));
+        // nn.SiLU()
+        blocks["input_hint_block.6"] = std::shared_ptr<GGMLBlock>(new Conv2d(32, 32, {3, 3}, {1, 1}, {1, 1}));
+        // nn.SiLU()
+        blocks["input_hint_block.8"] = std::shared_ptr<GGMLBlock>(new Conv2d(32, 96, {3, 3}, {2, 2}, {1, 1}));
+        // nn.SiLU()
+        blocks["input_hint_block.10"] = std::shared_ptr<GGMLBlock>(new Conv2d(96, 96, {3, 3}, {1, 1}, {1, 1}));
+        // nn.SiLU()
+        blocks["input_hint_block.12"] = std::shared_ptr<GGMLBlock>(new Conv2d(96, 256, {3, 3}, {2, 2}, {1, 1}));
+        // nn.SiLU()
+        blocks["input_hint_block.14"] = std::shared_ptr<GGMLBlock>(new Conv2d(256, model_channels, {3, 3}, {1, 1}, {1, 1}));
+
+        size_t len_mults = channel_mult.size();
+        for (int i = 0; i < len_mults; i++) {
+            int mult = channel_mult[i];
+            for (int j = 0; j < num_res_blocks; j++) {
+                input_block_idx += 1;
+                std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0";
+                blocks[name]     = std::shared_ptr<GGMLBlock>(get_resblock(ch, time_embed_dim, mult * model_channels));
+
+                ch = mult * model_channels;
+                if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) {
+                    int n_head = num_heads;
+                    int d_head = ch / num_heads;
+                    if (num_head_channels != -1) {
+                        d_head = num_head_channels;
+                        n_head = ch / d_head;
+                    }
+                    std::string name = "input_blocks." + std::to_string(input_block_idx) + ".1";
+                    blocks[name]     = std::shared_ptr<GGMLBlock>(get_attention_layer(ch,
+                                                                                      n_head,
+                                                                                      d_head,
+                                                                                      transformer_depth[i],
+                                                                                      context_dim));
+                }
+                blocks["zero_convs." + std::to_string(input_block_idx) + ".0"] = std::shared_ptr<GGMLBlock>(make_zero_conv(ch));
+                input_block_chans.push_back(ch);
+            }
+            if (i != len_mults - 1) {
+                input_block_idx += 1;
+                std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0";
+                blocks[name]     = std::shared_ptr<GGMLBlock>(new DownSampleBlock(ch, ch));
+
+                blocks["zero_convs." + std::to_string(input_block_idx) + ".0"] = std::shared_ptr<GGMLBlock>(make_zero_conv(ch));
+
+                input_block_chans.push_back(ch);
+                ds *= 2;
+            }
+        }
+
+        // middle blocks
+        int n_head = num_heads;
+        int d_head = ch / num_heads;
+        if (num_head_channels != -1) {
+            d_head = num_head_channels;
+            n_head = ch / d_head;
+        }
+        blocks["middle_block.0"] = std::shared_ptr<GGMLBlock>(get_resblock(ch, time_embed_dim, ch));
+        blocks["middle_block.1"] = std::shared_ptr<GGMLBlock>(get_attention_layer(ch,
+                                                                                  n_head,
+                                                                                  d_head,
+                                                                                  transformer_depth[transformer_depth.size() - 1],
+                                                                                  context_dim));
+        blocks["middle_block.2"] = std::shared_ptr<GGMLBlock>(get_resblock(ch, time_embed_dim, ch));
+
+        // middle_block_out
+        blocks["middle_block_out.0"] = std::shared_ptr<GGMLBlock>(make_zero_conv(ch));
+    }
+
+    struct ggml_tensor* resblock_forward(std::string name,
+                                         struct ggml_context* ctx,
+                                         struct ggml_tensor* x,
+                                         struct ggml_tensor* emb) {
+        auto block = std::dynamic_pointer_cast<ResBlock>(blocks[name]);
+        return block->forward(ctx, x, emb);
+    }
+
+    struct ggml_tensor* attention_layer_forward(std::string name,
+                                                struct ggml_context* ctx,
+                                                struct ggml_tensor* x,
+                                                struct ggml_tensor* context) {
+        auto block = std::dynamic_pointer_cast<SpatialTransformer>(blocks[name]);
+        return block->forward(ctx, x, context);
+    }
+
+    struct ggml_tensor* input_hint_block_forward(struct ggml_context* ctx,
+                                                 struct ggml_tensor* hint,
+                                                 struct ggml_tensor* emb,
+                                                 struct ggml_tensor* context) {
+        int num_input_blocks = 15;
+        auto h               = hint;
+        for (int i = 0; i < num_input_blocks; i++) {
+            if (i % 2 == 0) {
+                auto block = std::dynamic_pointer_cast<Conv2d>(blocks["input_hint_block." + std::to_string(i)]);
+
+                h = block->forward(ctx, h);
+            } else {
+                h = ggml_silu_inplace(ctx, h);
+            }
+        }
+        return h;
+    }
+
+    std::vector<struct ggml_tensor*> forward(struct ggml_context* ctx,
+                                             struct ggml_tensor* x,
+                                             struct ggml_tensor* hint,
+                                             struct ggml_tensor* guided_hint,
+                                             struct ggml_tensor* timesteps,
+                                             struct ggml_tensor* context,
+                                             struct ggml_tensor* y = NULL) {
+        // x: [N, in_channels, h, w] or [N, in_channels/2, h, w]
+        // timesteps: [N,]
+        // context: [N, max_position, hidden_size] or [1, max_position, hidden_size]. for example, [N, 77, 768]
+        // y: [N, adm_in_channels] or [1, adm_in_channels]
+        if (context != NULL) {
+            if (context->ne[2] != x->ne[3]) {
+                context = ggml_repeat(ctx, context, ggml_new_tensor_3d(ctx, GGML_TYPE_F32, context->ne[0], context->ne[1], x->ne[3]));
+            }
+        }
+
+        if (y != NULL) {
+            if (y->ne[1] != x->ne[3]) {
+                y = ggml_repeat(ctx, y, ggml_new_tensor_2d(ctx, GGML_TYPE_F32, y->ne[0], x->ne[3]));
+            }
+        }
+
+        auto time_embed_0     = std::dynamic_pointer_cast<Linear>(blocks["time_embed.0"]);
+        auto time_embed_2     = std::dynamic_pointer_cast<Linear>(blocks["time_embed.2"]);
+        auto input_blocks_0_0 = std::dynamic_pointer_cast<Conv2d>(blocks["input_blocks.0.0"]);
+        auto zero_convs_0     = std::dynamic_pointer_cast<Conv2d>(blocks["zero_convs.0.0"]);
+
+        auto middle_block_out = std::dynamic_pointer_cast<Conv2d>(blocks["middle_block_out.0"]);
+
+        auto t_emb = ggml_nn_timestep_embedding(ctx, timesteps, model_channels);  // [N, model_channels]
+
+        auto emb = time_embed_0->forward(ctx, t_emb);
+        emb      = ggml_silu_inplace(ctx, emb);
+        emb      = time_embed_2->forward(ctx, emb);  // [N, time_embed_dim]
+
+        // SDXL/SVD
+        if (y != NULL) {
+            auto label_embed_0 = std::dynamic_pointer_cast<Linear>(blocks["label_emb.0.0"]);
+            auto label_embed_2 = std::dynamic_pointer_cast<Linear>(blocks["label_emb.0.2"]);
+
+            auto label_emb = label_embed_0->forward(ctx, y);
+            label_emb      = ggml_silu_inplace(ctx, label_emb);
+            label_emb      = label_embed_2->forward(ctx, label_emb);  // [N, time_embed_dim]
+
+            emb = ggml_add(ctx, emb, label_emb);  // [N, time_embed_dim]
+        }
+
+        std::vector<struct ggml_tensor*> outs;
+
+        if (guided_hint == NULL) {
+            guided_hint = input_hint_block_forward(ctx, hint, emb, context);
+        }
+        outs.push_back(guided_hint);
+
+        // input_blocks
+
+        // input block 0
+        auto h = input_blocks_0_0->forward(ctx, x);
+        h      = ggml_add(ctx, h, guided_hint);
+        outs.push_back(zero_convs_0->forward(ctx, h));
+
+        // input block 1-11
+        size_t len_mults    = channel_mult.size();
+        int input_block_idx = 0;
+        int ds              = 1;
+        for (int i = 0; i < len_mults; i++) {
+            int mult = channel_mult[i];
+            for (int j = 0; j < num_res_blocks; j++) {
+                input_block_idx += 1;
+                std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0";
+                h                = resblock_forward(name, ctx, h, emb);  // [N, mult*model_channels, h, w]
+                if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) {
+                    std::string name = "input_blocks." + std::to_string(input_block_idx) + ".1";
+                    h                = attention_layer_forward(name, ctx, h, context);  // [N, mult*model_channels, h, w]
+                }
+
+                auto zero_conv = std::dynamic_pointer_cast<Conv2d>(blocks["zero_convs." + std::to_string(input_block_idx) + ".0"]);
+
+                outs.push_back(zero_conv->forward(ctx, h));
+            }
+            if (i != len_mults - 1) {
+                ds *= 2;
+                input_block_idx += 1;
+
+                std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0";
+                auto block       = std::dynamic_pointer_cast<DownSampleBlock>(blocks[name]);
+
+                h = block->forward(ctx, h);  // [N, mult*model_channels, h/(2^(i+1)), w/(2^(i+1))]
+
+                auto zero_conv = std::dynamic_pointer_cast<Conv2d>(blocks["zero_convs." + std::to_string(input_block_idx) + ".0"]);
+
+                outs.push_back(zero_conv->forward(ctx, h));
+            }
+        }
+        // [N, 4*model_channels, h/8, w/8]
+
+        // middle_block
+        h = resblock_forward("middle_block.0", ctx, h, emb);             // [N, 4*model_channels, h/8, w/8]
+        h = attention_layer_forward("middle_block.1", ctx, h, context);  // [N, 4*model_channels, h/8, w/8]
+        h = resblock_forward("middle_block.2", ctx, h, emb);             // [N, 4*model_channels, h/8, w/8]
+
+        // out
+        outs.push_back(middle_block_out->forward(ctx, h));
+        return outs;
+    }
+};
+
+struct ControlNet : public GGMLModule {
+    SDVersion version = VERSION_1_x;
+    ControlNetBlock control_net;
+
+    ggml_backend_buffer_t control_buffer = NULL;  // keep control output tensors in backend memory
+    ggml_context* control_ctx            = NULL;
+    std::vector<struct ggml_tensor*> controls;  // (12 input block outputs, 1 middle block output) SD 1.5
+    struct ggml_tensor* guided_hint = NULL;     // guided_hint cache, for faster inference
+    bool guided_hint_cached         = false;
+
+    ControlNet(ggml_backend_t backend,
+               ggml_type wtype,
+               SDVersion version = VERSION_1_x)
+        : GGMLModule(backend, wtype), control_net(version) {
+        control_net.init(params_ctx, wtype);
+    }
+
+    ~ControlNet() {
+        free_control_ctx();
+    }
+
+    void alloc_control_ctx(std::vector<struct ggml_tensor*> outs) {
+        struct ggml_init_params params;
+        params.mem_size   = static_cast<size_t>(outs.size() * ggml_tensor_overhead()) + 1024 * 1024;
+        params.mem_buffer = NULL;
+        params.no_alloc   = true;
+        control_ctx       = ggml_init(params);
+
+        controls.resize(outs.size() - 1);
+
+        size_t control_buffer_size = 0;
+
+        guided_hint = ggml_dup_tensor(control_ctx, outs[0]);
+        control_buffer_size += ggml_nbytes(guided_hint);
+
+        for (int i = 0; i < outs.size() - 1; i++) {
+            controls[i] = ggml_dup_tensor(control_ctx, outs[i + 1]);
+            control_buffer_size += ggml_nbytes(controls[i]);
+        }
+
+        control_buffer = ggml_backend_alloc_ctx_tensors(control_ctx, backend);
+
+        LOG_DEBUG("control buffer size %.2fMB", control_buffer_size * 1.f / 1024.f / 1024.f);
+    }
+
+    void free_control_ctx() {
+        if (control_buffer != NULL) {
+            ggml_backend_buffer_free(control_buffer);
+            control_buffer = NULL;
+        }
+        if (control_ctx != NULL) {
+            ggml_free(control_ctx);
+            control_ctx = NULL;
+        }
+        guided_hint        = NULL;
+        guided_hint_cached = false;
+        controls.clear();
+    }
+
+    std::string get_desc() {
+        return "control_net";
+    }
+
+    size_t get_params_mem_size() {
+        return control_net.get_params_mem_size();
+    }
+
+    size_t get_params_num() {
+        return control_net.get_params_num();
+    }
+
+    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
+        control_net.get_param_tensors(tensors, prefix);
+    }
+
+    struct ggml_cgraph* build_graph(struct ggml_tensor* x,
+                                    struct ggml_tensor* hint,
+                                    struct ggml_tensor* timesteps,
+                                    struct ggml_tensor* context,
+                                    struct ggml_tensor* y = NULL) {
+        struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, CONTROL_NET_GRAPH_SIZE, false);
+
+        x = to_backend(x);
+        if (guided_hint_cached) {
+            hint = NULL;
+        } else {
+            hint = to_backend(hint);
+        }
+        context   = to_backend(context);
+        y         = to_backend(y);
+        timesteps = to_backend(timesteps);
+
+        auto outs = control_net.forward(compute_ctx,
+                                        x,
+                                        hint,
+                                        guided_hint_cached ? guided_hint : NULL,
+                                        timesteps,
+                                        context,
+                                        y);
+
+        if (control_ctx == NULL) {
+            alloc_control_ctx(outs);
+        }
+
+        ggml_build_forward_expand(gf, ggml_cpy(compute_ctx, outs[0], guided_hint));
+        for (int i = 0; i < outs.size() - 1; i++) {
+            ggml_build_forward_expand(gf, ggml_cpy(compute_ctx, outs[i + 1], controls[i]));
+        }
+
+        return gf;
+    }
+
+    void compute(int n_threads,
+                 struct ggml_tensor* x,
+                 struct ggml_tensor* hint,
+                 struct ggml_tensor* timesteps,
+                 struct ggml_tensor* context,
+                 struct ggml_tensor* y,
+                 struct ggml_tensor** output     = NULL,
+                 struct ggml_context* output_ctx = NULL) {
+        // x: [N, in_channels, h, w]
+        // timesteps: [N, ]
+        // context: [N, max_position, hidden_size]([N, 77, 768]) or [1, max_position, hidden_size]
+        // y: [N, adm_in_channels] or [1, adm_in_channels]
+        auto get_graph = [&]() -> struct ggml_cgraph* {
+            return build_graph(x, hint, timesteps, context, y);
+        };
+
+        GGMLModule::compute(get_graph, n_threads, false, output, output_ctx);
+        guided_hint_cached = true;
+    }
+
+    bool load_from_file(const std::string& file_path) {
+        LOG_INFO("loading control net from '%s'", file_path.c_str());
+        alloc_params_buffer();
+        std::map<std::string, ggml_tensor*> tensors;
+        control_net.get_param_tensors(tensors);
+        std::set<std::string> ignore_tensors;
+
+        ModelLoader model_loader;
+        if (!model_loader.init_from_file(file_path)) {
+            LOG_ERROR("init control net model loader from file failed: '%s'", file_path.c_str());
+            return false;
+        }
+
+        bool success = model_loader.load_tensors(tensors, backend, ignore_tensors);
+
+        if (!success) {
+            LOG_ERROR("load control net tensors from model loader failed");
+            return false;
+        }
+
+        LOG_INFO("control net model loaded");
+        return success;
+    }
+};
+
+#endif  // __CONTROL_HPP__
--- a/otherarch/sdcpp/denoiser.hpp
+++ b/otherarch/sdcpp/denoiser.hpp
@ -0,0 +1,125 @@
+#ifndef __DENOISER_HPP__
+#define __DENOISER_HPP__
+
+#include "ggml_extend.hpp"
+
+/*================================================= CompVisDenoiser ==================================================*/
+
+// Ref: https://github.com/crowsonkb/k-diffusion/blob/master/k_diffusion/external.py
+
+#define TIMESTEPS 1000
+
+struct SigmaSchedule {
+    float alphas_cumprod[TIMESTEPS];
+    float sigmas[TIMESTEPS];
+    float log_sigmas[TIMESTEPS];
+
+    virtual std::vector<float> get_sigmas(uint32_t n) = 0;
+
+    float sigma_to_t(float sigma) {
+        float log_sigma = std::log(sigma);
+        std::vector<float> dists;
+        dists.reserve(TIMESTEPS);
+        for (float log_sigma_val : log_sigmas) {
+            dists.push_back(log_sigma - log_sigma_val);
+        }
+
+        int low_idx = 0;
+        for (size_t i = 0; i < TIMESTEPS; i++) {
+            if (dists[i] >= 0) {
+                low_idx++;
+            }
+        }
+        low_idx      = std::min(std::max(low_idx - 1, 0), TIMESTEPS - 2);
+        int high_idx = low_idx + 1;
+
+        float low  = log_sigmas[low_idx];
+        float high = log_sigmas[high_idx];
+        float w    = (low - log_sigma) / (low - high);
+        w          = std::max(0.f, std::min(1.f, w));
+        float t    = (1.0f - w) * low_idx + w * high_idx;
+
+        return t;
+    }
+
+    float t_to_sigma(float t) {
+        int low_idx     = static_cast<int>(std::floor(t));
+        int high_idx    = static_cast<int>(std::ceil(t));
+        float w         = t - static_cast<float>(low_idx);
+        float log_sigma = (1.0f - w) * log_sigmas[low_idx] + w * log_sigmas[high_idx];
+        return std::exp(log_sigma);
+    }
+};
+
+struct DiscreteSchedule : SigmaSchedule {
+    std::vector<float> get_sigmas(uint32_t n) {
+        std::vector<float> result;
+
+        int t_max = TIMESTEPS - 1;
+
+        if (n == 0) {
+            return result;
+        } else if (n == 1) {
+            result.push_back(t_to_sigma((float)t_max));
+            result.push_back(0);
+            return result;
+        }
+
+        float step = static_cast<float>(t_max) / static_cast<float>(n - 1);
+        for (uint32_t i = 0; i < n; ++i) {
+            float t = t_max - step * i;
+            result.push_back(t_to_sigma(t));
+        }
+        result.push_back(0);
+        return result;
+    }
+};
+
+struct KarrasSchedule : SigmaSchedule {
+    std::vector<float> get_sigmas(uint32_t n) {
+        // These *COULD* be function arguments here,
+        // but does anybody ever bother to touch them?
+        float sigma_min = 0.1f;
+        float sigma_max = 10.f;
+        float rho       = 7.f;
+
+        std::vector<float> result(n + 1);
+
+        float min_inv_rho = pow(sigma_min, (1.f / rho));
+        float max_inv_rho = pow(sigma_max, (1.f / rho));
+        for (uint32_t i = 0; i < n; i++) {
+            // Eq. (5) from Karras et al 2022
+            result[i] = pow(max_inv_rho + (float)i / ((float)n - 1.f) * (min_inv_rho - max_inv_rho), rho);
+        }
+        result[n] = 0.;
+        return result;
+    }
+};
+
+struct Denoiser {
+    std::shared_ptr<SigmaSchedule> schedule              = std::make_shared<DiscreteSchedule>();
+    virtual std::vector<float> get_scalings(float sigma) = 0;
+};
+
+struct CompVisDenoiser : public Denoiser {
+    float sigma_data = 1.0f;
+
+    std::vector<float> get_scalings(float sigma) {
+        float c_out = -sigma;
+        float c_in  = 1.0f / std::sqrt(sigma * sigma + sigma_data * sigma_data);
+        return {c_out, c_in};
+    }
+};
+
+struct CompVisVDenoiser : public Denoiser {
+    float sigma_data = 1.0f;
+
+    std::vector<float> get_scalings(float sigma) {
+        float c_skip = sigma_data * sigma_data / (sigma * sigma + sigma_data * sigma_data);
+        float c_out  = -sigma * sigma_data / std::sqrt(sigma * sigma + sigma_data * sigma_data);
+        float c_in   = 1.0f / std::sqrt(sigma * sigma + sigma_data * sigma_data);
+        return {c_skip, c_out, c_in};
+    }
+};
+
+#endif  // __DENOISER_HPP__
--- a/otherarch/sdcpp/esrgan.hpp
+++ b/otherarch/sdcpp/esrgan.hpp
@ -0,0 +1,206 @@
+#ifndef __ESRGAN_HPP__
+#define __ESRGAN_HPP__
+
+#include "ggml_extend.hpp"
+#include "model.h"
+
+/*
+    ===================================    ESRGAN  ===================================
+    References:
+    https://github.com/xinntao/Real-ESRGAN/blob/master/inference_realesrgan.py
+    https://github.com/XPixelGroup/BasicSR/blob/v1.4.2/basicsr/archs/rrdbnet_arch.py
+
+*/
+
+class ResidualDenseBlock : public GGMLBlock {
+protected:
+    int num_feat;
+    int num_grow_ch;
+
+public:
+    ResidualDenseBlock(int num_feat = 64, int num_grow_ch = 32)
+        : num_feat(num_feat), num_grow_ch(num_grow_ch) {
+        blocks["conv1"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_grow_ch, {3, 3}, {1, 1}, {1, 1}));
+        blocks["conv2"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat + num_grow_ch, num_grow_ch, {3, 3}, {1, 1}, {1, 1}));
+        blocks["conv3"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat + 2 * num_grow_ch, num_grow_ch, {3, 3}, {1, 1}, {1, 1}));
+        blocks["conv4"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat + 3 * num_grow_ch, num_grow_ch, {3, 3}, {1, 1}, {1, 1}));
+        blocks["conv5"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat + 4 * num_grow_ch, num_feat, {3, 3}, {1, 1}, {1, 1}));
+    }
+
+    struct ggml_tensor* lrelu(struct ggml_context* ctx, struct ggml_tensor* x) {
+        return ggml_leaky_relu(ctx, x, 0.2f, true);
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        // x: [n, num_feat, h, w]
+        // return: [n, num_feat, h, w]
+
+        auto conv1 = std::dynamic_pointer_cast<Conv2d>(blocks["conv1"]);
+        auto conv2 = std::dynamic_pointer_cast<Conv2d>(blocks["conv2"]);
+        auto conv3 = std::dynamic_pointer_cast<Conv2d>(blocks["conv3"]);
+        auto conv4 = std::dynamic_pointer_cast<Conv2d>(blocks["conv4"]);
+        auto conv5 = std::dynamic_pointer_cast<Conv2d>(blocks["conv5"]);
+
+        auto x1    = lrelu(ctx, conv1->forward(ctx, x));
+        auto x_cat = ggml_concat(ctx, x, x1);
+        auto x2    = lrelu(ctx, conv2->forward(ctx, x_cat));
+        x_cat      = ggml_concat(ctx, x_cat, x2);
+        auto x3    = lrelu(ctx, conv3->forward(ctx, x_cat));
+        x_cat      = ggml_concat(ctx, x_cat, x3);
+        auto x4    = lrelu(ctx, conv4->forward(ctx, x_cat));
+        x_cat      = ggml_concat(ctx, x_cat, x4);
+        auto x5    = conv5->forward(ctx, x_cat);
+
+        x5 = ggml_add(ctx, ggml_scale(ctx, x5, 0.2f), x);
+        return x5;
+    }
+};
+
+class RRDB : public GGMLBlock {
+public:
+    RRDB(int num_feat, int num_grow_ch = 32) {
+        blocks["rdb1"] = std::shared_ptr<GGMLBlock>(new ResidualDenseBlock(num_feat, num_grow_ch));
+        blocks["rdb2"] = std::shared_ptr<GGMLBlock>(new ResidualDenseBlock(num_feat, num_grow_ch));
+        blocks["rdb3"] = std::shared_ptr<GGMLBlock>(new ResidualDenseBlock(num_feat, num_grow_ch));
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        // x: [n, num_feat, h, w]
+        // return: [n, num_feat, h, w]
+
+        auto rdb1 = std::dynamic_pointer_cast<ResidualDenseBlock>(blocks["rdb1"]);
+        auto rdb2 = std::dynamic_pointer_cast<ResidualDenseBlock>(blocks["rdb2"]);
+        auto rdb3 = std::dynamic_pointer_cast<ResidualDenseBlock>(blocks["rdb3"]);
+
+        auto out = rdb1->forward(ctx, x);
+        out      = rdb2->forward(ctx, out);
+        out      = rdb3->forward(ctx, out);
+
+        out = ggml_add(ctx, ggml_scale(ctx, out, 0.2f), x);
+        return out;
+    }
+};
+
+class RRDBNet : public GGMLBlock {
+protected:
+    int scale       = 4;  // default RealESRGAN_x4plus_anime_6B
+    int num_block   = 6;  // default RealESRGAN_x4plus_anime_6B
+    int num_in_ch   = 3;
+    int num_out_ch  = 3;
+    int num_feat    = 64;  // default RealESRGAN_x4plus_anime_6B
+    int num_grow_ch = 32;  // default RealESRGAN_x4plus_anime_6B
+
+public:
+    RRDBNet() {
+        blocks["conv_first"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_in_ch, num_feat, {3, 3}, {1, 1}, {1, 1}));
+        for (int i = 0; i < num_block; i++) {
+            std::string name = "body." + std::to_string(i);
+            blocks[name]     = std::shared_ptr<GGMLBlock>(new RRDB(num_feat, num_grow_ch));
+        }
+        blocks["conv_body"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}));
+        // upsample
+        blocks["conv_up1"]  = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}));
+        blocks["conv_up2"]  = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}));
+        blocks["conv_hr"]   = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}));
+        blocks["conv_last"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_out_ch, {3, 3}, {1, 1}, {1, 1}));
+    }
+
+    struct ggml_tensor* lrelu(struct ggml_context* ctx, struct ggml_tensor* x) {
+        return ggml_leaky_relu(ctx, x, 0.2f, true);
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        // x: [n, num_in_ch, h, w]
+        // return: [n, num_out_ch, h*4, w*4]
+        auto conv_first = std::dynamic_pointer_cast<Conv2d>(blocks["conv_first"]);
+        auto conv_body  = std::dynamic_pointer_cast<Conv2d>(blocks["conv_body"]);
+        auto conv_up1   = std::dynamic_pointer_cast<Conv2d>(blocks["conv_up1"]);
+        auto conv_up2   = std::dynamic_pointer_cast<Conv2d>(blocks["conv_up2"]);
+        auto conv_hr    = std::dynamic_pointer_cast<Conv2d>(blocks["conv_hr"]);
+        auto conv_last  = std::dynamic_pointer_cast<Conv2d>(blocks["conv_last"]);
+
+        auto feat      = conv_first->forward(ctx, x);
+        auto body_feat = feat;
+        for (int i = 0; i < num_block; i++) {
+            std::string name = "body." + std::to_string(i);
+            auto block       = std::dynamic_pointer_cast<RRDB>(blocks[name]);
+
+            body_feat = block->forward(ctx, body_feat);
+        }
+        body_feat = conv_body->forward(ctx, body_feat);
+        feat      = ggml_add(ctx, feat, body_feat);
+        // upsample
+        feat     = lrelu(ctx, conv_up1->forward(ctx, ggml_upscale(ctx, feat, 2)));
+        feat     = lrelu(ctx, conv_up2->forward(ctx, ggml_upscale(ctx, feat, 2)));
+        auto out = conv_last->forward(ctx, lrelu(ctx, conv_hr->forward(ctx, feat)));
+        return out;
+    }
+};
+
+struct ESRGAN : public GGMLModule {
+    RRDBNet rrdb_net;
+    int scale     = 4;
+    int tile_size = 128;  // avoid cuda OOM for 4gb VRAM
+
+    ESRGAN(ggml_backend_t backend,
+           ggml_type wtype)
+        : GGMLModule(backend, wtype) {
+        rrdb_net.init(params_ctx, wtype);
+    }
+
+    std::string get_desc() {
+        return "esrgan";
+    }
+
+    size_t get_params_mem_size() {
+        return rrdb_net.get_params_mem_size();
+    }
+
+    size_t get_params_num() {
+        return rrdb_net.get_params_num();
+    }
+
+    bool load_from_file(const std::string& file_path) {
+        LOG_INFO("loading esrgan from '%s'", file_path.c_str());
+
+        alloc_params_buffer();
+        std::map<std::string, ggml_tensor*> esrgan_tensors;
+        rrdb_net.get_param_tensors(esrgan_tensors);
+
+        ModelLoader model_loader;
+        if (!model_loader.init_from_file(file_path)) {
+            LOG_ERROR("init esrgan model loader from file failed: '%s'", file_path.c_str());
+            return false;
+        }
+
+        bool success = model_loader.load_tensors(esrgan_tensors, backend);
+
+        if (!success) {
+            LOG_ERROR("load esrgan tensors from model loader failed");
+            return false;
+        }
+
+        LOG_INFO("esrgan model loaded");
+        return success;
+    }
+
+    struct ggml_cgraph* build_graph(struct ggml_tensor* x) {
+        struct ggml_cgraph* gf  = ggml_new_graph(compute_ctx);
+        x                       = to_backend(x);
+        struct ggml_tensor* out = rrdb_net.forward(compute_ctx, x);
+        ggml_build_forward_expand(gf, out);
+        return gf;
+    }
+
+    void compute(const int n_threads,
+                 struct ggml_tensor* x,
+                 ggml_tensor** output,
+                 ggml_context* output_ctx = NULL) {
+        auto get_graph = [&]() -> struct ggml_cgraph* {
+            return build_graph(x);
+        };
+        GGMLModule::compute(get_graph, n_threads, false, output, output_ctx);
+    }
+};
+
+#endif  // __ESRGAN_HPP__
--- a/otherarch/sdcpp/ggml_extend.hpp
+++ b/otherarch/sdcpp/ggml_extend.hpp
--- a/otherarch/sdcpp/lora.hpp
+++ b/otherarch/sdcpp/lora.hpp
@ -0,0 +1,164 @@
+#ifndef __LORA_HPP__
+#define __LORA_HPP__
+
+#include "ggml_extend.hpp"
+
+#define LORA_GRAPH_SIZE 10240
+
+struct LoraModel : public GGMLModule {
+    float multiplier = 1.0f;
+    std::map<std::string, struct ggml_tensor*> lora_tensors;
+    std::string file_path;
+    ModelLoader model_loader;
+    bool load_failed = false;
+
+    LoraModel(ggml_backend_t backend,
+              ggml_type wtype,
+              const std::string file_path = "")
+        : file_path(file_path), GGMLModule(backend, wtype) {
+        if (!model_loader.init_from_file(file_path)) {
+            load_failed = true;
+        }
+    }
+
+    std::string get_desc() {
+        return "lora";
+    }
+
+    size_t get_params_num() {
+        return LORA_GRAPH_SIZE;
+    }
+
+    size_t get_params_mem_size() {
+        return model_loader.get_params_mem_size(NULL);
+    }
+
+    bool load_from_file() {
+        LOG_INFO("loading LoRA from '%s'", file_path.c_str());
+
+        if (load_failed) {
+            LOG_ERROR("init lora model loader from file failed: '%s'", file_path.c_str());
+            return false;
+        }
+
+        bool dry_run          = true;
+        auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool {
+            const std::string& name = tensor_storage.name;
+
+            if (dry_run) {
+                struct ggml_tensor* real = ggml_new_tensor(params_ctx,
+                                                           tensor_storage.type,
+                                                           tensor_storage.n_dims,
+                                                           tensor_storage.ne);
+                lora_tensors[name]       = real;
+            } else {
+                auto real   = lora_tensors[name];
+                *dst_tensor = real;
+            }
+            return true;
+        };
+
+        model_loader.load_tensors(on_new_tensor_cb, backend);
+        alloc_params_buffer();
+
+        dry_run = false;
+        model_loader.load_tensors(on_new_tensor_cb, backend);
+
+        LOG_DEBUG("finished loaded lora");
+        return true;
+    }
+
+    struct ggml_cgraph* build_graph(std::map<std::string, struct ggml_tensor*> model_tensors) {
+        struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, LORA_GRAPH_SIZE, false);
+
+        std::set<std::string> applied_lora_tensors;
+        for (auto it : model_tensors) {
+            std::string k_tensor       = it.first;
+            struct ggml_tensor* weight = model_tensors[it.first];
+
+            size_t k_pos = k_tensor.find(".weight");
+            if (k_pos == std::string::npos) {
+                continue;
+            }
+            k_tensor = k_tensor.substr(0, k_pos);
+            replace_all_chars(k_tensor, '.', '_');
+            std::string lora_up_name   = "lora." + k_tensor + ".lora_up.weight";
+            std::string lora_down_name = "lora." + k_tensor + ".lora_down.weight";
+            std::string alpha_name     = "lora." + k_tensor + ".alpha";
+            std::string scale_name     = "lora." + k_tensor + ".scale";
+
+            ggml_tensor* lora_up   = NULL;
+            ggml_tensor* lora_down = NULL;
+
+            if (lora_tensors.find(lora_up_name) != lora_tensors.end()) {
+                lora_up = lora_tensors[lora_up_name];
+            }
+
+            if (lora_tensors.find(lora_down_name) != lora_tensors.end()) {
+                lora_down = lora_tensors[lora_down_name];
+            }
+
+            if (lora_up == NULL || lora_down == NULL) {
+                continue;
+            }
+
+            applied_lora_tensors.insert(lora_up_name);
+            applied_lora_tensors.insert(lora_down_name);
+            applied_lora_tensors.insert(alpha_name);
+            applied_lora_tensors.insert(scale_name);
+
+            // calc_cale
+            int64_t dim       = lora_down->ne[ggml_n_dims(lora_down) - 1];
+            float scale_value = 1.0f;
+            if (lora_tensors.find(scale_name) != lora_tensors.end()) {
+                scale_value = ggml_backend_tensor_get_f32(lora_tensors[scale_name]);
+            } else if (lora_tensors.find(alpha_name) != lora_tensors.end()) {
+                float alpha = ggml_backend_tensor_get_f32(lora_tensors[alpha_name]);
+                scale_value = alpha / dim;
+            }
+            scale_value *= multiplier;
+
+            // flat lora tensors to multiply it
+            int64_t lora_up_rows   = lora_up->ne[ggml_n_dims(lora_up) - 1];
+            lora_up                = ggml_reshape_2d(compute_ctx, lora_up, ggml_nelements(lora_up) / lora_up_rows, lora_up_rows);
+            int64_t lora_down_rows = lora_down->ne[ggml_n_dims(lora_down) - 1];
+            lora_down              = ggml_reshape_2d(compute_ctx, lora_down, ggml_nelements(lora_down) / lora_down_rows, lora_down_rows);
+
+            // ggml_mul_mat requires tensor b transposed
+            lora_down                  = ggml_cont(compute_ctx, ggml_transpose(compute_ctx, lora_down));
+            struct ggml_tensor* updown = ggml_mul_mat(compute_ctx, lora_up, lora_down);
+            updown                     = ggml_cont(compute_ctx, ggml_transpose(compute_ctx, updown));
+            updown                     = ggml_reshape(compute_ctx, updown, weight);
+            GGML_ASSERT(ggml_nelements(updown) == ggml_nelements(weight));
+            updown = ggml_scale_inplace(compute_ctx, updown, scale_value);
+            ggml_tensor* final_weight;
+            // if (weight->type != GGML_TYPE_F32 && weight->type != GGML_TYPE_F16) {
+            //     final_weight = ggml_new_tensor(compute_ctx, GGML_TYPE_F32, weight->n_dims, weight->ne);
+            //     final_weight = ggml_cpy_inplace(compute_ctx, weight, final_weight);
+            //     final_weight = ggml_add_inplace(compute_ctx, final_weight, updown);
+            //     final_weight = ggml_cpy_inplace(compute_ctx, final_weight, weight);
+            // } else {
+            //     final_weight = ggml_add_inplace(compute_ctx, weight, updown);
+            // }
+            final_weight = ggml_add_inplace(compute_ctx, weight, updown);  // apply directly
+            ggml_build_forward_expand(gf, final_weight);
+        }
+
+        for (auto& kv : lora_tensors) {
+            if (applied_lora_tensors.find(kv.first) == applied_lora_tensors.end()) {
+                LOG_WARN("unused lora tensor %s", kv.first.c_str());
+            }
+        }
+
+        return gf;
+    }
+
+    void apply(std::map<std::string, struct ggml_tensor*> model_tensors, int n_threads) {
+        auto get_graph = [&]() -> struct ggml_cgraph* {
+            return build_graph(model_tensors);
+        };
+        GGMLModule::compute(get_graph, n_threads, true);
+    }
+};
+
+#endif  // __LORA_HPP__
--- a/otherarch/sdcpp/main.cpp
+++ b/otherarch/sdcpp/main.cpp
@ -0,0 +1,778 @@
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+#include <iostream>
+#include <random>
+#include <string>
+#include <vector>
+
+// #include "preprocessing.hpp"
+#include "stable-diffusion.h"
+
+#define STB_IMAGE_IMPLEMENTATION
+#include "stb_image.h"
+
+#define STB_IMAGE_WRITE_IMPLEMENTATION
+#define STB_IMAGE_WRITE_STATIC
+#include "stb_image_write.h"
+
+const char* rng_type_to_str[] = {
+    "std_default",
+    "cuda",
+};
+
+// Names of the sampler method, same order as enum sample_method in stable-diffusion.h
+const char* sample_method_str[] = {
+    "euler_a",
+    "euler",
+    "heun",
+    "dpm2",
+    "dpm++2s_a",
+    "dpm++2m",
+    "dpm++2mv2",
+    "lcm",
+};
+
+// Names of the sigma schedule overrides, same order as sample_schedule in stable-diffusion.h
+const char* schedule_str[] = {
+    "default",
+    "discrete",
+    "karras",
+};
+
+const char* modes_str[] = {
+    "txt2img",
+    "img2img",
+    "img2vid",
+    "convert",
+};
+
+enum SDMode {
+    TXT2IMG,
+    IMG2IMG,
+    IMG2VID,
+    CONVERT,
+    MODE_COUNT
+};
+
+struct SDParams {
+    int n_threads = -1;
+    SDMode mode   = TXT2IMG;
+
+    std::string model_path;
+    std::string vae_path;
+    std::string taesd_path;
+    std::string esrgan_path;
+    std::string controlnet_path;
+    std::string embeddings_path;
+    sd_type_t wtype = SD_TYPE_COUNT;
+    std::string lora_model_dir;
+    std::string output_path = "output.png";
+    std::string input_path;
+    std::string control_image_path;
+
+    std::string prompt;
+    std::string negative_prompt;
+    float min_cfg   = 1.0f;
+    float cfg_scale = 7.0f;
+    int clip_skip   = -1;  // <= 0 represents unspecified
+    int width       = 512;
+    int height      = 512;
+    int batch_count = 1;
+
+    int video_frames         = 6;
+    int motion_bucket_id     = 127;
+    int fps                  = 6;
+    float augmentation_level = 0.f;
+
+    sample_method_t sample_method = EULER_A;
+    schedule_t schedule           = DEFAULT;
+    int sample_steps              = 20;
+    float strength                = 0.75f;
+    float control_strength        = 0.9f;
+    rng_type_t rng_type           = CUDA_RNG;
+    int64_t seed                  = 42;
+    bool verbose                  = false;
+    bool vae_tiling               = false;
+    bool control_net_cpu          = false;
+    bool canny_preprocess         = false;
+    int upscale_repeats           = 1;
+};
+
+void print_params(SDParams params) {
+    printf("Option: \n");
+    printf("    n_threads:         %d\n", params.n_threads);
+    printf("    mode:              %s\n", modes_str[params.mode]);
+    printf("    model_path:        %s\n", params.model_path.c_str());
+    printf("    wtype:             %s\n", params.wtype < SD_TYPE_COUNT ? sd_type_name(params.wtype) : "unspecified");
+    printf("    vae_path:          %s\n", params.vae_path.c_str());
+    printf("    taesd_path:        %s\n", params.taesd_path.c_str());
+    printf("    esrgan_path:       %s\n", params.esrgan_path.c_str());
+    printf("    controlnet_path:   %s\n", params.controlnet_path.c_str());
+    printf("    embeddings_path:   %s\n", params.embeddings_path.c_str());
+    printf("    output_path:       %s\n", params.output_path.c_str());
+    printf("    init_img:          %s\n", params.input_path.c_str());
+    printf("    control_image:     %s\n", params.control_image_path.c_str());
+    printf("    controlnet cpu:    %s\n", params.control_net_cpu ? "true" : "false");
+    printf("    strength(control): %.2f\n", params.control_strength);
+    printf("    prompt:            %s\n", params.prompt.c_str());
+    printf("    negative_prompt:   %s\n", params.negative_prompt.c_str());
+    printf("    min_cfg:           %.2f\n", params.min_cfg);
+    printf("    cfg_scale:         %.2f\n", params.cfg_scale);
+    printf("    clip_skip:         %d\n", params.clip_skip);
+    printf("    width:             %d\n", params.width);
+    printf("    height:            %d\n", params.height);
+    printf("    sample_method:     %s\n", sample_method_str[params.sample_method]);
+    printf("    schedule:          %s\n", schedule_str[params.schedule]);
+    printf("    sample_steps:      %d\n", params.sample_steps);
+    printf("    strength(img2img): %.2f\n", params.strength);
+    printf("    rng:               %s\n", rng_type_to_str[params.rng_type]);
+    printf("    seed:              %ld\n", params.seed);
+    printf("    batch_count:       %d\n", params.batch_count);
+    printf("    vae_tiling:        %s\n", params.vae_tiling ? "true" : "false");
+    printf("    upscale_repeats:   %d\n", params.upscale_repeats);
+}
+
+void print_usage(int argc, const char* argv[]) {
+    printf("usage: %s [arguments]\n", argv[0]);
+    printf("\n");
+    printf("arguments:\n");
+    printf("  -h, --help                         show this help message and exit\n");
+    printf("  -M, --mode [MODEL]                 run mode (txt2img or img2img or convert, default: txt2img)\n");
+    printf("  -t, --threads N                    number of threads to use during computation (default: -1).\n");
+    printf("                                     If threads <= 0, then threads will be set to the number of CPU physical cores\n");
+    printf("  -m, --model [MODEL]                path to model\n");
+    printf("  --vae [VAE]                        path to vae\n");
+    printf("  --taesd [TAESD_PATH]               path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)\n");
+    printf("  --control-net [CONTROL_PATH]       path to control net model\n");
+    printf("  --embd-dir [EMBEDDING_PATH]        path to embeddings.\n");
+    printf("  --upscale-model [ESRGAN_PATH]      path to esrgan model. Upscale images after generate, just RealESRGAN_x4plus_anime_6B supported by now.\n");
+    printf("  --upscale-repeats                  Run the ESRGAN upscaler this many times (default 1)\n");
+    printf("  --type [TYPE]                      weight type (f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0)\n");
+    printf("                                     If not specified, the default is the type of the weight file.\n");
+    printf("  --lora-model-dir [DIR]             lora model directory\n");
+    printf("  -i, --init-img [IMAGE]             path to the input image, required by img2img\n");
+    printf("  --control-image [IMAGE]            path to image condition, control net\n");
+    printf("  -o, --output OUTPUT                path to write result image to (default: ./output.png)\n");
+    printf("  -p, --prompt [PROMPT]              the prompt to render\n");
+    printf("  -n, --negative-prompt PROMPT       the negative prompt (default: \"\")\n");
+    printf("  --cfg-scale SCALE                  unconditional guidance scale: (default: 7.0)\n");
+    printf("  --strength STRENGTH                strength for noising/unnoising (default: 0.75)\n");
+    printf("  --control-strength STRENGTH        strength to apply Control Net (default: 0.9)\n");
+    printf("                                     1.0 corresponds to full destruction of information in init image\n");
+    printf("  -H, --height H                     image height, in pixel space (default: 512)\n");
+    printf("  -W, --width W                      image width, in pixel space (default: 512)\n");
+    printf("  --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, lcm}\n");
+    printf("                                     sampling method (default: \"euler_a\")\n");
+    printf("  --steps  STEPS                     number of sample steps (default: 20)\n");
+    printf("  --rng {std_default, cuda}          RNG (default: cuda)\n");
+    printf("  -s SEED, --seed SEED               RNG seed (default: 42, use random seed for < 0)\n");
+    printf("  -b, --batch-count COUNT            number of images to generate.\n");
+    printf("  --schedule {discrete, karras}      Denoiser sigma schedule (default: discrete)\n");
+    printf("  --clip-skip N                      ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1)\n");
+    printf("                                     <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x\n");
+    printf("  --vae-tiling                       process vae in tiles to reduce memory usage\n");
+    printf("  --control-net-cpu                  keep controlnet in cpu (for low vram)\n");
+    printf("  --canny                            apply canny preprocessor (edge detection)\n");
+    printf("  -v, --verbose                      print extra info\n");
+}
+
+void parse_args(int argc, const char** argv, SDParams& params) {
+    bool invalid_arg = false;
+    std::string arg;
+    for (int i = 1; i < argc; i++) {
+        arg = argv[i];
+
+        if (arg == "-t" || arg == "--threads") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.n_threads = std::stoi(argv[i]);
+        } else if (arg == "-M" || arg == "--mode") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            const char* mode_selected = argv[i];
+            int mode_found            = -1;
+            for (int d = 0; d < MODE_COUNT; d++) {
+                if (!strcmp(mode_selected, modes_str[d])) {
+                    mode_found = d;
+                }
+            }
+            if (mode_found == -1) {
+                fprintf(stderr,
+                        "error: invalid mode %s, must be one of [txt2img, img2img, img2vid, convert]\n",
+                        mode_selected);
+                exit(1);
+            }
+            params.mode = (SDMode)mode_found;
+        } else if (arg == "-m" || arg == "--model") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.model_path = argv[i];
+        } else if (arg == "--vae") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.vae_path = argv[i];
+        } else if (arg == "--taesd") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.taesd_path = argv[i];
+        } else if (arg == "--control-net") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.controlnet_path = argv[i];
+        } else if (arg == "--upscale-model") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.esrgan_path = argv[i];
+        } else if (arg == "--embd-dir") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.embeddings_path = argv[i];
+        } else if (arg == "--type") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            std::string type = argv[i];
+            if (type == "f32") {
+                params.wtype = SD_TYPE_F32;
+            } else if (type == "f16") {
+                params.wtype = SD_TYPE_F16;
+            } else if (type == "q4_0") {
+                params.wtype = SD_TYPE_Q4_0;
+            } else if (type == "q4_1") {
+                params.wtype = SD_TYPE_Q4_1;
+            } else if (type == "q5_0") {
+                params.wtype = SD_TYPE_Q5_0;
+            } else if (type == "q5_1") {
+                params.wtype = SD_TYPE_Q5_1;
+            } else if (type == "q8_0") {
+                params.wtype = SD_TYPE_Q8_0;
+            } else {
+                fprintf(stderr, "error: invalid weight format %s, must be one of [f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0]\n",
+                        type.c_str());
+                exit(1);
+            }
+        } else if (arg == "--lora-model-dir") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.lora_model_dir = argv[i];
+        } else if (arg == "-i" || arg == "--init-img") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.input_path = argv[i];
+        } else if (arg == "--control-image") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.control_image_path = argv[i];
+        } else if (arg == "-o" || arg == "--output") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.output_path = argv[i];
+        } else if (arg == "-p" || arg == "--prompt") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.prompt = argv[i];
+        } else if (arg == "--upscale-repeats") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.upscale_repeats = std::stoi(argv[i]);
+            if (params.upscale_repeats < 1) {
+                fprintf(stderr, "error: upscale multiplier must be at least 1\n");
+                exit(1);
+            }
+        } else if (arg == "-n" || arg == "--negative-prompt") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.negative_prompt = argv[i];
+        } else if (arg == "--cfg-scale") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.cfg_scale = std::stof(argv[i]);
+        } else if (arg == "--strength") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.strength = std::stof(argv[i]);
+        } else if (arg == "--control-strength") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.control_strength = std::stof(argv[i]);
+        } else if (arg == "-H" || arg == "--height") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.height = std::stoi(argv[i]);
+        } else if (arg == "-W" || arg == "--width") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.width = std::stoi(argv[i]);
+        } else if (arg == "--steps") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.sample_steps = std::stoi(argv[i]);
+        } else if (arg == "--clip-skip") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.clip_skip = std::stoi(argv[i]);
+        } else if (arg == "--vae-tiling") {
+            params.vae_tiling = true;
+        } else if (arg == "--control-net-cpu") {
+            params.control_net_cpu = true;
+        } else if (arg == "--canny") {
+            params.canny_preprocess = true;
+        } else if (arg == "-b" || arg == "--batch-count") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.batch_count = std::stoi(argv[i]);
+        } else if (arg == "--rng") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            std::string rng_type_str = argv[i];
+            if (rng_type_str == "std_default") {
+                params.rng_type = STD_DEFAULT_RNG;
+            } else if (rng_type_str == "cuda") {
+                params.rng_type = CUDA_RNG;
+            } else {
+                invalid_arg = true;
+                break;
+            }
+        } else if (arg == "--schedule") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            const char* schedule_selected = argv[i];
+            int schedule_found            = -1;
+            for (int d = 0; d < N_SCHEDULES; d++) {
+                if (!strcmp(schedule_selected, schedule_str[d])) {
+                    schedule_found = d;
+                }
+            }
+            if (schedule_found == -1) {
+                invalid_arg = true;
+                break;
+            }
+            params.schedule = (schedule_t)schedule_found;
+        } else if (arg == "-s" || arg == "--seed") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.seed = std::stoll(argv[i]);
+        } else if (arg == "--sampling-method") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            const char* sample_method_selected = argv[i];
+            int sample_method_found            = -1;
+            for (int m = 0; m < N_SAMPLE_METHODS; m++) {
+                if (!strcmp(sample_method_selected, sample_method_str[m])) {
+                    sample_method_found = m;
+                }
+            }
+            if (sample_method_found == -1) {
+                invalid_arg = true;
+                break;
+            }
+            params.sample_method = (sample_method_t)sample_method_found;
+        } else if (arg == "-h" || arg == "--help") {
+            print_usage(argc, argv);
+            exit(0);
+        } else if (arg == "-v" || arg == "--verbose") {
+            params.verbose = true;
+        } else {
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            print_usage(argc, argv);
+            exit(1);
+        }
+    }
+    if (invalid_arg) {
+        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
+        print_usage(argc, argv);
+        exit(1);
+    }
+    if (params.n_threads <= 0) {
+        params.n_threads = get_num_physical_cores();
+    }
+
+    if (params.mode != CONVERT && params.mode != IMG2VID && params.prompt.length() == 0) {
+        fprintf(stderr, "error: the following arguments are required: prompt\n");
+        print_usage(argc, argv);
+        exit(1);
+    }
+
+    if (params.model_path.length() == 0) {
+        fprintf(stderr, "error: the following arguments are required: model_path\n");
+        print_usage(argc, argv);
+        exit(1);
+    }
+
+    if ((params.mode == IMG2IMG || params.mode == IMG2VID) && params.input_path.length() == 0) {
+        fprintf(stderr, "error: when using the img2img mode, the following arguments are required: init-img\n");
+        print_usage(argc, argv);
+        exit(1);
+    }
+
+    if (params.output_path.length() == 0) {
+        fprintf(stderr, "error: the following arguments are required: output_path\n");
+        print_usage(argc, argv);
+        exit(1);
+    }
+
+    if (params.width <= 0 || params.width % 64 != 0) {
+        fprintf(stderr, "error: the width must be a multiple of 64\n");
+        exit(1);
+    }
+
+    if (params.height <= 0 || params.height % 64 != 0) {
+        fprintf(stderr, "error: the height must be a multiple of 64\n");
+        exit(1);
+    }
+
+    if (params.sample_steps <= 0) {
+        fprintf(stderr, "error: the sample_steps must be greater than 0\n");
+        exit(1);
+    }
+
+    if (params.strength < 0.f || params.strength > 1.f) {
+        fprintf(stderr, "error: can only work with strength in [0.0, 1.0]\n");
+        exit(1);
+    }
+
+    if (params.seed < 0) {
+        srand((int)time(NULL));
+        params.seed = rand();
+    }
+
+    if (params.mode == CONVERT) {
+        if (params.output_path == "output.png") {
+            params.output_path = "output.gguf";
+        }
+    }
+}
+
+static std::string sd_basename(const std::string& path) {
+    size_t pos = path.find_last_of('/');
+    if (pos != std::string::npos) {
+        return path.substr(pos + 1);
+    }
+    pos = path.find_last_of('\\');
+    if (pos != std::string::npos) {
+        return path.substr(pos + 1);
+    }
+    return path;
+}
+
+std::string get_image_params(SDParams params, int64_t seed) {
+    std::string parameter_string = params.prompt + "\n";
+    if (params.negative_prompt.size() != 0) {
+        parameter_string += "Negative prompt: " + params.negative_prompt + "\n";
+    }
+    parameter_string += "Steps: " + std::to_string(params.sample_steps) + ", ";
+    parameter_string += "CFG scale: " + std::to_string(params.cfg_scale) + ", ";
+    parameter_string += "Seed: " + std::to_string(seed) + ", ";
+    parameter_string += "Size: " + std::to_string(params.width) + "x" + std::to_string(params.height) + ", ";
+    parameter_string += "Model: " + sd_basename(params.model_path) + ", ";
+    parameter_string += "RNG: " + std::string(rng_type_to_str[params.rng_type]) + ", ";
+    parameter_string += "Sampler: " + std::string(sample_method_str[params.sample_method]);
+    if (params.schedule == KARRAS) {
+        parameter_string += " karras";
+    }
+    parameter_string += ", ";
+    parameter_string += "Version: stable-diffusion.cpp";
+    return parameter_string;
+}
+
+void sd_log_cb(enum sd_log_level_t level, const char* log, void* data) {
+    SDParams* params = (SDParams*)data;
+    if (!params->verbose && level <= SD_LOG_DEBUG) {
+        return;
+    }
+    if (level <= SD_LOG_INFO) {
+        fputs(log, stdout);
+        fflush(stdout);
+    } else {
+        fputs(log, stderr);
+        fflush(stderr);
+    }
+}
+
+int main(int argc, const char* argv[]) {
+    SDParams params;
+    parse_args(argc, argv, params);
+
+    sd_set_log_callback(sd_log_cb, (void*)&params);
+
+    if (params.verbose) {
+        print_params(params);
+        printf("%s", sd_get_system_info());
+    }
+
+    if (params.mode == CONVERT) {
+        bool success = convert(params.model_path.c_str(), params.vae_path.c_str(), params.output_path.c_str(), params.wtype);
+        if (!success) {
+            fprintf(stderr,
+                    "convert '%s'/'%s' to '%s' failed\n",
+                    params.model_path.c_str(),
+                    params.vae_path.c_str(),
+                    params.output_path.c_str());
+            return 1;
+        } else {
+            printf("convert '%s'/'%s' to '%s' success\n",
+                   params.model_path.c_str(),
+                   params.vae_path.c_str(),
+                   params.output_path.c_str());
+            return 0;
+        }
+    }
+
+    if (params.mode == IMG2VID) {
+        fprintf(stderr, "SVD support is broken, do not use it!!!\n");
+        return 1;
+    }
+
+    bool vae_decode_only        = true;
+    uint8_t* input_image_buffer = NULL;
+    if (params.mode == IMG2IMG || params.mode == IMG2VID) {
+        vae_decode_only = false;
+
+        int c              = 0;
+        input_image_buffer = stbi_load(params.input_path.c_str(), &params.width, &params.height, &c, 3);
+        if (input_image_buffer == NULL) {
+            fprintf(stderr, "load image from '%s' failed\n", params.input_path.c_str());
+            return 1;
+        }
+        if (c != 3) {
+            fprintf(stderr, "input image must be a 3 channels RGB image, but got %d channels\n", c);
+            free(input_image_buffer);
+            return 1;
+        }
+        if (params.width <= 0 || params.width % 64 != 0) {
+            fprintf(stderr, "error: the width of image must be a multiple of 64\n");
+            free(input_image_buffer);
+            return 1;
+        }
+        if (params.height <= 0 || params.height % 64 != 0) {
+            fprintf(stderr, "error: the height of image must be a multiple of 64\n");
+            free(input_image_buffer);
+            return 1;
+        }
+    }
+
+    sd_ctx_t* sd_ctx = new_sd_ctx(params.model_path.c_str(),
+                                  params.vae_path.c_str(),
+                                  params.taesd_path.c_str(),
+                                  params.controlnet_path.c_str(),
+                                  params.lora_model_dir.c_str(),
+                                  params.embeddings_path.c_str(),
+                                  vae_decode_only,
+                                  params.vae_tiling,
+                                  true,
+                                  params.n_threads,
+                                  params.wtype,
+                                  params.rng_type,
+                                  params.schedule,
+                                  params.control_net_cpu);
+
+    if (sd_ctx == NULL) {
+        printf("new_sd_ctx_t failed\n");
+        return 1;
+    }
+
+    sd_image_t* results;
+    if (params.mode == TXT2IMG) {
+        sd_image_t* control_image = NULL;
+        if (params.controlnet_path.size() > 0 && params.control_image_path.size() > 0) {
+            int c              = 0;
+            input_image_buffer = stbi_load(params.control_image_path.c_str(), &params.width, &params.height, &c, 3);
+            if (input_image_buffer == NULL) {
+                fprintf(stderr, "load image from '%s' failed\n", params.control_image_path.c_str());
+                return 1;
+            }
+            control_image = new sd_image_t{(uint32_t)params.width,
+                                           (uint32_t)params.height,
+                                           3,
+                                           input_image_buffer};
+            if (params.canny_preprocess) {  // apply preprocessor
+                control_image->data = preprocess_canny(control_image->data,
+                                                       control_image->width,
+                                                       control_image->height,
+                                                       0.08f,
+                                                       0.08f,
+                                                       0.8f,
+                                                       1.0f,
+                                                       false);
+            }
+        }
+        results = txt2img(sd_ctx,
+                          params.prompt.c_str(),
+                          params.negative_prompt.c_str(),
+                          params.clip_skip,
+                          params.cfg_scale,
+                          params.width,
+                          params.height,
+                          params.sample_method,
+                          params.sample_steps,
+                          params.seed,
+                          params.batch_count,
+                          control_image,
+                          params.control_strength);
+    } else {
+        sd_image_t input_image = {(uint32_t)params.width,
+                                  (uint32_t)params.height,
+                                  3,
+                                  input_image_buffer};
+
+        if (params.mode == IMG2VID) {
+            results = img2vid(sd_ctx,
+                              input_image,
+                              params.width,
+                              params.height,
+                              params.video_frames,
+                              params.motion_bucket_id,
+                              params.fps,
+                              params.augmentation_level,
+                              params.min_cfg,
+                              params.cfg_scale,
+                              params.sample_method,
+                              params.sample_steps,
+                              params.strength,
+                              params.seed);
+            if (results == NULL) {
+                printf("generate failed\n");
+                free_sd_ctx(sd_ctx);
+                return 1;
+            }
+            size_t last            = params.output_path.find_last_of(".");
+            std::string dummy_name = last != std::string::npos ? params.output_path.substr(0, last) : params.output_path;
+            for (int i = 0; i < params.video_frames; i++) {
+                if (results[i].data == NULL) {
+                    continue;
+                }
+                std::string final_image_path = i > 0 ? dummy_name + "_" + std::to_string(i + 1) + ".png" : dummy_name + ".png";
+                stbi_write_png(final_image_path.c_str(), results[i].width, results[i].height, results[i].channel,
+                               results[i].data, 0, get_image_params(params, params.seed + i).c_str());
+                printf("save result image to '%s'\n", final_image_path.c_str());
+                free(results[i].data);
+                results[i].data = NULL;
+            }
+            free(results);
+            free_sd_ctx(sd_ctx);
+            return 0;
+        } else {
+            results = img2img(sd_ctx,
+                              input_image,
+                              params.prompt.c_str(),
+                              params.negative_prompt.c_str(),
+                              params.clip_skip,
+                              params.cfg_scale,
+                              params.width,
+                              params.height,
+                              params.sample_method,
+                              params.sample_steps,
+                              params.strength,
+                              params.seed,
+                              params.batch_count);
+        }
+    }
+
+    if (results == NULL) {
+        printf("generate failed\n");
+        free_sd_ctx(sd_ctx);
+        return 1;
+    }
+
+    int upscale_factor = 4;  // unused for RealESRGAN_x4plus_anime_6B.pth
+    if (params.esrgan_path.size() > 0 && params.upscale_repeats > 0) {
+        upscaler_ctx_t* upscaler_ctx = new_upscaler_ctx(params.esrgan_path.c_str(),
+                                                        params.n_threads,
+                                                        params.wtype);
+
+        if (upscaler_ctx == NULL) {
+            printf("new_upscaler_ctx failed\n");
+        } else {
+            for (int i = 0; i < params.batch_count; i++) {
+                if (results[i].data == NULL) {
+                    continue;
+                }
+                sd_image_t current_image = results[i];
+                for (int u = 0; u < params.upscale_repeats; ++u) {
+                    sd_image_t upscaled_image = upscale(upscaler_ctx, current_image, upscale_factor);
+                    if (upscaled_image.data == NULL) {
+                        printf("upscale failed\n");
+                        break;
+                    }
+                    free(current_image.data);
+                    current_image = upscaled_image;
+                }
+                results[i] = current_image;  // Set the final upscaled image as the result
+            }
+        }
+    }
+
+    size_t last            = params.output_path.find_last_of(".");
+    std::string dummy_name = last != std::string::npos ? params.output_path.substr(0, last) : params.output_path;
+    for (int i = 0; i < params.batch_count; i++) {
+        if (results[i].data == NULL) {
+            continue;
+        }
+        std::string final_image_path = i > 0 ? dummy_name + "_" + std::to_string(i + 1) + ".png" : dummy_name + ".png";
+        stbi_write_png(final_image_path.c_str(), results[i].width, results[i].height, results[i].channel,
+                       results[i].data, 0, get_image_params(params, params.seed + i).c_str());
+        printf("save result image to '%s'\n", final_image_path.c_str());
+        free(results[i].data);
+        results[i].data = NULL;
+    }
+    free(results);
+    free_sd_ctx(sd_ctx);
+
+    return 0;
+}
--- a/otherarch/sdcpp/model.cpp
+++ b/otherarch/sdcpp/model.cpp
--- a/otherarch/sdcpp/model.h
+++ b/otherarch/sdcpp/model.h
@ -0,0 +1,154 @@
+#ifndef __MODEL_H__
+#define __MODEL_H__
+
+#include <functional>
+#include <map>
+#include <memory>
+#include <set>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "ggml-backend.h"
+#include "ggml.h"
+#include "json.hpp"
+#include "zip.h"
+
+#define SD_MAX_DIMS 5
+
+enum SDVersion {
+    VERSION_1_x,
+    VERSION_2_x,
+    VERSION_XL,
+    VERSION_SVD,
+    VERSION_COUNT,
+};
+
+struct TensorStorage {
+    std::string name;
+    ggml_type type          = GGML_TYPE_F32;
+    bool is_bf16            = false;
+    int64_t ne[SD_MAX_DIMS] = {1, 1, 1, 1, 1};
+    int n_dims              = 0;
+
+    size_t file_index = 0;
+    int index_in_zip  = -1;  // >= means stored in a zip file
+    size_t offset     = 0;   // offset in file
+
+    TensorStorage() = default;
+
+    TensorStorage(const std::string& name, ggml_type type, int64_t* ne, int n_dims, size_t file_index, size_t offset = 0)
+        : name(name), type(type), n_dims(n_dims), file_index(file_index), offset(offset) {
+        for (int i = 0; i < n_dims; i++) {
+            this->ne[i] = ne[i];
+        }
+    }
+
+    int64_t nelements() const {
+        int64_t n = 1;
+        for (int i = 0; i < SD_MAX_DIMS; i++) {
+            n *= ne[i];
+        }
+        return n;
+    }
+
+    int64_t nbytes() const {
+        return nelements() * ggml_type_size(type) / ggml_blck_size(type);
+    }
+
+    int64_t nbytes_to_read() const {
+        if (is_bf16) {
+            return nbytes() / 2;
+        } else {
+            return nbytes();
+        }
+    }
+
+    void unsqueeze() {
+        if (n_dims == 2) {
+            n_dims = 4;
+            ne[3]  = ne[1];
+            ne[2]  = ne[0];
+            ne[1]  = 1;
+            ne[0]  = 1;
+        }
+    }
+
+    std::vector<TensorStorage> chunk(size_t n) {
+        std::vector<TensorStorage> chunks;
+        size_t chunk_size = nbytes_to_read() / n;
+        // printf("%d/%d\n", chunk_size, nbytes_to_read());
+        reverse_ne();
+        for (int i = 0; i < n; i++) {
+            TensorStorage chunk_i = *this;
+            chunk_i.ne[0]         = ne[0] / n;
+            chunk_i.offset        = offset + i * chunk_size;
+            chunk_i.reverse_ne();
+            chunks.push_back(chunk_i);
+        }
+        reverse_ne();
+        return chunks;
+    }
+
+    void reverse_ne() {
+        int64_t new_ne[SD_MAX_DIMS] = {1, 1, 1, 1, 1};
+        for (int i = 0; i < n_dims; i++) {
+            new_ne[i] = ne[n_dims - 1 - i];
+        }
+        for (int i = 0; i < n_dims; i++) {
+            ne[i] = new_ne[i];
+        }
+    }
+
+    std::string to_string() const {
+        std::stringstream ss;
+        const char* type_name = ggml_type_name(type);
+        if (is_bf16) {
+            type_name = "bf16";
+        }
+        ss << name << " | " << type_name << " | ";
+        ss << n_dims << " [";
+        for (int i = 0; i < SD_MAX_DIMS; i++) {
+            ss << ne[i];
+            if (i != SD_MAX_DIMS - 1) {
+                ss << ", ";
+            }
+        }
+        ss << "]";
+        return ss.str();
+    }
+};
+
+typedef std::function<bool(const TensorStorage&, ggml_tensor**)> on_new_tensor_cb_t;
+
+class ModelLoader {
+protected:
+    std::vector<std::string> file_paths_;
+    std::vector<TensorStorage> tensor_storages;
+
+    bool parse_data_pkl(uint8_t* buffer,
+                        size_t buffer_size,
+                        zip_t* zip,
+                        std::string dir,
+                        size_t file_index,
+                        const std::string& prefix);
+
+    bool init_from_gguf_file(const std::string& file_path, const std::string& prefix = "");
+    bool init_from_safetensors_file(const std::string& file_path, const std::string& prefix = "");
+    bool init_from_ckpt_file(const std::string& file_path, const std::string& prefix = "");
+    bool init_from_diffusers_file(const std::string& file_path, const std::string& prefix = "");
+
+public:
+    bool init_from_file(const std::string& file_path, const std::string& prefix = "");
+    SDVersion get_sd_version();
+    ggml_type get_sd_wtype();
+    std::string load_merges();
+    bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend_t backend);
+    bool load_tensors(std::map<std::string, struct ggml_tensor*>& tensors,
+                      ggml_backend_t backend,
+                      std::set<std::string> ignore_tensors = {});
+    bool save_to_gguf_file(const std::string& file_path, ggml_type type);
+    int64_t get_params_mem_size(ggml_backend_t backend, ggml_type type = GGML_TYPE_COUNT);
+    ~ModelLoader() = default;
+};
+#endif  // __MODEL_H__
--- a/otherarch/sdcpp/preprocessing.hpp
+++ b/otherarch/sdcpp/preprocessing.hpp
@ -0,0 +1,227 @@
+#ifndef __PREPROCESSING_HPP__
+#define __PREPROCESSING_HPP__
+
+#include "ggml_extend.hpp"
+#define M_PI_ 3.14159265358979323846
+
+void convolve(struct ggml_tensor* input, struct ggml_tensor* output, struct ggml_tensor* kernel, int padding) {
+    struct ggml_init_params params;
+    params.mem_size                 = 20 * 1024 * 1024;  // 10
+    params.mem_buffer               = NULL;
+    params.no_alloc                 = false;
+    struct ggml_context* ctx0       = ggml_init(params);
+    struct ggml_tensor* kernel_fp16 = ggml_new_tensor_4d(ctx0, GGML_TYPE_F16, kernel->ne[0], kernel->ne[1], 1, 1);
+    ggml_fp32_to_fp16_row((float*)kernel->data, (ggml_fp16_t*)kernel_fp16->data, ggml_nelements(kernel));
+    ggml_tensor* h  = ggml_conv_2d(ctx0, kernel_fp16, input, 1, 1, padding, padding, 1, 1);
+    ggml_cgraph* gf = ggml_new_graph(ctx0);
+    ggml_build_forward_expand(gf, ggml_cpy(ctx0, h, output));
+    ggml_graph_compute_with_ctx(ctx0, gf, 1);
+    ggml_free(ctx0);
+}
+
+void gaussian_kernel(struct ggml_tensor* kernel) {
+    int ks_mid   = kernel->ne[0] / 2;
+    float sigma  = 1.4f;
+    float normal = 1.f / (2.0f * M_PI_ * powf(sigma, 2.0f));
+    for (int y = 0; y < kernel->ne[0]; y++) {
+        float gx = -ks_mid + y;
+        for (int x = 0; x < kernel->ne[1]; x++) {
+            float gy = -ks_mid + x;
+            float k_ = expf(-((gx * gx + gy * gy) / (2.0f * powf(sigma, 2.0f)))) * normal;
+            ggml_tensor_set_f32(kernel, k_, x, y);
+        }
+    }
+}
+
+void grayscale(struct ggml_tensor* rgb_img, struct ggml_tensor* grayscale) {
+    for (int iy = 0; iy < rgb_img->ne[1]; iy++) {
+        for (int ix = 0; ix < rgb_img->ne[0]; ix++) {
+            float r    = ggml_tensor_get_f32(rgb_img, ix, iy);
+            float g    = ggml_tensor_get_f32(rgb_img, ix, iy, 1);
+            float b    = ggml_tensor_get_f32(rgb_img, ix, iy, 2);
+            float gray = 0.2989f * r + 0.5870f * g + 0.1140f * b;
+            ggml_tensor_set_f32(grayscale, gray, ix, iy);
+        }
+    }
+}
+
+void prop_hypot(struct ggml_tensor* x, struct ggml_tensor* y, struct ggml_tensor* h) {
+    int n_elements = ggml_nelements(h);
+    float* dx      = (float*)x->data;
+    float* dy      = (float*)y->data;
+    float* dh      = (float*)h->data;
+    for (int i = 0; i < n_elements; i++) {
+        dh[i] = sqrtf(dx[i] * dx[i] + dy[i] * dy[i]);
+    }
+}
+
+void prop_arctan2(struct ggml_tensor* x, struct ggml_tensor* y, struct ggml_tensor* h) {
+    int n_elements = ggml_nelements(h);
+    float* dx      = (float*)x->data;
+    float* dy      = (float*)y->data;
+    float* dh      = (float*)h->data;
+    for (int i = 0; i < n_elements; i++) {
+        dh[i] = atan2f(dy[i], dx[i]);
+    }
+}
+
+void normalize_tensor(struct ggml_tensor* g) {
+    int n_elements = ggml_nelements(g);
+    float* dg      = (float*)g->data;
+    float max      = -INFINITY;
+    for (int i = 0; i < n_elements; i++) {
+        max = dg[i] > max ? dg[i] : max;
+    }
+    max = 1.0f / max;
+    for (int i = 0; i < n_elements; i++) {
+        dg[i] *= max;
+    }
+}
+
+void non_max_supression(struct ggml_tensor* result, struct ggml_tensor* G, struct ggml_tensor* D) {
+    for (int iy = 1; iy < result->ne[1] - 1; iy++) {
+        for (int ix = 1; ix < result->ne[0] - 1; ix++) {
+            float angle = ggml_tensor_get_f32(D, ix, iy) * 180.0f / M_PI_;
+            angle       = angle < 0.0f ? angle += 180.0f : angle;
+            float q     = 1.0f;
+            float r     = 1.0f;
+
+            // angle 0
+            if ((0 >= angle && angle < 22.5f) || (157.5f >= angle && angle <= 180)) {
+                q = ggml_tensor_get_f32(G, ix, iy + 1);
+                r = ggml_tensor_get_f32(G, ix, iy - 1);
+            }
+            // angle 45
+            else if (22.5f >= angle && angle < 67.5f) {
+                q = ggml_tensor_get_f32(G, ix + 1, iy - 1);
+                r = ggml_tensor_get_f32(G, ix - 1, iy + 1);
+            }
+            // angle 90
+            else if (67.5f >= angle && angle < 112.5) {
+                q = ggml_tensor_get_f32(G, ix + 1, iy);
+                r = ggml_tensor_get_f32(G, ix - 1, iy);
+            }
+            // angle 135
+            else if (112.5 >= angle && angle < 157.5f) {
+                q = ggml_tensor_get_f32(G, ix - 1, iy - 1);
+                r = ggml_tensor_get_f32(G, ix + 1, iy + 1);
+            }
+
+            float cur = ggml_tensor_get_f32(G, ix, iy);
+            if ((cur >= q) && (cur >= r)) {
+                ggml_tensor_set_f32(result, cur, ix, iy);
+            } else {
+                ggml_tensor_set_f32(result, 0.0f, ix, iy);
+            }
+        }
+    }
+}
+
+void threshold_hystersis(struct ggml_tensor* img, float high_threshold, float low_threshold, float weak, float strong) {
+    int n_elements = ggml_nelements(img);
+    float* imd     = (float*)img->data;
+    float max      = -INFINITY;
+    for (int i = 0; i < n_elements; i++) {
+        max = imd[i] > max ? imd[i] : max;
+    }
+    float ht = max * high_threshold;
+    float lt = ht * low_threshold;
+    for (int i = 0; i < n_elements; i++) {
+        float img_v = imd[i];
+        if (img_v >= ht) {  // strong pixel
+            imd[i] = strong;
+        } else if (img_v <= ht && img_v >= lt) {  // strong pixel
+            imd[i] = weak;
+        }
+    }
+
+    for (int iy = 0; iy < img->ne[1]; iy++) {
+        for (int ix = 0; ix < img->ne[0]; ix++) {
+            if (ix >= 3 && ix <= img->ne[0] - 3 && iy >= 3 && iy <= img->ne[1] - 3) {
+                ggml_tensor_set_f32(img, ggml_tensor_get_f32(img, ix, iy), ix, iy);
+            } else {
+                ggml_tensor_set_f32(img, 0.0f, ix, iy);
+            }
+        }
+    }
+
+    // hysteresis
+    for (int iy = 1; iy < img->ne[1] - 1; iy++) {
+        for (int ix = 1; ix < img->ne[0] - 1; ix++) {
+            float imd_v = ggml_tensor_get_f32(img, ix, iy);
+            if (imd_v == weak) {
+                if (ggml_tensor_get_f32(img, ix + 1, iy - 1) == strong || ggml_tensor_get_f32(img, ix + 1, iy) == strong ||
+                    ggml_tensor_get_f32(img, ix, iy - 1) == strong || ggml_tensor_get_f32(img, ix, iy + 1) == strong ||
+                    ggml_tensor_get_f32(img, ix - 1, iy - 1) == strong || ggml_tensor_get_f32(img, ix - 1, iy) == strong) {
+                    ggml_tensor_set_f32(img, strong, ix, iy);
+                } else {
+                    ggml_tensor_set_f32(img, 0.0f, ix, iy);
+                }
+            }
+        }
+    }
+}
+
+uint8_t* preprocess_canny(uint8_t* img, int width, int height, float high_threshold, float low_threshold, float weak, float strong, bool inverse) {
+    struct ggml_init_params params;
+    params.mem_size               = static_cast<size_t>(10 * 1024 * 1024);  // 10
+    params.mem_buffer             = NULL;
+    params.no_alloc               = false;
+    struct ggml_context* work_ctx = ggml_init(params);
+
+    if (!work_ctx) {
+        LOG_ERROR("ggml_init() failed");
+        return NULL;
+    }
+
+    float kX[9] = {
+        -1, 0, 1,
+        -2, 0, 2,
+        -1, 0, 1};
+
+    float kY[9] = {
+        1, 2, 1,
+        0, 0, 0,
+        -1, -2, -1};
+
+    // generate kernel
+    int kernel_size             = 5;
+    struct ggml_tensor* gkernel = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, kernel_size, kernel_size, 1, 1);
+    struct ggml_tensor* sf_kx   = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 3, 3, 1, 1);
+    memcpy(sf_kx->data, kX, ggml_nbytes(sf_kx));
+    struct ggml_tensor* sf_ky = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 3, 3, 1, 1);
+    memcpy(sf_ky->data, kY, ggml_nbytes(sf_ky));
+    gaussian_kernel(gkernel);
+    struct ggml_tensor* image      = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
+    struct ggml_tensor* image_gray = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 1, 1);
+    struct ggml_tensor* iX         = ggml_dup_tensor(work_ctx, image_gray);
+    struct ggml_tensor* iY         = ggml_dup_tensor(work_ctx, image_gray);
+    struct ggml_tensor* G          = ggml_dup_tensor(work_ctx, image_gray);
+    struct ggml_tensor* tetha      = ggml_dup_tensor(work_ctx, image_gray);
+    sd_image_to_tensor(img, image);
+    grayscale(image, image_gray);
+    convolve(image_gray, image_gray, gkernel, 2);
+    convolve(image_gray, iX, sf_kx, 1);
+    convolve(image_gray, iY, sf_ky, 1);
+    prop_hypot(iX, iY, G);
+    normalize_tensor(G);
+    prop_arctan2(iX, iY, tetha);
+    non_max_supression(image_gray, G, tetha);
+    threshold_hystersis(image_gray, high_threshold, low_threshold, weak, strong);
+    // to RGB channels
+    for (int iy = 0; iy < height; iy++) {
+        for (int ix = 0; ix < width; ix++) {
+            float gray = ggml_tensor_get_f32(image_gray, ix, iy);
+            gray       = inverse ? 1.0f - gray : gray;
+            ggml_tensor_set_f32(image, gray, ix, iy);
+            ggml_tensor_set_f32(image, gray, ix, iy, 1);
+            ggml_tensor_set_f32(image, gray, ix, iy, 2);
+        }
+    }
+    free(img);
+    uint8_t* output = sd_tensor_to_image(image);
+    ggml_free(work_ctx);
+    return output;
+}
+
+#endif  // __PREPROCESSING_HPP__
--- a/otherarch/sdcpp/rng.hpp
+++ b/otherarch/sdcpp/rng.hpp
@ -0,0 +1,35 @@
+#ifndef __RNG_H__
+#define __RNG_H__
+
+#include <random>
+#include <vector>
+
+class RNG {
+public:
+    virtual void manual_seed(uint64_t seed)      = 0;
+    virtual std::vector<float> randn(uint32_t n) = 0;
+};
+
+class STDDefaultRNG : public RNG {
+private:
+    std::default_random_engine generator;
+
+public:
+    void manual_seed(uint64_t seed) {
+        generator.seed((unsigned int)seed);
+    }
+
+    std::vector<float> randn(uint32_t n) {
+        std::vector<float> result;
+        float mean   = 0.0;
+        float stddev = 1.0;
+        std::normal_distribution<float> distribution(mean, stddev);
+        for (uint32_t i = 0; i < n; i++) {
+            float random_number = distribution(generator);
+            result.push_back(random_number);
+        }
+        return result;
+    }
+};
+
+#endif  // __RNG_H__
--- a/otherarch/sdcpp/rng_philox.hpp
+++ b/otherarch/sdcpp/rng_philox.hpp
@ -0,0 +1,125 @@
+#ifndef __RNG_PHILOX_H__
+#define __RNG_PHILOX_H__
+
+#include <cmath>
+#include <vector>
+
+#include "rng.hpp"
+
+// RNG imitiating torch cuda randn on CPU.
+// Port from: https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/5ef669de080814067961f28357256e8fe27544f4/modules/rng_philox.py
+class PhiloxRNG : public RNG {
+private:
+    uint64_t seed;
+    uint32_t offset;
+
+private:
+    std::vector<uint32_t> philox_m = {0xD2511F53, 0xCD9E8D57};
+    std::vector<uint32_t> philox_w = {0x9E3779B9, 0xBB67AE85};
+    float two_pow32_inv            = 2.3283064e-10f;
+    float two_pow32_inv_2pi        = 2.3283064e-10f * 6.2831855f;
+
+    std::vector<uint32_t> uint32(uint64_t x) {
+        std::vector<uint32_t> result(2);
+        result[0] = static_cast<uint32_t>(x & 0xFFFFFFFF);
+        result[1] = static_cast<uint32_t>(x >> 32);
+        return result;
+    }
+
+    std::vector<std::vector<uint32_t>> uint32(const std::vector<uint64_t>& x) {
+        uint32_t N = (uint32_t)x.size();
+        std::vector<std::vector<uint32_t>> result(2, std::vector<uint32_t>(N));
+
+        for (uint32_t i = 0; i < N; ++i) {
+            result[0][i] = static_cast<uint32_t>(x[i] & 0xFFFFFFFF);
+            result[1][i] = static_cast<uint32_t>(x[i] >> 32);
+        }
+
+        return result;
+    }
+
+    //  A single round of the Philox 4x32 random number generator.
+    void philox4_round(std::vector<std::vector<uint32_t>>& counter,
+                       const std::vector<std::vector<uint32_t>>& key) {
+        uint32_t N = (uint32_t)counter[0].size();
+        for (uint32_t i = 0; i < N; i++) {
+            std::vector<uint32_t> v1 = uint32(static_cast<uint64_t>(counter[0][i]) * static_cast<uint64_t>(philox_m[0]));
+            std::vector<uint32_t> v2 = uint32(static_cast<uint64_t>(counter[2][i]) * static_cast<uint64_t>(philox_m[1]));
+
+            counter[0][i] = v2[1] ^ counter[1][i] ^ key[0][i];
+            counter[1][i] = v2[0];
+            counter[2][i] = v1[1] ^ counter[3][i] ^ key[1][i];
+            counter[3][i] = v1[0];
+        }
+    }
+
+    // Generates 32-bit random numbers using the Philox 4x32 random number generator.
+    // Parameters:
+    //     counter : A 4xN array of 32-bit integers representing the counter values (offset into generation).
+    //     key : A 2xN array of 32-bit integers representing the key values (seed).
+    //     rounds : The number of rounds to perform.
+    // Returns:
+    //     std::vector<std::vector<uint32_t>>: A 4xN array of 32-bit integers containing the generated random numbers.
+    std::vector<std::vector<uint32_t>> philox4_32(std::vector<std::vector<uint32_t>>& counter,
+                                                  std::vector<std::vector<uint32_t>>& key,
+                                                  int rounds = 10) {
+        uint32_t N = (uint32_t)counter[0].size();
+        for (int i = 0; i < rounds - 1; ++i) {
+            philox4_round(counter, key);
+
+            for (uint32_t j = 0; j < N; ++j) {
+                key[0][j] += philox_w[0];
+                key[1][j] += philox_w[1];
+            }
+        }
+
+        philox4_round(counter, key);
+        return counter;
+    }
+
+    float box_muller(float x, float y) {
+        float u = x * two_pow32_inv + two_pow32_inv / 2;
+        float v = y * two_pow32_inv_2pi + two_pow32_inv_2pi / 2;
+
+        float s = sqrt(-2.0f * log(u));
+
+        float r1 = s * sin(v);
+        return r1;
+    }
+
+public:
+    PhiloxRNG(uint64_t seed = 0) {
+        this->seed   = seed;
+        this->offset = 0;
+    }
+
+    void manual_seed(uint64_t seed) {
+        this->seed   = seed;
+        this->offset = 0;
+    }
+
+    std::vector<float> randn(uint32_t n) {
+        std::vector<std::vector<uint32_t>> counter(4, std::vector<uint32_t>(n, 0));
+        for (uint32_t i = 0; i < n; i++) {
+            counter[0][i] = this->offset;
+        }
+
+        for (uint32_t i = 0; i < n; i++) {
+            counter[2][i] = i;
+        }
+        this->offset += 1;
+
+        std::vector<uint64_t> key(n, this->seed);
+        std::vector<std::vector<uint32_t>> key_uint32 = uint32(key);
+
+        std::vector<std::vector<uint32_t>> g = philox4_32(counter, key_uint32);
+
+        std::vector<float> result;
+        for (uint32_t i = 0; i < n; ++i) {
+            result.push_back(box_muller((float)g[0][i], (float)g[1][i]));
+        }
+        return result;
+    }
+};
+
+#endif  // __RNG_PHILOX_H__
--- a/otherarch/sdcpp/stable-diffusion.cpp
+++ b/otherarch/sdcpp/stable-diffusion.cpp
--- a/otherarch/sdcpp/stable-diffusion.h
+++ b/otherarch/sdcpp/stable-diffusion.h
@ -0,0 +1,193 @@
+#ifndef __STABLE_DIFFUSION_H__
+#define __STABLE_DIFFUSION_H__
+
+#if defined(_WIN32) || defined(__CYGWIN__)
+#ifndef SD_BUILD_SHARED_LIB
+#define SD_API
+#else
+#ifdef SD_BUILD_DLL
+#define SD_API __declspec(dllexport)
+#else
+#define SD_API __declspec(dllimport)
+#endif
+#endif
+#else
+#if __GNUC__ >= 4
+#define SD_API __attribute__((visibility("default")))
+#else
+#define SD_API
+#endif
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+enum rng_type_t {
+    STD_DEFAULT_RNG,
+    CUDA_RNG
+};
+
+enum sample_method_t {
+    EULER_A,
+    EULER,
+    HEUN,
+    DPM2,
+    DPMPP2S_A,
+    DPMPP2M,
+    DPMPP2Mv2,
+    LCM,
+    N_SAMPLE_METHODS
+};
+
+enum schedule_t {
+    DEFAULT,
+    DISCRETE,
+    KARRAS,
+    N_SCHEDULES
+};
+
+// same as enum ggml_type
+enum sd_type_t {
+    SD_TYPE_F32  = 0,
+    SD_TYPE_F16  = 1,
+    SD_TYPE_Q4_0 = 2,
+    SD_TYPE_Q4_1 = 3,
+    // SD_TYPE_Q4_2 = 4, support has been removed
+    // SD_TYPE_Q4_3 (5) support has been removed
+    SD_TYPE_Q5_0 = 6,
+    SD_TYPE_Q5_1 = 7,
+    SD_TYPE_Q8_0 = 8,
+    SD_TYPE_Q8_1 = 9,
+    // k-quantizations
+    SD_TYPE_Q2_K    = 10,
+    SD_TYPE_Q3_K    = 11,
+    SD_TYPE_Q4_K    = 12,
+    SD_TYPE_Q5_K    = 13,
+    SD_TYPE_Q6_K    = 14,
+    SD_TYPE_Q8_K    = 15,
+    SD_TYPE_IQ2_XXS = 16,
+    SD_TYPE_IQ2_XS  = 17,
+    SD_TYPE_IQ3_XXS = 18,
+    SD_TYPE_IQ1_S   = 19,
+    SD_TYPE_IQ4_NL  = 20,
+    SD_TYPE_I8,
+    SD_TYPE_I16,
+    SD_TYPE_I32,
+    SD_TYPE_COUNT,
+};
+
+SD_API const char* sd_type_name(enum sd_type_t type);
+
+enum sd_log_level_t {
+    SD_LOG_DEBUG,
+    SD_LOG_INFO,
+    SD_LOG_WARN,
+    SD_LOG_ERROR
+};
+
+typedef void (*sd_log_cb_t)(enum sd_log_level_t level, const char* text, void* data);
+
+SD_API void sd_set_log_callback(sd_log_cb_t sd_log_cb, void* data);
+SD_API int32_t get_num_physical_cores();
+SD_API const char* sd_get_system_info();
+
+typedef struct {
+    uint32_t width;
+    uint32_t height;
+    uint32_t channel;
+    uint8_t* data;
+} sd_image_t;
+
+typedef struct sd_ctx_t sd_ctx_t;
+
+SD_API sd_ctx_t* new_sd_ctx(const char* model_path,
+                            const char* vae_path,
+                            const char* taesd_path,
+                            const char* control_net_path_c_str,
+                            const char* lora_model_dir,
+                            const char* embed_dir_c_str,
+                            bool vae_decode_only,
+                            bool vae_tiling,
+                            bool free_params_immediately,
+                            int n_threads,
+                            enum sd_type_t wtype,
+                            enum rng_type_t rng_type,
+                            enum schedule_t s,
+                            bool keep_control_net_cpu);
+
+SD_API void free_sd_ctx(sd_ctx_t* sd_ctx);
+
+SD_API sd_image_t* txt2img(sd_ctx_t* sd_ctx,
+                           const char* prompt,
+                           const char* negative_prompt,
+                           int clip_skip,
+                           float cfg_scale,
+                           int width,
+                           int height,
+                           enum sample_method_t sample_method,
+                           int sample_steps,
+                           int64_t seed,
+                           int batch_count,
+                           const sd_image_t* control_cond,
+                           float control_strength);
+
+SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx,
+                           sd_image_t init_image,
+                           const char* prompt,
+                           const char* negative_prompt,
+                           int clip_skip,
+                           float cfg_scale,
+                           int width,
+                           int height,
+                           enum sample_method_t sample_method,
+                           int sample_steps,
+                           float strength,
+                           int64_t seed,
+                           int batch_count);
+
+SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx,
+                           sd_image_t init_image,
+                           int width,
+                           int height,
+                           int video_frames,
+                           int motion_bucket_id,
+                           int fps,
+                           float augmentation_level,
+                           float min_cfg,
+                           float cfg_scale,
+                           enum sample_method_t sample_method,
+                           int sample_steps,
+                           float strength,
+                           int64_t seed);
+
+typedef struct upscaler_ctx_t upscaler_ctx_t;
+
+SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path,
+                                        int n_threads,
+                                        enum sd_type_t wtype);
+SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx);
+
+SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx, sd_image_t input_image, uint32_t upscale_factor);
+
+SD_API bool convert(const char* input_path, const char* vae_path, const char* output_path, sd_type_t output_type);
+
+SD_API uint8_t* preprocess_canny(uint8_t* img,
+                                 int width,
+                                 int height,
+                                 float high_threshold,
+                                 float low_threshold,
+                                 float weak,
+                                 float strong,
+                                 bool inverse);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // __STABLE_DIFFUSION_H__
--- a/otherarch/sdcpp/tae.hpp
+++ b/otherarch/sdcpp/tae.hpp
@ -0,0 +1,259 @@
+#ifndef __TAE_HPP__
+#define __TAE_HPP__
+
+#include "ggml_extend.hpp"
+
+#include "model.h"
+
+/*
+    ===================================    TinyAutoEncoder  ===================================
+    References:
+    https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/autoencoders/vae.py
+    https://github.com/madebyollin/taesd/blob/main/taesd.py
+
+*/
+
+class TAEBlock : public UnaryBlock {
+protected:
+    int n_in;
+    int n_out;
+
+public:
+    TAEBlock(int n_in, int n_out)
+        : n_in(n_in), n_out(n_out) {
+        blocks["conv.0"] = std::shared_ptr<GGMLBlock>(new Conv2d(n_in, n_out, {3, 3}, {1, 1}, {1, 1}));
+        blocks["conv.2"] = std::shared_ptr<GGMLBlock>(new Conv2d(n_out, n_out, {3, 3}, {1, 1}, {1, 1}));
+        blocks["conv.4"] = std::shared_ptr<GGMLBlock>(new Conv2d(n_out, n_out, {3, 3}, {1, 1}, {1, 1}));
+        if (n_in != n_out) {
+            blocks["skip"] = std::shared_ptr<GGMLBlock>(new Conv2d(n_in, n_out, {1, 1}, {1, 1}, {1, 1}, {1, 1}, false));
+        }
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        // x: [n, n_in, h, w]
+        // return: [n, n_out, h, w]
+
+        auto conv_0 = std::dynamic_pointer_cast<Conv2d>(blocks["conv.0"]);
+        auto conv_2 = std::dynamic_pointer_cast<Conv2d>(blocks["conv.2"]);
+        auto conv_4 = std::dynamic_pointer_cast<Conv2d>(blocks["conv.4"]);
+
+        auto h = conv_0->forward(ctx, x);
+        h      = ggml_relu_inplace(ctx, h);
+        h      = conv_2->forward(ctx, h);
+        h      = ggml_relu_inplace(ctx, h);
+        h      = conv_4->forward(ctx, h);
+
+        if (n_in != n_out) {
+            auto skip = std::dynamic_pointer_cast<Conv2d>(blocks["skip"]);
+            LOG_DEBUG("skip");
+            x = skip->forward(ctx, x);
+        }
+
+        h = ggml_add(ctx, h, x);
+        h = ggml_relu_inplace(ctx, h);
+        return h;
+    }
+};
+
+class TinyEncoder : public UnaryBlock {
+    int in_channels = 3;
+    int channels    = 64;
+    int z_channels  = 4;
+    int num_blocks  = 3;
+
+public:
+    TinyEncoder() {
+        int index                       = 0;
+        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, channels, {3, 3}, {1, 1}, {1, 1}));
+        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
+
+        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {2, 2}, {1, 1}, {1, 1}, false));
+        for (int i = 0; i < num_blocks; i++) {
+            blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
+        }
+
+        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {2, 2}, {1, 1}, {1, 1}, false));
+        for (int i = 0; i < num_blocks; i++) {
+            blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
+        }
+
+        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {2, 2}, {1, 1}, {1, 1}, false));
+        for (int i = 0; i < num_blocks; i++) {
+            blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
+        }
+
+        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, z_channels, {3, 3}, {1, 1}, {1, 1}));
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        // x: [n, in_channels, h, w]
+        // return: [n, z_channels, h/8, w/8]
+
+        for (int i = 0; i < num_blocks * 3 + 6; i++) {
+            auto block = std::dynamic_pointer_cast<UnaryBlock>(blocks[std::to_string(i)]);
+
+            x = block->forward(ctx, x);
+        }
+
+        return x;
+    }
+};
+
+class TinyDecoder : public UnaryBlock {
+    int z_channels   = 4;
+    int channels     = 64;
+    int out_channels = 3;
+    int num_blocks   = 3;
+
+public:
+    TinyDecoder(int index = 0) {
+        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(z_channels, channels, {3, 3}, {1, 1}, {1, 1}));
+        index++;  // nn.ReLU()
+
+        for (int i = 0; i < num_blocks; i++) {
+            blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
+        }
+        index++;  // nn.Upsample()
+        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, false));
+
+        for (int i = 0; i < num_blocks; i++) {
+            blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
+        }
+        index++;  // nn.Upsample()
+        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, false));
+
+        for (int i = 0; i < num_blocks; i++) {
+            blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
+        }
+        index++;  // nn.Upsample()
+        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, false));
+
+        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
+        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* z) {
+        // z: [n, z_channels, h, w]
+        // return: [n, out_channels, h*8, w*8]
+
+        auto h = ggml_scale(ctx, z, 1.0f / 3.0f);
+        h      = ggml_tanh_inplace(ctx, h);
+        h      = ggml_scale(ctx, h, 3.0f);
+
+        for (int i = 0; i < num_blocks * 3 + 10; i++) {
+            if (blocks.find(std::to_string(i)) == blocks.end()) {
+                if (i == 1) {
+                    h = ggml_relu_inplace(ctx, h);
+                } else {
+                    h = ggml_upscale(ctx, h, 2);
+                }
+                continue;
+            }
+            auto block = std::dynamic_pointer_cast<UnaryBlock>(blocks[std::to_string(i)]);
+
+            h = block->forward(ctx, h);
+        }
+
+        return h;
+    }
+};
+
+class TAESD : public GGMLBlock {
+protected:
+    bool decode_only;
+
+public:
+    TAESD(bool decode_only = true)
+        : decode_only(decode_only) {
+        blocks["decoder.layers"] = std::shared_ptr<GGMLBlock>(new TinyDecoder());
+
+        if (!decode_only) {
+            blocks["encoder.layers"] = std::shared_ptr<GGMLBlock>(new TinyEncoder());
+        }
+    }
+
+    struct ggml_tensor* decode(struct ggml_context* ctx, struct ggml_tensor* z) {
+        auto decoder = std::dynamic_pointer_cast<TinyDecoder>(blocks["decoder.layers"]);
+        return decoder->forward(ctx, z);
+    }
+
+    struct ggml_tensor* encode(struct ggml_context* ctx, struct ggml_tensor* x) {
+        auto encoder = std::dynamic_pointer_cast<TinyEncoder>(blocks["encoder.layers"]);
+        return encoder->forward(ctx, x);
+    }
+};
+
+struct TinyAutoEncoder : public GGMLModule {
+    TAESD taesd;
+    bool decode_only = false;
+
+    TinyAutoEncoder(ggml_backend_t backend,
+                    ggml_type wtype,
+                    bool decoder_only = true)
+        : decode_only(decoder_only),
+          taesd(decode_only),
+          GGMLModule(backend, wtype) {
+        taesd.init(params_ctx, wtype);
+    }
+
+    std::string get_desc() {
+        return "taesd";
+    }
+
+    size_t get_params_mem_size() {
+        return taesd.get_params_mem_size();
+    }
+
+    size_t get_params_num() {
+        return taesd.get_params_num();
+    }
+
+    bool load_from_file(const std::string& file_path) {
+        LOG_INFO("loading taesd from '%s'", file_path.c_str());
+        alloc_params_buffer();
+        std::map<std::string, ggml_tensor*> taesd_tensors;
+        taesd.get_param_tensors(taesd_tensors);
+        std::set<std::string> ignore_tensors;
+        if (decode_only) {
+            ignore_tensors.insert("encoder.");
+        }
+
+        ModelLoader model_loader;
+        if (!model_loader.init_from_file(file_path)) {
+            LOG_ERROR("init taesd model loader from file failed: '%s'", file_path.c_str());
+            return false;
+        }
+
+        bool success = model_loader.load_tensors(taesd_tensors, backend, ignore_tensors);
+
+        if (!success) {
+            LOG_ERROR("load tae tensors from model loader failed");
+            return false;
+        }
+
+        LOG_INFO("taesd model loaded");
+        return success;
+    }
+
+    struct ggml_cgraph* build_graph(struct ggml_tensor* z, bool decode_graph) {
+        struct ggml_cgraph* gf  = ggml_new_graph(compute_ctx);
+        z                       = to_backend(z);
+        struct ggml_tensor* out = decode_graph ? taesd.decode(compute_ctx, z) : taesd.encode(compute_ctx, z);
+        ggml_build_forward_expand(gf, out);
+        return gf;
+    }
+
+    void compute(const int n_threads,
+                 struct ggml_tensor* z,
+                 bool decode_graph,
+                 struct ggml_tensor** output,
+                 struct ggml_context* output_ctx = NULL) {
+        auto get_graph = [&]() -> struct ggml_cgraph* {
+            return build_graph(z, decode_graph);
+        };
+
+        GGMLModule::compute(get_graph, n_threads, false, output, output_ctx);
+    }
+};
+
+#endif  // __TAE_HPP__
--- a/otherarch/sdcpp/thirdparty/README.md
+++ b/otherarch/sdcpp/thirdparty/README.md
@ -0,0 +1,2 @@
+- json.hpp library from: https://github.com/nlohmann/json
+- ZIP Library from: https://github.com/kuba--/zip
--- a/otherarch/sdcpp/thirdparty/json.hpp
+++ b/otherarch/sdcpp/thirdparty/json.hpp
--- a/otherarch/sdcpp/thirdparty/miniz.h
+++ b/otherarch/sdcpp/thirdparty/miniz.h
--- a/otherarch/sdcpp/thirdparty/stb_image.h
+++ b/otherarch/sdcpp/thirdparty/stb_image.h
--- a/otherarch/sdcpp/thirdparty/stb_image_write.h
+++ b/otherarch/sdcpp/thirdparty/stb_image_write.h
--- a/otherarch/sdcpp/thirdparty/zip.c
+++ b/otherarch/sdcpp/thirdparty/zip.c
--- a/otherarch/sdcpp/thirdparty/zip.h
+++ b/otherarch/sdcpp/thirdparty/zip.h
@ -0,0 +1,509 @@
+/*
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+#ifndef ZIP_H
+#define ZIP_H
+
+#include <stdint.h>
+#include <string.h>
+#include <sys/types.h>
+
+#ifndef ZIP_SHARED
+#define ZIP_EXPORT
+#else
+#ifdef _WIN32
+#ifdef ZIP_BUILD_SHARED
+#define ZIP_EXPORT __declspec(dllexport)
+#else
+#define ZIP_EXPORT __declspec(dllimport)
+#endif
+#else
+#define ZIP_EXPORT __attribute__((visibility("default")))
+#endif
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if !defined(_POSIX_C_SOURCE) && defined(_MSC_VER)
+// 64-bit Windows is the only mainstream platform
+// where sizeof(long) != sizeof(void*)
+#ifdef _WIN64
+typedef long long ssize_t; /* byte count or error */
+#else
+typedef long ssize_t; /* byte count or error */
+#endif
+#endif
+
+/**
+ * @mainpage
+ *
+ * Documentation for @ref zip.
+ */
+
+/**
+ * @addtogroup zip
+ * @{
+ */
+
+/**
+ * Default zip compression level.
+ */
+#define ZIP_DEFAULT_COMPRESSION_LEVEL 6
+
+/**
+ * Error codes
+ */
+#define ZIP_ENOINIT -1      // not initialized
+#define ZIP_EINVENTNAME -2  // invalid entry name
+#define ZIP_ENOENT -3       // entry not found
+#define ZIP_EINVMODE -4     // invalid zip mode
+#define ZIP_EINVLVL -5      // invalid compression level
+#define ZIP_ENOSUP64 -6     // no zip 64 support
+#define ZIP_EMEMSET -7      // memset error
+#define ZIP_EWRTENT -8      // cannot write data to entry
+#define ZIP_ETDEFLINIT -9   // cannot initialize tdefl compressor
+#define ZIP_EINVIDX -10     // invalid index
+#define ZIP_ENOHDR -11      // header not found
+#define ZIP_ETDEFLBUF -12   // cannot flush tdefl buffer
+#define ZIP_ECRTHDR -13     // cannot create entry header
+#define ZIP_EWRTHDR -14     // cannot write entry header
+#define ZIP_EWRTDIR -15     // cannot write to central dir
+#define ZIP_EOPNFILE -16    // cannot open file
+#define ZIP_EINVENTTYPE -17 // invalid entry type
+#define ZIP_EMEMNOALLOC -18 // extracting data using no memory allocation
+#define ZIP_ENOFILE -19     // file not found
+#define ZIP_ENOPERM -20     // no permission
+#define ZIP_EOOMEM -21      // out of memory
+#define ZIP_EINVZIPNAME -22 // invalid zip archive name
+#define ZIP_EMKDIR -23      // make dir error
+#define ZIP_ESYMLINK -24    // symlink error
+#define ZIP_ECLSZIP -25     // close archive error
+#define ZIP_ECAPSIZE -26    // capacity size too small
+#define ZIP_EFSEEK -27      // fseek error
+#define ZIP_EFREAD -28      // fread error
+#define ZIP_EFWRITE -29     // fwrite error
+#define ZIP_ERINIT -30      // cannot initialize reader
+#define ZIP_EWINIT -31      // cannot initialize writer
+#define ZIP_EWRINIT -32     // cannot initialize writer from reader
+
+/**
+ * Looks up the error message string corresponding to an error number.
+ * @param errnum error number
+ * @return error message string corresponding to errnum or NULL if error is not
+ * found.
+ */
+extern ZIP_EXPORT const char *zip_strerror(int errnum);
+
+/**
+ * @struct zip_t
+ *
+ * This data structure is used throughout the library to represent zip archive -
+ * forward declaration.
+ */
+struct zip_t;
+
+/**
+ * Opens zip archive with compression level using the given mode.
+ *
+ * @param zipname zip archive file name.
+ * @param level compression level (0-9 are the standard zlib-style levels).
+ * @param mode file access mode.
+ *        - 'r': opens a file for reading/extracting (the file must exists).
+ *        - 'w': creates an empty file for writing.
+ *        - 'a': appends to an existing archive.
+ *
+ * @return the zip archive handler or NULL on error
+ */
+extern ZIP_EXPORT struct zip_t *zip_open(const char *zipname, int level,
+                                         char mode);
+
+/**
+ * Opens zip archive with compression level using the given mode.
+ * The function additionally returns @param errnum -
+ *
+ * @param zipname zip archive file name.
+ * @param level compression level (0-9 are the standard zlib-style levels).
+ * @param mode file access mode.
+ *        - 'r': opens a file for reading/extracting (the file must exists).
+ *        - 'w': creates an empty file for writing.
+ *        - 'a': appends to an existing archive.
+ * @param errnum 0 on success, negative number (< 0) on error.
+ *
+ * @return the zip archive handler or NULL on error
+ */
+extern ZIP_EXPORT struct zip_t *
+zip_openwitherror(const char *zipname, int level, char mode, int *errnum);
+
+/**
+ * Closes the zip archive, releases resources - always finalize.
+ *
+ * @param zip zip archive handler.
+ */
+extern ZIP_EXPORT void zip_close(struct zip_t *zip);
+
+/**
+ * Determines if the archive has a zip64 end of central directory headers.
+ *
+ * @param zip zip archive handler.
+ *
+ * @return the return code - 1 (true), 0 (false), negative number (< 0) on
+ *         error.
+ */
+extern ZIP_EXPORT int zip_is64(struct zip_t *zip);
+
+/**
+ * Opens an entry by name in the zip archive.
+ *
+ * For zip archive opened in 'w' or 'a' mode the function will append
+ * a new entry. In readonly mode the function tries to locate the entry
+ * in global dictionary.
+ *
+ * @param zip zip archive handler.
+ * @param entryname an entry name in local dictionary.
+ *
+ * @return the return code - 0 on success, negative number (< 0) on error.
+ */
+extern ZIP_EXPORT int zip_entry_open(struct zip_t *zip, const char *entryname);
+
+/**
+ * Opens an entry by name in the zip archive.
+ *
+ * For zip archive opened in 'w' or 'a' mode the function will append
+ * a new entry. In readonly mode the function tries to locate the entry
+ * in global dictionary (case sensitive).
+ *
+ * @param zip zip archive handler.
+ * @param entryname an entry name in local dictionary (case sensitive).
+ *
+ * @return the return code - 0 on success, negative number (< 0) on error.
+ */
+extern ZIP_EXPORT int zip_entry_opencasesensitive(struct zip_t *zip,
+                                                  const char *entryname);
+
+/**
+ * Opens a new entry by index in the zip archive.
+ *
+ * This function is only valid if zip archive was opened in 'r' (readonly) mode.
+ *
+ * @param zip zip archive handler.
+ * @param index index in local dictionary.
+ *
+ * @return the return code - 0 on success, negative number (< 0) on error.
+ */
+extern ZIP_EXPORT int zip_entry_openbyindex(struct zip_t *zip, size_t index);
+
+/**
+ * Closes a zip entry, flushes buffer and releases resources.
+ *
+ * @param zip zip archive handler.
+ *
+ * @return the return code - 0 on success, negative number (< 0) on error.
+ */
+extern ZIP_EXPORT int zip_entry_close(struct zip_t *zip);
+
+/**
+ * Returns a local name of the current zip entry.
+ *
+ * The main difference between user's entry name and local entry name
+ * is optional relative path.
+ * Following .ZIP File Format Specification - the path stored MUST not contain
+ * a drive or device letter, or a leading slash.
+ * All slashes MUST be forward slashes '/' as opposed to backwards slashes '\'
+ * for compatibility with Amiga and UNIX file systems etc.
+ *
+ * @param zip: zip archive handler.
+ *
+ * @return the pointer to the current zip entry name, or NULL on error.
+ */
+extern ZIP_EXPORT const char *zip_entry_name(struct zip_t *zip);
+
+/**
+ * Returns an index of the current zip entry.
+ *
+ * @param zip zip archive handler.
+ *
+ * @return the index on success, negative number (< 0) on error.
+ */
+extern ZIP_EXPORT ssize_t zip_entry_index(struct zip_t *zip);
+
+/**
+ * Determines if the current zip entry is a directory entry.
+ *
+ * @param zip zip archive handler.
+ *
+ * @return the return code - 1 (true), 0 (false), negative number (< 0) on
+ *         error.
+ */
+extern ZIP_EXPORT int zip_entry_isdir(struct zip_t *zip);
+
+/**
+ * Returns the uncompressed size of the current zip entry.
+ * Alias for zip_entry_uncomp_size (for backward compatibility).
+ *
+ * @param zip zip archive handler.
+ *
+ * @return the uncompressed size in bytes.
+ */
+extern ZIP_EXPORT unsigned long long zip_entry_size(struct zip_t *zip);
+
+/**
+ * Returns the uncompressed size of the current zip entry.
+ *
+ * @param zip zip archive handler.
+ *
+ * @return the uncompressed size in bytes.
+ */
+extern ZIP_EXPORT unsigned long long zip_entry_uncomp_size(struct zip_t *zip);
+
+/**
+ * Returns the compressed size of the current zip entry.
+ *
+ * @param zip zip archive handler.
+ *
+ * @return the compressed size in bytes.
+ */
+extern ZIP_EXPORT unsigned long long zip_entry_comp_size(struct zip_t *zip);
+
+/**
+ * Returns CRC-32 checksum of the current zip entry.
+ *
+ * @param zip zip archive handler.
+ *
+ * @return the CRC-32 checksum.
+ */
+extern ZIP_EXPORT unsigned int zip_entry_crc32(struct zip_t *zip);
+
+/**
+ * Compresses an input buffer for the current zip entry.
+ *
+ * @param zip zip archive handler.
+ * @param buf input buffer.
+ * @param bufsize input buffer size (in bytes).
+ *
+ * @return the return code - 0 on success, negative number (< 0) on error.
+ */
+extern ZIP_EXPORT int zip_entry_write(struct zip_t *zip, const void *buf,
+                                      size_t bufsize);
+
+/**
+ * Compresses a file for the current zip entry.
+ *
+ * @param zip zip archive handler.
+ * @param filename input file.
+ *
+ * @return the return code - 0 on success, negative number (< 0) on error.
+ */
+extern ZIP_EXPORT int zip_entry_fwrite(struct zip_t *zip, const char *filename);
+
+/**
+ * Extracts the current zip entry into output buffer.
+ *
+ * The function allocates sufficient memory for a output buffer.
+ *
+ * @param zip zip archive handler.
+ * @param buf output buffer.
+ * @param bufsize output buffer size (in bytes).
+ *
+ * @note remember to release memory allocated for a output buffer.
+ *       for large entries, please take a look at zip_entry_extract function.
+ *
+ * @return the return code - the number of bytes actually read on success.
+ *         Otherwise a negative number (< 0) on error.
+ */
+extern ZIP_EXPORT ssize_t zip_entry_read(struct zip_t *zip, void **buf,
+                                         size_t *bufsize);
+
+/**
+ * Extracts the current zip entry into a memory buffer using no memory
+ * allocation.
+ *
+ * @param zip zip archive handler.
+ * @param buf preallocated output buffer.
+ * @param bufsize output buffer size (in bytes).
+ *
+ * @note ensure supplied output buffer is large enough.
+ *       zip_entry_size function (returns uncompressed size for the current
+ *       entry) can be handy to estimate how big buffer is needed.
+ *       For large entries, please take a look at zip_entry_extract function.
+ *
+ * @return the return code - the number of bytes actually read on success.
+ *         Otherwise a negative number (< 0) on error (e.g. bufsize is not large
+ * enough).
+ */
+extern ZIP_EXPORT ssize_t zip_entry_noallocread(struct zip_t *zip, void *buf,
+                                                size_t bufsize);
+
+/**
+ * Extracts the current zip entry into output file.
+ *
+ * @param zip zip archive handler.
+ * @param filename output file.
+ *
+ * @return the return code - 0 on success, negative number (< 0) on error.
+ */
+extern ZIP_EXPORT int zip_entry_fread(struct zip_t *zip, const char *filename);
+
+/**
+ * Extracts the current zip entry using a callback function (on_extract).
+ *
+ * @param zip zip archive handler.
+ * @param on_extract callback function.
+ * @param arg opaque pointer (optional argument, which you can pass to the
+ *        on_extract callback)
+ *
+ * @return the return code - 0 on success, negative number (< 0) on error.
+ */
+extern ZIP_EXPORT int
+zip_entry_extract(struct zip_t *zip,
+                  size_t (*on_extract)(void *arg, uint64_t offset,
+                                       const void *data, size_t size),
+                  void *arg);
+
+/**
+ * Returns the number of all entries (files and directories) in the zip archive.
+ *
+ * @param zip zip archive handler.
+ *
+ * @return the return code - the number of entries on success, negative number
+ *         (< 0) on error.
+ */
+extern ZIP_EXPORT ssize_t zip_entries_total(struct zip_t *zip);
+
+/**
+ * Deletes zip archive entries.
+ *
+ * @param zip zip archive handler.
+ * @param entries array of zip archive entries to be deleted.
+ * @param len the number of entries to be deleted.
+ * @return the number of deleted entries, or negative number (< 0) on error.
+ */
+extern ZIP_EXPORT ssize_t zip_entries_delete(struct zip_t *zip,
+                                             char *const entries[], size_t len);
+
+/**
+ * Extracts a zip archive stream into directory.
+ *
+ * If on_extract is not NULL, the callback will be called after
+ * successfully extracted each zip entry.
+ * Returning a negative value from the callback will cause abort and return an
+ * error. The last argument (void *arg) is optional, which you can use to pass
+ * data to the on_extract callback.
+ *
+ * @param stream zip archive stream.
+ * @param size stream size.
+ * @param dir output directory.
+ * @param on_extract on extract callback.
+ * @param arg opaque pointer.
+ *
+ * @return the return code - 0 on success, negative number (< 0) on error.
+ */
+extern ZIP_EXPORT int
+zip_stream_extract(const char *stream, size_t size, const char *dir,
+                   int (*on_extract)(const char *filename, void *arg),
+                   void *arg);
+
+/**
+ * Opens zip archive stream into memory.
+ *
+ * @param stream zip archive stream.
+ * @param size stream size.
+ * @param level compression level (0-9 are the standard zlib-style levels).
+ * @param mode file access mode.
+ *        - 'r': opens a file for reading/extracting (the file must exists).
+ *        - 'w': creates an empty file for writing.
+ *        - 'a': appends to an existing archive.
+ *
+ * @return the zip archive handler or NULL on error
+ */
+extern ZIP_EXPORT struct zip_t *zip_stream_open(const char *stream, size_t size,
+                                                int level, char mode);
+
+/**
+ * Opens zip archive stream into memory.
+ * The function additionally returns @param errnum -
+ *
+ * @param stream zip archive stream.
+ * @param size stream size.*
+ * @param level compression level (0-9 are the standard zlib-style levels).
+ * @param mode file access mode.
+ *        - 'r': opens a file for reading/extracting (the file must exists).
+ *        - 'w': creates an empty file for writing.
+ *        - 'a': appends to an existing archive.
+ * @param errnum 0 on success, negative number (< 0) on error.
+ *
+ * @return the zip archive handler or NULL on error
+ */
+extern ZIP_EXPORT struct zip_t *zip_stream_openwitherror(const char *stream,
+                                                         size_t size, int level,
+                                                         char mode,
+                                                         int *errnum);
+
+/**
+ * Copy zip archive stream output buffer.
+ *
+ * @param zip zip archive handler.
+ * @param buf output buffer. User should free buf.
+ * @param bufsize output buffer size (in bytes).
+ *
+ * @return copy size
+ */
+extern ZIP_EXPORT ssize_t zip_stream_copy(struct zip_t *zip, void **buf,
+                                          size_t *bufsize);
+
+/**
+ * Close zip archive releases resources.
+ *
+ * @param zip zip archive handler.
+ *
+ * @return
+ */
+extern ZIP_EXPORT void zip_stream_close(struct zip_t *zip);
+
+/**
+ * Creates a new archive and puts files into a single zip archive.
+ *
+ * @param zipname zip archive file.
+ * @param filenames input files.
+ * @param len: number of input files.
+ *
+ * @return the return code - 0 on success, negative number (< 0) on error.
+ */
+extern ZIP_EXPORT int zip_create(const char *zipname, const char *filenames[],
+                                 size_t len);
+
+/**
+ * Extracts a zip archive file into directory.
+ *
+ * If on_extract_entry is not NULL, the callback will be called after
+ * successfully extracted each zip entry.
+ * Returning a negative value from the callback will cause abort and return an
+ * error. The last argument (void *arg) is optional, which you can use to pass
+ * data to the on_extract_entry callback.
+ *
+ * @param zipname zip archive file.
+ * @param dir output directory.
+ * @param on_extract_entry on extract callback.
+ * @param arg opaque pointer.
+ *
+ * @return the return code - 0 on success, negative number (< 0) on error.
+ */
+extern ZIP_EXPORT int zip_extract(const char *zipname, const char *dir,
+                                  int (*on_extract_entry)(const char *filename,
+                                                          void *arg),
+                                  void *arg);
+/** @} */
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/otherarch/sdcpp/unet.hpp
+++ b/otherarch/sdcpp/unet.hpp
@ -0,0 +1,661 @@
+#ifndef __UNET_HPP__
+#define __UNET_HPP__
+
+#include "common.hpp"
+#include "ggml_extend.hpp"
+#include "model.h"
+
+/*==================================================== UnetModel =====================================================*/
+
+#define UNET_GRAPH_SIZE 10240
+
+class SpatialVideoTransformer : public SpatialTransformer {
+protected:
+    int64_t time_depth;
+    int64_t max_time_embed_period;
+
+public:
+    SpatialVideoTransformer(int64_t in_channels,
+                            int64_t n_head,
+                            int64_t d_head,
+                            int64_t depth,
+                            int64_t context_dim,
+                            int64_t time_depth            = 1,
+                            int64_t max_time_embed_period = 10000)
+        : SpatialTransformer(in_channels, n_head, d_head, depth, context_dim),
+          max_time_embed_period(max_time_embed_period) {
+        // We will convert unet transformer linear to conv2d 1x1 when loading the weights, so use_linear is always False
+        // use_spatial_context is always True
+        // merge_strategy is always learned_with_images
+        // merge_factor is loaded from weights
+        // time_context_dim is always None
+        // ff_in is always True
+        // disable_self_attn is always False
+        // disable_temporal_crossattention is always False
+
+        int64_t inner_dim = n_head * d_head;
+
+        GGML_ASSERT(depth == time_depth);
+        GGML_ASSERT(in_channels == inner_dim);
+
+        int64_t time_mix_d_head    = d_head;
+        int64_t n_time_mix_heads   = n_head;
+        int64_t time_mix_inner_dim = time_mix_d_head * n_time_mix_heads;  // equal to inner_dim
+        int64_t time_context_dim   = context_dim;
+
+        for (int i = 0; i < time_depth; i++) {
+            std::string name = "time_stack." + std::to_string(i);
+            blocks[name]     = std::shared_ptr<GGMLBlock>(new BasicTransformerBlock(inner_dim,
+                                                                                    n_time_mix_heads,
+                                                                                    time_mix_d_head,
+                                                                                    time_context_dim,
+                                                                                    true));
+        }
+
+        int64_t time_embed_dim     = in_channels * 4;
+        blocks["time_pos_embed.0"] = std::shared_ptr<GGMLBlock>(new Linear(in_channels, time_embed_dim));
+        // time_pos_embed.1 is nn.SiLU()
+        blocks["time_pos_embed.2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, in_channels));
+
+        blocks["time_mixer"] = std::shared_ptr<GGMLBlock>(new AlphaBlender());
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx,
+                                struct ggml_tensor* x,
+                                struct ggml_tensor* context,
+                                int timesteps) {
+        // x: [N, in_channels, h, w] aka [b*t, in_channels, h, w], t == timesteps
+        // context: [N, max_position(aka n_context), hidden_size(aka context_dim)] aka [b*t, n_context, context_dim], t == timesteps
+        // t_emb: [N, in_channels] aka [b*t, in_channels]
+        // timesteps is num_frames
+        // time_context is always None
+        // image_only_indicator is always tensor([0.])
+        // transformer_options is not used
+        // GGML_ASSERT(ggml_n_dims(context) == 3);
+
+        auto norm             = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm"]);
+        auto proj_in          = std::dynamic_pointer_cast<Conv2d>(blocks["proj_in"]);
+        auto proj_out         = std::dynamic_pointer_cast<Conv2d>(blocks["proj_out"]);
+        auto time_pos_embed_0 = std::dynamic_pointer_cast<Linear>(blocks["time_pos_embed.0"]);
+        auto time_pos_embed_2 = std::dynamic_pointer_cast<Linear>(blocks["time_pos_embed.2"]);
+        auto time_mixer       = std::dynamic_pointer_cast<AlphaBlender>(blocks["time_mixer"]);
+
+        auto x_in         = x;
+        int64_t n         = x->ne[3];
+        int64_t h         = x->ne[1];
+        int64_t w         = x->ne[0];
+        int64_t inner_dim = n_head * d_head;
+
+        GGML_ASSERT(n == timesteps);  // We compute cond and uncond separately, so batch_size==1
+
+        auto time_context    = context;  // [b*t, n_context, context_dim]
+        auto spatial_context = context;
+        // time_context_first_timestep = time_context[::timesteps]
+        auto time_context_first_timestep = ggml_view_3d(ctx,
+                                                        time_context,
+                                                        time_context->ne[0],
+                                                        time_context->ne[1],
+                                                        1,
+                                                        time_context->nb[1],
+                                                        time_context->nb[2],
+                                                        0);  // [b, n_context, context_dim]
+        time_context                     = ggml_new_tensor_3d(ctx, GGML_TYPE_F32,
+                                                              time_context_first_timestep->ne[0],
+                                                              time_context_first_timestep->ne[1],
+                                                              time_context_first_timestep->ne[2] * h * w);
+        time_context                     = ggml_repeat(ctx, time_context_first_timestep, time_context);  // [b*h*w, n_context, context_dim]
+
+        x = norm->forward(ctx, x);
+        x = proj_in->forward(ctx, x);  // [N, inner_dim, h, w]
+
+        x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 2, 0, 3));  // [N, h, w, inner_dim]
+        x = ggml_reshape_3d(ctx, x, inner_dim, w * h, n);      // [N, h * w, inner_dim]
+
+        auto num_frames = ggml_arange(ctx, 0, timesteps, 1);
+        // since b is 1, no need to do repeat
+        auto t_emb = ggml_nn_timestep_embedding(ctx, num_frames, in_channels, max_time_embed_period);  // [N, in_channels]
+
+        auto emb = time_pos_embed_0->forward(ctx, t_emb);
+        emb      = ggml_silu_inplace(ctx, emb);
+        emb      = time_pos_embed_2->forward(ctx, emb);                   // [N, in_channels]
+        emb      = ggml_reshape_3d(ctx, emb, emb->ne[0], 1, emb->ne[1]);  // [N, 1, in_channels]
+
+        for (int i = 0; i < depth; i++) {
+            std::string transformer_name = "transformer_blocks." + std::to_string(i);
+            std::string time_stack_name  = "time_stack." + std::to_string(i);
+
+            auto block     = std::dynamic_pointer_cast<BasicTransformerBlock>(blocks[transformer_name]);
+            auto mix_block = std::dynamic_pointer_cast<BasicTransformerBlock>(blocks[time_stack_name]);
+
+            x = block->forward(ctx, x, spatial_context);  // [N, h * w, inner_dim]
+
+            // in_channels == inner_dim
+            auto x_mix = x;
+            x_mix      = ggml_add(ctx, x_mix, emb);  // [N, h * w, inner_dim]
+
+            int64_t N = x_mix->ne[2];
+            int64_t T = timesteps;
+            int64_t B = N / T;
+            int64_t S = x_mix->ne[1];
+            int64_t C = x_mix->ne[0];
+
+            x_mix = ggml_reshape_4d(ctx, x_mix, C, S, T, B);               // (b t) s c -> b t s c
+            x_mix = ggml_cont(ctx, ggml_permute(ctx, x_mix, 0, 2, 1, 3));  // b t s c -> b s t c
+            x_mix = ggml_reshape_3d(ctx, x_mix, C, T, S * B);              // b s t c -> (b s) t c
+
+            x_mix = mix_block->forward(ctx, x_mix, time_context);  // [B * h * w, T, inner_dim]
+
+            x_mix = ggml_reshape_4d(ctx, x_mix, C, T, S, B);               // (b s) t c -> b s t c
+            x_mix = ggml_cont(ctx, ggml_permute(ctx, x_mix, 0, 2, 1, 3));  // b s t c -> b t s c
+            x_mix = ggml_reshape_3d(ctx, x_mix, C, S, T * B);              // b t s c -> (b t) s c
+
+            x = time_mixer->forward(ctx, x, x_mix);  // [N, h * w, inner_dim]
+        }
+
+        x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 0, 2, 3));  // [N, inner_dim, h * w]
+        x = ggml_reshape_4d(ctx, x, w, h, inner_dim, n);       // [N, inner_dim, h, w]
+
+        // proj_out
+        x = proj_out->forward(ctx, x);  // [N, in_channels, h, w]
+
+        x = ggml_add(ctx, x, x_in);
+        return x;
+    }
+};
+
+// ldm.modules.diffusionmodules.openaimodel.UNetModel
+class UnetModelBlock : public GGMLBlock {
+protected:
+    SDVersion version = VERSION_1_x;
+    // network hparams
+    int in_channels                        = 4;
+    int out_channels                       = 4;
+    int num_res_blocks                     = 2;
+    std::vector<int> attention_resolutions = {4, 2, 1};
+    std::vector<int> channel_mult          = {1, 2, 4, 4};
+    std::vector<int> transformer_depth     = {1, 1, 1, 1};
+    int time_embed_dim                     = 1280;  // model_channels*4
+    int num_heads                          = 8;
+    int num_head_channels                  = -1;   // channels // num_heads
+    int context_dim                        = 768;  // 1024 for VERSION_2_x, 2048 for VERSION_XL
+
+public:
+    int model_channels  = 320;
+    int adm_in_channels = 2816;  // only for VERSION_XL/SVD
+
+    UnetModelBlock(SDVersion version = VERSION_1_x)
+        : version(version) {
+        if (version == VERSION_2_x) {
+            context_dim       = 1024;
+            num_head_channels = 64;
+            num_heads         = -1;
+        } else if (version == VERSION_XL) {
+            context_dim           = 2048;
+            attention_resolutions = {4, 2};
+            channel_mult          = {1, 2, 4};
+            transformer_depth     = {1, 2, 10};
+            num_head_channels     = 64;
+            num_heads             = -1;
+        } else if (version == VERSION_SVD) {
+            in_channels       = 8;
+            out_channels      = 4;
+            context_dim       = 1024;
+            adm_in_channels   = 768;
+            num_head_channels = 64;
+            num_heads         = -1;
+        }
+        // dims is always 2
+        // use_temporal_attention is always True for SVD
+
+        blocks["time_embed.0"] = std::shared_ptr<GGMLBlock>(new Linear(model_channels, time_embed_dim));
+        // time_embed_1 is nn.SiLU()
+        blocks["time_embed.2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, time_embed_dim));
+
+        if (version == VERSION_XL || version == VERSION_SVD) {
+            blocks["label_emb.0.0"] = std::shared_ptr<GGMLBlock>(new Linear(adm_in_channels, time_embed_dim));
+            // label_emb_1 is nn.SiLU()
+            blocks["label_emb.0.2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, time_embed_dim));
+        }
+
+        // input_blocks
+        blocks["input_blocks.0.0"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, model_channels, {3, 3}, {1, 1}, {1, 1}));
+
+        std::vector<int> input_block_chans;
+        input_block_chans.push_back(model_channels);
+        int ch              = model_channels;
+        int input_block_idx = 0;
+        int ds              = 1;
+
+        auto get_resblock = [&](int64_t channels, int64_t emb_channels, int64_t out_channels) -> ResBlock* {
+            if (version == VERSION_SVD) {
+                return new VideoResBlock(channels, emb_channels, out_channels);
+            } else {
+                return new ResBlock(channels, emb_channels, out_channels);
+            }
+        };
+
+        auto get_attention_layer = [&](int64_t in_channels,
+                                       int64_t n_head,
+                                       int64_t d_head,
+                                       int64_t depth,
+                                       int64_t context_dim) -> SpatialTransformer* {
+            if (version == VERSION_SVD) {
+                return new SpatialVideoTransformer(in_channels, n_head, d_head, depth, context_dim);
+            } else {
+                return new SpatialTransformer(in_channels, n_head, d_head, depth, context_dim);
+            }
+        };
+
+        size_t len_mults = channel_mult.size();
+        for (int i = 0; i < len_mults; i++) {
+            int mult = channel_mult[i];
+            for (int j = 0; j < num_res_blocks; j++) {
+                input_block_idx += 1;
+                std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0";
+                blocks[name]     = std::shared_ptr<GGMLBlock>(get_resblock(ch, time_embed_dim, mult * model_channels));
+
+                ch = mult * model_channels;
+                if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) {
+                    int n_head = num_heads;
+                    int d_head = ch / num_heads;
+                    if (num_head_channels != -1) {
+                        d_head = num_head_channels;
+                        n_head = ch / d_head;
+                    }
+                    std::string name = "input_blocks." + std::to_string(input_block_idx) + ".1";
+                    blocks[name]     = std::shared_ptr<GGMLBlock>(get_attention_layer(ch,
+                                                                                      n_head,
+                                                                                      d_head,
+                                                                                      transformer_depth[i],
+                                                                                      context_dim));
+                }
+                input_block_chans.push_back(ch);
+            }
+            if (i != len_mults - 1) {
+                input_block_idx += 1;
+                std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0";
+                blocks[name]     = std::shared_ptr<GGMLBlock>(new DownSampleBlock(ch, ch));
+
+                input_block_chans.push_back(ch);
+                ds *= 2;
+            }
+        }
+
+        // middle blocks
+        int n_head = num_heads;
+        int d_head = ch / num_heads;
+        if (num_head_channels != -1) {
+            d_head = num_head_channels;
+            n_head = ch / d_head;
+        }
+        blocks["middle_block.0"] = std::shared_ptr<GGMLBlock>(get_resblock(ch, time_embed_dim, ch));
+        blocks["middle_block.1"] = std::shared_ptr<GGMLBlock>(get_attention_layer(ch,
+                                                                                  n_head,
+                                                                                  d_head,
+                                                                                  transformer_depth[transformer_depth.size() - 1],
+                                                                                  context_dim));
+        blocks["middle_block.2"] = std::shared_ptr<GGMLBlock>(get_resblock(ch, time_embed_dim, ch));
+
+        // output_blocks
+        int output_block_idx = 0;
+        for (int i = (int)len_mults - 1; i >= 0; i--) {
+            int mult = channel_mult[i];
+            for (int j = 0; j < num_res_blocks + 1; j++) {
+                int ich = input_block_chans.back();
+                input_block_chans.pop_back();
+
+                std::string name = "output_blocks." + std::to_string(output_block_idx) + ".0";
+                blocks[name]     = std::shared_ptr<GGMLBlock>(get_resblock(ch + ich, time_embed_dim, mult * model_channels));
+
+                ch                = mult * model_channels;
+                int up_sample_idx = 1;
+                if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) {
+                    int n_head = num_heads;
+                    int d_head = ch / num_heads;
+                    if (num_head_channels != -1) {
+                        d_head = num_head_channels;
+                        n_head = ch / d_head;
+                    }
+                    std::string name = "output_blocks." + std::to_string(output_block_idx) + ".1";
+                    blocks[name]     = std::shared_ptr<GGMLBlock>(get_attention_layer(ch, n_head, d_head, transformer_depth[i], context_dim));
+
+                    up_sample_idx++;
+                }
+
+                if (i > 0 && j == num_res_blocks) {
+                    std::string name = "output_blocks." + std::to_string(output_block_idx) + "." + std::to_string(up_sample_idx);
+                    blocks[name]     = std::shared_ptr<GGMLBlock>(new UpSampleBlock(ch, ch));
+
+                    ds /= 2;
+                }
+
+                output_block_idx += 1;
+            }
+        }
+
+        // out
+        blocks["out.0"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(ch));  // ch == model_channels
+        // out_1 is nn.SiLU()
+        blocks["out.2"] = std::shared_ptr<GGMLBlock>(new Conv2d(model_channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
+    }
+
+    struct ggml_tensor* resblock_forward(std::string name,
+                                         struct ggml_context* ctx,
+                                         struct ggml_tensor* x,
+                                         struct ggml_tensor* emb,
+                                         int num_video_frames) {
+        if (version == VERSION_SVD) {
+            auto block = std::dynamic_pointer_cast<VideoResBlock>(blocks[name]);
+
+            return block->forward(ctx, x, emb, num_video_frames);
+        } else {
+            auto block = std::dynamic_pointer_cast<ResBlock>(blocks[name]);
+
+            return block->forward(ctx, x, emb);
+        }
+    }
+
+    struct ggml_tensor* attention_layer_forward(std::string name,
+                                                struct ggml_context* ctx,
+                                                struct ggml_tensor* x,
+                                                struct ggml_tensor* context,
+                                                int timesteps) {
+        if (version == VERSION_SVD) {
+            auto block = std::dynamic_pointer_cast<SpatialVideoTransformer>(blocks[name]);
+
+            return block->forward(ctx, x, context, timesteps);
+        } else {
+            auto block = std::dynamic_pointer_cast<SpatialTransformer>(blocks[name]);
+
+            return block->forward(ctx, x, context);
+        }
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx,
+                                struct ggml_tensor* x,
+                                struct ggml_tensor* timesteps,
+                                struct ggml_tensor* context,
+                                struct ggml_tensor* c_concat              = NULL,
+                                struct ggml_tensor* y                     = NULL,
+                                int num_video_frames                      = -1,
+                                std::vector<struct ggml_tensor*> controls = {},
+                                float control_strength                    = 0.f) {
+        // x: [N, in_channels, h, w] or [N, in_channels/2, h, w]
+        // timesteps: [N,]
+        // context: [N, max_position, hidden_size] or [1, max_position, hidden_size]. for example, [N, 77, 768]
+        // c_concat: [N, in_channels, h, w] or [1, in_channels, h, w]
+        // y: [N, adm_in_channels] or [1, adm_in_channels]
+        // return: [N, out_channels, h, w]
+        if (context != NULL) {
+            if (context->ne[2] != x->ne[3]) {
+                context = ggml_repeat(ctx, context, ggml_new_tensor_3d(ctx, GGML_TYPE_F32, context->ne[0], context->ne[1], x->ne[3]));
+            }
+        }
+
+        if (c_concat != NULL) {
+            if (c_concat->ne[3] != x->ne[3]) {
+                c_concat = ggml_repeat(ctx, c_concat, x);
+            }
+            x = ggml_concat(ctx, x, c_concat);
+        }
+
+        if (y != NULL) {
+            if (y->ne[1] != x->ne[3]) {
+                y = ggml_repeat(ctx, y, ggml_new_tensor_2d(ctx, GGML_TYPE_F32, y->ne[0], x->ne[3]));
+            }
+        }
+
+        auto time_embed_0     = std::dynamic_pointer_cast<Linear>(blocks["time_embed.0"]);
+        auto time_embed_2     = std::dynamic_pointer_cast<Linear>(blocks["time_embed.2"]);
+        auto input_blocks_0_0 = std::dynamic_pointer_cast<Conv2d>(blocks["input_blocks.0.0"]);
+
+        auto out_0 = std::dynamic_pointer_cast<GroupNorm32>(blocks["out.0"]);
+        auto out_2 = std::dynamic_pointer_cast<Conv2d>(blocks["out.2"]);
+
+        auto t_emb = ggml_nn_timestep_embedding(ctx, timesteps, model_channels);  // [N, model_channels]
+
+        auto emb = time_embed_0->forward(ctx, t_emb);
+        emb      = ggml_silu_inplace(ctx, emb);
+        emb      = time_embed_2->forward(ctx, emb);  // [N, time_embed_dim]
+
+        // SDXL/SVD
+        if (y != NULL) {
+            auto label_embed_0 = std::dynamic_pointer_cast<Linear>(blocks["label_emb.0.0"]);
+            auto label_embed_2 = std::dynamic_pointer_cast<Linear>(blocks["label_emb.0.2"]);
+
+            auto label_emb = label_embed_0->forward(ctx, y);
+            label_emb      = ggml_silu_inplace(ctx, label_emb);
+            label_emb      = label_embed_2->forward(ctx, label_emb);  // [N, time_embed_dim]
+
+            emb = ggml_add(ctx, emb, label_emb);  // [N, time_embed_dim]
+        }
+
+        // input_blocks
+        std::vector<struct ggml_tensor*> hs;
+
+        // input block 0
+        auto h = input_blocks_0_0->forward(ctx, x);
+
+        ggml_set_name(h, "bench-start");
+        hs.push_back(h);
+        // input block 1-11
+        size_t len_mults    = channel_mult.size();
+        int input_block_idx = 0;
+        int ds              = 1;
+        for (int i = 0; i < len_mults; i++) {
+            int mult = channel_mult[i];
+            for (int j = 0; j < num_res_blocks; j++) {
+                input_block_idx += 1;
+                std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0";
+                h                = resblock_forward(name, ctx, h, emb, num_video_frames);  // [N, mult*model_channels, h, w]
+                if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) {
+                    std::string name = "input_blocks." + std::to_string(input_block_idx) + ".1";
+                    h                = attention_layer_forward(name, ctx, h, context, num_video_frames);  // [N, mult*model_channels, h, w]
+                }
+                hs.push_back(h);
+            }
+            if (i != len_mults - 1) {
+                ds *= 2;
+                input_block_idx += 1;
+
+                std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0";
+                auto block       = std::dynamic_pointer_cast<DownSampleBlock>(blocks[name]);
+
+                h = block->forward(ctx, h);  // [N, mult*model_channels, h/(2^(i+1)), w/(2^(i+1))]
+                hs.push_back(h);
+            }
+        }
+        // [N, 4*model_channels, h/8, w/8]
+
+        // middle_block
+        h = resblock_forward("middle_block.0", ctx, h, emb, num_video_frames);             // [N, 4*model_channels, h/8, w/8]
+        h = attention_layer_forward("middle_block.1", ctx, h, context, num_video_frames);  // [N, 4*model_channels, h/8, w/8]
+        h = resblock_forward("middle_block.2", ctx, h, emb, num_video_frames);             // [N, 4*model_channels, h/8, w/8]
+
+        if (controls.size() > 0) {
+            auto cs = ggml_scale_inplace(ctx, controls[controls.size() - 1], control_strength);
+            h       = ggml_add(ctx, h, cs);  // middle control
+        }
+        int control_offset = controls.size() - 2;
+
+        // output_blocks
+        int output_block_idx = 0;
+        for (int i = (int)len_mults - 1; i >= 0; i--) {
+            for (int j = 0; j < num_res_blocks + 1; j++) {
+                auto h_skip = hs.back();
+                hs.pop_back();
+
+                if (controls.size() > 0) {
+                    auto cs = ggml_scale_inplace(ctx, controls[control_offset], control_strength);
+                    h_skip  = ggml_add(ctx, h_skip, cs);  // control net condition
+                    control_offset--;
+                }
+
+                h = ggml_concat(ctx, h, h_skip);
+
+                std::string name = "output_blocks." + std::to_string(output_block_idx) + ".0";
+
+                h = resblock_forward(name, ctx, h, emb, num_video_frames);
+
+                int up_sample_idx = 1;
+                if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) {
+                    std::string name = "output_blocks." + std::to_string(output_block_idx) + ".1";
+
+                    h = attention_layer_forward(name, ctx, h, context, num_video_frames);
+
+                    up_sample_idx++;
+                }
+
+                if (i > 0 && j == num_res_blocks) {
+                    std::string name = "output_blocks." + std::to_string(output_block_idx) + "." + std::to_string(up_sample_idx);
+                    auto block       = std::dynamic_pointer_cast<UpSampleBlock>(blocks[name]);
+
+                    h = block->forward(ctx, h);
+
+                    ds /= 2;
+                }
+
+                output_block_idx += 1;
+            }
+        }
+
+        // out
+        h = out_0->forward(ctx, h);
+        h = ggml_silu_inplace(ctx, h);
+        h = out_2->forward(ctx, h);
+        ggml_set_name(h, "bench-end");
+        return h;  // [N, out_channels, h, w]
+    }
+};
+
+struct UNetModel : public GGMLModule {
+    SDVersion version = VERSION_1_x;
+    UnetModelBlock unet;
+
+    UNetModel(ggml_backend_t backend,
+              ggml_type wtype,
+              SDVersion version = VERSION_1_x)
+        : GGMLModule(backend, wtype), unet(version) {
+        unet.init(params_ctx, wtype);
+    }
+
+    std::string get_desc() {
+        return "unet";
+    }
+
+    size_t get_params_mem_size() {
+        return unet.get_params_mem_size();
+    }
+
+    size_t get_params_num() {
+        return unet.get_params_num();
+    }
+
+    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
+        unet.get_param_tensors(tensors, prefix);
+    }
+
+    struct ggml_cgraph* build_graph(struct ggml_tensor* x,
+                                    struct ggml_tensor* timesteps,
+                                    struct ggml_tensor* context,
+                                    struct ggml_tensor* c_concat              = NULL,
+                                    struct ggml_tensor* y                     = NULL,
+                                    int num_video_frames                      = -1,
+                                    std::vector<struct ggml_tensor*> controls = {},
+                                    float control_strength                    = 0.f) {
+        struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, UNET_GRAPH_SIZE, false);
+
+        if (num_video_frames == -1) {
+            num_video_frames = x->ne[3];
+        }
+
+        x         = to_backend(x);
+        context   = to_backend(context);
+        y         = to_backend(y);
+        timesteps = to_backend(timesteps);
+
+        for (int i = 0; i < controls.size(); i++) {
+            controls[i] = to_backend(controls[i]);
+        }
+
+        struct ggml_tensor* out = unet.forward(compute_ctx,
+                                               x,
+                                               timesteps,
+                                               context,
+                                               c_concat,
+                                               y,
+                                               num_video_frames,
+                                               controls,
+                                               control_strength);
+
+        ggml_build_forward_expand(gf, out);
+
+        return gf;
+    }
+
+    void compute(int n_threads,
+                 struct ggml_tensor* x,
+                 struct ggml_tensor* timesteps,
+                 struct ggml_tensor* context,
+                 struct ggml_tensor* c_concat,
+                 struct ggml_tensor* y,
+                 int num_video_frames                      = -1,
+                 std::vector<struct ggml_tensor*> controls = {},
+                 float control_strength                    = 0.f,
+                 struct ggml_tensor** output               = NULL,
+                 struct ggml_context* output_ctx           = NULL) {
+        // x: [N, in_channels, h, w]
+        // timesteps: [N, ]
+        // context: [N, max_position, hidden_size]([N, 77, 768]) or [1, max_position, hidden_size]
+        // c_concat: [N, in_channels, h, w] or [1, in_channels, h, w]
+        // y: [N, adm_in_channels] or [1, adm_in_channels]
+        auto get_graph = [&]() -> struct ggml_cgraph* {
+            return build_graph(x, timesteps, context, c_concat, y, num_video_frames, controls, control_strength);
+        };
+
+        GGMLModule::compute(get_graph, n_threads, false, output, output_ctx);
+    }
+
+    void test() {
+        struct ggml_init_params params;
+        params.mem_size   = static_cast<size_t>(10 * 1024 * 1024);  // 10 MB
+        params.mem_buffer = NULL;
+        params.no_alloc   = false;
+
+        struct ggml_context* work_ctx = ggml_init(params);
+        GGML_ASSERT(work_ctx != NULL);
+
+        {
+            // CPU, num_video_frames = 1, x{num_video_frames, 8, 8, 8}: Pass
+            // CUDA, num_video_frames = 1, x{num_video_frames, 8, 8, 8}: Pass
+            // CPU, num_video_frames = 3, x{num_video_frames, 8, 8, 8}: Wrong result
+            // CUDA, num_video_frames = 3, x{num_video_frames, 8, 8, 8}: nan
+            int num_video_frames = 3;
+
+            auto x = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 8, num_video_frames);
+            std::vector<float> timesteps_vec(num_video_frames, 999.f);
+            auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec);
+            ggml_set_f32(x, 0.5f);
+            // print_ggml_tensor(x);
+
+            auto context = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32, 1024, 1, num_video_frames);
+            ggml_set_f32(context, 0.5f);
+            // print_ggml_tensor(context);
+
+            auto y = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 768, num_video_frames);
+            ggml_set_f32(y, 0.5f);
+            // print_ggml_tensor(y);
+
+            struct ggml_tensor* out = NULL;
+
+            int t0 = ggml_time_ms();
+            compute(8, x, timesteps, context, NULL, y, num_video_frames, {}, 0.f, &out, work_ctx);
+            int t1 = ggml_time_ms();
+
+            print_ggml_tensor(out);
+            LOG_DEBUG("unet test done in %dms", t1 - t0);
+        }
+    };
+};
+
+#endif  // __UNET_HPP__
--- a/otherarch/sdcpp/upscaler.cpp
+++ b/otherarch/sdcpp/upscaler.cpp
@ -0,0 +1,123 @@
+#include "esrgan.hpp"
+#include "ggml_extend.hpp"
+#include "model.h"
+#include "stable-diffusion.h"
+
+struct UpscalerGGML {
+    ggml_backend_t backend    = NULL;  // general backend
+    ggml_type model_data_type = GGML_TYPE_F16;
+    std::shared_ptr<ESRGAN> esrgan_upscaler;
+    std::string esrgan_path;
+    int n_threads;
+
+    UpscalerGGML(int n_threads)
+        : n_threads(n_threads) {
+    }
+
+    bool load_from_file(const std::string& esrgan_path) {
+#ifdef SD_USE_CUBLAS
+        LOG_DEBUG("Using CUDA backend");
+        backend = ggml_backend_cuda_init(0);
+#endif
+#ifdef SD_USE_METAL
+        LOG_DEBUG("Using Metal backend");
+        ggml_metal_log_set_callback(ggml_log_callback_default, nullptr);
+        backend = ggml_backend_metal_init();
+#endif
+
+        if (!backend) {
+            LOG_DEBUG("Using CPU backend");
+            backend = ggml_backend_cpu_init();
+        }
+        LOG_INFO("Upscaler weight type: %s", ggml_type_name(model_data_type));
+        esrgan_upscaler = std::make_shared<ESRGAN>(backend, model_data_type);
+        if (!esrgan_upscaler->load_from_file(esrgan_path)) {
+            return false;
+        }
+        return true;
+    }
+
+    sd_image_t upscale(sd_image_t input_image, uint32_t upscale_factor) {
+        // upscale_factor, unused for RealESRGAN_x4plus_anime_6B.pth
+        sd_image_t upscaled_image = {0, 0, 0, NULL};
+        int output_width          = (int)input_image.width * esrgan_upscaler->scale;
+        int output_height         = (int)input_image.height * esrgan_upscaler->scale;
+        LOG_INFO("upscaling from (%i x %i) to (%i x %i)",
+                 input_image.width, input_image.height, output_width, output_height);
+
+        struct ggml_init_params params;
+        params.mem_size = output_width * output_height * 3 * sizeof(float) * 2;
+        params.mem_size += 2 * ggml_tensor_overhead();
+        params.mem_buffer = NULL;
+        params.no_alloc   = false;
+
+        // draft context
+        struct ggml_context* upscale_ctx = ggml_init(params);
+        if (!upscale_ctx) {
+            LOG_ERROR("ggml_init() failed");
+            return upscaled_image;
+        }
+        LOG_DEBUG("upscale work buffer size: %.2f MB", params.mem_size / 1024.f / 1024.f);
+        ggml_tensor* input_image_tensor = ggml_new_tensor_4d(upscale_ctx, GGML_TYPE_F32, input_image.width, input_image.height, 3, 1);
+        sd_image_to_tensor(input_image.data, input_image_tensor);
+
+        ggml_tensor* upscaled = ggml_new_tensor_4d(upscale_ctx, GGML_TYPE_F32, output_width, output_height, 3, 1);
+        auto on_tiling        = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
+            esrgan_upscaler->compute(n_threads, in, &out);
+        };
+        int64_t t0 = ggml_time_ms();
+        sd_tiling(input_image_tensor, upscaled, esrgan_upscaler->scale, esrgan_upscaler->tile_size, 0.25f, on_tiling);
+        esrgan_upscaler->free_compute_buffer();
+        ggml_tensor_clamp(upscaled, 0.f, 1.f);
+        uint8_t* upscaled_data = sd_tensor_to_image(upscaled);
+        ggml_free(upscale_ctx);
+        int64_t t3 = ggml_time_ms();
+        LOG_INFO("input_image_tensor upscaled, taking %.2fs", (t3 - t0) / 1000.0f);
+        upscaled_image = {
+            (uint32_t)output_width,
+            (uint32_t)output_height,
+            3,
+            upscaled_data,
+        };
+        return upscaled_image;
+    }
+};
+
+struct upscaler_ctx_t {
+    UpscalerGGML* upscaler = NULL;
+};
+
+upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str,
+                                 int n_threads,
+                                 enum sd_type_t wtype) {
+    upscaler_ctx_t* upscaler_ctx = (upscaler_ctx_t*)malloc(sizeof(upscaler_ctx_t));
+    if (upscaler_ctx == NULL) {
+        return NULL;
+    }
+    std::string esrgan_path(esrgan_path_c_str);
+
+    upscaler_ctx->upscaler = new UpscalerGGML(n_threads);
+    if (upscaler_ctx->upscaler == NULL) {
+        return NULL;
+    }
+
+    if (!upscaler_ctx->upscaler->load_from_file(esrgan_path)) {
+        delete upscaler_ctx->upscaler;
+        upscaler_ctx->upscaler = NULL;
+        free(upscaler_ctx);
+        return NULL;
+    }
+    return upscaler_ctx;
+}
+
+sd_image_t upscale(upscaler_ctx_t* upscaler_ctx, sd_image_t input_image, uint32_t upscale_factor) {
+    return upscaler_ctx->upscaler->upscale(input_image, upscale_factor);
+}
+
+void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx) {
+    if (upscaler_ctx->upscaler != NULL) {
+        delete upscaler_ctx->upscaler;
+        upscaler_ctx->upscaler = NULL;
+    }
+    free(upscaler_ctx);
+}
--- a/otherarch/sdcpp/util.cpp
+++ b/otherarch/sdcpp/util.cpp
@ -0,0 +1,463 @@
+#include "util.h"
+#include <stdarg.h>
+#include <algorithm>
+#include <cmath>
+#include <codecvt>
+#include <fstream>
+#include <locale>
+#include <sstream>
+#include <string>
+#include <thread>
+#include <unordered_set>
+#include <vector>
+#include "preprocessing.hpp"
+
+#if defined(__APPLE__) && defined(__MACH__)
+#include <sys/sysctl.h>
+#include <sys/types.h>
+#endif
+
+#if !defined(_WIN32)
+#include <sys/ioctl.h>
+#include <unistd.h>
+#endif
+
+#include "ggml/ggml.h"
+#include "stable-diffusion.h"
+
+bool ends_with(const std::string& str, const std::string& ending) {
+    if (str.length() >= ending.length()) {
+        return (str.compare(str.length() - ending.length(), ending.length(), ending) == 0);
+    } else {
+        return false;
+    }
+}
+
+bool starts_with(const std::string& str, const std::string& start) {
+    if (str.find(start) == 0) {
+        return true;
+    }
+    return false;
+}
+
+void replace_all_chars(std::string& str, char target, char replacement) {
+    for (size_t i = 0; i < str.length(); ++i) {
+        if (str[i] == target) {
+            str[i] = replacement;
+        }
+    }
+}
+
+std::string format(const char* fmt, ...) {
+    va_list ap;
+    va_list ap2;
+    va_start(ap, fmt);
+    va_copy(ap2, ap);
+    int size = vsnprintf(NULL, 0, fmt, ap);
+    std::vector<char> buf(size + 1);
+    int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
+    va_end(ap2);
+    va_end(ap);
+    return std::string(buf.data(), size);
+}
+
+#ifdef _WIN32  // code for windows
+#include <windows.h>
+
+bool file_exists(const std::string& filename) {
+    DWORD attributes = GetFileAttributesA(filename.c_str());
+    return (attributes != INVALID_FILE_ATTRIBUTES && !(attributes & FILE_ATTRIBUTE_DIRECTORY));
+}
+
+bool is_directory(const std::string& path) {
+    DWORD attributes = GetFileAttributesA(path.c_str());
+    return (attributes != INVALID_FILE_ATTRIBUTES && (attributes & FILE_ATTRIBUTE_DIRECTORY));
+}
+
+std::string get_full_path(const std::string& dir, const std::string& filename) {
+    std::string full_path = dir + "\\" + filename;
+
+    WIN32_FIND_DATA find_file_data;
+    HANDLE hFind = FindFirstFile(full_path.c_str(), &find_file_data);
+
+    if (hFind != INVALID_HANDLE_VALUE) {
+        FindClose(hFind);
+        return full_path;
+    } else {
+        return "";
+    }
+}
+
+#else  // Unix
+#include <dirent.h>
+#include <sys/stat.h>
+
+bool file_exists(const std::string& filename) {
+    struct stat buffer;
+    return (stat(filename.c_str(), &buffer) == 0 && S_ISREG(buffer.st_mode));
+}
+
+bool is_directory(const std::string& path) {
+    struct stat buffer;
+    return (stat(path.c_str(), &buffer) == 0 && S_ISDIR(buffer.st_mode));
+}
+
+std::string get_full_path(const std::string& dir, const std::string& filename) {
+    DIR* dp = opendir(dir.c_str());
+
+    if (dp != nullptr) {
+        struct dirent* entry;
+
+        while ((entry = readdir(dp)) != nullptr) {
+            if (strcasecmp(entry->d_name, filename.c_str()) == 0) {
+                closedir(dp);
+                return dir + "/" + entry->d_name;
+            }
+        }
+
+        closedir(dp);
+    }
+
+    return "";
+}
+
+#endif
+
+// get_num_physical_cores is copy from
+// https://github.com/ggerganov/llama.cpp/blob/master/examples/common.cpp
+// LICENSE: https://github.com/ggerganov/llama.cpp/blob/master/LICENSE
+int32_t get_num_physical_cores() {
+#ifdef __linux__
+    // enumerate the set of thread siblings, num entries is num cores
+    std::unordered_set<std::string> siblings;
+    for (uint32_t cpu = 0; cpu < UINT32_MAX; ++cpu) {
+        std::ifstream thread_siblings("/sys/devices/system/cpu" + std::to_string(cpu) + "/topology/thread_siblings");
+        if (!thread_siblings.is_open()) {
+            break;  // no more cpus
+        }
+        std::string line;
+        if (std::getline(thread_siblings, line)) {
+            siblings.insert(line);
+        }
+    }
+    if (siblings.size() > 0) {
+        return static_cast<int32_t>(siblings.size());
+    }
+#elif defined(__APPLE__) && defined(__MACH__)
+    int32_t num_physical_cores;
+    size_t len = sizeof(num_physical_cores);
+    int result = sysctlbyname("hw.perflevel0.physicalcpu", &num_physical_cores, &len, NULL, 0);
+    if (result == 0) {
+        return num_physical_cores;
+    }
+    result = sysctlbyname("hw.physicalcpu", &num_physical_cores, &len, NULL, 0);
+    if (result == 0) {
+        return num_physical_cores;
+    }
+#elif defined(_WIN32)
+    // TODO: Implement
+#endif
+    unsigned int n_threads = std::thread::hardware_concurrency();
+    return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
+}
+
+std::u32string utf8_to_utf32(const std::string& utf8_str) {
+    std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
+    return converter.from_bytes(utf8_str);
+}
+
+std::string utf32_to_utf8(const std::u32string& utf32_str) {
+    std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
+    return converter.to_bytes(utf32_str);
+}
+
+std::u32string unicode_value_to_utf32(int unicode_value) {
+    std::u32string utf32_string = {static_cast<char32_t>(unicode_value)};
+    return utf32_string;
+}
+
+static std::string sd_basename(const std::string& path) {
+    size_t pos = path.find_last_of('/');
+    if (pos != std::string::npos) {
+        return path.substr(pos + 1);
+    }
+    pos = path.find_last_of('\\');
+    if (pos != std::string::npos) {
+        return path.substr(pos + 1);
+    }
+    return path;
+}
+
+std::string path_join(const std::string& p1, const std::string& p2) {
+    if (p1.empty()) {
+        return p2;
+    }
+
+    if (p2.empty()) {
+        return p1;
+    }
+
+    if (p1[p1.length() - 1] == '/' || p1[p1.length() - 1] == '\\') {
+        return p1 + p2;
+    }
+
+    return p1 + "/" + p2;
+}
+
+void pretty_progress(int step, int steps, float time) {
+    if (step == 0) {
+        return;
+    }
+    std::string progress = "  |";
+    int max_progress     = 50;
+    int32_t current      = (int32_t)(step * 1.f * max_progress / steps);
+    for (int i = 0; i < 50; i++) {
+        if (i > current) {
+            progress += " ";
+        } else if (i == current && i != max_progress - 1) {
+            progress += ">";
+        } else {
+            progress += "=";
+        }
+    }
+    progress += "|";
+    printf(time > 1.0f ? "\r%s %i/%i - %.2fs/it" : "\r%s %i/%i - %.2fit/s",
+           progress.c_str(), step, steps,
+           time > 1.0f || time == 0 ? time : (1.0f / time));
+    fflush(stdout);  // for linux
+    if (step == steps) {
+        printf("\n");
+    }
+}
+
+std::string ltrim(const std::string& s) {
+    auto it = std::find_if(s.begin(), s.end(), [](int ch) {
+        return !std::isspace(ch);
+    });
+    return std::string(it, s.end());
+}
+
+std::string rtrim(const std::string& s) {
+    auto it = std::find_if(s.rbegin(), s.rend(), [](int ch) {
+        return !std::isspace(ch);
+    });
+    return std::string(s.begin(), it.base());
+}
+
+std::string trim(const std::string& s) {
+    return rtrim(ltrim(s));
+}
+
+static sd_log_cb_t sd_log_cb = NULL;
+void* sd_log_cb_data         = NULL;
+
+#define LOG_BUFFER_SIZE 1024
+
+void log_printf(sd_log_level_t level, const char* file, int line, const char* format, ...) {
+    va_list args;
+    va_start(args, format);
+
+    const char* level_str = "DEBUG";
+    if (level == SD_LOG_INFO) {
+        level_str = "INFO ";
+    } else if (level == SD_LOG_WARN) {
+        level_str = "WARN ";
+    } else if (level == SD_LOG_ERROR) {
+        level_str = "ERROR";
+    }
+
+    static char log_buffer[LOG_BUFFER_SIZE];
+
+    int written = snprintf(log_buffer, LOG_BUFFER_SIZE, "[%s] %s:%-4d - ", level_str, sd_basename(file).c_str(), line);
+
+    if (written >= 0 && written < LOG_BUFFER_SIZE) {
+        vsnprintf(log_buffer + written, LOG_BUFFER_SIZE - written, format, args);
+        strncat(log_buffer, "\n", LOG_BUFFER_SIZE - strlen(log_buffer) - 1);
+    }
+
+    if (sd_log_cb) {
+        sd_log_cb(level, log_buffer, sd_log_cb_data);
+    }
+
+    va_end(args);
+}
+
+void sd_set_log_callback(sd_log_cb_t cb, void* data) {
+    sd_log_cb      = cb;
+    sd_log_cb_data = data;
+}
+
+const char* sd_get_system_info() {
+    static char buffer[1024];
+    std::stringstream ss;
+    ss << "System Info: \n";
+    ss << "    BLAS = " << ggml_cpu_has_blas() << std::endl;
+    ss << "    SSE3 = " << ggml_cpu_has_sse3() << std::endl;
+    ss << "    AVX = " << ggml_cpu_has_avx() << std::endl;
+    ss << "    AVX2 = " << ggml_cpu_has_avx2() << std::endl;
+    ss << "    AVX512 = " << ggml_cpu_has_avx512() << std::endl;
+    ss << "    AVX512_VBMI = " << ggml_cpu_has_avx512_vbmi() << std::endl;
+    ss << "    AVX512_VNNI = " << ggml_cpu_has_avx512_vnni() << std::endl;
+    ss << "    FMA = " << ggml_cpu_has_fma() << std::endl;
+    ss << "    NEON = " << ggml_cpu_has_neon() << std::endl;
+    ss << "    ARM_FMA = " << ggml_cpu_has_arm_fma() << std::endl;
+    ss << "    F16C = " << ggml_cpu_has_f16c() << std::endl;
+    ss << "    FP16_VA = " << ggml_cpu_has_fp16_va() << std::endl;
+    ss << "    WASM_SIMD = " << ggml_cpu_has_wasm_simd() << std::endl;
+    ss << "    VSX = " << ggml_cpu_has_vsx() << std::endl;
+    snprintf(buffer, sizeof(buffer), "%s", ss.str().c_str());
+    return buffer;
+}
+
+const char* sd_type_name(enum sd_type_t type) {
+    return ggml_type_name((ggml_type)type);
+}
+
+sd_image_f32_t sd_image_t_to_sd_image_f32_t(sd_image_t image) {
+    sd_image_f32_t converted_image;
+    converted_image.width   = image.width;
+    converted_image.height  = image.height;
+    converted_image.channel = image.channel;
+
+    // Allocate memory for float data
+    converted_image.data = (float*)malloc(image.width * image.height * image.channel * sizeof(float));
+
+    for (int i = 0; i < image.width * image.height * image.channel; i++) {
+        // Convert uint8_t to float
+        converted_image.data[i] = (float)image.data[i];
+    }
+
+    return converted_image;
+}
+
+// Function to perform double linear interpolation
+float interpolate(float v1, float v2, float v3, float v4, float x_ratio, float y_ratio) {
+    return v1 * (1 - x_ratio) * (1 - y_ratio) + v2 * x_ratio * (1 - y_ratio) + v3 * (1 - x_ratio) * y_ratio + v4 * x_ratio * y_ratio;
+}
+
+sd_image_f32_t resize_sd_image_f32_t(sd_image_f32_t image, int target_width, int target_height) {
+    sd_image_f32_t resized_image;
+    resized_image.width   = target_width;
+    resized_image.height  = target_height;
+    resized_image.channel = image.channel;
+
+    // Allocate memory for resized float data
+    resized_image.data = (float*)malloc(target_width * target_height * image.channel * sizeof(float));
+
+    for (int y = 0; y < target_height; y++) {
+        for (int x = 0; x < target_width; x++) {
+            float original_x = (float)x * image.width / target_width;
+            float original_y = (float)y * image.height / target_height;
+
+            int x1 = (int)original_x;
+            int y1 = (int)original_y;
+            int x2 = x1 + 1;
+            int y2 = y1 + 1;
+
+            for (int k = 0; k < image.channel; k++) {
+                float v1 = *(image.data + y1 * image.width * image.channel + x1 * image.channel + k);
+                float v2 = *(image.data + y1 * image.width * image.channel + x2 * image.channel + k);
+                float v3 = *(image.data + y2 * image.width * image.channel + x1 * image.channel + k);
+                float v4 = *(image.data + y2 * image.width * image.channel + x2 * image.channel + k);
+
+                float x_ratio = original_x - x1;
+                float y_ratio = original_y - y1;
+
+                float value = interpolate(v1, v2, v3, v4, x_ratio, y_ratio);
+
+                *(resized_image.data + y * target_width * image.channel + x * image.channel + k) = value;
+            }
+        }
+    }
+
+    return resized_image;
+}
+
+void normalize_sd_image_f32_t(sd_image_f32_t image, float means[3], float stds[3]) {
+    for (int y = 0; y < image.height; y++) {
+        for (int x = 0; x < image.width; x++) {
+            for (int k = 0; k < image.channel; k++) {
+                int index         = (y * image.width + x) * image.channel + k;
+                image.data[index] = (image.data[index] - means[k]) / stds[k];
+            }
+        }
+    }
+}
+
+// Constants for means and std
+float means[3] = {0.48145466, 0.4578275, 0.40821073};
+float stds[3]  = {0.26862954, 0.26130258, 0.27577711};
+
+// Function to clip and preprocess sd_image_f32_t
+sd_image_f32_t clip_preprocess(sd_image_f32_t image, int size) {
+    float scale = (float)size / fmin(image.width, image.height);
+
+    // Interpolation
+    int new_width       = (int)(scale * image.width);
+    int new_height      = (int)(scale * image.height);
+    float* resized_data = (float*)malloc(new_width * new_height * image.channel * sizeof(float));
+
+    for (int y = 0; y < new_height; y++) {
+        for (int x = 0; x < new_width; x++) {
+            float original_x = (float)x * image.width / new_width;
+            float original_y = (float)y * image.height / new_height;
+
+            int x1 = (int)original_x;
+            int y1 = (int)original_y;
+            int x2 = x1 + 1;
+            int y2 = y1 + 1;
+
+            for (int k = 0; k < image.channel; k++) {
+                float v1 = *(image.data + y1 * image.width * image.channel + x1 * image.channel + k);
+                float v2 = *(image.data + y1 * image.width * image.channel + x2 * image.channel + k);
+                float v3 = *(image.data + y2 * image.width * image.channel + x1 * image.channel + k);
+                float v4 = *(image.data + y2 * image.width * image.channel + x2 * image.channel + k);
+
+                float x_ratio = original_x - x1;
+                float y_ratio = original_y - y1;
+
+                float value = interpolate(v1, v2, v3, v4, x_ratio, y_ratio);
+
+                *(resized_data + y * new_width * image.channel + x * image.channel + k) = value;
+            }
+        }
+    }
+
+    // Clip and preprocess
+    int h = (new_height - size) / 2;
+    int w = (new_width - size) / 2;
+
+    sd_image_f32_t result;
+    result.width   = size;
+    result.height  = size;
+    result.channel = image.channel;
+    result.data    = (float*)malloc(size * size * image.channel * sizeof(float));
+
+    for (int k = 0; k < image.channel; k++) {
+        for (int i = 0; i < size; i++) {
+            for (int j = 0; j < size; j++) {
+                *(result.data + i * size * image.channel + j * image.channel + k) =
+                    fmin(fmax(*(resized_data + (i + h) * new_width * image.channel + (j + w) * image.channel + k), 0.0f), 255.0f) / 255.0f;
+            }
+        }
+    }
+
+    // Free allocated memory
+    free(resized_data);
+
+    // Normalize
+    for (int k = 0; k < image.channel; k++) {
+        for (int i = 0; i < size; i++) {
+            for (int j = 0; j < size; j++) {
+                // *(result.data + i * size * image.channel + j * image.channel + k) = 0.5f;
+                int offset  = i * size * image.channel + j * image.channel + k;
+                float value = *(result.data + offset);
+                value       = (value - means[k]) / stds[k];
+                // value = 0.5f;
+                *(result.data + offset) = value;
+            }
+        }
+    }
+
+    return result;
+}
--- a/otherarch/sdcpp/util.h
+++ b/otherarch/sdcpp/util.h
@ -0,0 +1,51 @@
+#ifndef __UTIL_H__
+#define __UTIL_H__
+
+#include <cstdint>
+#include <string>
+
+#include "stable-diffusion.h"
+
+bool ends_with(const std::string& str, const std::string& ending);
+bool starts_with(const std::string& str, const std::string& start);
+
+std::string format(const char* fmt, ...);
+
+void replace_all_chars(std::string& str, char target, char replacement);
+
+bool file_exists(const std::string& filename);
+bool is_directory(const std::string& path);
+std::string get_full_path(const std::string& dir, const std::string& filename);
+
+std::u32string utf8_to_utf32(const std::string& utf8_str);
+std::string utf32_to_utf8(const std::u32string& utf32_str);
+std::u32string unicode_value_to_utf32(int unicode_value);
+
+typedef struct {
+    uint32_t width;
+    uint32_t height;
+    uint32_t channel;
+    float* data;
+} sd_image_f32_t;
+
+void normalize_sd_image_f32_t(sd_image_f32_t image, float means[3], float stds[3]);
+
+sd_image_f32_t sd_image_t_to_sd_image_f32_t(sd_image_t image);
+
+sd_image_f32_t resize_sd_image_f32_t(sd_image_f32_t image, int target_width, int target_height);
+
+sd_image_f32_t clip_preprocess(sd_image_f32_t image, int size);
+
+std::string path_join(const std::string& p1, const std::string& p2);
+
+void pretty_progress(int step, int steps, float time);
+
+void log_printf(sd_log_level_t level, const char* file, int line, const char* format, ...);
+
+std::string trim(const std::string& s);
+
+#define LOG_DEBUG(format, ...) log_printf(SD_LOG_DEBUG, __FILE__, __LINE__, format, ##__VA_ARGS__)
+#define LOG_INFO(format, ...) log_printf(SD_LOG_INFO, __FILE__, __LINE__, format, ##__VA_ARGS__)
+#define LOG_WARN(format, ...) log_printf(SD_LOG_WARN, __FILE__, __LINE__, format, ##__VA_ARGS__)
+#define LOG_ERROR(format, ...) log_printf(SD_LOG_ERROR, __FILE__, __LINE__, format, ##__VA_ARGS__)
+#endif  // __UTIL_H__
--- a/otherarch/sdcpp/vae.hpp
+++ b/otherarch/sdcpp/vae.hpp
@ -0,0 +1,613 @@
+#ifndef __VAE_HPP__
+#define __VAE_HPP__
+
+#include "common.hpp"
+#include "ggml_extend.hpp"
+
+/*================================================== AutoEncoderKL ===================================================*/
+
+#define VAE_GRAPH_SIZE 10240
+
+class ResnetBlock : public UnaryBlock {
+protected:
+    int64_t in_channels;
+    int64_t out_channels;
+
+public:
+    ResnetBlock(int64_t in_channels,
+                int64_t out_channels)
+        : in_channels(in_channels),
+          out_channels(out_channels) {
+        // temb_channels is always 0
+        blocks["norm1"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(in_channels));
+        blocks["conv1"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
+
+        blocks["norm2"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(out_channels));
+        blocks["conv2"] = std::shared_ptr<GGMLBlock>(new Conv2d(out_channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
+
+        if (out_channels != in_channels) {
+            blocks["nin_shortcut"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, out_channels, {1, 1}));
+        }
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        // x: [N, in_channels, h, w]
+        // t_emb is always None
+        auto norm1 = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm1"]);
+        auto conv1 = std::dynamic_pointer_cast<Conv2d>(blocks["conv1"]);
+        auto norm2 = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm2"]);
+        auto conv2 = std::dynamic_pointer_cast<Conv2d>(blocks["conv2"]);
+
+        auto h = x;
+        h      = norm1->forward(ctx, h);
+        h      = ggml_silu_inplace(ctx, h);  // swish
+        h      = conv1->forward(ctx, h);
+        // return h;
+
+        h = norm2->forward(ctx, h);
+        h = ggml_silu_inplace(ctx, h);  // swish
+        // dropout, skip for inference
+        h = conv2->forward(ctx, h);
+
+        // skip connection
+        if (out_channels != in_channels) {
+            auto nin_shortcut = std::dynamic_pointer_cast<Conv2d>(blocks["nin_shortcut"]);
+
+            x = nin_shortcut->forward(ctx, x);  // [N, out_channels, h, w]
+        }
+
+        h = ggml_add(ctx, h, x);
+        return h;  // [N, out_channels, h, w]
+    }
+};
+
+class AttnBlock : public UnaryBlock {
+protected:
+    int64_t in_channels;
+
+public:
+    AttnBlock(int64_t in_channels)
+        : in_channels(in_channels) {
+        blocks["norm"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(in_channels));
+        blocks["q"]    = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1}));
+        blocks["k"]    = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1}));
+        blocks["v"]    = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1}));
+
+        blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1}));
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        // x: [N, in_channels, h, w]
+        auto norm     = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm"]);
+        auto q_proj   = std::dynamic_pointer_cast<Conv2d>(blocks["q"]);
+        auto k_proj   = std::dynamic_pointer_cast<Conv2d>(blocks["k"]);
+        auto v_proj   = std::dynamic_pointer_cast<Conv2d>(blocks["v"]);
+        auto proj_out = std::dynamic_pointer_cast<Conv2d>(blocks["proj_out"]);
+
+        auto h_ = norm->forward(ctx, x);
+
+        const int64_t n = h_->ne[3];
+        const int64_t c = h_->ne[2];
+        const int64_t h = h_->ne[1];
+        const int64_t w = h_->ne[0];
+
+        auto q = q_proj->forward(ctx, h_);                          // [N, in_channels, h, w]
+        q      = ggml_cont(ctx, ggml_permute(ctx, q, 1, 2, 0, 3));  // [N, h, w, in_channels]
+        q      = ggml_reshape_3d(ctx, q, c, h * w, n);              // [N, h * w, in_channels]
+
+        auto k = k_proj->forward(ctx, h_);                          // [N, in_channels, h, w]
+        k      = ggml_cont(ctx, ggml_permute(ctx, k, 1, 2, 0, 3));  // [N, h, w, in_channels]
+        k      = ggml_reshape_3d(ctx, k, c, h * w, n);              // [N, h * w, in_channels]
+
+        auto v = v_proj->forward(ctx, h_);              // [N, in_channels, h, w]
+        v      = ggml_reshape_3d(ctx, v, h * w, c, n);  // [N, in_channels, h * w]
+
+        h_ = ggml_nn_attention(ctx, q, k, v, false);  // [N, h * w, in_channels]
+
+        h_ = ggml_cont(ctx, ggml_permute(ctx, h_, 1, 0, 2, 3));  // [N, in_channels, h * w]
+        h_ = ggml_reshape_4d(ctx, h_, w, h, c, n);               // [N, in_channels, h, w]
+
+        h_ = proj_out->forward(ctx, h_);  // [N, in_channels, h, w]
+
+        h_ = ggml_add(ctx, h_, x);
+        return h_;
+    }
+};
+
+class AE3DConv : public Conv2d {
+public:
+    AE3DConv(int64_t in_channels,
+             int64_t out_channels,
+             std::pair<int, int> kernel_size,
+             int64_t video_kernel_size    = 3,
+             std::pair<int, int> stride   = {1, 1},
+             std::pair<int, int> padding  = {0, 0},
+             std::pair<int, int> dilation = {1, 1},
+             bool bias                    = true)
+        : Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias) {
+        int64_t kernel_padding  = video_kernel_size / 2;
+        blocks["time_mix_conv"] = std::shared_ptr<GGMLBlock>(new Conv3dnx1x1(out_channels,
+                                                                             out_channels,
+                                                                             video_kernel_size,
+                                                                             1,
+                                                                             kernel_padding));
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx,
+                                struct ggml_tensor* x) {
+        // timesteps always None
+        // skip_video always False
+        // x: [N, IC, IH, IW]
+        // result: [N, OC, OH, OW]
+        auto time_mix_conv = std::dynamic_pointer_cast<Conv3dnx1x1>(blocks["time_mix_conv"]);
+
+        x = Conv2d::forward(ctx, x);
+        // timesteps = x.shape[0]
+        // x = rearrange(x, "(b t) c h w -> b c t h w", t=timesteps)
+        // x = conv3d(x)
+        // return rearrange(x, "b c t h w -> (b t) c h w")
+        int64_t T = x->ne[3];
+        int64_t B = x->ne[3] / T;
+        int64_t C = x->ne[2];
+        int64_t H = x->ne[1];
+        int64_t W = x->ne[0];
+
+        x = ggml_reshape_4d(ctx, x, W * H, C, T, B);           // (b t) c h w -> b t c (h w)
+        x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));  // b t c (h w) -> b c t (h w)
+        x = time_mix_conv->forward(ctx, x);                    // [B, OC, T, OH * OW]
+        x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));  // b c t (h w) -> b t c (h w)
+        x = ggml_reshape_4d(ctx, x, W, H, C, T * B);           // b t c (h w) -> (b t) c h w
+        return x;                                              // [B*T, OC, OH, OW]
+    }
+};
+
+class VideoResnetBlock : public ResnetBlock {
+protected:
+    void init_params(struct ggml_context* ctx, ggml_type wtype) {
+        params["mix_factor"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
+    }
+
+    float get_alpha() {
+        float alpha = ggml_backend_tensor_get_f32(params["mix_factor"]);
+        return sigmoid(alpha);
+    }
+
+public:
+    VideoResnetBlock(int64_t in_channels,
+                     int64_t out_channels,
+                     int video_kernel_size = 3)
+        : ResnetBlock(in_channels, out_channels) {
+        // merge_strategy is always learned
+        blocks["time_stack"] = std::shared_ptr<GGMLBlock>(new ResBlock(out_channels, 0, out_channels, {video_kernel_size, 1}, 3, false, true));
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        // x: [N, in_channels, h, w] aka [b*t, in_channels, h, w]
+        // return: [N, out_channels, h, w] aka [b*t, out_channels, h, w]
+        // t_emb is always None
+        // skip_video is always False
+        // timesteps is always None
+        auto time_stack = std::dynamic_pointer_cast<ResBlock>(blocks["time_stack"]);
+
+        x = ResnetBlock::forward(ctx, x);  // [N, out_channels, h, w]
+        // return x;
+
+        int64_t T = x->ne[3];
+        int64_t B = x->ne[3] / T;
+        int64_t C = x->ne[2];
+        int64_t H = x->ne[1];
+        int64_t W = x->ne[0];
+
+        x          = ggml_reshape_4d(ctx, x, W * H, C, T, B);           // (b t) c h w -> b t c (h w)
+        x          = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));  // b t c (h w) -> b c t (h w)
+        auto x_mix = x;
+
+        x = time_stack->forward(ctx, x);  // b t c (h w)
+
+        float alpha = get_alpha();
+        x           = ggml_add(ctx,
+                               ggml_scale(ctx, x, alpha),
+                               ggml_scale(ctx, x_mix, 1.0f - alpha));
+
+        x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));  // b c t (h w) -> b t c (h w)
+        x = ggml_reshape_4d(ctx, x, W, H, C, T * B);           // b t c (h w) -> (b t) c h w
+
+        return x;
+    }
+};
+
+// ldm.modules.diffusionmodules.model.Encoder
+class Encoder : public GGMLBlock {
+protected:
+    int ch                   = 128;
+    std::vector<int> ch_mult = {1, 2, 4, 4};
+    int num_res_blocks       = 2;
+    int in_channels          = 3;
+    int z_channels           = 4;
+    bool double_z            = true;
+
+public:
+    Encoder(int ch,
+            std::vector<int> ch_mult,
+            int num_res_blocks,
+            int in_channels,
+            int z_channels,
+            bool double_z = true)
+        : ch(ch),
+          ch_mult(ch_mult),
+          num_res_blocks(num_res_blocks),
+          in_channels(in_channels),
+          z_channels(z_channels),
+          double_z(double_z) {
+        blocks["conv_in"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, ch, {3, 3}, {1, 1}, {1, 1}));
+
+        size_t num_resolutions = ch_mult.size();
+
+        int block_in = 1;
+        for (int i = 0; i < num_resolutions; i++) {
+            if (i == 0) {
+                block_in = ch;
+            } else {
+                block_in = ch * ch_mult[i - 1];
+            }
+            int block_out = ch * ch_mult[i];
+            for (int j = 0; j < num_res_blocks; j++) {
+                std::string name = "down." + std::to_string(i) + ".block." + std::to_string(j);
+                blocks[name]     = std::shared_ptr<GGMLBlock>(new ResnetBlock(block_in, block_out));
+                block_in         = block_out;
+            }
+            if (i != num_resolutions - 1) {
+                std::string name = "down." + std::to_string(i) + ".downsample";
+                blocks[name]     = std::shared_ptr<GGMLBlock>(new DownSampleBlock(block_in, block_in, true));
+            }
+        }
+
+        blocks["mid.block_1"] = std::shared_ptr<GGMLBlock>(new ResnetBlock(block_in, block_in));
+        blocks["mid.attn_1"]  = std::shared_ptr<GGMLBlock>(new AttnBlock(block_in));
+        blocks["mid.block_2"] = std::shared_ptr<GGMLBlock>(new ResnetBlock(block_in, block_in));
+
+        blocks["norm_out"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(block_in));
+        blocks["conv_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(block_in, double_z ? z_channels * 2 : z_channels, {3, 3}, {1, 1}, {1, 1}));
+    }
+
+    virtual struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        // x: [N, in_channels, h, w]
+
+        auto conv_in     = std::dynamic_pointer_cast<Conv2d>(blocks["conv_in"]);
+        auto mid_block_1 = std::dynamic_pointer_cast<ResnetBlock>(blocks["mid.block_1"]);
+        auto mid_attn_1  = std::dynamic_pointer_cast<AttnBlock>(blocks["mid.attn_1"]);
+        auto mid_block_2 = std::dynamic_pointer_cast<ResnetBlock>(blocks["mid.block_2"]);
+        auto norm_out    = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm_out"]);
+        auto conv_out    = std::dynamic_pointer_cast<Conv2d>(blocks["conv_out"]);
+
+        auto h = conv_in->forward(ctx, x);  // [N, ch, h, w]
+
+        // downsampling
+        size_t num_resolutions = ch_mult.size();
+        for (int i = 0; i < num_resolutions; i++) {
+            for (int j = 0; j < num_res_blocks; j++) {
+                std::string name = "down." + std::to_string(i) + ".block." + std::to_string(j);
+                auto down_block  = std::dynamic_pointer_cast<ResnetBlock>(blocks[name]);
+
+                h = down_block->forward(ctx, h);
+            }
+            if (i != num_resolutions - 1) {
+                std::string name = "down." + std::to_string(i) + ".downsample";
+                auto down_sample = std::dynamic_pointer_cast<DownSampleBlock>(blocks[name]);
+
+                h = down_sample->forward(ctx, h);
+            }
+        }
+
+        // middle
+        h = mid_block_1->forward(ctx, h);
+        h = mid_attn_1->forward(ctx, h);
+        h = mid_block_2->forward(ctx, h);  // [N, block_in, h, w]
+
+        // end
+        h = norm_out->forward(ctx, h);
+        h = ggml_silu_inplace(ctx, h);  // nonlinearity/swish
+        h = conv_out->forward(ctx, h);  // [N, z_channels*2, h, w]
+        return h;
+    }
+};
+
+// ldm.modules.diffusionmodules.model.Decoder
+class Decoder : public GGMLBlock {
+protected:
+    int ch                   = 128;
+    int out_ch               = 3;
+    std::vector<int> ch_mult = {1, 2, 4, 4};
+    int num_res_blocks       = 2;
+    int z_channels           = 4;
+    bool video_decoder       = false;
+    int video_kernel_size    = 3;
+
+    virtual std::shared_ptr<GGMLBlock> get_conv_out(int64_t in_channels,
+                                                    int64_t out_channels,
+                                                    std::pair<int, int> kernel_size,
+                                                    std::pair<int, int> stride  = {1, 1},
+                                                    std::pair<int, int> padding = {0, 0}) {
+        if (video_decoder) {
+            return std::shared_ptr<GGMLBlock>(new AE3DConv(in_channels, out_channels, kernel_size, video_kernel_size, stride, padding));
+        } else {
+            return std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, out_channels, kernel_size, stride, padding));
+        }
+    }
+
+    virtual std::shared_ptr<GGMLBlock> get_resnet_block(int64_t in_channels,
+                                                        int64_t out_channels) {
+        if (video_decoder) {
+            return std::shared_ptr<GGMLBlock>(new VideoResnetBlock(in_channels, out_channels, video_kernel_size));
+        } else {
+            return std::shared_ptr<GGMLBlock>(new ResnetBlock(in_channels, out_channels));
+        }
+    }
+
+public:
+    Decoder(int ch,
+            int out_ch,
+            std::vector<int> ch_mult,
+            int num_res_blocks,
+            int z_channels,
+            bool video_decoder    = false,
+            int video_kernel_size = 3)
+        : ch(ch),
+          out_ch(out_ch),
+          ch_mult(ch_mult),
+          num_res_blocks(num_res_blocks),
+          z_channels(z_channels),
+          video_decoder(video_decoder),
+          video_kernel_size(video_kernel_size) {
+        size_t num_resolutions = ch_mult.size();
+        int block_in           = ch * ch_mult[num_resolutions - 1];
+
+        blocks["conv_in"] = std::shared_ptr<GGMLBlock>(new Conv2d(z_channels, block_in, {3, 3}, {1, 1}, {1, 1}));
+
+        blocks["mid.block_1"] = get_resnet_block(block_in, block_in);
+        blocks["mid.attn_1"]  = std::shared_ptr<GGMLBlock>(new AttnBlock(block_in));
+        blocks["mid.block_2"] = get_resnet_block(block_in, block_in);
+
+        for (int i = num_resolutions - 1; i >= 0; i--) {
+            int mult      = ch_mult[i];
+            int block_out = ch * mult;
+            for (int j = 0; j < num_res_blocks + 1; j++) {
+                std::string name = "up." + std::to_string(i) + ".block." + std::to_string(j);
+                blocks[name]     = get_resnet_block(block_in, block_out);
+
+                block_in = block_out;
+            }
+            if (i != 0) {
+                std::string name = "up." + std::to_string(i) + ".upsample";
+                blocks[name]     = std::shared_ptr<GGMLBlock>(new UpSampleBlock(block_in, block_in));
+            }
+        }
+
+        blocks["norm_out"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(block_in));
+        blocks["conv_out"] = get_conv_out(block_in, out_ch, {3, 3}, {1, 1}, {1, 1});
+    }
+
+    virtual struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* z) {
+        // z: [N, z_channels, h, w]
+        // alpha is always 0
+        // merge_strategy is always learned
+        // time_mode is always conv-only, so we need to replace conv_out_op/resnet_op to AE3DConv/VideoResBlock
+        // AttnVideoBlock will not be used
+        auto conv_in     = std::dynamic_pointer_cast<Conv2d>(blocks["conv_in"]);
+        auto mid_block_1 = std::dynamic_pointer_cast<ResnetBlock>(blocks["mid.block_1"]);
+        auto mid_attn_1  = std::dynamic_pointer_cast<AttnBlock>(blocks["mid.attn_1"]);
+        auto mid_block_2 = std::dynamic_pointer_cast<ResnetBlock>(blocks["mid.block_2"]);
+        auto norm_out    = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm_out"]);
+        auto conv_out    = std::dynamic_pointer_cast<Conv2d>(blocks["conv_out"]);
+
+        // conv_in
+        auto h = conv_in->forward(ctx, z);  // [N, block_in, h, w]
+
+        // middle
+        h = mid_block_1->forward(ctx, h);
+        // return h;
+
+        h = mid_attn_1->forward(ctx, h);
+        h = mid_block_2->forward(ctx, h);  // [N, block_in, h, w]
+
+        // upsampling
+        size_t num_resolutions = ch_mult.size();
+        for (int i = num_resolutions - 1; i >= 0; i--) {
+            for (int j = 0; j < num_res_blocks + 1; j++) {
+                std::string name = "up." + std::to_string(i) + ".block." + std::to_string(j);
+                auto up_block    = std::dynamic_pointer_cast<ResnetBlock>(blocks[name]);
+
+                h = up_block->forward(ctx, h);
+            }
+            if (i != 0) {
+                std::string name = "up." + std::to_string(i) + ".upsample";
+                auto up_sample   = std::dynamic_pointer_cast<UpSampleBlock>(blocks[name]);
+
+                h = up_sample->forward(ctx, h);
+            }
+        }
+
+        h = norm_out->forward(ctx, h);
+        h = ggml_silu_inplace(ctx, h);  // nonlinearity/swish
+        h = conv_out->forward(ctx, h);  // [N, out_ch, h*8, w*8]
+        return h;
+    }
+};
+
+// ldm.models.autoencoder.AutoencoderKL
+class AutoencodingEngine : public GGMLBlock {
+protected:
+    bool decode_only       = true;
+    bool use_video_decoder = false;
+    int embed_dim          = 4;
+    struct {
+        int z_channels           = 4;
+        int resolution           = 256;
+        int in_channels          = 3;
+        int out_ch               = 3;
+        int ch                   = 128;
+        std::vector<int> ch_mult = {1, 2, 4, 4};
+        int num_res_blocks       = 2;
+        bool double_z            = true;
+    } dd_config;
+
+public:
+    AutoencodingEngine(bool decode_only       = true,
+                       bool use_video_decoder = false)
+        : decode_only(decode_only), use_video_decoder(use_video_decoder) {
+        blocks["decoder"] = std::shared_ptr<GGMLBlock>(new Decoder(dd_config.ch,
+                                                                   dd_config.out_ch,
+                                                                   dd_config.ch_mult,
+                                                                   dd_config.num_res_blocks,
+                                                                   dd_config.z_channels,
+                                                                   use_video_decoder));
+        if (!use_video_decoder) {
+            blocks["post_quant_conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(dd_config.z_channels,
+                                                                              embed_dim,
+                                                                              {1, 1}));
+        }
+        if (!decode_only) {
+            blocks["encoder"] = std::shared_ptr<GGMLBlock>(new Encoder(dd_config.ch,
+                                                                       dd_config.ch_mult,
+                                                                       dd_config.num_res_blocks,
+                                                                       dd_config.in_channels,
+                                                                       dd_config.z_channels,
+                                                                       dd_config.double_z));
+            if (!use_video_decoder) {
+                int factor = dd_config.double_z ? 2 : 1;
+
+                blocks["quant_conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(embed_dim * factor,
+                                                                             dd_config.z_channels * factor,
+                                                                             {1, 1}));
+            }
+        }
+    }
+
+    struct ggml_tensor* decode(struct ggml_context* ctx, struct ggml_tensor* z) {
+        // z: [N, z_channels, h, w]
+        if (!use_video_decoder) {
+            auto post_quant_conv = std::dynamic_pointer_cast<Conv2d>(blocks["post_quant_conv"]);
+            z                    = post_quant_conv->forward(ctx, z);  // [N, z_channels, h, w]
+        }
+        auto decoder = std::dynamic_pointer_cast<Decoder>(blocks["decoder"]);
+
+        ggml_set_name(z, "bench-start");
+        auto h = decoder->forward(ctx, z);
+        ggml_set_name(h, "bench-end");
+        return h;
+    }
+
+    struct ggml_tensor* encode(struct ggml_context* ctx, struct ggml_tensor* x) {
+        // x: [N, in_channels, h, w]
+        auto encoder = std::dynamic_pointer_cast<Encoder>(blocks["encoder"]);
+
+        auto h = encoder->forward(ctx, x);  // [N, 2*z_channels, h/8, w/8]
+        if (!use_video_decoder) {
+            auto quant_conv = std::dynamic_pointer_cast<Conv2d>(blocks["quant_conv"]);
+            h               = quant_conv->forward(ctx, h);  // [N, 2*embed_dim, h/8, w/8]
+        }
+        return h;
+    }
+};
+
+struct AutoEncoderKL : public GGMLModule {
+    bool decode_only = true;
+    AutoencodingEngine ae;
+
+    AutoEncoderKL(ggml_backend_t backend,
+                  ggml_type wtype,
+                  bool decode_only       = false,
+                  bool use_video_decoder = false)
+        : decode_only(decode_only), ae(decode_only, use_video_decoder), GGMLModule(backend, wtype) {
+        ae.init(params_ctx, wtype);
+    }
+
+    std::string get_desc() {
+        return "vae";
+    }
+
+    size_t get_params_mem_size() {
+        return ae.get_params_mem_size();
+    }
+
+    size_t get_params_num() {
+        return ae.get_params_num();
+    }
+
+    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
+        ae.get_param_tensors(tensors, prefix);
+    }
+
+    struct ggml_cgraph* build_graph(struct ggml_tensor* z, bool decode_graph) {
+        struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
+
+        z = to_backend(z);
+
+        struct ggml_tensor* out = decode_graph ? ae.decode(compute_ctx, z) : ae.encode(compute_ctx, z);
+
+        ggml_build_forward_expand(gf, out);
+
+        return gf;
+    }
+
+    void compute(const int n_threads,
+                 struct ggml_tensor* z,
+                 bool decode_graph,
+                 struct ggml_tensor** output,
+                 struct ggml_context* output_ctx = NULL) {
+        auto get_graph = [&]() -> struct ggml_cgraph* {
+            return build_graph(z, decode_graph);
+        };
+        // ggml_set_f32(z, 0.5f);
+        // print_ggml_tensor(z);
+        GGMLModule::compute(get_graph, n_threads, true, output, output_ctx);
+    }
+
+    void test() {
+        struct ggml_init_params params;
+        params.mem_size   = static_cast<size_t>(10 * 1024 * 1024);  // 10 MB
+        params.mem_buffer = NULL;
+        params.no_alloc   = false;
+
+        struct ggml_context* work_ctx = ggml_init(params);
+        GGML_ASSERT(work_ctx != NULL);
+
+        {
+            // CPU, x{1, 3, 64, 64}: Pass
+            // CUDA, x{1, 3, 64, 64}: Pass, but sill get wrong result for some image, may be due to interlnal nan
+            // CPU, x{2, 3, 64, 64}: Wrong result
+            // CUDA, x{2, 3, 64, 64}: Wrong result, and different from CPU result
+            auto x = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 64, 64, 3, 2);
+            ggml_set_f32(x, 0.5f);
+            print_ggml_tensor(x);
+            struct ggml_tensor* out = NULL;
+
+            int t0 = ggml_time_ms();
+            compute(8, x, false, &out, work_ctx);
+            int t1 = ggml_time_ms();
+
+            print_ggml_tensor(out);
+            LOG_DEBUG("encode test done in %dms", t1 - t0);
+        }
+
+        if (false) {
+            // CPU, z{1, 4, 8, 8}: Pass
+            // CUDA, z{1, 4, 8, 8}: Pass
+            // CPU, z{3, 4, 8, 8}: Wrong result
+            // CUDA, z{3, 4, 8, 8}: Wrong result, and different from CPU result
+            auto z = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 4, 1);
+            ggml_set_f32(z, 0.5f);
+            print_ggml_tensor(z);
+            struct ggml_tensor* out = NULL;
+
+            int t0 = ggml_time_ms();
+            compute(8, z, true, &out, work_ctx);
+            int t1 = ggml_time_ms();
+
+            print_ggml_tensor(out);
+            LOG_DEBUG("decode test done in %dms", t1 - t0);
+        }
+    };
+};
+
+#endif
--- a/otherarch/sdcpp/vocab.hpp
+++ b/otherarch/sdcpp/vocab.hpp