initial files from sdcpp (not working)

2025-09-11 01:24:36 +00:00 · 2024-02-28 15:45:13 +08:00 · 2024-02-28 15:45:13 +08:00 · 26696970ce
commit 26696970ce
parent ad638285de
33 changed files with 582497 additions and 4 deletions
--- a/8
+++ b/8
@ -39,8 +39,8 @@ endif
 #
 # keep standard at C11 and C++11
-CFLAGS   = -I.            -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./include/vulkan -O3 -DNDEBUG -std=c11   -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE
+CFLAGS   = -I.            -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./otherarch/sdcpp -I./otherarch/sdcpp/thirdparty -I./include/vulkan -O3 -DNDEBUG -std=c11   -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE
-CXXFLAGS = -I. -I./common -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./include/vulkan -O3 -DNDEBUG -std=c++11 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE
+CXXFLAGS = -I. -I./common -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./otherarch/sdcpp -I./otherarch/sdcpp/thirdparty -I./include/vulkan -O3 -DNDEBUG -std=c++11 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE
 LDFLAGS  =
 FASTCFLAGS = $(subst -O3,-Ofast,$(CFLAGS))
@ -496,12 +496,14 @@ gpttype_adapter_vulkan_noavx2.o: $(GPTTYPE_ADAPTER)
 	$(CXX) $(CXXFLAGS) $(FAILSAFE_FLAGS) $(VULKAN_FLAGS) -c $< -o $@
 clean:
-	rm -vf *.o main quantize_llama quantize_gpt2 quantize_gptj quantize_neox quantize_mpt quantize-stats perplexity embedding benchmark-matmult save-load-state gguf imatrix imatrix.exe gguf.exe main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe quantize_mpt.exe koboldcpp_default.dll koboldcpp_openblas.dll koboldcpp_failsafe.dll koboldcpp_noavx2.dll koboldcpp_clblast.dll koboldcpp_clblast_noavx2.dll koboldcpp_cublas.dll koboldcpp_hipblas.dll koboldcpp_vulkan.dll koboldcpp_vulkan_noavx2.dll koboldcpp_default.so koboldcpp_openblas.so koboldcpp_failsafe.so koboldcpp_noavx2.so koboldcpp_clblast.so koboldcpp_clblast_noavx2.so koboldcpp_cublas.so koboldcpp_hipblas.so koboldcpp_vulkan.so koboldcpp_vulkan_noavx2.so
+	rm -vf *.o main sdmain quantize_llama quantize_gpt2 quantize_gptj quantize_neox quantize_mpt quantize-stats perplexity embedding benchmark-matmult save-load-state gguf imatrix imatrix.exe gguf.exe main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe quantize_mpt.exe koboldcpp_default.dll koboldcpp_openblas.dll koboldcpp_failsafe.dll koboldcpp_noavx2.dll koboldcpp_clblast.dll koboldcpp_clblast_noavx2.dll koboldcpp_cublas.dll koboldcpp_hipblas.dll koboldcpp_vulkan.dll koboldcpp_vulkan_noavx2.dll koboldcpp_default.so koboldcpp_openblas.so koboldcpp_failsafe.so koboldcpp_noavx2.so koboldcpp_clblast.so koboldcpp_clblast_noavx2.so koboldcpp_cublas.so koboldcpp_hipblas.so koboldcpp_vulkan.so koboldcpp_vulkan_noavx2.so
 # useful tools
 main: examples/main/main.cpp common/sampling.cpp build-info.h ggml.o ggml-quants.o ggml-alloc.o ggml-backend.o llama.o common.o console.o grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 	@echo '====  Run ./main -h for help.  ===='
 sdmain: otherarch/sdcpp/main.cpp otherarch/sdcpp/stable-diffusion.cpp common/sampling.cpp build-info.h ggml.o ggml-quants.o ggml-alloc.o ggml-backend.o llama.o common.o console.o grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 imatrix: examples/imatrix/imatrix.cpp common/sampling.cpp build-info.h ggml.o ggml-quants.o ggml-alloc.o ggml-backend.o llama.o common.o console.o grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 gguf: examples/gguf/gguf.cpp build-info.h ggml.o llama.o $(OBJS)
--- a/ggml.h
+++ b/ggml.h
@ -229,7 +229,7 @@
 #define GGML_MAX_CONTEXTS       64
 #define GGML_MAX_SRC            10
 #ifndef GGML_MAX_NAME
-#define GGML_MAX_NAME           64
+#define GGML_MAX_NAME           128
 #endif
 #define GGML_MAX_OP_PARAMS      64
 #define GGML_DEFAULT_N_THREADS  4
--- a/otherarch/sdcpp/CMakeLists.txt
+++ b/otherarch/sdcpp/CMakeLists.txt
@ -0,0 +1,95 @@
 cmake_minimum_required(VERSION 3.12)
 project("stable-diffusion")
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
    set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
    set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
 endif()
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
 if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
    set(SD_STANDALONE ON)
 else()
    set(SD_STANDALONE OFF)
 endif()
 #
 # Option list
 #
 # general
 option(SD_CUBLAS                     "sd: cuda backend" OFF)
 option(SD_HIPBLAS                    "sd: rocm backend" OFF)
 option(SD_METAL                      "sd: metal backend" OFF)
 option(SD_FLASH_ATTN                 "sd: use flash attention for x4 less memory usage" OFF)
 option(SD_FAST_SOFTMAX               "sd: x1.5 faster softmax, indeterministic (sometimes, same seed don't generate same image), cuda only" OFF)
 option(SD_BUILD_SHARED_LIBS          "sd: build shared libs" OFF)
 if(SD_CUBLAS)
 	message("Use CUBLAS as backend stable-diffusion")
    set(GGML_CUBLAS ON)
    add_definitions(-DSD_USE_CUBLAS)
 endif()
 if(SD_METAL)
 	message("Use Metal as backend stable-diffusion")
    set(GGML_METAL ON)
    add_definitions(-DSD_USE_METAL)
 endif()
 if (SD_HIPBLAS)
    message("Use HIPBLAS as backend stable-diffusion")
    set(GGML_HIPBLAS ON)
    add_definitions(-DSD_USE_CUBLAS)
    if(SD_FAST_SOFTMAX)
        set(GGML_CUDA_FAST_SOFTMAX ON)
    endif()
 endif ()
 if(SD_FLASH_ATTN)
    message("Use Flash Attention for memory optimization")
    add_definitions(-DSD_USE_FLASH_ATTENTION)
 endif()
 set(SD_LIB stable-diffusion)
 file(GLOB SD_LIB_SOURCES
    "*.h"
    "*.cpp"
    "*.hpp"
 )
 # we can get only one share lib
 if(SD_BUILD_SHARED_LIBS)
    message("Build shared library")
    set(BUILD_SHARED_LIBS OFF)
    message(${SD_LIB_SOURCES})
    add_library(${SD_LIB} SHARED ${SD_LIB_SOURCES})
    add_definitions(-DSD_BUILD_SHARED_LIB)
    target_compile_definitions(${SD_LIB} PRIVATE -DSD_BUILD_DLL)
    set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 else()
    message("Build static library")
    add_library(${SD_LIB} STATIC ${SD_LIB_SOURCES})
 endif()
 set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
 # deps
 add_subdirectory(ggml)
 add_subdirectory(thirdparty)
 target_link_libraries(${SD_LIB} PUBLIC ggml zip)
 target_include_directories(${SD_LIB} PUBLIC . thirdparty)
 target_compile_features(${SD_LIB} PUBLIC cxx_std_11)
 add_subdirectory(examples)
--- a/otherarch/sdcpp/SDCPP_LICENSE
+++ b/otherarch/sdcpp/SDCPP_LICENSE
@ -0,0 +1,21 @@
 MIT License
 Copyright (c) 2023 leejet
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
--- a/otherarch/sdcpp/clip.hpp
+++ b/otherarch/sdcpp/clip.hpp
--- a/otherarch/sdcpp/common.hpp
+++ b/otherarch/sdcpp/common.hpp
@ -0,0 +1,529 @@
 #ifndef __COMMON_HPP__
 #define __COMMON_HPP__
 #include "ggml_extend.hpp"
 class DownSampleBlock : public GGMLBlock {
 protected:
    int channels;
    int out_channels;
    bool vae_downsample;
 public:
    DownSampleBlock(int channels,
                    int out_channels,
                    bool vae_downsample = false)
        : channels(channels),
          out_channels(out_channels),
          vae_downsample(vae_downsample) {
        if (vae_downsample) {
            blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {2, 2}, {0, 0}));
        } else {
            blocks["op"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {2, 2}, {1, 1}));
        }
    }
    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
        // x: [N, channels, h, w]
        if (vae_downsample) {
            auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]);
            x = ggml_pad(ctx, x, 1, 1, 0, 0);
            x = conv->forward(ctx, x);
        } else {
            auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["op"]);
            x = conv->forward(ctx, x);
        }
        return x;  // [N, out_channels, h/2, w/2]
    }
 };
 class UpSampleBlock : public GGMLBlock {
 protected:
    int channels;
    int out_channels;
 public:
    UpSampleBlock(int channels,
                  int out_channels)
        : channels(channels),
          out_channels(out_channels) {
        blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
    }
    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
        // x: [N, channels, h, w]
        auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]);
        x = ggml_upscale(ctx, x, 2);  // [N, channels, h*2, w*2]
        x = conv->forward(ctx, x);    // [N, out_channels, h*2, w*2]
        return x;
    }
 };
 class ResBlock : public GGMLBlock {
 protected:
    // network hparams
    int64_t channels;      // model_channels * (1, 1, 1, 2, 2, 4, 4, 4)
    int64_t emb_channels;  // time_embed_dim
    int64_t out_channels;  // mult * model_channels
    std::pair<int, int> kernel_size;
    int dims;
    bool skip_t_emb;
    bool exchange_temb_dims;
    std::shared_ptr<GGMLBlock> conv_nd(int dims,
                                       int64_t in_channels,
                                       int64_t out_channels,
                                       std::pair<int, int> kernel_size,
                                       std::pair<int, int> padding) {
        GGML_ASSERT(dims == 2 || dims == 3);
        if (dims == 3) {
            return std::shared_ptr<GGMLBlock>(new Conv3dnx1x1(in_channels, out_channels, kernel_size.first, 1, padding.first));
        } else {
            return std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, out_channels, kernel_size, {1, 1}, padding));
        }
    }
 public:
    ResBlock(int64_t channels,
             int64_t emb_channels,
             int64_t out_channels,
             std::pair<int, int> kernel_size = {3, 3},
             int dims                        = 2,
             bool exchange_temb_dims         = false,
             bool skip_t_emb                 = false)
        : channels(channels),
          emb_channels(emb_channels),
          out_channels(out_channels),
          kernel_size(kernel_size),
          dims(dims),
          skip_t_emb(skip_t_emb),
          exchange_temb_dims(exchange_temb_dims) {
        std::pair<int, int> padding = {kernel_size.first / 2, kernel_size.second / 2};
        blocks["in_layers.0"]       = std::shared_ptr<GGMLBlock>(new GroupNorm32(channels));
        // in_layer_1 is nn.SILU()
        blocks["in_layers.2"] = conv_nd(dims, channels, out_channels, kernel_size, padding);
        if (!skip_t_emb) {
            // emb_layer_0 is nn.SILU()
            blocks["emb_layers.1"] = std::shared_ptr<GGMLBlock>(new Linear(emb_channels, out_channels));
        }
        blocks["out_layers.0"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(out_channels));
        // out_layer_1 is nn.SILU()
        // out_layer_2 is nn.Dropout(), skip for inference
        blocks["out_layers.3"] = conv_nd(dims, out_channels, out_channels, kernel_size, padding);
        if (out_channels != channels) {
            blocks["skip_connection"] = conv_nd(dims, channels, out_channels, {1, 1}, {0, 0});
        }
    }
    virtual struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* emb = NULL) {
        // For dims==3, we reduce dimension from 5d to 4d by merging h and w, in order not to change ggml
        // [N, c, t, h, w] => [N, c, t, h * w]
        // x: [N, channels, h, w] if dims == 2 else [N, channels, t, h, w]
        // emb: [N, emb_channels] if dims == 2 else [N, t, emb_channels]
        auto in_layers_0  = std::dynamic_pointer_cast<GroupNorm32>(blocks["in_layers.0"]);
        auto in_layers_2  = std::dynamic_pointer_cast<UnaryBlock>(blocks["in_layers.2"]);
        auto out_layers_0 = std::dynamic_pointer_cast<GroupNorm32>(blocks["out_layers.0"]);
        auto out_layers_3 = std::dynamic_pointer_cast<UnaryBlock>(blocks["out_layers.3"]);
        if (emb == NULL) {
            GGML_ASSERT(skip_t_emb);
        }
        // in_layers
        auto h = in_layers_0->forward(ctx, x);
        h      = ggml_silu_inplace(ctx, h);
        h      = in_layers_2->forward(ctx, h);  // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
        // emb_layers
        if (!skip_t_emb) {
            auto emb_layer_1 = std::dynamic_pointer_cast<Linear>(blocks["emb_layers.1"]);
            auto emb_out = ggml_silu(ctx, emb);
            emb_out      = emb_layer_1->forward(ctx, emb_out);  // [N, out_channels] if dims == 2 else [N, t, out_channels]
            if (dims == 2) {
                emb_out = ggml_reshape_4d(ctx, emb_out, 1, 1, emb_out->ne[0], emb_out->ne[1]);  // [N, out_channels, 1, 1]
            } else {
                emb_out = ggml_reshape_4d(ctx, emb_out, 1, emb_out->ne[0], emb_out->ne[1], emb_out->ne[2]);  // [N, t, out_channels, 1]
                if (exchange_temb_dims) {
                    // emb_out = rearrange(emb_out, "b t c ... -> b c t ...")
                    emb_out = ggml_cont(ctx, ggml_permute(ctx, emb_out, 0, 2, 1, 3));  // [N, out_channels, t, 1]
                }
            }
            h = ggml_add(ctx, h, emb_out);  // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
        }
        // out_layers
        h = out_layers_0->forward(ctx, h);
        h = ggml_silu_inplace(ctx, h);
        // dropout, skip for inference
        h = out_layers_3->forward(ctx, h);
        // skip connection
        if (out_channels != channels) {
            auto skip_connection = std::dynamic_pointer_cast<UnaryBlock>(blocks["skip_connection"]);
            x                    = skip_connection->forward(ctx, x);  // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
        }
        h = ggml_add(ctx, h, x);
        return h;  // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
    }
 };
 class GEGLU : public GGMLBlock {
 protected:
    int64_t dim_in;
    int64_t dim_out;
    void init_params(struct ggml_context* ctx, ggml_type wtype) {
        params["proj.weight"] = ggml_new_tensor_2d(ctx, wtype, dim_in, dim_out * 2);
        params["proj.bias"]   = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, dim_out * 2);
    }
 public:
    GEGLU(int64_t dim_in, int64_t dim_out)
        : dim_in(dim_in), dim_out(dim_out) {}
    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
        // x: [ne3, ne2, ne1, dim_in]
        // return: [ne3, ne2, ne1, dim_out]
        struct ggml_tensor* w = params["proj.weight"];
        struct ggml_tensor* b = params["proj.bias"];
        auto x_w    = ggml_view_2d(ctx, w, w->ne[0], w->ne[1] / 2, w->nb[1], 0);                        // [dim_out, dim_in]
        auto x_b    = ggml_view_1d(ctx, b, b->ne[0] / 2, 0);                                            // [dim_out, dim_in]
        auto gate_w = ggml_view_2d(ctx, w, w->ne[0], w->ne[1] / 2, w->nb[1], w->nb[1] * w->ne[1] / 2);  // [dim_out, ]
        auto gate_b = ggml_view_1d(ctx, b, b->ne[0] / 2, b->nb[0] * b->ne[0] / 2);                      // [dim_out, ]
        auto x_in = x;
        x         = ggml_nn_linear(ctx, x_in, x_w, x_b);        // [ne3, ne2, ne1, dim_out]
        auto gate = ggml_nn_linear(ctx, x_in, gate_w, gate_b);  // [ne3, ne2, ne1, dim_out]
        gate = ggml_gelu_inplace(ctx, gate);
        x = ggml_mul(ctx, x, gate);  // [ne3, ne2, ne1, dim_out]
        return x;
    }
 };
 class FeedForward : public GGMLBlock {
 public:
    FeedForward(int64_t dim,
                int64_t dim_out,
                int64_t mult = 4) {
        int64_t inner_dim = dim * mult;
        blocks["net.0"] = std::shared_ptr<GGMLBlock>(new GEGLU(dim, inner_dim));
        // net_1 is nn.Dropout(), skip for inference
        blocks["net.2"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim_out));
    }
    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
        // x: [ne3, ne2, ne1, dim]
        // return: [ne3, ne2, ne1, dim_out]
        auto net_0 = std::dynamic_pointer_cast<GEGLU>(blocks["net.0"]);
        auto net_2 = std::dynamic_pointer_cast<Linear>(blocks["net.2"]);
        x = net_0->forward(ctx, x);  // [ne3, ne2, ne1, inner_dim]
        x = net_2->forward(ctx, x);  // [ne3, ne2, ne1, dim_out]
        return x;
    }
 };
 class CrossAttention : public GGMLBlock {
 protected:
    int64_t query_dim;
    int64_t context_dim;
    int64_t n_head;
    int64_t d_head;
 public:
    CrossAttention(int64_t query_dim,
                   int64_t context_dim,
                   int64_t n_head,
                   int64_t d_head)
        : n_head(n_head),
          d_head(d_head),
          query_dim(query_dim),
          context_dim(context_dim) {
        int64_t inner_dim = d_head * n_head;
        blocks["to_q"] = std::shared_ptr<GGMLBlock>(new Linear(query_dim, inner_dim, false));
        blocks["to_k"] = std::shared_ptr<GGMLBlock>(new Linear(context_dim, inner_dim, false));
        blocks["to_v"] = std::shared_ptr<GGMLBlock>(new Linear(context_dim, inner_dim, false));
        blocks["to_out.0"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, query_dim));
        // to_out_1 is nn.Dropout(), skip for inference
    }
    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* context) {
        // x: [N, n_token, query_dim]
        // context: [N, n_context, context_dim]
        // return: [N, n_token, query_dim]
        auto to_q     = std::dynamic_pointer_cast<Linear>(blocks["to_q"]);
        auto to_k     = std::dynamic_pointer_cast<Linear>(blocks["to_k"]);
        auto to_v     = std::dynamic_pointer_cast<Linear>(blocks["to_v"]);
        auto to_out_0 = std::dynamic_pointer_cast<Linear>(blocks["to_out.0"]);
        int64_t n         = x->ne[2];
        int64_t n_token   = x->ne[1];
        int64_t n_context = context->ne[1];
        int64_t inner_dim = d_head * n_head;
        auto q = to_q->forward(ctx, x);                                 // [N, n_token, inner_dim]
        q      = ggml_reshape_4d(ctx, q, d_head, n_head, n_token, n);   // [N, n_token, n_head, d_head]
        q      = ggml_cont(ctx, ggml_permute(ctx, q, 0, 2, 1, 3));      // [N, n_head, n_token, d_head]
        q      = ggml_reshape_3d(ctx, q, d_head, n_token, n_head * n);  // [N * n_head, n_token, d_head]
        auto k = to_k->forward(ctx, context);                             // [N, n_context, inner_dim]
        k      = ggml_reshape_4d(ctx, k, d_head, n_head, n_context, n);   // [N, n_context, n_head, d_head]
        k      = ggml_cont(ctx, ggml_permute(ctx, k, 0, 2, 1, 3));        // [N, n_head, n_context, d_head]
        k      = ggml_reshape_3d(ctx, k, d_head, n_context, n_head * n);  // [N * n_head, n_context, d_head]
        auto v = to_v->forward(ctx, context);                             // [N, n_context, inner_dim]
        v      = ggml_reshape_4d(ctx, v, d_head, n_head, n_context, n);   // [N, n_context, n_head, d_head]
        v      = ggml_cont(ctx, ggml_permute(ctx, v, 1, 2, 0, 3));        // [N, n_head, d_head, n_context]
        v      = ggml_reshape_3d(ctx, v, n_context, d_head, n_head * n);  // [N * n_head, d_head, n_context]
        auto kqv = ggml_nn_attention(ctx, q, k, v, false);  // [N * n_head, n_token, d_head]
        kqv      = ggml_reshape_4d(ctx, kqv, d_head, n_token, n_head, n);
        kqv      = ggml_cont(ctx, ggml_permute(ctx, kqv, 0, 2, 1, 3));  // [N, n_token, n_head, d_head]
        x = ggml_reshape_3d(ctx, kqv, d_head * n_head, n_token, n);  // [N, n_token, inner_dim]
        x = to_out_0->forward(ctx, x);  // [N, n_token, query_dim]
        return x;
    }
 };
 class BasicTransformerBlock : public GGMLBlock {
 protected:
    int64_t n_head;
    int64_t d_head;
    bool ff_in;
 public:
    BasicTransformerBlock(int64_t dim,
                          int64_t n_head,
                          int64_t d_head,
                          int64_t context_dim,
                          bool ff_in = false)
        : n_head(n_head), d_head(d_head), ff_in(ff_in) {
        // disable_self_attn is always False
        // disable_temporal_crossattention is always False
        // switch_temporal_ca_to_sa is always False
        // inner_dim is always None or equal to dim
        // gated_ff is always True
        blocks["attn1"] = std::shared_ptr<GGMLBlock>(new CrossAttention(dim, dim, n_head, d_head));
        blocks["attn2"] = std::shared_ptr<GGMLBlock>(new CrossAttention(dim, context_dim, n_head, d_head));
        blocks["ff"]    = std::shared_ptr<GGMLBlock>(new FeedForward(dim, dim));
        blocks["norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
        blocks["norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
        blocks["norm3"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
        if (ff_in) {
            blocks["norm_in"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
            blocks["ff_in"]   = std::shared_ptr<GGMLBlock>(new FeedForward(dim, dim));
        }
    }
    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* context) {
        // x: [N, n_token, query_dim]
        // context: [N, n_context, context_dim]
        // return: [N, n_token, query_dim]
        auto attn1 = std::dynamic_pointer_cast<CrossAttention>(blocks["attn1"]);
        auto attn2 = std::dynamic_pointer_cast<CrossAttention>(blocks["attn2"]);
        auto ff    = std::dynamic_pointer_cast<FeedForward>(blocks["ff"]);
        auto norm1 = std::dynamic_pointer_cast<LayerNorm>(blocks["norm1"]);
        auto norm2 = std::dynamic_pointer_cast<LayerNorm>(blocks["norm2"]);
        auto norm3 = std::dynamic_pointer_cast<LayerNorm>(blocks["norm3"]);
        if (ff_in) {
            auto norm_in = std::dynamic_pointer_cast<LayerNorm>(blocks["norm_in"]);
            auto ff_in   = std::dynamic_pointer_cast<FeedForward>(blocks["ff_in"]);
            auto x_skip = x;
            x           = norm_in->forward(ctx, x);
            x           = ff_in->forward(ctx, x);
            // self.is_res is always True
            x = ggml_add(ctx, x, x_skip);
        }
        auto r = x;
        x      = norm1->forward(ctx, x);
        x      = attn1->forward(ctx, x, x);  // self-attention
        x      = ggml_add(ctx, x, r);
        r      = x;
        x      = norm2->forward(ctx, x);
        x      = attn2->forward(ctx, x, context);  // cross-attention
        x      = ggml_add(ctx, x, r);
        r      = x;
        x      = norm3->forward(ctx, x);
        x      = ff->forward(ctx, x);
        x      = ggml_add(ctx, x, r);
        return x;
    }
 };
 class SpatialTransformer : public GGMLBlock {
 protected:
    int64_t in_channels;  // mult * model_channels
    int64_t n_head;
    int64_t d_head;
    int64_t depth       = 1;    // 1
    int64_t context_dim = 768;  // hidden_size, 1024 for VERSION_2_x
 public:
    SpatialTransformer(int64_t in_channels,
                       int64_t n_head,
                       int64_t d_head,
                       int64_t depth,
                       int64_t context_dim)
        : in_channels(in_channels),
          n_head(n_head),
          d_head(d_head),
          depth(depth),
          context_dim(context_dim) {
        // We will convert unet transformer linear to conv2d 1x1 when loading the weights, so use_linear is always False
        // disable_self_attn is always False
        int64_t inner_dim = n_head * d_head;  // in_channels
        blocks["norm"]    = std::shared_ptr<GGMLBlock>(new GroupNorm32(in_channels));
        blocks["proj_in"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, inner_dim, {1, 1}));
        for (int i = 0; i < depth; i++) {
            std::string name = "transformer_blocks." + std::to_string(i);
            blocks[name]     = std::shared_ptr<GGMLBlock>(new BasicTransformerBlock(inner_dim, n_head, d_head, context_dim));
        }
        blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(inner_dim, in_channels, {1, 1}));
    }
    virtual struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* context) {
        // x: [N, in_channels, h, w]
        // context: [N, max_position(aka n_token), hidden_size(aka context_dim)]
        auto norm     = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm"]);
        auto proj_in  = std::dynamic_pointer_cast<Conv2d>(blocks["proj_in"]);
        auto proj_out = std::dynamic_pointer_cast<Conv2d>(blocks["proj_out"]);
        auto x_in         = x;
        int64_t n         = x->ne[3];
        int64_t h         = x->ne[1];
        int64_t w         = x->ne[0];
        int64_t inner_dim = n_head * d_head;
        x = norm->forward(ctx, x);
        x = proj_in->forward(ctx, x);  // [N, inner_dim, h, w]
        x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 2, 0, 3));  // [N, h, w, inner_dim]
        x = ggml_reshape_3d(ctx, x, inner_dim, w * h, n);      // [N, h * w, inner_dim]
        for (int i = 0; i < depth; i++) {
            std::string name       = "transformer_blocks." + std::to_string(i);
            auto transformer_block = std::dynamic_pointer_cast<BasicTransformerBlock>(blocks[name]);
            x = transformer_block->forward(ctx, x, context);
        }
        x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 0, 2, 3));  // [N, inner_dim, h * w]
        x = ggml_reshape_4d(ctx, x, w, h, inner_dim, n);       // [N, inner_dim, h, w]
        // proj_out
        x = proj_out->forward(ctx, x);  // [N, in_channels, h, w]
        x = ggml_add(ctx, x, x_in);
        return x;
    }
 };
 class AlphaBlender : public GGMLBlock {
 protected:
    void init_params(struct ggml_context* ctx, ggml_type wtype) {
        params["mix_factor"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
    }
    float get_alpha() {
        // image_only_indicator is always tensor([0.]) and since mix_factor.shape is [1,]
        // so learned_with_images is same as learned
        float alpha = ggml_backend_tensor_get_f32(params["mix_factor"]);
        return sigmoid(alpha);
    }
 public:
    AlphaBlender() {
        // merge_strategy is always learned_with_images
        // for inference, we don't need to set alpha
        // since mix_factor.shape is [1,], we don't need rearrange using  rearrange_pattern
    }
    struct ggml_tensor* forward(struct ggml_context* ctx,
                                struct ggml_tensor* x_spatial,
                                struct ggml_tensor* x_temporal) {
        // image_only_indicator is always tensor([0.])
        float alpha = get_alpha();
        auto x      = ggml_add(ctx,
                               ggml_scale(ctx, x_spatial, alpha),
                               ggml_scale(ctx, x_temporal, 1.0f - alpha));
        return x;
    }
 };
 class VideoResBlock : public ResBlock {
 public:
    VideoResBlock(int channels,
                  int emb_channels,
                  int out_channels,
                  std::pair<int, int> kernel_size = {3, 3},
                  int64_t video_kernel_size       = 3,
                  int dims                        = 2)  // always 2
        : ResBlock(channels, emb_channels, out_channels, kernel_size, dims) {
        blocks["time_stack"] = std::shared_ptr<GGMLBlock>(new ResBlock(out_channels, emb_channels, out_channels, kernel_size, 3, true));
        blocks["time_mixer"] = std::shared_ptr<GGMLBlock>(new AlphaBlender());
    }
    struct ggml_tensor* forward(struct ggml_context* ctx,
                                struct ggml_tensor* x,
                                struct ggml_tensor* emb,
                                int num_video_frames) {
        // x: [N, channels, h, w] aka [b*t, channels, h, w]
        // emb: [N, emb_channels] aka [b*t, emb_channels]
        // image_only_indicator is always tensor([0.])
        auto time_stack = std::dynamic_pointer_cast<ResBlock>(blocks["time_stack"]);
        auto time_mixer = std::dynamic_pointer_cast<AlphaBlender>(blocks["time_mixer"]);
        x = ResBlock::forward(ctx, x, emb);
        int64_t T = num_video_frames;
        int64_t B = x->ne[3] / T;
        int64_t C = x->ne[2];
        int64_t H = x->ne[1];
        int64_t W = x->ne[0];
        x          = ggml_reshape_4d(ctx, x, W * H, C, T, B);           // (b t) c h w -> b t c (h w)
        x          = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));  // b t c (h w) -> b c t (h w)
        auto x_mix = x;
        emb = ggml_reshape_4d(ctx, emb, emb->ne[0], T, B, emb->ne[3]);  // (b t) ... -> b t ...
        x = time_stack->forward(ctx, x, emb);  // b t c (h w)
        x = time_mixer->forward(ctx, x_mix, x);  // b t c (h w)
        x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));  // b c t (h w) -> b t c (h w)
        x = ggml_reshape_4d(ctx, x, W, H, C, T * B);           // b t c (h w) -> (b t) c h w
        return x;
    }
 };
 #endif  // __COMMON_HPP__
--- a/otherarch/sdcpp/control.hpp
+++ b/otherarch/sdcpp/control.hpp
@ -0,0 +1,466 @@
 #ifndef __CONTROL_HPP__
 #define __CONTROL_HPP__
 #include "common.hpp"
 #include "ggml_extend.hpp"
 #include "model.h"
 #define CONTROL_NET_GRAPH_SIZE 1536
 /*
    =================================== ControlNet ===================================
    Reference: https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/cldm/cldm.py
 */
 class ControlNetBlock : public GGMLBlock {
 protected:
    SDVersion version = VERSION_1_x;
    // network hparams
    int in_channels                        = 4;
    int out_channels                       = 4;
    int hint_channels                      = 3;
    int num_res_blocks                     = 2;
    std::vector<int> attention_resolutions = {4, 2, 1};
    std::vector<int> channel_mult          = {1, 2, 4, 4};
    std::vector<int> transformer_depth     = {1, 1, 1, 1};
    int time_embed_dim                     = 1280;  // model_channels*4
    int num_heads                          = 8;
    int num_head_channels                  = -1;   // channels // num_heads
    int context_dim                        = 768;  // 1024 for VERSION_2_x, 2048 for VERSION_XL
 public:
    int model_channels  = 320;
    int adm_in_channels = 2816;  // only for VERSION_XL
    ControlNetBlock(SDVersion version = VERSION_1_x)
        : version(version) {
        if (version == VERSION_2_x) {
            context_dim       = 1024;
            num_head_channels = 64;
            num_heads         = -1;
        } else if (version == VERSION_XL) {
            context_dim           = 2048;
            attention_resolutions = {4, 2};
            channel_mult          = {1, 2, 4};
            transformer_depth     = {1, 2, 10};
            num_head_channels     = 64;
            num_heads             = -1;
        } else if (version == VERSION_SVD) {
            in_channels       = 8;
            out_channels      = 4;
            context_dim       = 1024;
            adm_in_channels   = 768;
            num_head_channels = 64;
            num_heads         = -1;
        }
        blocks["time_embed.0"] = std::shared_ptr<GGMLBlock>(new Linear(model_channels, time_embed_dim));
        // time_embed_1 is nn.SiLU()
        blocks["time_embed.2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, time_embed_dim));
        if (version == VERSION_XL || version == VERSION_SVD) {
            blocks["label_emb.0.0"] = std::shared_ptr<GGMLBlock>(new Linear(adm_in_channels, time_embed_dim));
            // label_emb_1 is nn.SiLU()
            blocks["label_emb.0.2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, time_embed_dim));
        }
        // input_blocks
        blocks["input_blocks.0.0"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, model_channels, {3, 3}, {1, 1}, {1, 1}));
        std::vector<int> input_block_chans;
        input_block_chans.push_back(model_channels);
        int ch              = model_channels;
        int input_block_idx = 0;
        int ds              = 1;
        auto get_resblock = [&](int64_t channels, int64_t emb_channels, int64_t out_channels) -> ResBlock* {
            return new ResBlock(channels, emb_channels, out_channels);
        };
        auto get_attention_layer = [&](int64_t in_channels,
                                       int64_t n_head,
                                       int64_t d_head,
                                       int64_t depth,
                                       int64_t context_dim) -> SpatialTransformer* {
            return new SpatialTransformer(in_channels, n_head, d_head, depth, context_dim);
        };
        auto make_zero_conv = [&](int64_t channels) {
            return new Conv2d(channels, channels, {1, 1});
        };
        blocks["zero_convs.0.0"] = std::shared_ptr<GGMLBlock>(make_zero_conv(model_channels));
        blocks["input_hint_block.0"] = std::shared_ptr<GGMLBlock>(new Conv2d(hint_channels, 16, {3, 3}, {1, 1}, {1, 1}));
        // nn.SiLU()
        blocks["input_hint_block.2"] = std::shared_ptr<GGMLBlock>(new Conv2d(16, 16, {3, 3}, {1, 1}, {1, 1}));
        // nn.SiLU()
        blocks["input_hint_block.4"] = std::shared_ptr<GGMLBlock>(new Conv2d(16, 32, {3, 3}, {2, 2}, {1, 1}));
        // nn.SiLU()
        blocks["input_hint_block.6"] = std::shared_ptr<GGMLBlock>(new Conv2d(32, 32, {3, 3}, {1, 1}, {1, 1}));
        // nn.SiLU()
        blocks["input_hint_block.8"] = std::shared_ptr<GGMLBlock>(new Conv2d(32, 96, {3, 3}, {2, 2}, {1, 1}));
        // nn.SiLU()
        blocks["input_hint_block.10"] = std::shared_ptr<GGMLBlock>(new Conv2d(96, 96, {3, 3}, {1, 1}, {1, 1}));
        // nn.SiLU()
        blocks["input_hint_block.12"] = std::shared_ptr<GGMLBlock>(new Conv2d(96, 256, {3, 3}, {2, 2}, {1, 1}));
        // nn.SiLU()
        blocks["input_hint_block.14"] = std::shared_ptr<GGMLBlock>(new Conv2d(256, model_channels, {3, 3}, {1, 1}, {1, 1}));
        size_t len_mults = channel_mult.size();
        for (int i = 0; i < len_mults; i++) {
            int mult = channel_mult[i];
            for (int j = 0; j < num_res_blocks; j++) {
                input_block_idx += 1;
                std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0";
                blocks[name]     = std::shared_ptr<GGMLBlock>(get_resblock(ch, time_embed_dim, mult * model_channels));
                ch = mult * model_channels;
                if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) {
                    int n_head = num_heads;
                    int d_head = ch / num_heads;
                    if (num_head_channels != -1) {
                        d_head = num_head_channels;
                        n_head = ch / d_head;
                    }
                    std::string name = "input_blocks." + std::to_string(input_block_idx) + ".1";
                    blocks[name]     = std::shared_ptr<GGMLBlock>(get_attention_layer(ch,
                                                                                      n_head,
                                                                                      d_head,
                                                                                      transformer_depth[i],
                                                                                      context_dim));
                }
                blocks["zero_convs." + std::to_string(input_block_idx) + ".0"] = std::shared_ptr<GGMLBlock>(make_zero_conv(ch));
                input_block_chans.push_back(ch);
            }
            if (i != len_mults - 1) {
                input_block_idx += 1;
                std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0";
                blocks[name]     = std::shared_ptr<GGMLBlock>(new DownSampleBlock(ch, ch));
                blocks["zero_convs." + std::to_string(input_block_idx) + ".0"] = std::shared_ptr<GGMLBlock>(make_zero_conv(ch));
                input_block_chans.push_back(ch);
                ds *= 2;
            }
        }
        // middle blocks
        int n_head = num_heads;
        int d_head = ch / num_heads;
        if (num_head_channels != -1) {
            d_head = num_head_channels;
            n_head = ch / d_head;
        }
        blocks["middle_block.0"] = std::shared_ptr<GGMLBlock>(get_resblock(ch, time_embed_dim, ch));
        blocks["middle_block.1"] = std::shared_ptr<GGMLBlock>(get_attention_layer(ch,
                                                                                  n_head,
                                                                                  d_head,
                                                                                  transformer_depth[transformer_depth.size() - 1],
                                                                                  context_dim));
        blocks["middle_block.2"] = std::shared_ptr<GGMLBlock>(get_resblock(ch, time_embed_dim, ch));
        // middle_block_out
        blocks["middle_block_out.0"] = std::shared_ptr<GGMLBlock>(make_zero_conv(ch));
    }
    struct ggml_tensor* resblock_forward(std::string name,
                                         struct ggml_context* ctx,
                                         struct ggml_tensor* x,
                                         struct ggml_tensor* emb) {
        auto block = std::dynamic_pointer_cast<ResBlock>(blocks[name]);
        return block->forward(ctx, x, emb);
    }
    struct ggml_tensor* attention_layer_forward(std::string name,
                                                struct ggml_context* ctx,
                                                struct ggml_tensor* x,
                                                struct ggml_tensor* context) {
        auto block = std::dynamic_pointer_cast<SpatialTransformer>(blocks[name]);
        return block->forward(ctx, x, context);
    }
    struct ggml_tensor* input_hint_block_forward(struct ggml_context* ctx,
                                                 struct ggml_tensor* hint,
                                                 struct ggml_tensor* emb,
                                                 struct ggml_tensor* context) {
        int num_input_blocks = 15;
        auto h               = hint;
        for (int i = 0; i < num_input_blocks; i++) {
            if (i % 2 == 0) {
                auto block = std::dynamic_pointer_cast<Conv2d>(blocks["input_hint_block." + std::to_string(i)]);
                h = block->forward(ctx, h);
            } else {
                h = ggml_silu_inplace(ctx, h);
            }
        }
        return h;
    }
    std::vector<struct ggml_tensor*> forward(struct ggml_context* ctx,
                                             struct ggml_tensor* x,
                                             struct ggml_tensor* hint,
                                             struct ggml_tensor* guided_hint,
                                             struct ggml_tensor* timesteps,
                                             struct ggml_tensor* context,
                                             struct ggml_tensor* y = NULL) {
        // x: [N, in_channels, h, w] or [N, in_channels/2, h, w]
        // timesteps: [N,]
        // context: [N, max_position, hidden_size] or [1, max_position, hidden_size]. for example, [N, 77, 768]
        // y: [N, adm_in_channels] or [1, adm_in_channels]
        if (context != NULL) {
            if (context->ne[2] != x->ne[3]) {
                context = ggml_repeat(ctx, context, ggml_new_tensor_3d(ctx, GGML_TYPE_F32, context->ne[0], context->ne[1], x->ne[3]));
            }
        }
        if (y != NULL) {
            if (y->ne[1] != x->ne[3]) {
                y = ggml_repeat(ctx, y, ggml_new_tensor_2d(ctx, GGML_TYPE_F32, y->ne[0], x->ne[3]));
            }
        }
        auto time_embed_0     = std::dynamic_pointer_cast<Linear>(blocks["time_embed.0"]);
        auto time_embed_2     = std::dynamic_pointer_cast<Linear>(blocks["time_embed.2"]);
        auto input_blocks_0_0 = std::dynamic_pointer_cast<Conv2d>(blocks["input_blocks.0.0"]);
        auto zero_convs_0     = std::dynamic_pointer_cast<Conv2d>(blocks["zero_convs.0.0"]);
        auto middle_block_out = std::dynamic_pointer_cast<Conv2d>(blocks["middle_block_out.0"]);
        auto t_emb = ggml_nn_timestep_embedding(ctx, timesteps, model_channels);  // [N, model_channels]
        auto emb = time_embed_0->forward(ctx, t_emb);
        emb      = ggml_silu_inplace(ctx, emb);
        emb      = time_embed_2->forward(ctx, emb);  // [N, time_embed_dim]
        // SDXL/SVD
        if (y != NULL) {
            auto label_embed_0 = std::dynamic_pointer_cast<Linear>(blocks["label_emb.0.0"]);
            auto label_embed_2 = std::dynamic_pointer_cast<Linear>(blocks["label_emb.0.2"]);
            auto label_emb = label_embed_0->forward(ctx, y);
            label_emb      = ggml_silu_inplace(ctx, label_emb);
            label_emb      = label_embed_2->forward(ctx, label_emb);  // [N, time_embed_dim]
            emb = ggml_add(ctx, emb, label_emb);  // [N, time_embed_dim]
        }
        std::vector<struct ggml_tensor*> outs;
        if (guided_hint == NULL) {
            guided_hint = input_hint_block_forward(ctx, hint, emb, context);
        }
        outs.push_back(guided_hint);
        // input_blocks
        // input block 0
        auto h = input_blocks_0_0->forward(ctx, x);
        h      = ggml_add(ctx, h, guided_hint);
        outs.push_back(zero_convs_0->forward(ctx, h));
        // input block 1-11
        size_t len_mults    = channel_mult.size();
        int input_block_idx = 0;
        int ds              = 1;
        for (int i = 0; i < len_mults; i++) {
            int mult = channel_mult[i];
            for (int j = 0; j < num_res_blocks; j++) {
                input_block_idx += 1;
                std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0";
                h                = resblock_forward(name, ctx, h, emb);  // [N, mult*model_channels, h, w]
                if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) {
                    std::string name = "input_blocks." + std::to_string(input_block_idx) + ".1";
                    h                = attention_layer_forward(name, ctx, h, context);  // [N, mult*model_channels, h, w]
                }
                auto zero_conv = std::dynamic_pointer_cast<Conv2d>(blocks["zero_convs." + std::to_string(input_block_idx) + ".0"]);
                outs.push_back(zero_conv->forward(ctx, h));
            }
            if (i != len_mults - 1) {
                ds *= 2;
                input_block_idx += 1;
                std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0";
                auto block       = std::dynamic_pointer_cast<DownSampleBlock>(blocks[name]);
                h = block->forward(ctx, h);  // [N, mult*model_channels, h/(2^(i+1)), w/(2^(i+1))]
                auto zero_conv = std::dynamic_pointer_cast<Conv2d>(blocks["zero_convs." + std::to_string(input_block_idx) + ".0"]);
                outs.push_back(zero_conv->forward(ctx, h));
            }
        }
        // [N, 4*model_channels, h/8, w/8]
        // middle_block
        h = resblock_forward("middle_block.0", ctx, h, emb);             // [N, 4*model_channels, h/8, w/8]
        h = attention_layer_forward("middle_block.1", ctx, h, context);  // [N, 4*model_channels, h/8, w/8]
        h = resblock_forward("middle_block.2", ctx, h, emb);             // [N, 4*model_channels, h/8, w/8]
        // out
        outs.push_back(middle_block_out->forward(ctx, h));
        return outs;
    }
 };
 struct ControlNet : public GGMLModule {
    SDVersion version = VERSION_1_x;
    ControlNetBlock control_net;
    ggml_backend_buffer_t control_buffer = NULL;  // keep control output tensors in backend memory
    ggml_context* control_ctx            = NULL;
    std::vector<struct ggml_tensor*> controls;  // (12 input block outputs, 1 middle block output) SD 1.5
    struct ggml_tensor* guided_hint = NULL;     // guided_hint cache, for faster inference
    bool guided_hint_cached         = false;
    ControlNet(ggml_backend_t backend,
               ggml_type wtype,
               SDVersion version = VERSION_1_x)
        : GGMLModule(backend, wtype), control_net(version) {
        control_net.init(params_ctx, wtype);
    }
    ~ControlNet() {
        free_control_ctx();
    }
    void alloc_control_ctx(std::vector<struct ggml_tensor*> outs) {
        struct ggml_init_params params;
        params.mem_size   = static_cast<size_t>(outs.size() * ggml_tensor_overhead()) + 1024 * 1024;
        params.mem_buffer = NULL;
        params.no_alloc   = true;
        control_ctx       = ggml_init(params);
        controls.resize(outs.size() - 1);
        size_t control_buffer_size = 0;
        guided_hint = ggml_dup_tensor(control_ctx, outs[0]);
        control_buffer_size += ggml_nbytes(guided_hint);
        for (int i = 0; i < outs.size() - 1; i++) {
            controls[i] = ggml_dup_tensor(control_ctx, outs[i + 1]);
            control_buffer_size += ggml_nbytes(controls[i]);
        }
        control_buffer = ggml_backend_alloc_ctx_tensors(control_ctx, backend);
        LOG_DEBUG("control buffer size %.2fMB", control_buffer_size * 1.f / 1024.f / 1024.f);
    }
    void free_control_ctx() {
        if (control_buffer != NULL) {
            ggml_backend_buffer_free(control_buffer);
            control_buffer = NULL;
        }
        if (control_ctx != NULL) {
            ggml_free(control_ctx);
            control_ctx = NULL;
        }
        guided_hint        = NULL;
        guided_hint_cached = false;
        controls.clear();
    }
    std::string get_desc() {
        return "control_net";
    }
    size_t get_params_mem_size() {
        return control_net.get_params_mem_size();
    }
    size_t get_params_num() {
        return control_net.get_params_num();
    }
    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
        control_net.get_param_tensors(tensors, prefix);
    }
    struct ggml_cgraph* build_graph(struct ggml_tensor* x,
                                    struct ggml_tensor* hint,
                                    struct ggml_tensor* timesteps,
                                    struct ggml_tensor* context,
                                    struct ggml_tensor* y = NULL) {
        struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, CONTROL_NET_GRAPH_SIZE, false);
        x = to_backend(x);
        if (guided_hint_cached) {
            hint = NULL;
        } else {
            hint = to_backend(hint);
        }
        context   = to_backend(context);
        y         = to_backend(y);
        timesteps = to_backend(timesteps);
        auto outs = control_net.forward(compute_ctx,
                                        x,
                                        hint,
                                        guided_hint_cached ? guided_hint : NULL,
                                        timesteps,
                                        context,
                                        y);
        if (control_ctx == NULL) {
            alloc_control_ctx(outs);
        }
        ggml_build_forward_expand(gf, ggml_cpy(compute_ctx, outs[0], guided_hint));
        for (int i = 0; i < outs.size() - 1; i++) {
            ggml_build_forward_expand(gf, ggml_cpy(compute_ctx, outs[i + 1], controls[i]));
        }
        return gf;
    }
    void compute(int n_threads,
                 struct ggml_tensor* x,
                 struct ggml_tensor* hint,
                 struct ggml_tensor* timesteps,
                 struct ggml_tensor* context,
                 struct ggml_tensor* y,
                 struct ggml_tensor** output     = NULL,
                 struct ggml_context* output_ctx = NULL) {
        // x: [N, in_channels, h, w]
        // timesteps: [N, ]
        // context: [N, max_position, hidden_size]([N, 77, 768]) or [1, max_position, hidden_size]
        // y: [N, adm_in_channels] or [1, adm_in_channels]
        auto get_graph = [&]() -> struct ggml_cgraph* {
            return build_graph(x, hint, timesteps, context, y);
        };
        GGMLModule::compute(get_graph, n_threads, false, output, output_ctx);
        guided_hint_cached = true;
    }
    bool load_from_file(const std::string& file_path) {
        LOG_INFO("loading control net from '%s'", file_path.c_str());
        alloc_params_buffer();
        std::map<std::string, ggml_tensor*> tensors;
        control_net.get_param_tensors(tensors);
        std::set<std::string> ignore_tensors;
        ModelLoader model_loader;
        if (!model_loader.init_from_file(file_path)) {
            LOG_ERROR("init control net model loader from file failed: '%s'", file_path.c_str());
            return false;
        }
        bool success = model_loader.load_tensors(tensors, backend, ignore_tensors);
        if (!success) {
            LOG_ERROR("load control net tensors from model loader failed");
            return false;
        }
        LOG_INFO("control net model loaded");
        return success;
    }
 };
 #endif  // __CONTROL_HPP__
--- a/otherarch/sdcpp/denoiser.hpp
+++ b/otherarch/sdcpp/denoiser.hpp
@ -0,0 +1,125 @@
 #ifndef __DENOISER_HPP__
 #define __DENOISER_HPP__
 #include "ggml_extend.hpp"
 /*================================================= CompVisDenoiser ==================================================*/
 // Ref: https://github.com/crowsonkb/k-diffusion/blob/master/k_diffusion/external.py
 #define TIMESTEPS 1000
 struct SigmaSchedule {
    float alphas_cumprod[TIMESTEPS];
    float sigmas[TIMESTEPS];
    float log_sigmas[TIMESTEPS];
    virtual std::vector<float> get_sigmas(uint32_t n) = 0;
    float sigma_to_t(float sigma) {
        float log_sigma = std::log(sigma);
        std::vector<float> dists;
        dists.reserve(TIMESTEPS);
        for (float log_sigma_val : log_sigmas) {
            dists.push_back(log_sigma - log_sigma_val);
        }
        int low_idx = 0;
        for (size_t i = 0; i < TIMESTEPS; i++) {
            if (dists[i] >= 0) {
                low_idx++;
            }
        }
        low_idx      = std::min(std::max(low_idx - 1, 0), TIMESTEPS - 2);
        int high_idx = low_idx + 1;
        float low  = log_sigmas[low_idx];
        float high = log_sigmas[high_idx];
        float w    = (low - log_sigma) / (low - high);
        w          = std::max(0.f, std::min(1.f, w));
        float t    = (1.0f - w) * low_idx + w * high_idx;
        return t;
    }
    float t_to_sigma(float t) {
        int low_idx     = static_cast<int>(std::floor(t));
        int high_idx    = static_cast<int>(std::ceil(t));
        float w         = t - static_cast<float>(low_idx);
        float log_sigma = (1.0f - w) * log_sigmas[low_idx] + w * log_sigmas[high_idx];
        return std::exp(log_sigma);
    }
 };
 struct DiscreteSchedule : SigmaSchedule {
    std::vector<float> get_sigmas(uint32_t n) {
        std::vector<float> result;
        int t_max = TIMESTEPS - 1;
        if (n == 0) {
            return result;
        } else if (n == 1) {
            result.push_back(t_to_sigma((float)t_max));
            result.push_back(0);
            return result;
        }
        float step = static_cast<float>(t_max) / static_cast<float>(n - 1);
        for (uint32_t i = 0; i < n; ++i) {
            float t = t_max - step * i;
            result.push_back(t_to_sigma(t));
        }
        result.push_back(0);
        return result;
    }
 };
 struct KarrasSchedule : SigmaSchedule {
    std::vector<float> get_sigmas(uint32_t n) {
        // These *COULD* be function arguments here,
        // but does anybody ever bother to touch them?
        float sigma_min = 0.1f;
        float sigma_max = 10.f;
        float rho       = 7.f;
        std::vector<float> result(n + 1);
        float min_inv_rho = pow(sigma_min, (1.f / rho));
        float max_inv_rho = pow(sigma_max, (1.f / rho));
        for (uint32_t i = 0; i < n; i++) {
            // Eq. (5) from Karras et al 2022
            result[i] = pow(max_inv_rho + (float)i / ((float)n - 1.f) * (min_inv_rho - max_inv_rho), rho);
        }
        result[n] = 0.;
        return result;
    }
 };
 struct Denoiser {
    std::shared_ptr<SigmaSchedule> schedule              = std::make_shared<DiscreteSchedule>();
    virtual std::vector<float> get_scalings(float sigma) = 0;
 };
 struct CompVisDenoiser : public Denoiser {
    float sigma_data = 1.0f;
    std::vector<float> get_scalings(float sigma) {
        float c_out = -sigma;
        float c_in  = 1.0f / std::sqrt(sigma * sigma + sigma_data * sigma_data);
        return {c_out, c_in};
    }
 };
 struct CompVisVDenoiser : public Denoiser {
    float sigma_data = 1.0f;
    std::vector<float> get_scalings(float sigma) {
        float c_skip = sigma_data * sigma_data / (sigma * sigma + sigma_data * sigma_data);
        float c_out  = -sigma * sigma_data / std::sqrt(sigma * sigma + sigma_data * sigma_data);
        float c_in   = 1.0f / std::sqrt(sigma * sigma + sigma_data * sigma_data);
        return {c_skip, c_out, c_in};
    }
 };
 #endif  // __DENOISER_HPP__
--- a/otherarch/sdcpp/esrgan.hpp
+++ b/otherarch/sdcpp/esrgan.hpp
@ -0,0 +1,206 @@
 #ifndef __ESRGAN_HPP__
 #define __ESRGAN_HPP__
 #include "ggml_extend.hpp"
 #include "model.h"
 /*
    ===================================    ESRGAN  ===================================
    References:
    https://github.com/xinntao/Real-ESRGAN/blob/master/inference_realesrgan.py
    https://github.com/XPixelGroup/BasicSR/blob/v1.4.2/basicsr/archs/rrdbnet_arch.py
 */
 class ResidualDenseBlock : public GGMLBlock {
 protected:
    int num_feat;
    int num_grow_ch;
 public:
    ResidualDenseBlock(int num_feat = 64, int num_grow_ch = 32)
        : num_feat(num_feat), num_grow_ch(num_grow_ch) {
        blocks["conv1"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_grow_ch, {3, 3}, {1, 1}, {1, 1}));
        blocks["conv2"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat + num_grow_ch, num_grow_ch, {3, 3}, {1, 1}, {1, 1}));
        blocks["conv3"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat + 2 * num_grow_ch, num_grow_ch, {3, 3}, {1, 1}, {1, 1}));
        blocks["conv4"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat + 3 * num_grow_ch, num_grow_ch, {3, 3}, {1, 1}, {1, 1}));
        blocks["conv5"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat + 4 * num_grow_ch, num_feat, {3, 3}, {1, 1}, {1, 1}));
    }
    struct ggml_tensor* lrelu(struct ggml_context* ctx, struct ggml_tensor* x) {
        return ggml_leaky_relu(ctx, x, 0.2f, true);
    }
    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
        // x: [n, num_feat, h, w]
        // return: [n, num_feat, h, w]
        auto conv1 = std::dynamic_pointer_cast<Conv2d>(blocks["conv1"]);
        auto conv2 = std::dynamic_pointer_cast<Conv2d>(blocks["conv2"]);
        auto conv3 = std::dynamic_pointer_cast<Conv2d>(blocks["conv3"]);
        auto conv4 = std::dynamic_pointer_cast<Conv2d>(blocks["conv4"]);
        auto conv5 = std::dynamic_pointer_cast<Conv2d>(blocks["conv5"]);
        auto x1    = lrelu(ctx, conv1->forward(ctx, x));
        auto x_cat = ggml_concat(ctx, x, x1);
        auto x2    = lrelu(ctx, conv2->forward(ctx, x_cat));
        x_cat      = ggml_concat(ctx, x_cat, x2);
        auto x3    = lrelu(ctx, conv3->forward(ctx, x_cat));
        x_cat      = ggml_concat(ctx, x_cat, x3);
        auto x4    = lrelu(ctx, conv4->forward(ctx, x_cat));
        x_cat      = ggml_concat(ctx, x_cat, x4);
        auto x5    = conv5->forward(ctx, x_cat);
        x5 = ggml_add(ctx, ggml_scale(ctx, x5, 0.2f), x);
        return x5;
    }
 };
 class RRDB : public GGMLBlock {
 public:
    RRDB(int num_feat, int num_grow_ch = 32) {
        blocks["rdb1"] = std::shared_ptr<GGMLBlock>(new ResidualDenseBlock(num_feat, num_grow_ch));
        blocks["rdb2"] = std::shared_ptr<GGMLBlock>(new ResidualDenseBlock(num_feat, num_grow_ch));
        blocks["rdb3"] = std::shared_ptr<GGMLBlock>(new ResidualDenseBlock(num_feat, num_grow_ch));
    }
    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
        // x: [n, num_feat, h, w]
        // return: [n, num_feat, h, w]
        auto rdb1 = std::dynamic_pointer_cast<ResidualDenseBlock>(blocks["rdb1"]);
        auto rdb2 = std::dynamic_pointer_cast<ResidualDenseBlock>(blocks["rdb2"]);
        auto rdb3 = std::dynamic_pointer_cast<ResidualDenseBlock>(blocks["rdb3"]);
        auto out = rdb1->forward(ctx, x);
        out      = rdb2->forward(ctx, out);
        out      = rdb3->forward(ctx, out);
        out = ggml_add(ctx, ggml_scale(ctx, out, 0.2f), x);
        return out;
    }
 };
 class RRDBNet : public GGMLBlock {
 protected:
    int scale       = 4;  // default RealESRGAN_x4plus_anime_6B
    int num_block   = 6;  // default RealESRGAN_x4plus_anime_6B
    int num_in_ch   = 3;
    int num_out_ch  = 3;
    int num_feat    = 64;  // default RealESRGAN_x4plus_anime_6B
    int num_grow_ch = 32;  // default RealESRGAN_x4plus_anime_6B
 public:
    RRDBNet() {
        blocks["conv_first"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_in_ch, num_feat, {3, 3}, {1, 1}, {1, 1}));
        for (int i = 0; i < num_block; i++) {
            std::string name = "body." + std::to_string(i);
            blocks[name]     = std::shared_ptr<GGMLBlock>(new RRDB(num_feat, num_grow_ch));
        }
        blocks["conv_body"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}));
        // upsample
        blocks["conv_up1"]  = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}));
        blocks["conv_up2"]  = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}));
        blocks["conv_hr"]   = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}));
        blocks["conv_last"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_out_ch, {3, 3}, {1, 1}, {1, 1}));
    }
    struct ggml_tensor* lrelu(struct ggml_context* ctx, struct ggml_tensor* x) {
        return ggml_leaky_relu(ctx, x, 0.2f, true);
    }
    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
        // x: [n, num_in_ch, h, w]
        // return: [n, num_out_ch, h*4, w*4]
        auto conv_first = std::dynamic_pointer_cast<Conv2d>(blocks["conv_first"]);
        auto conv_body  = std::dynamic_pointer_cast<Conv2d>(blocks["conv_body"]);
        auto conv_up1   = std::dynamic_pointer_cast<Conv2d>(blocks["conv_up1"]);
        auto conv_up2   = std::dynamic_pointer_cast<Conv2d>(blocks["conv_up2"]);
        auto conv_hr    = std::dynamic_pointer_cast<Conv2d>(blocks["conv_hr"]);
        auto conv_last  = std::dynamic_pointer_cast<Conv2d>(blocks["conv_last"]);
        auto feat      = conv_first->forward(ctx, x);
        auto body_feat = feat;
        for (int i = 0; i < num_block; i++) {
            std::string name = "body." + std::to_string(i);
            auto block       = std::dynamic_pointer_cast<RRDB>(blocks[name]);
            body_feat = block->forward(ctx, body_feat);
        }
        body_feat = conv_body->forward(ctx, body_feat);
        feat      = ggml_add(ctx, feat, body_feat);
        // upsample
        feat     = lrelu(ctx, conv_up1->forward(ctx, ggml_upscale(ctx, feat, 2)));
        feat     = lrelu(ctx, conv_up2->forward(ctx, ggml_upscale(ctx, feat, 2)));
        auto out = conv_last->forward(ctx, lrelu(ctx, conv_hr->forward(ctx, feat)));
        return out;
    }
 };
 struct ESRGAN : public GGMLModule {
    RRDBNet rrdb_net;
    int scale     = 4;
    int tile_size = 128;  // avoid cuda OOM for 4gb VRAM
    ESRGAN(ggml_backend_t backend,
           ggml_type wtype)
        : GGMLModule(backend, wtype) {
        rrdb_net.init(params_ctx, wtype);
    }
    std::string get_desc() {
        return "esrgan";
    }
    size_t get_params_mem_size() {
        return rrdb_net.get_params_mem_size();
    }
    size_t get_params_num() {
        return rrdb_net.get_params_num();
    }
    bool load_from_file(const std::string& file_path) {
        LOG_INFO("loading esrgan from '%s'", file_path.c_str());
        alloc_params_buffer();
        std::map<std::string, ggml_tensor*> esrgan_tensors;
        rrdb_net.get_param_tensors(esrgan_tensors);
        ModelLoader model_loader;
        if (!model_loader.init_from_file(file_path)) {
            LOG_ERROR("init esrgan model loader from file failed: '%s'", file_path.c_str());
            return false;
        }
        bool success = model_loader.load_tensors(esrgan_tensors, backend);
        if (!success) {
            LOG_ERROR("load esrgan tensors from model loader failed");
            return false;
        }
        LOG_INFO("esrgan model loaded");
        return success;
    }
    struct ggml_cgraph* build_graph(struct ggml_tensor* x) {
        struct ggml_cgraph* gf  = ggml_new_graph(compute_ctx);
        x                       = to_backend(x);
        struct ggml_tensor* out = rrdb_net.forward(compute_ctx, x);
        ggml_build_forward_expand(gf, out);
        return gf;
    }
    void compute(const int n_threads,
                 struct ggml_tensor* x,
                 ggml_tensor** output,
                 ggml_context* output_ctx = NULL) {
        auto get_graph = [&]() -> struct ggml_cgraph* {
            return build_graph(x);
        };
        GGMLModule::compute(get_graph, n_threads, false, output, output_ctx);
    }
 };
 #endif  // __ESRGAN_HPP__
--- a/otherarch/sdcpp/ggml_extend.hpp
+++ b/otherarch/sdcpp/ggml_extend.hpp
--- a/otherarch/sdcpp/lora.hpp
+++ b/otherarch/sdcpp/lora.hpp
@ -0,0 +1,164 @@
 #ifndef __LORA_HPP__
 #define __LORA_HPP__
 #include "ggml_extend.hpp"
 #define LORA_GRAPH_SIZE 10240
 struct LoraModel : public GGMLModule {
    float multiplier = 1.0f;
    std::map<std::string, struct ggml_tensor*> lora_tensors;
    std::string file_path;
    ModelLoader model_loader;
    bool load_failed = false;
    LoraModel(ggml_backend_t backend,
              ggml_type wtype,
              const std::string file_path = "")
        : file_path(file_path), GGMLModule(backend, wtype) {
        if (!model_loader.init_from_file(file_path)) {
            load_failed = true;
        }
    }
    std::string get_desc() {
        return "lora";
    }
    size_t get_params_num() {
        return LORA_GRAPH_SIZE;
    }
    size_t get_params_mem_size() {
        return model_loader.get_params_mem_size(NULL);
    }
    bool load_from_file() {
        LOG_INFO("loading LoRA from '%s'", file_path.c_str());
        if (load_failed) {
            LOG_ERROR("init lora model loader from file failed: '%s'", file_path.c_str());
            return false;
        }
        bool dry_run          = true;
        auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool {
            const std::string& name = tensor_storage.name;
            if (dry_run) {
                struct ggml_tensor* real = ggml_new_tensor(params_ctx,
                                                           tensor_storage.type,
                                                           tensor_storage.n_dims,
                                                           tensor_storage.ne);
                lora_tensors[name]       = real;
            } else {
                auto real   = lora_tensors[name];
                *dst_tensor = real;
            }
            return true;
        };
        model_loader.load_tensors(on_new_tensor_cb, backend);
        alloc_params_buffer();
        dry_run = false;
        model_loader.load_tensors(on_new_tensor_cb, backend);
        LOG_DEBUG("finished loaded lora");
        return true;
    }
    struct ggml_cgraph* build_graph(std::map<std::string, struct ggml_tensor*> model_tensors) {
        struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, LORA_GRAPH_SIZE, false);
        std::set<std::string> applied_lora_tensors;
        for (auto it : model_tensors) {
            std::string k_tensor       = it.first;
            struct ggml_tensor* weight = model_tensors[it.first];
            size_t k_pos = k_tensor.find(".weight");
            if (k_pos == std::string::npos) {
                continue;
            }
            k_tensor = k_tensor.substr(0, k_pos);
            replace_all_chars(k_tensor, '.', '_');
            std::string lora_up_name   = "lora." + k_tensor + ".lora_up.weight";
            std::string lora_down_name = "lora." + k_tensor + ".lora_down.weight";
            std::string alpha_name     = "lora." + k_tensor + ".alpha";
            std::string scale_name     = "lora." + k_tensor + ".scale";
            ggml_tensor* lora_up   = NULL;
            ggml_tensor* lora_down = NULL;
            if (lora_tensors.find(lora_up_name) != lora_tensors.end()) {
                lora_up = lora_tensors[lora_up_name];
            }
            if (lora_tensors.find(lora_down_name) != lora_tensors.end()) {
                lora_down = lora_tensors[lora_down_name];
            }
            if (lora_up == NULL || lora_down == NULL) {
                continue;
            }
            applied_lora_tensors.insert(lora_up_name);
            applied_lora_tensors.insert(lora_down_name);
            applied_lora_tensors.insert(alpha_name);
            applied_lora_tensors.insert(scale_name);
            // calc_cale
            int64_t dim       = lora_down->ne[ggml_n_dims(lora_down) - 1];
            float scale_value = 1.0f;
            if (lora_tensors.find(scale_name) != lora_tensors.end()) {
                scale_value = ggml_backend_tensor_get_f32(lora_tensors[scale_name]);
            } else if (lora_tensors.find(alpha_name) != lora_tensors.end()) {
                float alpha = ggml_backend_tensor_get_f32(lora_tensors[alpha_name]);
                scale_value = alpha / dim;
            }
            scale_value *= multiplier;
            // flat lora tensors to multiply it
            int64_t lora_up_rows   = lora_up->ne[ggml_n_dims(lora_up) - 1];
            lora_up                = ggml_reshape_2d(compute_ctx, lora_up, ggml_nelements(lora_up) / lora_up_rows, lora_up_rows);
            int64_t lora_down_rows = lora_down->ne[ggml_n_dims(lora_down) - 1];
            lora_down              = ggml_reshape_2d(compute_ctx, lora_down, ggml_nelements(lora_down) / lora_down_rows, lora_down_rows);
            // ggml_mul_mat requires tensor b transposed
            lora_down                  = ggml_cont(compute_ctx, ggml_transpose(compute_ctx, lora_down));
            struct ggml_tensor* updown = ggml_mul_mat(compute_ctx, lora_up, lora_down);
            updown                     = ggml_cont(compute_ctx, ggml_transpose(compute_ctx, updown));
            updown                     = ggml_reshape(compute_ctx, updown, weight);
            GGML_ASSERT(ggml_nelements(updown) == ggml_nelements(weight));
            updown = ggml_scale_inplace(compute_ctx, updown, scale_value);
            ggml_tensor* final_weight;
            // if (weight->type != GGML_TYPE_F32 && weight->type != GGML_TYPE_F16) {
            //     final_weight = ggml_new_tensor(compute_ctx, GGML_TYPE_F32, weight->n_dims, weight->ne);
            //     final_weight = ggml_cpy_inplace(compute_ctx, weight, final_weight);
            //     final_weight = ggml_add_inplace(compute_ctx, final_weight, updown);
            //     final_weight = ggml_cpy_inplace(compute_ctx, final_weight, weight);
            // } else {
            //     final_weight = ggml_add_inplace(compute_ctx, weight, updown);
            // }
            final_weight = ggml_add_inplace(compute_ctx, weight, updown);  // apply directly
            ggml_build_forward_expand(gf, final_weight);
        }
        for (auto& kv : lora_tensors) {
            if (applied_lora_tensors.find(kv.first) == applied_lora_tensors.end()) {
                LOG_WARN("unused lora tensor %s", kv.first.c_str());
            }
        }
        return gf;
    }
    void apply(std::map<std::string, struct ggml_tensor*> model_tensors, int n_threads) {
        auto get_graph = [&]() -> struct ggml_cgraph* {
            return build_graph(model_tensors);
        };
        GGMLModule::compute(get_graph, n_threads, true);
    }
 };
 #endif  // __LORA_HPP__
--- a/otherarch/sdcpp/main.cpp
+++ b/otherarch/sdcpp/main.cpp
@ -0,0 +1,778 @@
 #include <stdio.h>
 #include <string.h>
 #include <time.h>
 #include <iostream>
 #include <random>
 #include <string>
 #include <vector>
 // #include "preprocessing.hpp"
 #include "stable-diffusion.h"
 #define STB_IMAGE_IMPLEMENTATION
 #include "stb_image.h"
 #define STB_IMAGE_WRITE_IMPLEMENTATION
 #define STB_IMAGE_WRITE_STATIC
 #include "stb_image_write.h"
 const char* rng_type_to_str[] = {
    "std_default",
    "cuda",
 };
 // Names of the sampler method, same order as enum sample_method in stable-diffusion.h
 const char* sample_method_str[] = {
    "euler_a",
    "euler",
    "heun",
    "dpm2",
    "dpm++2s_a",
    "dpm++2m",
    "dpm++2mv2",
    "lcm",
 };
 // Names of the sigma schedule overrides, same order as sample_schedule in stable-diffusion.h
 const char* schedule_str[] = {
    "default",
    "discrete",
    "karras",
 };
 const char* modes_str[] = {
    "txt2img",
    "img2img",
    "img2vid",
    "convert",
 };
 enum SDMode {
    TXT2IMG,
    IMG2IMG,
    IMG2VID,
    CONVERT,
    MODE_COUNT
 };
 struct SDParams {
    int n_threads = -1;
    SDMode mode   = TXT2IMG;
    std::string model_path;
    std::string vae_path;
    std::string taesd_path;
    std::string esrgan_path;
    std::string controlnet_path;
    std::string embeddings_path;
    sd_type_t wtype = SD_TYPE_COUNT;
    std::string lora_model_dir;
    std::string output_path = "output.png";
    std::string input_path;
    std::string control_image_path;
    std::string prompt;
    std::string negative_prompt;
    float min_cfg   = 1.0f;
    float cfg_scale = 7.0f;
    int clip_skip   = -1;  // <= 0 represents unspecified
    int width       = 512;
    int height      = 512;
    int batch_count = 1;
    int video_frames         = 6;
    int motion_bucket_id     = 127;
    int fps                  = 6;
    float augmentation_level = 0.f;
    sample_method_t sample_method = EULER_A;
    schedule_t schedule           = DEFAULT;
    int sample_steps              = 20;
    float strength                = 0.75f;
    float control_strength        = 0.9f;
    rng_type_t rng_type           = CUDA_RNG;
    int64_t seed                  = 42;
    bool verbose                  = false;
    bool vae_tiling               = false;
    bool control_net_cpu          = false;
    bool canny_preprocess         = false;
    int upscale_repeats           = 1;
 };
 void print_params(SDParams params) {
    printf("Option: \n");
    printf("    n_threads:         %d\n", params.n_threads);
    printf("    mode:              %s\n", modes_str[params.mode]);
    printf("    model_path:        %s\n", params.model_path.c_str());
    printf("    wtype:             %s\n", params.wtype < SD_TYPE_COUNT ? sd_type_name(params.wtype) : "unspecified");
    printf("    vae_path:          %s\n", params.vae_path.c_str());
    printf("    taesd_path:        %s\n", params.taesd_path.c_str());
    printf("    esrgan_path:       %s\n", params.esrgan_path.c_str());
    printf("    controlnet_path:   %s\n", params.controlnet_path.c_str());
    printf("    embeddings_path:   %s\n", params.embeddings_path.c_str());
    printf("    output_path:       %s\n", params.output_path.c_str());
    printf("    init_img:          %s\n", params.input_path.c_str());
    printf("    control_image:     %s\n", params.control_image_path.c_str());
    printf("    controlnet cpu:    %s\n", params.control_net_cpu ? "true" : "false");
    printf("    strength(control): %.2f\n", params.control_strength);
    printf("    prompt:            %s\n", params.prompt.c_str());
    printf("    negative_prompt:   %s\n", params.negative_prompt.c_str());
    printf("    min_cfg:           %.2f\n", params.min_cfg);
    printf("    cfg_scale:         %.2f\n", params.cfg_scale);
    printf("    clip_skip:         %d\n", params.clip_skip);
    printf("    width:             %d\n", params.width);
    printf("    height:            %d\n", params.height);
    printf("    sample_method:     %s\n", sample_method_str[params.sample_method]);
    printf("    schedule:          %s\n", schedule_str[params.schedule]);
    printf("    sample_steps:      %d\n", params.sample_steps);
    printf("    strength(img2img): %.2f\n", params.strength);
    printf("    rng:               %s\n", rng_type_to_str[params.rng_type]);
    printf("    seed:              %ld\n", params.seed);
    printf("    batch_count:       %d\n", params.batch_count);
    printf("    vae_tiling:        %s\n", params.vae_tiling ? "true" : "false");
    printf("    upscale_repeats:   %d\n", params.upscale_repeats);
 }
 void print_usage(int argc, const char* argv[]) {
    printf("usage: %s [arguments]\n", argv[0]);
    printf("\n");
    printf("arguments:\n");
    printf("  -h, --help                         show this help message and exit\n");
    printf("  -M, --mode [MODEL]                 run mode (txt2img or img2img or convert, default: txt2img)\n");
    printf("  -t, --threads N                    number of threads to use during computation (default: -1).\n");
    printf("                                     If threads <= 0, then threads will be set to the number of CPU physical cores\n");
    printf("  -m, --model [MODEL]                path to model\n");
    printf("  --vae [VAE]                        path to vae\n");
    printf("  --taesd [TAESD_PATH]               path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)\n");
    printf("  --control-net [CONTROL_PATH]       path to control net model\n");
    printf("  --embd-dir [EMBEDDING_PATH]        path to embeddings.\n");
    printf("  --upscale-model [ESRGAN_PATH]      path to esrgan model. Upscale images after generate, just RealESRGAN_x4plus_anime_6B supported by now.\n");
    printf("  --upscale-repeats                  Run the ESRGAN upscaler this many times (default 1)\n");
    printf("  --type [TYPE]                      weight type (f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0)\n");
    printf("                                     If not specified, the default is the type of the weight file.\n");
    printf("  --lora-model-dir [DIR]             lora model directory\n");
    printf("  -i, --init-img [IMAGE]             path to the input image, required by img2img\n");
    printf("  --control-image [IMAGE]            path to image condition, control net\n");
    printf("  -o, --output OUTPUT                path to write result image to (default: ./output.png)\n");
    printf("  -p, --prompt [PROMPT]              the prompt to render\n");
    printf("  -n, --negative-prompt PROMPT       the negative prompt (default: \"\")\n");
    printf("  --cfg-scale SCALE                  unconditional guidance scale: (default: 7.0)\n");
    printf("  --strength STRENGTH                strength for noising/unnoising (default: 0.75)\n");
    printf("  --control-strength STRENGTH        strength to apply Control Net (default: 0.9)\n");
    printf("                                     1.0 corresponds to full destruction of information in init image\n");
    printf("  -H, --height H                     image height, in pixel space (default: 512)\n");
    printf("  -W, --width W                      image width, in pixel space (default: 512)\n");
    printf("  --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, lcm}\n");
    printf("                                     sampling method (default: \"euler_a\")\n");
    printf("  --steps  STEPS                     number of sample steps (default: 20)\n");
    printf("  --rng {std_default, cuda}          RNG (default: cuda)\n");
    printf("  -s SEED, --seed SEED               RNG seed (default: 42, use random seed for < 0)\n");
    printf("  -b, --batch-count COUNT            number of images to generate.\n");
    printf("  --schedule {discrete, karras}      Denoiser sigma schedule (default: discrete)\n");
    printf("  --clip-skip N                      ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1)\n");
    printf("                                     <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x\n");
    printf("  --vae-tiling                       process vae in tiles to reduce memory usage\n");
    printf("  --control-net-cpu                  keep controlnet in cpu (for low vram)\n");
    printf("  --canny                            apply canny preprocessor (edge detection)\n");
    printf("  -v, --verbose                      print extra info\n");
 }
 void parse_args(int argc, const char** argv, SDParams& params) {
    bool invalid_arg = false;
    std::string arg;
    for (int i = 1; i < argc; i++) {
        arg = argv[i];
        if (arg == "-t" || arg == "--threads") {
            if (++i >= argc) {
                invalid_arg = true;
                break;
            }
            params.n_threads = std::stoi(argv[i]);
        } else if (arg == "-M" || arg == "--mode") {
            if (++i >= argc) {
                invalid_arg = true;
                break;
            }
            const char* mode_selected = argv[i];
            int mode_found            = -1;
            for (int d = 0; d < MODE_COUNT; d++) {
                if (!strcmp(mode_selected, modes_str[d])) {
                    mode_found = d;
                }
            }
            if (mode_found == -1) {
                fprintf(stderr,
                        "error: invalid mode %s, must be one of [txt2img, img2img, img2vid, convert]\n",
                        mode_selected);
                exit(1);
            }
            params.mode = (SDMode)mode_found;
        } else if (arg == "-m" || arg == "--model") {
            if (++i >= argc) {
                invalid_arg = true;
                break;
            }
            params.model_path = argv[i];
        } else if (arg == "--vae") {
            if (++i >= argc) {
                invalid_arg = true;
                break;
            }
            params.vae_path = argv[i];
        } else if (arg == "--taesd") {
            if (++i >= argc) {
                invalid_arg = true;
                break;
            }
            params.taesd_path = argv[i];
        } else if (arg == "--control-net") {
            if (++i >= argc) {
                invalid_arg = true;
                break;
            }
            params.controlnet_path = argv[i];
        } else if (arg == "--upscale-model") {
            if (++i >= argc) {
                invalid_arg = true;
                break;
            }
            params.esrgan_path = argv[i];
        } else if (arg == "--embd-dir") {
            if (++i >= argc) {
                invalid_arg = true;
                break;
            }
            params.embeddings_path = argv[i];
        } else if (arg == "--type") {
            if (++i >= argc) {
                invalid_arg = true;
                break;
            }
            std::string type = argv[i];
            if (type == "f32") {
                params.wtype = SD_TYPE_F32;
            } else if (type == "f16") {
                params.wtype = SD_TYPE_F16;
            } else if (type == "q4_0") {
                params.wtype = SD_TYPE_Q4_0;
            } else if (type == "q4_1") {
                params.wtype = SD_TYPE_Q4_1;
            } else if (type == "q5_0") {
                params.wtype = SD_TYPE_Q5_0;
            } else if (type == "q5_1") {
                params.wtype = SD_TYPE_Q5_1;
            } else if (type == "q8_0") {
                params.wtype = SD_TYPE_Q8_0;
            } else {
                fprintf(stderr, "error: invalid weight format %s, must be one of [f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0]\n",
                        type.c_str());
                exit(1);
            }
        } else if (arg == "--lora-model-dir") {
            if (++i >= argc) {
                invalid_arg = true;
                break;
            }
            params.lora_model_dir = argv[i];
        } else if (arg == "-i" || arg == "--init-img") {
            if (++i >= argc) {
                invalid_arg = true;
                break;
            }
            params.input_path = argv[i];
        } else if (arg == "--control-image") {
            if (++i >= argc) {
                invalid_arg = true;
                break;
            }
            params.control_image_path = argv[i];
        } else if (arg == "-o" || arg == "--output") {
            if (++i >= argc) {
                invalid_arg = true;
                break;
            }
            params.output_path = argv[i];
        } else if (arg == "-p" || arg == "--prompt") {
            if (++i >= argc) {
                invalid_arg = true;
                break;
            }
            params.prompt = argv[i];
        } else if (arg == "--upscale-repeats") {
            if (++i >= argc) {
                invalid_arg = true;
                break;
            }
            params.upscale_repeats = std::stoi(argv[i]);
            if (params.upscale_repeats < 1) {
                fprintf(stderr, "error: upscale multiplier must be at least 1\n");
                exit(1);
            }
        } else if (arg == "-n" || arg == "--negative-prompt") {
            if (++i >= argc) {
                invalid_arg = true;
                break;
            }
            params.negative_prompt = argv[i];
        } else if (arg == "--cfg-scale") {
            if (++i >= argc) {
                invalid_arg = true;
                break;
            }
            params.cfg_scale = std::stof(argv[i]);
        } else if (arg == "--strength") {
            if (++i >= argc) {
                invalid_arg = true;
                break;
            }
            params.strength = std::stof(argv[i]);
        } else if (arg == "--control-strength") {
            if (++i >= argc) {
                invalid_arg = true;
                break;
            }
            params.control_strength = std::stof(argv[i]);
        } else if (arg == "-H" || arg == "--height") {
            if (++i >= argc) {
                invalid_arg = true;
                break;
            }
            params.height = std::stoi(argv[i]);
        } else if (arg == "-W" || arg == "--width") {
            if (++i >= argc) {
                invalid_arg = true;
                break;
            }
            params.width = std::stoi(argv[i]);
        } else if (arg == "--steps") {
            if (++i >= argc) {
                invalid_arg = true;
                break;
            }
            params.sample_steps = std::stoi(argv[i]);
        } else if (arg == "--clip-skip") {
            if (++i >= argc) {
                invalid_arg = true;
                break;
            }
            params.clip_skip = std::stoi(argv[i]);
        } else if (arg == "--vae-tiling") {
            params.vae_tiling = true;
        } else if (arg == "--control-net-cpu") {
            params.control_net_cpu = true;
        } else if (arg == "--canny") {
            params.canny_preprocess = true;
        } else if (arg == "-b" || arg == "--batch-count") {
            if (++i >= argc) {
                invalid_arg = true;
                break;
            }
            params.batch_count = std::stoi(argv[i]);
        } else if (arg == "--rng") {
            if (++i >= argc) {
                invalid_arg = true;
                break;
            }
            std::string rng_type_str = argv[i];
            if (rng_type_str == "std_default") {
                params.rng_type = STD_DEFAULT_RNG;
            } else if (rng_type_str == "cuda") {
                params.rng_type = CUDA_RNG;
            } else {
                invalid_arg = true;
                break;
            }
        } else if (arg == "--schedule") {
            if (++i >= argc) {
                invalid_arg = true;
                break;
            }
            const char* schedule_selected = argv[i];
            int schedule_found            = -1;
            for (int d = 0; d < N_SCHEDULES; d++) {
                if (!strcmp(schedule_selected, schedule_str[d])) {
                    schedule_found = d;
                }
            }
            if (schedule_found == -1) {
                invalid_arg = true;
                break;
            }
            params.schedule = (schedule_t)schedule_found;
        } else if (arg == "-s" || arg == "--seed") {
            if (++i >= argc) {
                invalid_arg = true;
                break;
            }
            params.seed = std::stoll(argv[i]);
        } else if (arg == "--sampling-method") {
            if (++i >= argc) {
                invalid_arg = true;
                break;
            }
            const char* sample_method_selected = argv[i];
            int sample_method_found            = -1;
            for (int m = 0; m < N_SAMPLE_METHODS; m++) {
                if (!strcmp(sample_method_selected, sample_method_str[m])) {
                    sample_method_found = m;
                }
            }
            if (sample_method_found == -1) {
                invalid_arg = true;
                break;
            }
            params.sample_method = (sample_method_t)sample_method_found;
        } else if (arg == "-h" || arg == "--help") {
            print_usage(argc, argv);
            exit(0);
        } else if (arg == "-v" || arg == "--verbose") {
            params.verbose = true;
        } else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            print_usage(argc, argv);
            exit(1);
        }
    }
    if (invalid_arg) {
        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
        print_usage(argc, argv);
        exit(1);
    }
    if (params.n_threads <= 0) {
        params.n_threads = get_num_physical_cores();
    }
    if (params.mode != CONVERT && params.mode != IMG2VID && params.prompt.length() == 0) {
        fprintf(stderr, "error: the following arguments are required: prompt\n");
        print_usage(argc, argv);
        exit(1);
    }
    if (params.model_path.length() == 0) {
        fprintf(stderr, "error: the following arguments are required: model_path\n");
        print_usage(argc, argv);
        exit(1);
    }
    if ((params.mode == IMG2IMG || params.mode == IMG2VID) && params.input_path.length() == 0) {
        fprintf(stderr, "error: when using the img2img mode, the following arguments are required: init-img\n");
        print_usage(argc, argv);
        exit(1);
    }
    if (params.output_path.length() == 0) {
        fprintf(stderr, "error: the following arguments are required: output_path\n");
        print_usage(argc, argv);
        exit(1);
    }
    if (params.width <= 0 || params.width % 64 != 0) {
        fprintf(stderr, "error: the width must be a multiple of 64\n");
        exit(1);
    }
    if (params.height <= 0 || params.height % 64 != 0) {
        fprintf(stderr, "error: the height must be a multiple of 64\n");
        exit(1);
    }
    if (params.sample_steps <= 0) {
        fprintf(stderr, "error: the sample_steps must be greater than 0\n");
        exit(1);
    }
    if (params.strength < 0.f || params.strength > 1.f) {
        fprintf(stderr, "error: can only work with strength in [0.0, 1.0]\n");
        exit(1);
    }
    if (params.seed < 0) {
        srand((int)time(NULL));
        params.seed = rand();
    }
    if (params.mode == CONVERT) {
        if (params.output_path == "output.png") {
            params.output_path = "output.gguf";
        }
    }
 }
 static std::string sd_basename(const std::string& path) {
    size_t pos = path.find_last_of('/');
    if (pos != std::string::npos) {
        return path.substr(pos + 1);
    }
    pos = path.find_last_of('\\');
    if (pos != std::string::npos) {
        return path.substr(pos + 1);
    }
    return path;
 }
 std::string get_image_params(SDParams params, int64_t seed) {
    std::string parameter_string = params.prompt + "\n";
    if (params.negative_prompt.size() != 0) {
        parameter_string += "Negative prompt: " + params.negative_prompt + "\n";
    }
    parameter_string += "Steps: " + std::to_string(params.sample_steps) + ", ";
    parameter_string += "CFG scale: " + std::to_string(params.cfg_scale) + ", ";
    parameter_string += "Seed: " + std::to_string(seed) + ", ";
    parameter_string += "Size: " + std::to_string(params.width) + "x" + std::to_string(params.height) + ", ";
    parameter_string += "Model: " + sd_basename(params.model_path) + ", ";
    parameter_string += "RNG: " + std::string(rng_type_to_str[params.rng_type]) + ", ";
    parameter_string += "Sampler: " + std::string(sample_method_str[params.sample_method]);
    if (params.schedule == KARRAS) {
        parameter_string += " karras";
    }
    parameter_string += ", ";
    parameter_string += "Version: stable-diffusion.cpp";
    return parameter_string;
 }
 void sd_log_cb(enum sd_log_level_t level, const char* log, void* data) {
    SDParams* params = (SDParams*)data;
    if (!params->verbose && level <= SD_LOG_DEBUG) {
        return;
    }
    if (level <= SD_LOG_INFO) {
        fputs(log, stdout);
        fflush(stdout);
    } else {
        fputs(log, stderr);
        fflush(stderr);
    }
 }
 int main(int argc, const char* argv[]) {
    SDParams params;
    parse_args(argc, argv, params);
    sd_set_log_callback(sd_log_cb, (void*)&params);
    if (params.verbose) {
        print_params(params);
        printf("%s", sd_get_system_info());
    }
    if (params.mode == CONVERT) {
        bool success = convert(params.model_path.c_str(), params.vae_path.c_str(), params.output_path.c_str(), params.wtype);
        if (!success) {
            fprintf(stderr,
                    "convert '%s'/'%s' to '%s' failed\n",
                    params.model_path.c_str(),
                    params.vae_path.c_str(),
                    params.output_path.c_str());
            return 1;
        } else {
            printf("convert '%s'/'%s' to '%s' success\n",
                   params.model_path.c_str(),
                   params.vae_path.c_str(),
                   params.output_path.c_str());
            return 0;
        }
    }
    if (params.mode == IMG2VID) {
        fprintf(stderr, "SVD support is broken, do not use it!!!\n");
        return 1;
    }
    bool vae_decode_only        = true;
    uint8_t* input_image_buffer = NULL;
    if (params.mode == IMG2IMG || params.mode == IMG2VID) {
        vae_decode_only = false;
        int c              = 0;
        input_image_buffer = stbi_load(params.input_path.c_str(), &params.width, &params.height, &c, 3);
        if (input_image_buffer == NULL) {
            fprintf(stderr, "load image from '%s' failed\n", params.input_path.c_str());
            return 1;
        }
        if (c != 3) {
            fprintf(stderr, "input image must be a 3 channels RGB image, but got %d channels\n", c);
            free(input_image_buffer);
            return 1;
        }
        if (params.width <= 0 || params.width % 64 != 0) {
            fprintf(stderr, "error: the width of image must be a multiple of 64\n");
            free(input_image_buffer);
            return 1;
        }
        if (params.height <= 0 || params.height % 64 != 0) {
            fprintf(stderr, "error: the height of image must be a multiple of 64\n");
            free(input_image_buffer);
            return 1;
        }
    }
    sd_ctx_t* sd_ctx = new_sd_ctx(params.model_path.c_str(),
                                  params.vae_path.c_str(),
                                  params.taesd_path.c_str(),
                                  params.controlnet_path.c_str(),
                                  params.lora_model_dir.c_str(),
                                  params.embeddings_path.c_str(),
                                  vae_decode_only,
                                  params.vae_tiling,
                                  true,
                                  params.n_threads,
                                  params.wtype,
                                  params.rng_type,
                                  params.schedule,
                                  params.control_net_cpu);
    if (sd_ctx == NULL) {
        printf("new_sd_ctx_t failed\n");
        return 1;
    }
    sd_image_t* results;
    if (params.mode == TXT2IMG) {
        sd_image_t* control_image = NULL;
        if (params.controlnet_path.size() > 0 && params.control_image_path.size() > 0) {
            int c              = 0;
            input_image_buffer = stbi_load(params.control_image_path.c_str(), &params.width, &params.height, &c, 3);
            if (input_image_buffer == NULL) {
                fprintf(stderr, "load image from '%s' failed\n", params.control_image_path.c_str());
                return 1;
            }
            control_image = new sd_image_t{(uint32_t)params.width,
                                           (uint32_t)params.height,
                                           3,
                                           input_image_buffer};
            if (params.canny_preprocess) {  // apply preprocessor
                control_image->data = preprocess_canny(control_image->data,
                                                       control_image->width,
                                                       control_image->height,
                                                       0.08f,
                                                       0.08f,
                                                       0.8f,
                                                       1.0f,
                                                       false);
            }
        }
        results = txt2img(sd_ctx,
                          params.prompt.c_str(),
                          params.negative_prompt.c_str(),
                          params.clip_skip,
                          params.cfg_scale,
                          params.width,
                          params.height,
                          params.sample_method,
                          params.sample_steps,
                          params.seed,
                          params.batch_count,
                          control_image,
                          params.control_strength);
    } else {
        sd_image_t input_image = {(uint32_t)params.width,
                                  (uint32_t)params.height,
                                  3,
                                  input_image_buffer};
        if (params.mode == IMG2VID) {
            results = img2vid(sd_ctx,
                              input_image,
                              params.width,
                              params.height,
                              params.video_frames,
                              params.motion_bucket_id,
                              params.fps,
                              params.augmentation_level,
                              params.min_cfg,
                              params.cfg_scale,
                              params.sample_method,
                              params.sample_steps,
                              params.strength,
                              params.seed);
            if (results == NULL) {
                printf("generate failed\n");
                free_sd_ctx(sd_ctx);
                return 1;
            }
            size_t last            = params.output_path.find_last_of(".");
            std::string dummy_name = last != std::string::npos ? params.output_path.substr(0, last) : params.output_path;
            for (int i = 0; i < params.video_frames; i++) {
                if (results[i].data == NULL) {
                    continue;
                }
                std::string final_image_path = i > 0 ? dummy_name + "_" + std::to_string(i + 1) + ".png" : dummy_name + ".png";
                stbi_write_png(final_image_path.c_str(), results[i].width, results[i].height, results[i].channel,
                               results[i].data, 0, get_image_params(params, params.seed + i).c_str());
                printf("save result image to '%s'\n", final_image_path.c_str());
                free(results[i].data);
                results[i].data = NULL;
            }
            free(results);
            free_sd_ctx(sd_ctx);
            return 0;
        } else {
            results = img2img(sd_ctx,
                              input_image,
                              params.prompt.c_str(),
                              params.negative_prompt.c_str(),
                              params.clip_skip,
                              params.cfg_scale,
                              params.width,
                              params.height,
                              params.sample_method,
                              params.sample_steps,
                              params.strength,
                              params.seed,
                              params.batch_count);
        }
    }
    if (results == NULL) {
        printf("generate failed\n");
        free_sd_ctx(sd_ctx);
        return 1;
    }
    int upscale_factor = 4;  // unused for RealESRGAN_x4plus_anime_6B.pth
    if (params.esrgan_path.size() > 0 && params.upscale_repeats > 0) {
        upscaler_ctx_t* upscaler_ctx = new_upscaler_ctx(params.esrgan_path.c_str(),
                                                        params.n_threads,
                                                        params.wtype);
        if (upscaler_ctx == NULL) {
            printf("new_upscaler_ctx failed\n");
        } else {
            for (int i = 0; i < params.batch_count; i++) {
                if (results[i].data == NULL) {
                    continue;
                }
                sd_image_t current_image = results[i];
                for (int u = 0; u < params.upscale_repeats; ++u) {
                    sd_image_t upscaled_image = upscale(upscaler_ctx, current_image, upscale_factor);
                    if (upscaled_image.data == NULL) {
                        printf("upscale failed\n");
                        break;
                    }
                    free(current_image.data);
                    current_image = upscaled_image;
                }
                results[i] = current_image;  // Set the final upscaled image as the result
            }
        }
    }
    size_t last            = params.output_path.find_last_of(".");
    std::string dummy_name = last != std::string::npos ? params.output_path.substr(0, last) : params.output_path;
    for (int i = 0; i < params.batch_count; i++) {
        if (results[i].data == NULL) {
            continue;
        }
        std::string final_image_path = i > 0 ? dummy_name + "_" + std::to_string(i + 1) + ".png" : dummy_name + ".png";
        stbi_write_png(final_image_path.c_str(), results[i].width, results[i].height, results[i].channel,
                       results[i].data, 0, get_image_params(params, params.seed + i).c_str());
        printf("save result image to '%s'\n", final_image_path.c_str());
        free(results[i].data);
        results[i].data = NULL;
    }
    free(results);
    free_sd_ctx(sd_ctx);
    return 0;
 }
--- a/otherarch/sdcpp/model.cpp
+++ b/otherarch/sdcpp/model.cpp
--- a/otherarch/sdcpp/model.h
+++ b/otherarch/sdcpp/model.h
@ -0,0 +1,154 @@
 #ifndef __MODEL_H__
 #define __MODEL_H__
 #include <functional>
 #include <map>
 #include <memory>
 #include <set>
 #include <sstream>
 #include <string>
 #include <vector>
 #include "ggml-backend.h"
 #include "ggml.h"
 #include "json.hpp"
 #include "zip.h"
 #define SD_MAX_DIMS 5
 enum SDVersion {
    VERSION_1_x,
    VERSION_2_x,
    VERSION_XL,
    VERSION_SVD,
    VERSION_COUNT,
 };
 struct TensorStorage {
    std::string name;
    ggml_type type          = GGML_TYPE_F32;
    bool is_bf16            = false;
    int64_t ne[SD_MAX_DIMS] = {1, 1, 1, 1, 1};
    int n_dims              = 0;
    size_t file_index = 0;
    int index_in_zip  = -1;  // >= means stored in a zip file
    size_t offset     = 0;   // offset in file
    TensorStorage() = default;
    TensorStorage(const std::string& name, ggml_type type, int64_t* ne, int n_dims, size_t file_index, size_t offset = 0)
        : name(name), type(type), n_dims(n_dims), file_index(file_index), offset(offset) {
        for (int i = 0; i < n_dims; i++) {
            this->ne[i] = ne[i];
        }
    }
    int64_t nelements() const {
        int64_t n = 1;
        for (int i = 0; i < SD_MAX_DIMS; i++) {
            n *= ne[i];
        }
        return n;
    }
    int64_t nbytes() const {
        return nelements() * ggml_type_size(type) / ggml_blck_size(type);
    }
    int64_t nbytes_to_read() const {
        if (is_bf16) {
            return nbytes() / 2;
        } else {
            return nbytes();
        }
    }
    void unsqueeze() {
        if (n_dims == 2) {
            n_dims = 4;
            ne[3]  = ne[1];
            ne[2]  = ne[0];
            ne[1]  = 1;
            ne[0]  = 1;
        }
    }
    std::vector<TensorStorage> chunk(size_t n) {
        std::vector<TensorStorage> chunks;
        size_t chunk_size = nbytes_to_read() / n;
        // printf("%d/%d\n", chunk_size, nbytes_to_read());
        reverse_ne();
        for (int i = 0; i < n; i++) {
            TensorStorage chunk_i = *this;
            chunk_i.ne[0]         = ne[0] / n;
            chunk_i.offset        = offset + i * chunk_size;
            chunk_i.reverse_ne();
            chunks.push_back(chunk_i);
        }
        reverse_ne();
        return chunks;
    }
    void reverse_ne() {
        int64_t new_ne[SD_MAX_DIMS] = {1, 1, 1, 1, 1};
        for (int i = 0; i < n_dims; i++) {
            new_ne[i] = ne[n_dims - 1 - i];
        }
        for (int i = 0; i < n_dims; i++) {
            ne[i] = new_ne[i];
        }
    }
    std::string to_string() const {
        std::stringstream ss;
        const char* type_name = ggml_type_name(type);
        if (is_bf16) {
            type_name = "bf16";
        }
        ss << name << " | " << type_name << " | ";
        ss << n_dims << " [";
        for (int i = 0; i < SD_MAX_DIMS; i++) {
            ss << ne[i];
            if (i != SD_MAX_DIMS - 1) {
                ss << ", ";
            }
        }
        ss << "]";
        return ss.str();
    }
 };
 typedef std::function<bool(const TensorStorage&, ggml_tensor**)> on_new_tensor_cb_t;
 class ModelLoader {
 protected:
    std::vector<std::string> file_paths_;
    std::vector<TensorStorage> tensor_storages;
    bool parse_data_pkl(uint8_t* buffer,
                        size_t buffer_size,
                        zip_t* zip,
                        std::string dir,
                        size_t file_index,
                        const std::string& prefix);
    bool init_from_gguf_file(const std::string& file_path, const std::string& prefix = "");
    bool init_from_safetensors_file(const std::string& file_path, const std::string& prefix = "");
    bool init_from_ckpt_file(const std::string& file_path, const std::string& prefix = "");
    bool init_from_diffusers_file(const std::string& file_path, const std::string& prefix = "");
 public:
    bool init_from_file(const std::string& file_path, const std::string& prefix = "");
    SDVersion get_sd_version();
    ggml_type get_sd_wtype();
    std::string load_merges();
    bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend_t backend);
    bool load_tensors(std::map<std::string, struct ggml_tensor*>& tensors,
                      ggml_backend_t backend,
                      std::set<std::string> ignore_tensors = {});
    bool save_to_gguf_file(const std::string& file_path, ggml_type type);
    int64_t get_params_mem_size(ggml_backend_t backend, ggml_type type = GGML_TYPE_COUNT);
    ~ModelLoader() = default;
 };
 #endif  // __MODEL_H__
--- a/otherarch/sdcpp/preprocessing.hpp
+++ b/otherarch/sdcpp/preprocessing.hpp
@ -0,0 +1,227 @@
 #ifndef __PREPROCESSING_HPP__
 #define __PREPROCESSING_HPP__
 #include "ggml_extend.hpp"
 #define M_PI_ 3.14159265358979323846
 void convolve(struct ggml_tensor* input, struct ggml_tensor* output, struct ggml_tensor* kernel, int padding) {
    struct ggml_init_params params;
    params.mem_size                 = 20 * 1024 * 1024;  // 10
    params.mem_buffer               = NULL;
    params.no_alloc                 = false;
    struct ggml_context* ctx0       = ggml_init(params);
    struct ggml_tensor* kernel_fp16 = ggml_new_tensor_4d(ctx0, GGML_TYPE_F16, kernel->ne[0], kernel->ne[1], 1, 1);
    ggml_fp32_to_fp16_row((float*)kernel->data, (ggml_fp16_t*)kernel_fp16->data, ggml_nelements(kernel));
    ggml_tensor* h  = ggml_conv_2d(ctx0, kernel_fp16, input, 1, 1, padding, padding, 1, 1);
    ggml_cgraph* gf = ggml_new_graph(ctx0);
    ggml_build_forward_expand(gf, ggml_cpy(ctx0, h, output));
    ggml_graph_compute_with_ctx(ctx0, gf, 1);
    ggml_free(ctx0);
 }
 void gaussian_kernel(struct ggml_tensor* kernel) {
    int ks_mid   = kernel->ne[0] / 2;
    float sigma  = 1.4f;
    float normal = 1.f / (2.0f * M_PI_ * powf(sigma, 2.0f));
    for (int y = 0; y < kernel->ne[0]; y++) {
        float gx = -ks_mid + y;
        for (int x = 0; x < kernel->ne[1]; x++) {
            float gy = -ks_mid + x;
            float k_ = expf(-((gx * gx + gy * gy) / (2.0f * powf(sigma, 2.0f)))) * normal;
            ggml_tensor_set_f32(kernel, k_, x, y);
        }
    }
 }
 void grayscale(struct ggml_tensor* rgb_img, struct ggml_tensor* grayscale) {
    for (int iy = 0; iy < rgb_img->ne[1]; iy++) {
        for (int ix = 0; ix < rgb_img->ne[0]; ix++) {
            float r    = ggml_tensor_get_f32(rgb_img, ix, iy);
            float g    = ggml_tensor_get_f32(rgb_img, ix, iy, 1);
            float b    = ggml_tensor_get_f32(rgb_img, ix, iy, 2);
            float gray = 0.2989f * r + 0.5870f * g + 0.1140f * b;
            ggml_tensor_set_f32(grayscale, gray, ix, iy);
        }
    }
 }
 void prop_hypot(struct ggml_tensor* x, struct ggml_tensor* y, struct ggml_tensor* h) {
    int n_elements = ggml_nelements(h);
    float* dx      = (float*)x->data;
    float* dy      = (float*)y->data;
    float* dh      = (float*)h->data;
    for (int i = 0; i < n_elements; i++) {
        dh[i] = sqrtf(dx[i] * dx[i] + dy[i] * dy[i]);
    }
 }
 void prop_arctan2(struct ggml_tensor* x, struct ggml_tensor* y, struct ggml_tensor* h) {
    int n_elements = ggml_nelements(h);
    float* dx      = (float*)x->data;
    float* dy      = (float*)y->data;
    float* dh      = (float*)h->data;
    for (int i = 0; i < n_elements; i++) {
        dh[i] = atan2f(dy[i], dx[i]);
    }
 }
 void normalize_tensor(struct ggml_tensor* g) {
    int n_elements = ggml_nelements(g);
    float* dg      = (float*)g->data;
    float max      = -INFINITY;
    for (int i = 0; i < n_elements; i++) {
        max = dg[i] > max ? dg[i] : max;
    }
    max = 1.0f / max;
    for (int i = 0; i < n_elements; i++) {
        dg[i] *= max;
    }
 }
 void non_max_supression(struct ggml_tensor* result, struct ggml_tensor* G, struct ggml_tensor* D) {
    for (int iy = 1; iy < result->ne[1] - 1; iy++) {
        for (int ix = 1; ix < result->ne[0] - 1; ix++) {
            float angle = ggml_tensor_get_f32(D, ix, iy) * 180.0f / M_PI_;
            angle       = angle < 0.0f ? angle += 180.0f : angle;
            float q     = 1.0f;
            float r     = 1.0f;
            // angle 0
            if ((0 >= angle && angle < 22.5f) || (157.5f >= angle && angle <= 180)) {
                q = ggml_tensor_get_f32(G, ix, iy + 1);
                r = ggml_tensor_get_f32(G, ix, iy - 1);
            }
            // angle 45
            else if (22.5f >= angle && angle < 67.5f) {
                q = ggml_tensor_get_f32(G, ix + 1, iy - 1);
                r = ggml_tensor_get_f32(G, ix - 1, iy + 1);
            }
            // angle 90
            else if (67.5f >= angle && angle < 112.5) {
                q = ggml_tensor_get_f32(G, ix + 1, iy);
                r = ggml_tensor_get_f32(G, ix - 1, iy);
            }
            // angle 135
            else if (112.5 >= angle && angle < 157.5f) {
                q = ggml_tensor_get_f32(G, ix - 1, iy - 1);
                r = ggml_tensor_get_f32(G, ix + 1, iy + 1);
            }
            float cur = ggml_tensor_get_f32(G, ix, iy);
            if ((cur >= q) && (cur >= r)) {
                ggml_tensor_set_f32(result, cur, ix, iy);
            } else {
                ggml_tensor_set_f32(result, 0.0f, ix, iy);
            }
        }
    }
 }
 void threshold_hystersis(struct ggml_tensor* img, float high_threshold, float low_threshold, float weak, float strong) {
    int n_elements = ggml_nelements(img);
    float* imd     = (float*)img->data;
    float max      = -INFINITY;
    for (int i = 0; i < n_elements; i++) {
        max = imd[i] > max ? imd[i] : max;
    }
    float ht = max * high_threshold;
    float lt = ht * low_threshold;
    for (int i = 0; i < n_elements; i++) {
        float img_v = imd[i];
        if (img_v >= ht) {  // strong pixel
            imd[i] = strong;
        } else if (img_v <= ht && img_v >= lt) {  // strong pixel
            imd[i] = weak;
        }
    }
    for (int iy = 0; iy < img->ne[1]; iy++) {
        for (int ix = 0; ix < img->ne[0]; ix++) {
            if (ix >= 3 && ix <= img->ne[0] - 3 && iy >= 3 && iy <= img->ne[1] - 3) {
                ggml_tensor_set_f32(img, ggml_tensor_get_f32(img, ix, iy), ix, iy);
            } else {
                ggml_tensor_set_f32(img, 0.0f, ix, iy);
            }
        }
    }
    // hysteresis
    for (int iy = 1; iy < img->ne[1] - 1; iy++) {
        for (int ix = 1; ix < img->ne[0] - 1; ix++) {
            float imd_v = ggml_tensor_get_f32(img, ix, iy);
            if (imd_v == weak) {
                if (ggml_tensor_get_f32(img, ix + 1, iy - 1) == strong || ggml_tensor_get_f32(img, ix + 1, iy) == strong ||
                    ggml_tensor_get_f32(img, ix, iy - 1) == strong || ggml_tensor_get_f32(img, ix, iy + 1) == strong ||
                    ggml_tensor_get_f32(img, ix - 1, iy - 1) == strong || ggml_tensor_get_f32(img, ix - 1, iy) == strong) {
                    ggml_tensor_set_f32(img, strong, ix, iy);
                } else {
                    ggml_tensor_set_f32(img, 0.0f, ix, iy);
                }
            }
        }
    }
 }
 uint8_t* preprocess_canny(uint8_t* img, int width, int height, float high_threshold, float low_threshold, float weak, float strong, bool inverse) {
    struct ggml_init_params params;
    params.mem_size               = static_cast<size_t>(10 * 1024 * 1024);  // 10
    params.mem_buffer             = NULL;
    params.no_alloc               = false;
    struct ggml_context* work_ctx = ggml_init(params);
    if (!work_ctx) {
        LOG_ERROR("ggml_init() failed");
        return NULL;
    }
    float kX[9] = {
        -1, 0, 1,
        -2, 0, 2,
        -1, 0, 1};
    float kY[9] = {
        1, 2, 1,
        0, 0, 0,
        -1, -2, -1};
    // generate kernel
    int kernel_size             = 5;
    struct ggml_tensor* gkernel = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, kernel_size, kernel_size, 1, 1);
    struct ggml_tensor* sf_kx   = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 3, 3, 1, 1);
    memcpy(sf_kx->data, kX, ggml_nbytes(sf_kx));
    struct ggml_tensor* sf_ky = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 3, 3, 1, 1);
    memcpy(sf_ky->data, kY, ggml_nbytes(sf_ky));
    gaussian_kernel(gkernel);
    struct ggml_tensor* image      = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
    struct ggml_tensor* image_gray = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 1, 1);
    struct ggml_tensor* iX         = ggml_dup_tensor(work_ctx, image_gray);
    struct ggml_tensor* iY         = ggml_dup_tensor(work_ctx, image_gray);
    struct ggml_tensor* G          = ggml_dup_tensor(work_ctx, image_gray);
    struct ggml_tensor* tetha      = ggml_dup_tensor(work_ctx, image_gray);
    sd_image_to_tensor(img, image);
    grayscale(image, image_gray);
    convolve(image_gray, image_gray, gkernel, 2);
    convolve(image_gray, iX, sf_kx, 1);
    convolve(image_gray, iY, sf_ky, 1);
    prop_hypot(iX, iY, G);
    normalize_tensor(G);
    prop_arctan2(iX, iY, tetha);
    non_max_supression(image_gray, G, tetha);
    threshold_hystersis(image_gray, high_threshold, low_threshold, weak, strong);
    // to RGB channels
    for (int iy = 0; iy < height; iy++) {
        for (int ix = 0; ix < width; ix++) {
            float gray = ggml_tensor_get_f32(image_gray, ix, iy);
            gray       = inverse ? 1.0f - gray : gray;
            ggml_tensor_set_f32(image, gray, ix, iy);
            ggml_tensor_set_f32(image, gray, ix, iy, 1);
            ggml_tensor_set_f32(image, gray, ix, iy, 2);
        }
    }
    free(img);
    uint8_t* output = sd_tensor_to_image(image);
    ggml_free(work_ctx);
    return output;
 }
 #endif  // __PREPROCESSING_HPP__
--- a/otherarch/sdcpp/rng.hpp
+++ b/otherarch/sdcpp/rng.hpp
@ -0,0 +1,35 @@
 #ifndef __RNG_H__
 #define __RNG_H__
 #include <random>
 #include <vector>
 class RNG {
 public:
    virtual void manual_seed(uint64_t seed)      = 0;
    virtual std::vector<float> randn(uint32_t n) = 0;
 };
 class STDDefaultRNG : public RNG {
 private:
    std::default_random_engine generator;
 public:
    void manual_seed(uint64_t seed) {
        generator.seed((unsigned int)seed);
    }
    std::vector<float> randn(uint32_t n) {
        std::vector<float> result;
        float mean   = 0.0;
        float stddev = 1.0;
        std::normal_distribution<float> distribution(mean, stddev);
        for (uint32_t i = 0; i < n; i++) {
            float random_number = distribution(generator);
            result.push_back(random_number);
        }
        return result;
    }
 };
 #endif  // __RNG_H__
--- a/otherarch/sdcpp/rng_philox.hpp
+++ b/otherarch/sdcpp/rng_philox.hpp
@ -0,0 +1,125 @@
 #ifndef __RNG_PHILOX_H__
 #define __RNG_PHILOX_H__
 #include <cmath>
 #include <vector>
 #include "rng.hpp"
 // RNG imitiating torch cuda randn on CPU.
 // Port from: https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/5ef669de080814067961f28357256e8fe27544f4/modules/rng_philox.py
 class PhiloxRNG : public RNG {
 private:
    uint64_t seed;
    uint32_t offset;
 private:
    std::vector<uint32_t> philox_m = {0xD2511F53, 0xCD9E8D57};
    std::vector<uint32_t> philox_w = {0x9E3779B9, 0xBB67AE85};
    float two_pow32_inv            = 2.3283064e-10f;
    float two_pow32_inv_2pi        = 2.3283064e-10f * 6.2831855f;
    std::vector<uint32_t> uint32(uint64_t x) {
        std::vector<uint32_t> result(2);
        result[0] = static_cast<uint32_t>(x & 0xFFFFFFFF);
        result[1] = static_cast<uint32_t>(x >> 32);
        return result;
    }
    std::vector<std::vector<uint32_t>> uint32(const std::vector<uint64_t>& x) {
        uint32_t N = (uint32_t)x.size();
        std::vector<std::vector<uint32_t>> result(2, std::vector<uint32_t>(N));
        for (uint32_t i = 0; i < N; ++i) {
            result[0][i] = static_cast<uint32_t>(x[i] & 0xFFFFFFFF);
            result[1][i] = static_cast<uint32_t>(x[i] >> 32);
        }
        return result;
    }
    //  A single round of the Philox 4x32 random number generator.
    void philox4_round(std::vector<std::vector<uint32_t>>& counter,
                       const std::vector<std::vector<uint32_t>>& key) {
        uint32_t N = (uint32_t)counter[0].size();
        for (uint32_t i = 0; i < N; i++) {
            std::vector<uint32_t> v1 = uint32(static_cast<uint64_t>(counter[0][i]) * static_cast<uint64_t>(philox_m[0]));
            std::vector<uint32_t> v2 = uint32(static_cast<uint64_t>(counter[2][i]) * static_cast<uint64_t>(philox_m[1]));
            counter[0][i] = v2[1] ^ counter[1][i] ^ key[0][i];
            counter[1][i] = v2[0];
            counter[2][i] = v1[1] ^ counter[3][i] ^ key[1][i];
            counter[3][i] = v1[0];
        }
    }
    // Generates 32-bit random numbers using the Philox 4x32 random number generator.
    // Parameters:
    //     counter : A 4xN array of 32-bit integers representing the counter values (offset into generation).
    //     key : A 2xN array of 32-bit integers representing the key values (seed).
    //     rounds : The number of rounds to perform.
    // Returns:
    //     std::vector<std::vector<uint32_t>>: A 4xN array of 32-bit integers containing the generated random numbers.
    std::vector<std::vector<uint32_t>> philox4_32(std::vector<std::vector<uint32_t>>& counter,
                                                  std::vector<std::vector<uint32_t>>& key,
                                                  int rounds = 10) {
        uint32_t N = (uint32_t)counter[0].size();
        for (int i = 0; i < rounds - 1; ++i) {
            philox4_round(counter, key);
            for (uint32_t j = 0; j < N; ++j) {
                key[0][j] += philox_w[0];
                key[1][j] += philox_w[1];
            }
        }
        philox4_round(counter, key);
        return counter;
    }
    float box_muller(float x, float y) {
        float u = x * two_pow32_inv + two_pow32_inv / 2;
        float v = y * two_pow32_inv_2pi + two_pow32_inv_2pi / 2;
        float s = sqrt(-2.0f * log(u));
        float r1 = s * sin(v);
        return r1;
    }
 public:
    PhiloxRNG(uint64_t seed = 0) {
        this->seed   = seed;
        this->offset = 0;
    }
    void manual_seed(uint64_t seed) {
        this->seed   = seed;
        this->offset = 0;
    }
    std::vector<float> randn(uint32_t n) {
        std::vector<std::vector<uint32_t>> counter(4, std::vector<uint32_t>(n, 0));
        for (uint32_t i = 0; i < n; i++) {
            counter[0][i] = this->offset;
        }
        for (uint32_t i = 0; i < n; i++) {
            counter[2][i] = i;
        }
        this->offset += 1;
        std::vector<uint64_t> key(n, this->seed);
        std::vector<std::vector<uint32_t>> key_uint32 = uint32(key);
        std::vector<std::vector<uint32_t>> g = philox4_32(counter, key_uint32);
        std::vector<float> result;
        for (uint32_t i = 0; i < n; ++i) {
            result.push_back(box_muller((float)g[0][i], (float)g[1][i]));
        }
        return result;
    }
 };
 #endif  // __RNG_PHILOX_H__
--- a/otherarch/sdcpp/stable-diffusion.cpp
+++ b/otherarch/sdcpp/stable-diffusion.cpp
--- a/otherarch/sdcpp/stable-diffusion.h
+++ b/otherarch/sdcpp/stable-diffusion.h
@ -0,0 +1,193 @@
 #ifndef __STABLE_DIFFUSION_H__
 #define __STABLE_DIFFUSION_H__
 #if defined(_WIN32) || defined(__CYGWIN__)
 #ifndef SD_BUILD_SHARED_LIB
 #define SD_API
 #else
 #ifdef SD_BUILD_DLL
 #define SD_API __declspec(dllexport)
 #else
 #define SD_API __declspec(dllimport)
 #endif
 #endif
 #else
 #if __GNUC__ >= 4
 #define SD_API __attribute__((visibility("default")))
 #else
 #define SD_API
 #endif
 #endif
 #ifdef __cplusplus
 extern "C" {
 #endif
 #include <stdbool.h>
 #include <stddef.h>
 #include <stdint.h>
 #include <string.h>
 enum rng_type_t {
    STD_DEFAULT_RNG,
    CUDA_RNG
 };
 enum sample_method_t {
    EULER_A,
    EULER,
    HEUN,
    DPM2,
    DPMPP2S_A,
    DPMPP2M,
    DPMPP2Mv2,
    LCM,
    N_SAMPLE_METHODS
 };
 enum schedule_t {
    DEFAULT,
    DISCRETE,
    KARRAS,
    N_SCHEDULES
 };
 // same as enum ggml_type
 enum sd_type_t {
    SD_TYPE_F32  = 0,
    SD_TYPE_F16  = 1,
    SD_TYPE_Q4_0 = 2,
    SD_TYPE_Q4_1 = 3,
    // SD_TYPE_Q4_2 = 4, support has been removed
    // SD_TYPE_Q4_3 (5) support has been removed
    SD_TYPE_Q5_0 = 6,
    SD_TYPE_Q5_1 = 7,
    SD_TYPE_Q8_0 = 8,
    SD_TYPE_Q8_1 = 9,
    // k-quantizations
    SD_TYPE_Q2_K    = 10,
    SD_TYPE_Q3_K    = 11,
    SD_TYPE_Q4_K    = 12,
    SD_TYPE_Q5_K    = 13,
    SD_TYPE_Q6_K    = 14,
    SD_TYPE_Q8_K    = 15,
    SD_TYPE_IQ2_XXS = 16,
    SD_TYPE_IQ2_XS  = 17,
    SD_TYPE_IQ3_XXS = 18,
    SD_TYPE_IQ1_S   = 19,
    SD_TYPE_IQ4_NL  = 20,
    SD_TYPE_I8,
    SD_TYPE_I16,
    SD_TYPE_I32,
    SD_TYPE_COUNT,
 };
 SD_API const char* sd_type_name(enum sd_type_t type);
 enum sd_log_level_t {
    SD_LOG_DEBUG,
    SD_LOG_INFO,
    SD_LOG_WARN,
    SD_LOG_ERROR
 };
 typedef void (*sd_log_cb_t)(enum sd_log_level_t level, const char* text, void* data);
 SD_API void sd_set_log_callback(sd_log_cb_t sd_log_cb, void* data);
 SD_API int32_t get_num_physical_cores();
 SD_API const char* sd_get_system_info();
 typedef struct {
    uint32_t width;
    uint32_t height;
    uint32_t channel;
    uint8_t* data;
 } sd_image_t;
 typedef struct sd_ctx_t sd_ctx_t;
 SD_API sd_ctx_t* new_sd_ctx(const char* model_path,
                            const char* vae_path,
                            const char* taesd_path,
                            const char* control_net_path_c_str,
                            const char* lora_model_dir,
                            const char* embed_dir_c_str,
                            bool vae_decode_only,
                            bool vae_tiling,
                            bool free_params_immediately,
                            int n_threads,
                            enum sd_type_t wtype,
                            enum rng_type_t rng_type,
                            enum schedule_t s,
                            bool keep_control_net_cpu);
 SD_API void free_sd_ctx(sd_ctx_t* sd_ctx);
 SD_API sd_image_t* txt2img(sd_ctx_t* sd_ctx,
                           const char* prompt,
                           const char* negative_prompt,
                           int clip_skip,
                           float cfg_scale,
                           int width,
                           int height,
                           enum sample_method_t sample_method,
                           int sample_steps,
                           int64_t seed,
                           int batch_count,
                           const sd_image_t* control_cond,
                           float control_strength);
 SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx,
                           sd_image_t init_image,
                           const char* prompt,
                           const char* negative_prompt,
                           int clip_skip,
                           float cfg_scale,
                           int width,
                           int height,
                           enum sample_method_t sample_method,
                           int sample_steps,
                           float strength,
                           int64_t seed,
                           int batch_count);
 SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx,
                           sd_image_t init_image,
                           int width,
                           int height,
                           int video_frames,
                           int motion_bucket_id,
                           int fps,
                           float augmentation_level,
                           float min_cfg,
                           float cfg_scale,
                           enum sample_method_t sample_method,
                           int sample_steps,
                           float strength,
                           int64_t seed);
 typedef struct upscaler_ctx_t upscaler_ctx_t;
 SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path,
                                        int n_threads,
                                        enum sd_type_t wtype);
 SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx);
 SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx, sd_image_t input_image, uint32_t upscale_factor);
 SD_API bool convert(const char* input_path, const char* vae_path, const char* output_path, sd_type_t output_type);
 SD_API uint8_t* preprocess_canny(uint8_t* img,
                                 int width,
                                 int height,
                                 float high_threshold,
                                 float low_threshold,
                                 float weak,
                                 float strong,
                                 bool inverse);
 #ifdef __cplusplus
 }
 #endif
 #endif  // __STABLE_DIFFUSION_H__
--- a/otherarch/sdcpp/tae.hpp
+++ b/otherarch/sdcpp/tae.hpp
@ -0,0 +1,259 @@
 #ifndef __TAE_HPP__
 #define __TAE_HPP__
 #include "ggml_extend.hpp"
 #include "model.h"
 /*
    ===================================    TinyAutoEncoder  ===================================
    References:
    https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/autoencoders/vae.py
    https://github.com/madebyollin/taesd/blob/main/taesd.py
 */
 class TAEBlock : public UnaryBlock {
 protected:
    int n_in;
    int n_out;
 public:
    TAEBlock(int n_in, int n_out)
        : n_in(n_in), n_out(n_out) {
        blocks["conv.0"] = std::shared_ptr<GGMLBlock>(new Conv2d(n_in, n_out, {3, 3}, {1, 1}, {1, 1}));
        blocks["conv.2"] = std::shared_ptr<GGMLBlock>(new Conv2d(n_out, n_out, {3, 3}, {1, 1}, {1, 1}));
        blocks["conv.4"] = std::shared_ptr<GGMLBlock>(new Conv2d(n_out, n_out, {3, 3}, {1, 1}, {1, 1}));
        if (n_in != n_out) {
            blocks["skip"] = std::shared_ptr<GGMLBlock>(new Conv2d(n_in, n_out, {1, 1}, {1, 1}, {1, 1}, {1, 1}, false));
        }
    }
    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
        // x: [n, n_in, h, w]
        // return: [n, n_out, h, w]
        auto conv_0 = std::dynamic_pointer_cast<Conv2d>(blocks["conv.0"]);
        auto conv_2 = std::dynamic_pointer_cast<Conv2d>(blocks["conv.2"]);
        auto conv_4 = std::dynamic_pointer_cast<Conv2d>(blocks["conv.4"]);
        auto h = conv_0->forward(ctx, x);
        h      = ggml_relu_inplace(ctx, h);
        h      = conv_2->forward(ctx, h);
        h      = ggml_relu_inplace(ctx, h);
        h      = conv_4->forward(ctx, h);
        if (n_in != n_out) {
            auto skip = std::dynamic_pointer_cast<Conv2d>(blocks["skip"]);
            LOG_DEBUG("skip");
            x = skip->forward(ctx, x);
        }
        h = ggml_add(ctx, h, x);
        h = ggml_relu_inplace(ctx, h);
        return h;
    }
 };
 class TinyEncoder : public UnaryBlock {
    int in_channels = 3;
    int channels    = 64;
    int z_channels  = 4;
    int num_blocks  = 3;
 public:
    TinyEncoder() {
        int index                       = 0;
        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, channels, {3, 3}, {1, 1}, {1, 1}));
        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {2, 2}, {1, 1}, {1, 1}, false));
        for (int i = 0; i < num_blocks; i++) {
            blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
        }
        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {2, 2}, {1, 1}, {1, 1}, false));
        for (int i = 0; i < num_blocks; i++) {
            blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
        }
        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {2, 2}, {1, 1}, {1, 1}, false));
        for (int i = 0; i < num_blocks; i++) {
            blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
        }
        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, z_channels, {3, 3}, {1, 1}, {1, 1}));
    }
    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
        // x: [n, in_channels, h, w]
        // return: [n, z_channels, h/8, w/8]
        for (int i = 0; i < num_blocks * 3 + 6; i++) {
            auto block = std::dynamic_pointer_cast<UnaryBlock>(blocks[std::to_string(i)]);
            x = block->forward(ctx, x);
        }
        return x;
    }
 };
 class TinyDecoder : public UnaryBlock {
    int z_channels   = 4;
    int channels     = 64;
    int out_channels = 3;
    int num_blocks   = 3;
 public:
    TinyDecoder(int index = 0) {
        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(z_channels, channels, {3, 3}, {1, 1}, {1, 1}));
        index++;  // nn.ReLU()
        for (int i = 0; i < num_blocks; i++) {
            blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
        }
        index++;  // nn.Upsample()
        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, false));
        for (int i = 0; i < num_blocks; i++) {
            blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
        }
        index++;  // nn.Upsample()
        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, false));
        for (int i = 0; i < num_blocks; i++) {
            blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
        }
        index++;  // nn.Upsample()
        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, false));
        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
    }
    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* z) {
        // z: [n, z_channels, h, w]
        // return: [n, out_channels, h*8, w*8]
        auto h = ggml_scale(ctx, z, 1.0f / 3.0f);
        h      = ggml_tanh_inplace(ctx, h);
        h      = ggml_scale(ctx, h, 3.0f);
        for (int i = 0; i < num_blocks * 3 + 10; i++) {
            if (blocks.find(std::to_string(i)) == blocks.end()) {
                if (i == 1) {
                    h = ggml_relu_inplace(ctx, h);
                } else {
                    h = ggml_upscale(ctx, h, 2);
                }
                continue;
            }
            auto block = std::dynamic_pointer_cast<UnaryBlock>(blocks[std::to_string(i)]);
            h = block->forward(ctx, h);
        }
        return h;
    }
 };
 class TAESD : public GGMLBlock {
 protected:
    bool decode_only;
 public:
    TAESD(bool decode_only = true)
        : decode_only(decode_only) {
        blocks["decoder.layers"] = std::shared_ptr<GGMLBlock>(new TinyDecoder());
        if (!decode_only) {
            blocks["encoder.layers"] = std::shared_ptr<GGMLBlock>(new TinyEncoder());
        }
    }
    struct ggml_tensor* decode(struct ggml_context* ctx, struct ggml_tensor* z) {
        auto decoder = std::dynamic_pointer_cast<TinyDecoder>(blocks["decoder.layers"]);
        return decoder->forward(ctx, z);
    }
    struct ggml_tensor* encode(struct ggml_context* ctx, struct ggml_tensor* x) {
        auto encoder = std::dynamic_pointer_cast<TinyEncoder>(blocks["encoder.layers"]);
        return encoder->forward(ctx, x);
    }
 };
 struct TinyAutoEncoder : public GGMLModule {
    TAESD taesd;
    bool decode_only = false;
    TinyAutoEncoder(ggml_backend_t backend,
                    ggml_type wtype,
                    bool decoder_only = true)
        : decode_only(decoder_only),
          taesd(decode_only),
          GGMLModule(backend, wtype) {
        taesd.init(params_ctx, wtype);
    }
    std::string get_desc() {
        return "taesd";
    }
    size_t get_params_mem_size() {
        return taesd.get_params_mem_size();
    }
    size_t get_params_num() {
        return taesd.get_params_num();
    }
    bool load_from_file(const std::string& file_path) {
        LOG_INFO("loading taesd from '%s'", file_path.c_str());
        alloc_params_buffer();
        std::map<std::string, ggml_tensor*> taesd_tensors;
        taesd.get_param_tensors(taesd_tensors);
        std::set<std::string> ignore_tensors;
        if (decode_only) {
            ignore_tensors.insert("encoder.");
        }
        ModelLoader model_loader;
        if (!model_loader.init_from_file(file_path)) {
            LOG_ERROR("init taesd model loader from file failed: '%s'", file_path.c_str());
            return false;
        }
        bool success = model_loader.load_tensors(taesd_tensors, backend, ignore_tensors);
        if (!success) {
            LOG_ERROR("load tae tensors from model loader failed");
            return false;
        }
        LOG_INFO("taesd model loaded");
        return success;
    }
    struct ggml_cgraph* build_graph(struct ggml_tensor* z, bool decode_graph) {
        struct ggml_cgraph* gf  = ggml_new_graph(compute_ctx);
        z                       = to_backend(z);
        struct ggml_tensor* out = decode_graph ? taesd.decode(compute_ctx, z) : taesd.encode(compute_ctx, z);
        ggml_build_forward_expand(gf, out);
        return gf;
    }
    void compute(const int n_threads,
                 struct ggml_tensor* z,
                 bool decode_graph,
                 struct ggml_tensor** output,
                 struct ggml_context* output_ctx = NULL) {
        auto get_graph = [&]() -> struct ggml_cgraph* {
            return build_graph(z, decode_graph);
        };
        GGMLModule::compute(get_graph, n_threads, false, output, output_ctx);
    }
 };
 #endif  // __TAE_HPP__
--- a/otherarch/sdcpp/thirdparty/README.md
+++ b/otherarch/sdcpp/thirdparty/README.md
@ -0,0 +1,2 @@
 - json.hpp library from: https://github.com/nlohmann/json
 - ZIP Library from: https://github.com/kuba--/zip
--- a/otherarch/sdcpp/thirdparty/json.hpp
+++ b/otherarch/sdcpp/thirdparty/json.hpp
--- a/otherarch/sdcpp/thirdparty/miniz.h
+++ b/otherarch/sdcpp/thirdparty/miniz.h
--- a/otherarch/sdcpp/thirdparty/stb_image.h
+++ b/otherarch/sdcpp/thirdparty/stb_image.h
--- a/otherarch/sdcpp/thirdparty/stb_image_write.h
+++ b/otherarch/sdcpp/thirdparty/stb_image_write.h
--- a/otherarch/sdcpp/thirdparty/zip.c
+++ b/otherarch/sdcpp/thirdparty/zip.c
--- a/otherarch/sdcpp/thirdparty/zip.h
+++ b/otherarch/sdcpp/thirdparty/zip.h
@ -0,0 +1,509 @@
 /*
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 */
 #pragma once
 #ifndef ZIP_H
 #define ZIP_H
 #include <stdint.h>
 #include <string.h>
 #include <sys/types.h>
 #ifndef ZIP_SHARED
 #define ZIP_EXPORT
 #else
 #ifdef _WIN32
 #ifdef ZIP_BUILD_SHARED
 #define ZIP_EXPORT __declspec(dllexport)
 #else
 #define ZIP_EXPORT __declspec(dllimport)
 #endif
 #else
 #define ZIP_EXPORT __attribute__((visibility("default")))
 #endif
 #endif
 #ifdef __cplusplus
 extern "C" {
 #endif
 #if !defined(_POSIX_C_SOURCE) && defined(_MSC_VER)
 // 64-bit Windows is the only mainstream platform
 // where sizeof(long) != sizeof(void*)
 #ifdef _WIN64
 typedef long long ssize_t; /* byte count or error */
 #else
 typedef long ssize_t; /* byte count or error */
 #endif
 #endif
 /**
 * @mainpage
 *
 * Documentation for @ref zip.
 */
 /**
 * @addtogroup zip
 * @{
 */
 /**
 * Default zip compression level.
 */
 #define ZIP_DEFAULT_COMPRESSION_LEVEL 6
 /**
 * Error codes
 */
 #define ZIP_ENOINIT -1      // not initialized
 #define ZIP_EINVENTNAME -2  // invalid entry name
 #define ZIP_ENOENT -3       // entry not found
 #define ZIP_EINVMODE -4     // invalid zip mode
 #define ZIP_EINVLVL -5      // invalid compression level
 #define ZIP_ENOSUP64 -6     // no zip 64 support
 #define ZIP_EMEMSET -7      // memset error
 #define ZIP_EWRTENT -8      // cannot write data to entry
 #define ZIP_ETDEFLINIT -9   // cannot initialize tdefl compressor
 #define ZIP_EINVIDX -10     // invalid index
 #define ZIP_ENOHDR -11      // header not found
 #define ZIP_ETDEFLBUF -12   // cannot flush tdefl buffer
 #define ZIP_ECRTHDR -13     // cannot create entry header
 #define ZIP_EWRTHDR -14     // cannot write entry header
 #define ZIP_EWRTDIR -15     // cannot write to central dir
 #define ZIP_EOPNFILE -16    // cannot open file
 #define ZIP_EINVENTTYPE -17 // invalid entry type
 #define ZIP_EMEMNOALLOC -18 // extracting data using no memory allocation
 #define ZIP_ENOFILE -19     // file not found
 #define ZIP_ENOPERM -20     // no permission
 #define ZIP_EOOMEM -21      // out of memory
 #define ZIP_EINVZIPNAME -22 // invalid zip archive name
 #define ZIP_EMKDIR -23      // make dir error
 #define ZIP_ESYMLINK -24    // symlink error
 #define ZIP_ECLSZIP -25     // close archive error
 #define ZIP_ECAPSIZE -26    // capacity size too small
 #define ZIP_EFSEEK -27      // fseek error
 #define ZIP_EFREAD -28      // fread error
 #define ZIP_EFWRITE -29     // fwrite error
 #define ZIP_ERINIT -30      // cannot initialize reader
 #define ZIP_EWINIT -31      // cannot initialize writer
 #define ZIP_EWRINIT -32     // cannot initialize writer from reader
 /**
 * Looks up the error message string corresponding to an error number.
 * @param errnum error number
 * @return error message string corresponding to errnum or NULL if error is not
 * found.
 */
 extern ZIP_EXPORT const char *zip_strerror(int errnum);
 /**
 * @struct zip_t
 *
 * This data structure is used throughout the library to represent zip archive -
 * forward declaration.
 */
 struct zip_t;
 /**
 * Opens zip archive with compression level using the given mode.
 *
 * @param zipname zip archive file name.
 * @param level compression level (0-9 are the standard zlib-style levels).
 * @param mode file access mode.
 *        - 'r': opens a file for reading/extracting (the file must exists).
 *        - 'w': creates an empty file for writing.
 *        - 'a': appends to an existing archive.
 *
 * @return the zip archive handler or NULL on error
 */
 extern ZIP_EXPORT struct zip_t *zip_open(const char *zipname, int level,
                                         char mode);
 /**
 * Opens zip archive with compression level using the given mode.
 * The function additionally returns @param errnum -
 *
 * @param zipname zip archive file name.
 * @param level compression level (0-9 are the standard zlib-style levels).
 * @param mode file access mode.
 *        - 'r': opens a file for reading/extracting (the file must exists).
 *        - 'w': creates an empty file for writing.
 *        - 'a': appends to an existing archive.
 * @param errnum 0 on success, negative number (< 0) on error.
 *
 * @return the zip archive handler or NULL on error
 */
 extern ZIP_EXPORT struct zip_t *
 zip_openwitherror(const char *zipname, int level, char mode, int *errnum);
 /**
 * Closes the zip archive, releases resources - always finalize.
 *
 * @param zip zip archive handler.
 */
 extern ZIP_EXPORT void zip_close(struct zip_t *zip);
 /**
 * Determines if the archive has a zip64 end of central directory headers.
 *
 * @param zip zip archive handler.
 *
 * @return the return code - 1 (true), 0 (false), negative number (< 0) on
 *         error.
 */
 extern ZIP_EXPORT int zip_is64(struct zip_t *zip);
 /**
 * Opens an entry by name in the zip archive.
 *
 * For zip archive opened in 'w' or 'a' mode the function will append
 * a new entry. In readonly mode the function tries to locate the entry
 * in global dictionary.
 *
 * @param zip zip archive handler.
 * @param entryname an entry name in local dictionary.
 *
 * @return the return code - 0 on success, negative number (< 0) on error.
 */
 extern ZIP_EXPORT int zip_entry_open(struct zip_t *zip, const char *entryname);
 /**
 * Opens an entry by name in the zip archive.
 *
 * For zip archive opened in 'w' or 'a' mode the function will append
 * a new entry. In readonly mode the function tries to locate the entry
 * in global dictionary (case sensitive).
 *
 * @param zip zip archive handler.
 * @param entryname an entry name in local dictionary (case sensitive).
 *
 * @return the return code - 0 on success, negative number (< 0) on error.
 */
 extern ZIP_EXPORT int zip_entry_opencasesensitive(struct zip_t *zip,
                                                  const char *entryname);
 /**
 * Opens a new entry by index in the zip archive.
 *
 * This function is only valid if zip archive was opened in 'r' (readonly) mode.
 *
 * @param zip zip archive handler.
 * @param index index in local dictionary.
 *
 * @return the return code - 0 on success, negative number (< 0) on error.
 */
 extern ZIP_EXPORT int zip_entry_openbyindex(struct zip_t *zip, size_t index);
 /**
 * Closes a zip entry, flushes buffer and releases resources.
 *
 * @param zip zip archive handler.
 *
 * @return the return code - 0 on success, negative number (< 0) on error.
 */
 extern ZIP_EXPORT int zip_entry_close(struct zip_t *zip);
 /**
 * Returns a local name of the current zip entry.
 *
 * The main difference between user's entry name and local entry name
 * is optional relative path.
 * Following .ZIP File Format Specification - the path stored MUST not contain
 * a drive or device letter, or a leading slash.
 * All slashes MUST be forward slashes '/' as opposed to backwards slashes '\'
 * for compatibility with Amiga and UNIX file systems etc.
 *
 * @param zip: zip archive handler.
 *
 * @return the pointer to the current zip entry name, or NULL on error.
 */
 extern ZIP_EXPORT const char *zip_entry_name(struct zip_t *zip);
 /**
 * Returns an index of the current zip entry.
 *
 * @param zip zip archive handler.
 *
 * @return the index on success, negative number (< 0) on error.
 */
 extern ZIP_EXPORT ssize_t zip_entry_index(struct zip_t *zip);
 /**
 * Determines if the current zip entry is a directory entry.
 *
 * @param zip zip archive handler.
 *
 * @return the return code - 1 (true), 0 (false), negative number (< 0) on
 *         error.
 */
 extern ZIP_EXPORT int zip_entry_isdir(struct zip_t *zip);
 /**
 * Returns the uncompressed size of the current zip entry.
 * Alias for zip_entry_uncomp_size (for backward compatibility).
 *
 * @param zip zip archive handler.
 *
 * @return the uncompressed size in bytes.
 */
 extern ZIP_EXPORT unsigned long long zip_entry_size(struct zip_t *zip);
 /**
 * Returns the uncompressed size of the current zip entry.
 *
 * @param zip zip archive handler.
 *
 * @return the uncompressed size in bytes.
 */
 extern ZIP_EXPORT unsigned long long zip_entry_uncomp_size(struct zip_t *zip);
 /**
 * Returns the compressed size of the current zip entry.
 *
 * @param zip zip archive handler.
 *
 * @return the compressed size in bytes.
 */
 extern ZIP_EXPORT unsigned long long zip_entry_comp_size(struct zip_t *zip);
 /**
 * Returns CRC-32 checksum of the current zip entry.
 *
 * @param zip zip archive handler.
 *
 * @return the CRC-32 checksum.
 */
 extern ZIP_EXPORT unsigned int zip_entry_crc32(struct zip_t *zip);
 /**
 * Compresses an input buffer for the current zip entry.
 *
 * @param zip zip archive handler.
 * @param buf input buffer.
 * @param bufsize input buffer size (in bytes).
 *
 * @return the return code - 0 on success, negative number (< 0) on error.
 */
 extern ZIP_EXPORT int zip_entry_write(struct zip_t *zip, const void *buf,
                                      size_t bufsize);
 /**
 * Compresses a file for the current zip entry.
 *
 * @param zip zip archive handler.
 * @param filename input file.
 *
 * @return the return code - 0 on success, negative number (< 0) on error.
 */
 extern ZIP_EXPORT int zip_entry_fwrite(struct zip_t *zip, const char *filename);
 /**
 * Extracts the current zip entry into output buffer.
 *
 * The function allocates sufficient memory for a output buffer.
 *
 * @param zip zip archive handler.
 * @param buf output buffer.
 * @param bufsize output buffer size (in bytes).
 *
 * @note remember to release memory allocated for a output buffer.
 *       for large entries, please take a look at zip_entry_extract function.
 *
 * @return the return code - the number of bytes actually read on success.
 *         Otherwise a negative number (< 0) on error.
 */
 extern ZIP_EXPORT ssize_t zip_entry_read(struct zip_t *zip, void **buf,
                                         size_t *bufsize);
 /**
 * Extracts the current zip entry into a memory buffer using no memory
 * allocation.
 *
 * @param zip zip archive handler.
 * @param buf preallocated output buffer.
 * @param bufsize output buffer size (in bytes).
 *
 * @note ensure supplied output buffer is large enough.
 *       zip_entry_size function (returns uncompressed size for the current
 *       entry) can be handy to estimate how big buffer is needed.
 *       For large entries, please take a look at zip_entry_extract function.
 *
 * @return the return code - the number of bytes actually read on success.
 *         Otherwise a negative number (< 0) on error (e.g. bufsize is not large
 * enough).
 */
 extern ZIP_EXPORT ssize_t zip_entry_noallocread(struct zip_t *zip, void *buf,
                                                size_t bufsize);
 /**
 * Extracts the current zip entry into output file.
 *
 * @param zip zip archive handler.
 * @param filename output file.
 *
 * @return the return code - 0 on success, negative number (< 0) on error.
 */
 extern ZIP_EXPORT int zip_entry_fread(struct zip_t *zip, const char *filename);
 /**
 * Extracts the current zip entry using a callback function (on_extract).
 *
 * @param zip zip archive handler.
 * @param on_extract callback function.
 * @param arg opaque pointer (optional argument, which you can pass to the
 *        on_extract callback)
 *
 * @return the return code - 0 on success, negative number (< 0) on error.
 */
 extern ZIP_EXPORT int
 zip_entry_extract(struct zip_t *zip,
                  size_t (*on_extract)(void *arg, uint64_t offset,
                                       const void *data, size_t size),
                  void *arg);
 /**
 * Returns the number of all entries (files and directories) in the zip archive.
 *
 * @param zip zip archive handler.
 *
 * @return the return code - the number of entries on success, negative number
 *         (< 0) on error.
 */
 extern ZIP_EXPORT ssize_t zip_entries_total(struct zip_t *zip);
 /**
 * Deletes zip archive entries.
 *
 * @param zip zip archive handler.
 * @param entries array of zip archive entries to be deleted.
 * @param len the number of entries to be deleted.
 * @return the number of deleted entries, or negative number (< 0) on error.
 */
 extern ZIP_EXPORT ssize_t zip_entries_delete(struct zip_t *zip,
                                             char *const entries[], size_t len);
 /**
 * Extracts a zip archive stream into directory.
 *
 * If on_extract is not NULL, the callback will be called after
 * successfully extracted each zip entry.
 * Returning a negative value from the callback will cause abort and return an
 * error. The last argument (void *arg) is optional, which you can use to pass
 * data to the on_extract callback.
 *
 * @param stream zip archive stream.
 * @param size stream size.
 * @param dir output directory.
 * @param on_extract on extract callback.
 * @param arg opaque pointer.
 *
 * @return the return code - 0 on success, negative number (< 0) on error.
 */
 extern ZIP_EXPORT int
 zip_stream_extract(const char *stream, size_t size, const char *dir,
                   int (*on_extract)(const char *filename, void *arg),
                   void *arg);
 /**
 * Opens zip archive stream into memory.
 *
 * @param stream zip archive stream.
 * @param size stream size.
 * @param level compression level (0-9 are the standard zlib-style levels).
 * @param mode file access mode.
 *        - 'r': opens a file for reading/extracting (the file must exists).
 *        - 'w': creates an empty file for writing.
 *        - 'a': appends to an existing archive.
 *
 * @return the zip archive handler or NULL on error
 */
 extern ZIP_EXPORT struct zip_t *zip_stream_open(const char *stream, size_t size,
                                                int level, char mode);
 /**
 * Opens zip archive stream into memory.
 * The function additionally returns @param errnum -
 *
 * @param stream zip archive stream.
 * @param size stream size.*
 * @param level compression level (0-9 are the standard zlib-style levels).
 * @param mode file access mode.
 *        - 'r': opens a file for reading/extracting (the file must exists).
 *        - 'w': creates an empty file for writing.
 *        - 'a': appends to an existing archive.
 * @param errnum 0 on success, negative number (< 0) on error.
 *
 * @return the zip archive handler or NULL on error
 */
 extern ZIP_EXPORT struct zip_t *zip_stream_openwitherror(const char *stream,
                                                         size_t size, int level,
                                                         char mode,
                                                         int *errnum);
 /**
 * Copy zip archive stream output buffer.
 *
 * @param zip zip archive handler.
 * @param buf output buffer. User should free buf.
 * @param bufsize output buffer size (in bytes).
 *
 * @return copy size
 */
 extern ZIP_EXPORT ssize_t zip_stream_copy(struct zip_t *zip, void **buf,
                                          size_t *bufsize);
 /**
 * Close zip archive releases resources.
 *
 * @param zip zip archive handler.
 *
 * @return
 */
 extern ZIP_EXPORT void zip_stream_close(struct zip_t *zip);
 /**
 * Creates a new archive and puts files into a single zip archive.
 *
 * @param zipname zip archive file.
 * @param filenames input files.
 * @param len: number of input files.
 *
 * @return the return code - 0 on success, negative number (< 0) on error.
 */
 extern ZIP_EXPORT int zip_create(const char *zipname, const char *filenames[],
                                 size_t len);
 /**
 * Extracts a zip archive file into directory.
 *
 * If on_extract_entry is not NULL, the callback will be called after
 * successfully extracted each zip entry.
 * Returning a negative value from the callback will cause abort and return an
 * error. The last argument (void *arg) is optional, which you can use to pass
 * data to the on_extract_entry callback.
 *
 * @param zipname zip archive file.
 * @param dir output directory.
 * @param on_extract_entry on extract callback.
 * @param arg opaque pointer.
 *
 * @return the return code - 0 on success, negative number (< 0) on error.
 */
 extern ZIP_EXPORT int zip_extract(const char *zipname, const char *dir,
                                  int (*on_extract_entry)(const char *filename,
                                                          void *arg),
                                  void *arg);
 /** @} */
 #ifdef __cplusplus
 }
 #endif
 #endif
--- a/otherarch/sdcpp/unet.hpp
+++ b/otherarch/sdcpp/unet.hpp
@ -0,0 +1,661 @@
 #ifndef __UNET_HPP__
 #define __UNET_HPP__
 #include "common.hpp"
 #include "ggml_extend.hpp"
 #include "model.h"
 /*==================================================== UnetModel =====================================================*/
 #define UNET_GRAPH_SIZE 10240
 class SpatialVideoTransformer : public SpatialTransformer {
 protected:
    int64_t time_depth;
    int64_t max_time_embed_period;
 public:
    SpatialVideoTransformer(int64_t in_channels,
                            int64_t n_head,
                            int64_t d_head,
                            int64_t depth,
                            int64_t context_dim,
                            int64_t time_depth            = 1,
                            int64_t max_time_embed_period = 10000)
        : SpatialTransformer(in_channels, n_head, d_head, depth, context_dim),
          max_time_embed_period(max_time_embed_period) {
        // We will convert unet transformer linear to conv2d 1x1 when loading the weights, so use_linear is always False
        // use_spatial_context is always True
        // merge_strategy is always learned_with_images
        // merge_factor is loaded from weights
        // time_context_dim is always None
        // ff_in is always True
        // disable_self_attn is always False
        // disable_temporal_crossattention is always False
        int64_t inner_dim = n_head * d_head;
        GGML_ASSERT(depth == time_depth);
        GGML_ASSERT(in_channels == inner_dim);
        int64_t time_mix_d_head    = d_head;
        int64_t n_time_mix_heads   = n_head;
        int64_t time_mix_inner_dim = time_mix_d_head * n_time_mix_heads;  // equal to inner_dim
        int64_t time_context_dim   = context_dim;
        for (int i = 0; i < time_depth; i++) {
            std::string name = "time_stack." + std::to_string(i);
            blocks[name]     = std::shared_ptr<GGMLBlock>(new BasicTransformerBlock(inner_dim,
                                                                                    n_time_mix_heads,
                                                                                    time_mix_d_head,
                                                                                    time_context_dim,
                                                                                    true));
        }
        int64_t time_embed_dim     = in_channels * 4;
        blocks["time_pos_embed.0"] = std::shared_ptr<GGMLBlock>(new Linear(in_channels, time_embed_dim));
        // time_pos_embed.1 is nn.SiLU()
        blocks["time_pos_embed.2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, in_channels));
        blocks["time_mixer"] = std::shared_ptr<GGMLBlock>(new AlphaBlender());
    }
    struct ggml_tensor* forward(struct ggml_context* ctx,
                                struct ggml_tensor* x,
                                struct ggml_tensor* context,
                                int timesteps) {
        // x: [N, in_channels, h, w] aka [b*t, in_channels, h, w], t == timesteps
        // context: [N, max_position(aka n_context), hidden_size(aka context_dim)] aka [b*t, n_context, context_dim], t == timesteps
        // t_emb: [N, in_channels] aka [b*t, in_channels]
        // timesteps is num_frames
        // time_context is always None
        // image_only_indicator is always tensor([0.])
        // transformer_options is not used
        // GGML_ASSERT(ggml_n_dims(context) == 3);
        auto norm             = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm"]);
        auto proj_in          = std::dynamic_pointer_cast<Conv2d>(blocks["proj_in"]);
        auto proj_out         = std::dynamic_pointer_cast<Conv2d>(blocks["proj_out"]);
        auto time_pos_embed_0 = std::dynamic_pointer_cast<Linear>(blocks["time_pos_embed.0"]);
        auto time_pos_embed_2 = std::dynamic_pointer_cast<Linear>(blocks["time_pos_embed.2"]);
        auto time_mixer       = std::dynamic_pointer_cast<AlphaBlender>(blocks["time_mixer"]);
        auto x_in         = x;
        int64_t n         = x->ne[3];
        int64_t h         = x->ne[1];
        int64_t w         = x->ne[0];
        int64_t inner_dim = n_head * d_head;
        GGML_ASSERT(n == timesteps);  // We compute cond and uncond separately, so batch_size==1
        auto time_context    = context;  // [b*t, n_context, context_dim]
        auto spatial_context = context;
        // time_context_first_timestep = time_context[::timesteps]
        auto time_context_first_timestep = ggml_view_3d(ctx,
                                                        time_context,
                                                        time_context->ne[0],
                                                        time_context->ne[1],
                                                        1,
                                                        time_context->nb[1],
                                                        time_context->nb[2],
                                                        0);  // [b, n_context, context_dim]
        time_context                     = ggml_new_tensor_3d(ctx, GGML_TYPE_F32,
                                                              time_context_first_timestep->ne[0],
                                                              time_context_first_timestep->ne[1],
                                                              time_context_first_timestep->ne[2] * h * w);
        time_context                     = ggml_repeat(ctx, time_context_first_timestep, time_context);  // [b*h*w, n_context, context_dim]
        x = norm->forward(ctx, x);
        x = proj_in->forward(ctx, x);  // [N, inner_dim, h, w]
        x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 2, 0, 3));  // [N, h, w, inner_dim]
        x = ggml_reshape_3d(ctx, x, inner_dim, w * h, n);      // [N, h * w, inner_dim]
        auto num_frames = ggml_arange(ctx, 0, timesteps, 1);
        // since b is 1, no need to do repeat
        auto t_emb = ggml_nn_timestep_embedding(ctx, num_frames, in_channels, max_time_embed_period);  // [N, in_channels]
        auto emb = time_pos_embed_0->forward(ctx, t_emb);
        emb      = ggml_silu_inplace(ctx, emb);
        emb      = time_pos_embed_2->forward(ctx, emb);                   // [N, in_channels]
        emb      = ggml_reshape_3d(ctx, emb, emb->ne[0], 1, emb->ne[1]);  // [N, 1, in_channels]
        for (int i = 0; i < depth; i++) {
            std::string transformer_name = "transformer_blocks." + std::to_string(i);
            std::string time_stack_name  = "time_stack." + std::to_string(i);
            auto block     = std::dynamic_pointer_cast<BasicTransformerBlock>(blocks[transformer_name]);
            auto mix_block = std::dynamic_pointer_cast<BasicTransformerBlock>(blocks[time_stack_name]);
            x = block->forward(ctx, x, spatial_context);  // [N, h * w, inner_dim]
            // in_channels == inner_dim
            auto x_mix = x;
            x_mix      = ggml_add(ctx, x_mix, emb);  // [N, h * w, inner_dim]
            int64_t N = x_mix->ne[2];
            int64_t T = timesteps;
            int64_t B = N / T;
            int64_t S = x_mix->ne[1];
            int64_t C = x_mix->ne[0];
            x_mix = ggml_reshape_4d(ctx, x_mix, C, S, T, B);               // (b t) s c -> b t s c
            x_mix = ggml_cont(ctx, ggml_permute(ctx, x_mix, 0, 2, 1, 3));  // b t s c -> b s t c
            x_mix = ggml_reshape_3d(ctx, x_mix, C, T, S * B);              // b s t c -> (b s) t c
            x_mix = mix_block->forward(ctx, x_mix, time_context);  // [B * h * w, T, inner_dim]
            x_mix = ggml_reshape_4d(ctx, x_mix, C, T, S, B);               // (b s) t c -> b s t c
            x_mix = ggml_cont(ctx, ggml_permute(ctx, x_mix, 0, 2, 1, 3));  // b s t c -> b t s c
            x_mix = ggml_reshape_3d(ctx, x_mix, C, S, T * B);              // b t s c -> (b t) s c
            x = time_mixer->forward(ctx, x, x_mix);  // [N, h * w, inner_dim]
        }
        x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 0, 2, 3));  // [N, inner_dim, h * w]
        x = ggml_reshape_4d(ctx, x, w, h, inner_dim, n);       // [N, inner_dim, h, w]
        // proj_out
        x = proj_out->forward(ctx, x);  // [N, in_channels, h, w]
        x = ggml_add(ctx, x, x_in);
        return x;
    }
 };
 // ldm.modules.diffusionmodules.openaimodel.UNetModel
 class UnetModelBlock : public GGMLBlock {
 protected:
    SDVersion version = VERSION_1_x;
    // network hparams
    int in_channels                        = 4;
    int out_channels                       = 4;
    int num_res_blocks                     = 2;
    std::vector<int> attention_resolutions = {4, 2, 1};
    std::vector<int> channel_mult          = {1, 2, 4, 4};
    std::vector<int> transformer_depth     = {1, 1, 1, 1};
    int time_embed_dim                     = 1280;  // model_channels*4
    int num_heads                          = 8;
    int num_head_channels                  = -1;   // channels // num_heads
    int context_dim                        = 768;  // 1024 for VERSION_2_x, 2048 for VERSION_XL
 public:
    int model_channels  = 320;
    int adm_in_channels = 2816;  // only for VERSION_XL/SVD
    UnetModelBlock(SDVersion version = VERSION_1_x)
        : version(version) {
        if (version == VERSION_2_x) {
            context_dim       = 1024;
            num_head_channels = 64;
            num_heads         = -1;
        } else if (version == VERSION_XL) {
            context_dim           = 2048;
            attention_resolutions = {4, 2};
            channel_mult          = {1, 2, 4};
            transformer_depth     = {1, 2, 10};
            num_head_channels     = 64;
            num_heads             = -1;
        } else if (version == VERSION_SVD) {
            in_channels       = 8;
            out_channels      = 4;
            context_dim       = 1024;
            adm_in_channels   = 768;
            num_head_channels = 64;
            num_heads         = -1;
        }
        // dims is always 2
        // use_temporal_attention is always True for SVD
        blocks["time_embed.0"] = std::shared_ptr<GGMLBlock>(new Linear(model_channels, time_embed_dim));
        // time_embed_1 is nn.SiLU()
        blocks["time_embed.2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, time_embed_dim));
        if (version == VERSION_XL || version == VERSION_SVD) {
            blocks["label_emb.0.0"] = std::shared_ptr<GGMLBlock>(new Linear(adm_in_channels, time_embed_dim));
            // label_emb_1 is nn.SiLU()
            blocks["label_emb.0.2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, time_embed_dim));
        }
        // input_blocks
        blocks["input_blocks.0.0"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, model_channels, {3, 3}, {1, 1}, {1, 1}));
        std::vector<int> input_block_chans;
        input_block_chans.push_back(model_channels);
        int ch              = model_channels;
        int input_block_idx = 0;
        int ds              = 1;
        auto get_resblock = [&](int64_t channels, int64_t emb_channels, int64_t out_channels) -> ResBlock* {
            if (version == VERSION_SVD) {
                return new VideoResBlock(channels, emb_channels, out_channels);
            } else {
                return new ResBlock(channels, emb_channels, out_channels);
            }
        };
        auto get_attention_layer = [&](int64_t in_channels,
                                       int64_t n_head,
                                       int64_t d_head,
                                       int64_t depth,
                                       int64_t context_dim) -> SpatialTransformer* {
            if (version == VERSION_SVD) {
                return new SpatialVideoTransformer(in_channels, n_head, d_head, depth, context_dim);
            } else {
                return new SpatialTransformer(in_channels, n_head, d_head, depth, context_dim);
            }
        };
        size_t len_mults = channel_mult.size();
        for (int i = 0; i < len_mults; i++) {
            int mult = channel_mult[i];
            for (int j = 0; j < num_res_blocks; j++) {
                input_block_idx += 1;
                std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0";
                blocks[name]     = std::shared_ptr<GGMLBlock>(get_resblock(ch, time_embed_dim, mult * model_channels));
                ch = mult * model_channels;
                if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) {
                    int n_head = num_heads;
                    int d_head = ch / num_heads;
                    if (num_head_channels != -1) {
                        d_head = num_head_channels;
                        n_head = ch / d_head;
                    }
                    std::string name = "input_blocks." + std::to_string(input_block_idx) + ".1";
                    blocks[name]     = std::shared_ptr<GGMLBlock>(get_attention_layer(ch,
                                                                                      n_head,
                                                                                      d_head,
                                                                                      transformer_depth[i],
                                                                                      context_dim));
                }
                input_block_chans.push_back(ch);
            }
            if (i != len_mults - 1) {
                input_block_idx += 1;
                std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0";
                blocks[name]     = std::shared_ptr<GGMLBlock>(new DownSampleBlock(ch, ch));
                input_block_chans.push_back(ch);
                ds *= 2;
            }
        }
        // middle blocks
        int n_head = num_heads;
        int d_head = ch / num_heads;
        if (num_head_channels != -1) {
            d_head = num_head_channels;
            n_head = ch / d_head;
        }
        blocks["middle_block.0"] = std::shared_ptr<GGMLBlock>(get_resblock(ch, time_embed_dim, ch));
        blocks["middle_block.1"] = std::shared_ptr<GGMLBlock>(get_attention_layer(ch,
                                                                                  n_head,
                                                                                  d_head,
                                                                                  transformer_depth[transformer_depth.size() - 1],
                                                                                  context_dim));
        blocks["middle_block.2"] = std::shared_ptr<GGMLBlock>(get_resblock(ch, time_embed_dim, ch));
        // output_blocks
        int output_block_idx = 0;
        for (int i = (int)len_mults - 1; i >= 0; i--) {
            int mult = channel_mult[i];
            for (int j = 0; j < num_res_blocks + 1; j++) {
                int ich = input_block_chans.back();
                input_block_chans.pop_back();
                std::string name = "output_blocks." + std::to_string(output_block_idx) + ".0";
                blocks[name]     = std::shared_ptr<GGMLBlock>(get_resblock(ch + ich, time_embed_dim, mult * model_channels));
                ch                = mult * model_channels;
                int up_sample_idx = 1;
                if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) {
                    int n_head = num_heads;
                    int d_head = ch / num_heads;
                    if (num_head_channels != -1) {
                        d_head = num_head_channels;
                        n_head = ch / d_head;
                    }
                    std::string name = "output_blocks." + std::to_string(output_block_idx) + ".1";
                    blocks[name]     = std::shared_ptr<GGMLBlock>(get_attention_layer(ch, n_head, d_head, transformer_depth[i], context_dim));
                    up_sample_idx++;
                }
                if (i > 0 && j == num_res_blocks) {
                    std::string name = "output_blocks." + std::to_string(output_block_idx) + "." + std::to_string(up_sample_idx);
                    blocks[name]     = std::shared_ptr<GGMLBlock>(new UpSampleBlock(ch, ch));
                    ds /= 2;
                }
                output_block_idx += 1;
            }
        }
        // out
        blocks["out.0"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(ch));  // ch == model_channels
        // out_1 is nn.SiLU()
        blocks["out.2"] = std::shared_ptr<GGMLBlock>(new Conv2d(model_channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
    }
    struct ggml_tensor* resblock_forward(std::string name,
                                         struct ggml_context* ctx,
                                         struct ggml_tensor* x,
                                         struct ggml_tensor* emb,
                                         int num_video_frames) {
        if (version == VERSION_SVD) {
            auto block = std::dynamic_pointer_cast<VideoResBlock>(blocks[name]);
            return block->forward(ctx, x, emb, num_video_frames);
        } else {
            auto block = std::dynamic_pointer_cast<ResBlock>(blocks[name]);
            return block->forward(ctx, x, emb);
        }
    }
    struct ggml_tensor* attention_layer_forward(std::string name,
                                                struct ggml_context* ctx,
                                                struct ggml_tensor* x,
                                                struct ggml_tensor* context,
                                                int timesteps) {
        if (version == VERSION_SVD) {
            auto block = std::dynamic_pointer_cast<SpatialVideoTransformer>(blocks[name]);
            return block->forward(ctx, x, context, timesteps);
        } else {
            auto block = std::dynamic_pointer_cast<SpatialTransformer>(blocks[name]);
            return block->forward(ctx, x, context);
        }
    }
    struct ggml_tensor* forward(struct ggml_context* ctx,
                                struct ggml_tensor* x,
                                struct ggml_tensor* timesteps,
                                struct ggml_tensor* context,
                                struct ggml_tensor* c_concat              = NULL,
                                struct ggml_tensor* y                     = NULL,
                                int num_video_frames                      = -1,
                                std::vector<struct ggml_tensor*> controls = {},
                                float control_strength                    = 0.f) {
        // x: [N, in_channels, h, w] or [N, in_channels/2, h, w]
        // timesteps: [N,]
        // context: [N, max_position, hidden_size] or [1, max_position, hidden_size]. for example, [N, 77, 768]
        // c_concat: [N, in_channels, h, w] or [1, in_channels, h, w]
        // y: [N, adm_in_channels] or [1, adm_in_channels]
        // return: [N, out_channels, h, w]
        if (context != NULL) {
            if (context->ne[2] != x->ne[3]) {
                context = ggml_repeat(ctx, context, ggml_new_tensor_3d(ctx, GGML_TYPE_F32, context->ne[0], context->ne[1], x->ne[3]));
            }
        }
        if (c_concat != NULL) {
            if (c_concat->ne[3] != x->ne[3]) {
                c_concat = ggml_repeat(ctx, c_concat, x);
            }
            x = ggml_concat(ctx, x, c_concat);
        }
        if (y != NULL) {
            if (y->ne[1] != x->ne[3]) {
                y = ggml_repeat(ctx, y, ggml_new_tensor_2d(ctx, GGML_TYPE_F32, y->ne[0], x->ne[3]));
            }
        }
        auto time_embed_0     = std::dynamic_pointer_cast<Linear>(blocks["time_embed.0"]);
        auto time_embed_2     = std::dynamic_pointer_cast<Linear>(blocks["time_embed.2"]);
        auto input_blocks_0_0 = std::dynamic_pointer_cast<Conv2d>(blocks["input_blocks.0.0"]);
        auto out_0 = std::dynamic_pointer_cast<GroupNorm32>(blocks["out.0"]);
        auto out_2 = std::dynamic_pointer_cast<Conv2d>(blocks["out.2"]);
        auto t_emb = ggml_nn_timestep_embedding(ctx, timesteps, model_channels);  // [N, model_channels]
        auto emb = time_embed_0->forward(ctx, t_emb);
        emb      = ggml_silu_inplace(ctx, emb);
        emb      = time_embed_2->forward(ctx, emb);  // [N, time_embed_dim]
        // SDXL/SVD
        if (y != NULL) {
            auto label_embed_0 = std::dynamic_pointer_cast<Linear>(blocks["label_emb.0.0"]);
            auto label_embed_2 = std::dynamic_pointer_cast<Linear>(blocks["label_emb.0.2"]);
            auto label_emb = label_embed_0->forward(ctx, y);
            label_emb      = ggml_silu_inplace(ctx, label_emb);
            label_emb      = label_embed_2->forward(ctx, label_emb);  // [N, time_embed_dim]
            emb = ggml_add(ctx, emb, label_emb);  // [N, time_embed_dim]
        }
        // input_blocks
        std::vector<struct ggml_tensor*> hs;
        // input block 0
        auto h = input_blocks_0_0->forward(ctx, x);
        ggml_set_name(h, "bench-start");
        hs.push_back(h);
        // input block 1-11
        size_t len_mults    = channel_mult.size();
        int input_block_idx = 0;
        int ds              = 1;
        for (int i = 0; i < len_mults; i++) {
            int mult = channel_mult[i];
            for (int j = 0; j < num_res_blocks; j++) {
                input_block_idx += 1;
                std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0";
                h                = resblock_forward(name, ctx, h, emb, num_video_frames);  // [N, mult*model_channels, h, w]
                if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) {
                    std::string name = "input_blocks." + std::to_string(input_block_idx) + ".1";
                    h                = attention_layer_forward(name, ctx, h, context, num_video_frames);  // [N, mult*model_channels, h, w]
                }
                hs.push_back(h);
            }
            if (i != len_mults - 1) {
                ds *= 2;
                input_block_idx += 1;
                std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0";
                auto block       = std::dynamic_pointer_cast<DownSampleBlock>(blocks[name]);
                h = block->forward(ctx, h);  // [N, mult*model_channels, h/(2^(i+1)), w/(2^(i+1))]
                hs.push_back(h);
            }
        }
        // [N, 4*model_channels, h/8, w/8]
        // middle_block
        h = resblock_forward("middle_block.0", ctx, h, emb, num_video_frames);             // [N, 4*model_channels, h/8, w/8]
        h = attention_layer_forward("middle_block.1", ctx, h, context, num_video_frames);  // [N, 4*model_channels, h/8, w/8]
        h = resblock_forward("middle_block.2", ctx, h, emb, num_video_frames);             // [N, 4*model_channels, h/8, w/8]
        if (controls.size() > 0) {
            auto cs = ggml_scale_inplace(ctx, controls[controls.size() - 1], control_strength);
            h       = ggml_add(ctx, h, cs);  // middle control
        }
        int control_offset = controls.size() - 2;
        // output_blocks
        int output_block_idx = 0;
        for (int i = (int)len_mults - 1; i >= 0; i--) {
            for (int j = 0; j < num_res_blocks + 1; j++) {
                auto h_skip = hs.back();
                hs.pop_back();
                if (controls.size() > 0) {
                    auto cs = ggml_scale_inplace(ctx, controls[control_offset], control_strength);
                    h_skip  = ggml_add(ctx, h_skip, cs);  // control net condition
                    control_offset--;
                }
                h = ggml_concat(ctx, h, h_skip);
                std::string name = "output_blocks." + std::to_string(output_block_idx) + ".0";
                h = resblock_forward(name, ctx, h, emb, num_video_frames);
                int up_sample_idx = 1;
                if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) {
                    std::string name = "output_blocks." + std::to_string(output_block_idx) + ".1";
                    h = attention_layer_forward(name, ctx, h, context, num_video_frames);
                    up_sample_idx++;
                }
                if (i > 0 && j == num_res_blocks) {
                    std::string name = "output_blocks." + std::to_string(output_block_idx) + "." + std::to_string(up_sample_idx);
                    auto block       = std::dynamic_pointer_cast<UpSampleBlock>(blocks[name]);
                    h = block->forward(ctx, h);
                    ds /= 2;
                }
                output_block_idx += 1;
            }
        }
        // out
        h = out_0->forward(ctx, h);
        h = ggml_silu_inplace(ctx, h);
        h = out_2->forward(ctx, h);
        ggml_set_name(h, "bench-end");
        return h;  // [N, out_channels, h, w]
    }
 };
 struct UNetModel : public GGMLModule {
    SDVersion version = VERSION_1_x;
    UnetModelBlock unet;
    UNetModel(ggml_backend_t backend,
              ggml_type wtype,
              SDVersion version = VERSION_1_x)
        : GGMLModule(backend, wtype), unet(version) {
        unet.init(params_ctx, wtype);
    }
    std::string get_desc() {
        return "unet";
    }
    size_t get_params_mem_size() {
        return unet.get_params_mem_size();
    }
    size_t get_params_num() {
        return unet.get_params_num();
    }
    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
        unet.get_param_tensors(tensors, prefix);
    }
    struct ggml_cgraph* build_graph(struct ggml_tensor* x,
                                    struct ggml_tensor* timesteps,
                                    struct ggml_tensor* context,
                                    struct ggml_tensor* c_concat              = NULL,
                                    struct ggml_tensor* y                     = NULL,
                                    int num_video_frames                      = -1,
                                    std::vector<struct ggml_tensor*> controls = {},
                                    float control_strength                    = 0.f) {
        struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, UNET_GRAPH_SIZE, false);
        if (num_video_frames == -1) {
            num_video_frames = x->ne[3];
        }
        x         = to_backend(x);
        context   = to_backend(context);
        y         = to_backend(y);
        timesteps = to_backend(timesteps);
        for (int i = 0; i < controls.size(); i++) {
            controls[i] = to_backend(controls[i]);
        }
        struct ggml_tensor* out = unet.forward(compute_ctx,
                                               x,
                                               timesteps,
                                               context,
                                               c_concat,
                                               y,
                                               num_video_frames,
                                               controls,
                                               control_strength);
        ggml_build_forward_expand(gf, out);
        return gf;
    }
    void compute(int n_threads,
                 struct ggml_tensor* x,
                 struct ggml_tensor* timesteps,
                 struct ggml_tensor* context,
                 struct ggml_tensor* c_concat,
                 struct ggml_tensor* y,
                 int num_video_frames                      = -1,
                 std::vector<struct ggml_tensor*> controls = {},
                 float control_strength                    = 0.f,
                 struct ggml_tensor** output               = NULL,
                 struct ggml_context* output_ctx           = NULL) {
        // x: [N, in_channels, h, w]
        // timesteps: [N, ]
        // context: [N, max_position, hidden_size]([N, 77, 768]) or [1, max_position, hidden_size]
        // c_concat: [N, in_channels, h, w] or [1, in_channels, h, w]
        // y: [N, adm_in_channels] or [1, adm_in_channels]
        auto get_graph = [&]() -> struct ggml_cgraph* {
            return build_graph(x, timesteps, context, c_concat, y, num_video_frames, controls, control_strength);
        };
        GGMLModule::compute(get_graph, n_threads, false, output, output_ctx);
    }
    void test() {
        struct ggml_init_params params;
        params.mem_size   = static_cast<size_t>(10 * 1024 * 1024);  // 10 MB
        params.mem_buffer = NULL;
        params.no_alloc   = false;
        struct ggml_context* work_ctx = ggml_init(params);
        GGML_ASSERT(work_ctx != NULL);
        {
            // CPU, num_video_frames = 1, x{num_video_frames, 8, 8, 8}: Pass
            // CUDA, num_video_frames = 1, x{num_video_frames, 8, 8, 8}: Pass
            // CPU, num_video_frames = 3, x{num_video_frames, 8, 8, 8}: Wrong result
            // CUDA, num_video_frames = 3, x{num_video_frames, 8, 8, 8}: nan
            int num_video_frames = 3;
            auto x = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 8, num_video_frames);
            std::vector<float> timesteps_vec(num_video_frames, 999.f);
            auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec);
            ggml_set_f32(x, 0.5f);
            // print_ggml_tensor(x);
            auto context = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32, 1024, 1, num_video_frames);
            ggml_set_f32(context, 0.5f);
            // print_ggml_tensor(context);
            auto y = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 768, num_video_frames);
            ggml_set_f32(y, 0.5f);
            // print_ggml_tensor(y);
            struct ggml_tensor* out = NULL;
            int t0 = ggml_time_ms();
            compute(8, x, timesteps, context, NULL, y, num_video_frames, {}, 0.f, &out, work_ctx);
            int t1 = ggml_time_ms();
            print_ggml_tensor(out);
            LOG_DEBUG("unet test done in %dms", t1 - t0);
        }
    };
 };
 #endif  // __UNET_HPP__
--- a/otherarch/sdcpp/upscaler.cpp
+++ b/otherarch/sdcpp/upscaler.cpp
@ -0,0 +1,123 @@
 #include "esrgan.hpp"
 #include "ggml_extend.hpp"
 #include "model.h"
 #include "stable-diffusion.h"
 struct UpscalerGGML {
    ggml_backend_t backend    = NULL;  // general backend
    ggml_type model_data_type = GGML_TYPE_F16;
    std::shared_ptr<ESRGAN> esrgan_upscaler;
    std::string esrgan_path;
    int n_threads;
    UpscalerGGML(int n_threads)
        : n_threads(n_threads) {
    }
    bool load_from_file(const std::string& esrgan_path) {
 #ifdef SD_USE_CUBLAS
        LOG_DEBUG("Using CUDA backend");
        backend = ggml_backend_cuda_init(0);
 #endif
 #ifdef SD_USE_METAL
        LOG_DEBUG("Using Metal backend");
        ggml_metal_log_set_callback(ggml_log_callback_default, nullptr);
        backend = ggml_backend_metal_init();
 #endif
        if (!backend) {
            LOG_DEBUG("Using CPU backend");
            backend = ggml_backend_cpu_init();
        }
        LOG_INFO("Upscaler weight type: %s", ggml_type_name(model_data_type));
        esrgan_upscaler = std::make_shared<ESRGAN>(backend, model_data_type);
        if (!esrgan_upscaler->load_from_file(esrgan_path)) {
            return false;
        }
        return true;
    }
    sd_image_t upscale(sd_image_t input_image, uint32_t upscale_factor) {
        // upscale_factor, unused for RealESRGAN_x4plus_anime_6B.pth
        sd_image_t upscaled_image = {0, 0, 0, NULL};
        int output_width          = (int)input_image.width * esrgan_upscaler->scale;
        int output_height         = (int)input_image.height * esrgan_upscaler->scale;
        LOG_INFO("upscaling from (%i x %i) to (%i x %i)",
                 input_image.width, input_image.height, output_width, output_height);
        struct ggml_init_params params;
        params.mem_size = output_width * output_height * 3 * sizeof(float) * 2;
        params.mem_size += 2 * ggml_tensor_overhead();
        params.mem_buffer = NULL;
        params.no_alloc   = false;
        // draft context
        struct ggml_context* upscale_ctx = ggml_init(params);
        if (!upscale_ctx) {
            LOG_ERROR("ggml_init() failed");
            return upscaled_image;
        }
        LOG_DEBUG("upscale work buffer size: %.2f MB", params.mem_size / 1024.f / 1024.f);
        ggml_tensor* input_image_tensor = ggml_new_tensor_4d(upscale_ctx, GGML_TYPE_F32, input_image.width, input_image.height, 3, 1);
        sd_image_to_tensor(input_image.data, input_image_tensor);
        ggml_tensor* upscaled = ggml_new_tensor_4d(upscale_ctx, GGML_TYPE_F32, output_width, output_height, 3, 1);
        auto on_tiling        = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
            esrgan_upscaler->compute(n_threads, in, &out);
        };
        int64_t t0 = ggml_time_ms();
        sd_tiling(input_image_tensor, upscaled, esrgan_upscaler->scale, esrgan_upscaler->tile_size, 0.25f, on_tiling);
        esrgan_upscaler->free_compute_buffer();
        ggml_tensor_clamp(upscaled, 0.f, 1.f);
        uint8_t* upscaled_data = sd_tensor_to_image(upscaled);
        ggml_free(upscale_ctx);
        int64_t t3 = ggml_time_ms();
        LOG_INFO("input_image_tensor upscaled, taking %.2fs", (t3 - t0) / 1000.0f);
        upscaled_image = {
            (uint32_t)output_width,
            (uint32_t)output_height,
            3,
            upscaled_data,
        };
        return upscaled_image;
    }
 };
 struct upscaler_ctx_t {
    UpscalerGGML* upscaler = NULL;
 };
 upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str,
                                 int n_threads,
                                 enum sd_type_t wtype) {
    upscaler_ctx_t* upscaler_ctx = (upscaler_ctx_t*)malloc(sizeof(upscaler_ctx_t));
    if (upscaler_ctx == NULL) {
        return NULL;
    }
    std::string esrgan_path(esrgan_path_c_str);
    upscaler_ctx->upscaler = new UpscalerGGML(n_threads);
    if (upscaler_ctx->upscaler == NULL) {
        return NULL;
    }
    if (!upscaler_ctx->upscaler->load_from_file(esrgan_path)) {
        delete upscaler_ctx->upscaler;
        upscaler_ctx->upscaler = NULL;
        free(upscaler_ctx);
        return NULL;
    }
    return upscaler_ctx;
 }
 sd_image_t upscale(upscaler_ctx_t* upscaler_ctx, sd_image_t input_image, uint32_t upscale_factor) {
    return upscaler_ctx->upscaler->upscale(input_image, upscale_factor);
 }
 void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx) {
    if (upscaler_ctx->upscaler != NULL) {
        delete upscaler_ctx->upscaler;
        upscaler_ctx->upscaler = NULL;
    }
    free(upscaler_ctx);
 }
--- a/otherarch/sdcpp/util.cpp
+++ b/otherarch/sdcpp/util.cpp
@ -0,0 +1,463 @@
 #include "util.h"
 #include <stdarg.h>
 #include <algorithm>
 #include <cmath>
 #include <codecvt>
 #include <fstream>
 #include <locale>
 #include <sstream>
 #include <string>
 #include <thread>
 #include <unordered_set>
 #include <vector>
 #include "preprocessing.hpp"
 #if defined(__APPLE__) && defined(__MACH__)
 #include <sys/sysctl.h>
 #include <sys/types.h>
 #endif
 #if !defined(_WIN32)
 #include <sys/ioctl.h>
 #include <unistd.h>
 #endif
 #include "ggml/ggml.h"
 #include "stable-diffusion.h"
 bool ends_with(const std::string& str, const std::string& ending) {
    if (str.length() >= ending.length()) {
        return (str.compare(str.length() - ending.length(), ending.length(), ending) == 0);
    } else {
        return false;
    }
 }
 bool starts_with(const std::string& str, const std::string& start) {
    if (str.find(start) == 0) {
        return true;
    }
    return false;
 }
 void replace_all_chars(std::string& str, char target, char replacement) {
    for (size_t i = 0; i < str.length(); ++i) {
        if (str[i] == target) {
            str[i] = replacement;
        }
    }
 }
 std::string format(const char* fmt, ...) {
    va_list ap;
    va_list ap2;
    va_start(ap, fmt);
    va_copy(ap2, ap);
    int size = vsnprintf(NULL, 0, fmt, ap);
    std::vector<char> buf(size + 1);
    int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
    va_end(ap2);
    va_end(ap);
    return std::string(buf.data(), size);
 }
 #ifdef _WIN32  // code for windows
 #include <windows.h>
 bool file_exists(const std::string& filename) {
    DWORD attributes = GetFileAttributesA(filename.c_str());
    return (attributes != INVALID_FILE_ATTRIBUTES && !(attributes & FILE_ATTRIBUTE_DIRECTORY));
 }
 bool is_directory(const std::string& path) {
    DWORD attributes = GetFileAttributesA(path.c_str());
    return (attributes != INVALID_FILE_ATTRIBUTES && (attributes & FILE_ATTRIBUTE_DIRECTORY));
 }
 std::string get_full_path(const std::string& dir, const std::string& filename) {
    std::string full_path = dir + "\\" + filename;
    WIN32_FIND_DATA find_file_data;
    HANDLE hFind = FindFirstFile(full_path.c_str(), &find_file_data);
    if (hFind != INVALID_HANDLE_VALUE) {
        FindClose(hFind);
        return full_path;
    } else {
        return "";
    }
 }
 #else  // Unix
 #include <dirent.h>
 #include <sys/stat.h>
 bool file_exists(const std::string& filename) {
    struct stat buffer;
    return (stat(filename.c_str(), &buffer) == 0 && S_ISREG(buffer.st_mode));
 }
 bool is_directory(const std::string& path) {
    struct stat buffer;
    return (stat(path.c_str(), &buffer) == 0 && S_ISDIR(buffer.st_mode));
 }
 std::string get_full_path(const std::string& dir, const std::string& filename) {
    DIR* dp = opendir(dir.c_str());
    if (dp != nullptr) {
        struct dirent* entry;
        while ((entry = readdir(dp)) != nullptr) {
            if (strcasecmp(entry->d_name, filename.c_str()) == 0) {
                closedir(dp);
                return dir + "/" + entry->d_name;
            }
        }
        closedir(dp);
    }
    return "";
 }
 #endif
 // get_num_physical_cores is copy from
 // https://github.com/ggerganov/llama.cpp/blob/master/examples/common.cpp
 // LICENSE: https://github.com/ggerganov/llama.cpp/blob/master/LICENSE
 int32_t get_num_physical_cores() {
 #ifdef __linux__
    // enumerate the set of thread siblings, num entries is num cores
    std::unordered_set<std::string> siblings;
    for (uint32_t cpu = 0; cpu < UINT32_MAX; ++cpu) {
        std::ifstream thread_siblings("/sys/devices/system/cpu" + std::to_string(cpu) + "/topology/thread_siblings");
        if (!thread_siblings.is_open()) {
            break;  // no more cpus
        }
        std::string line;
        if (std::getline(thread_siblings, line)) {
            siblings.insert(line);
        }
    }
    if (siblings.size() > 0) {
        return static_cast<int32_t>(siblings.size());
    }
 #elif defined(__APPLE__) && defined(__MACH__)
    int32_t num_physical_cores;
    size_t len = sizeof(num_physical_cores);
    int result = sysctlbyname("hw.perflevel0.physicalcpu", &num_physical_cores, &len, NULL, 0);
    if (result == 0) {
        return num_physical_cores;
    }
    result = sysctlbyname("hw.physicalcpu", &num_physical_cores, &len, NULL, 0);
    if (result == 0) {
        return num_physical_cores;
    }
 #elif defined(_WIN32)
    // TODO: Implement
 #endif
    unsigned int n_threads = std::thread::hardware_concurrency();
    return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
 }
 std::u32string utf8_to_utf32(const std::string& utf8_str) {
    std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
    return converter.from_bytes(utf8_str);
 }
 std::string utf32_to_utf8(const std::u32string& utf32_str) {
    std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
    return converter.to_bytes(utf32_str);
 }
 std::u32string unicode_value_to_utf32(int unicode_value) {
    std::u32string utf32_string = {static_cast<char32_t>(unicode_value)};
    return utf32_string;
 }
 static std::string sd_basename(const std::string& path) {
    size_t pos = path.find_last_of('/');
    if (pos != std::string::npos) {
        return path.substr(pos + 1);
    }
    pos = path.find_last_of('\\');
    if (pos != std::string::npos) {
        return path.substr(pos + 1);
    }
    return path;
 }
 std::string path_join(const std::string& p1, const std::string& p2) {
    if (p1.empty()) {
        return p2;
    }
    if (p2.empty()) {
        return p1;
    }
    if (p1[p1.length() - 1] == '/' || p1[p1.length() - 1] == '\\') {
        return p1 + p2;
    }
    return p1 + "/" + p2;
 }
 void pretty_progress(int step, int steps, float time) {
    if (step == 0) {
        return;
    }
    std::string progress = "  |";
    int max_progress     = 50;
    int32_t current      = (int32_t)(step * 1.f * max_progress / steps);
    for (int i = 0; i < 50; i++) {
        if (i > current) {
            progress += " ";
        } else if (i == current && i != max_progress - 1) {
            progress += ">";
        } else {
            progress += "=";
        }
    }
    progress += "|";
    printf(time > 1.0f ? "\r%s %i/%i - %.2fs/it" : "\r%s %i/%i - %.2fit/s",
           progress.c_str(), step, steps,
           time > 1.0f || time == 0 ? time : (1.0f / time));
    fflush(stdout);  // for linux
    if (step == steps) {
        printf("\n");
    }
 }
 std::string ltrim(const std::string& s) {
    auto it = std::find_if(s.begin(), s.end(), [](int ch) {
        return !std::isspace(ch);
    });
    return std::string(it, s.end());
 }
 std::string rtrim(const std::string& s) {
    auto it = std::find_if(s.rbegin(), s.rend(), [](int ch) {
        return !std::isspace(ch);
    });
    return std::string(s.begin(), it.base());
 }
 std::string trim(const std::string& s) {
    return rtrim(ltrim(s));
 }
 static sd_log_cb_t sd_log_cb = NULL;
 void* sd_log_cb_data         = NULL;
 #define LOG_BUFFER_SIZE 1024
 void log_printf(sd_log_level_t level, const char* file, int line, const char* format, ...) {
    va_list args;
    va_start(args, format);
    const char* level_str = "DEBUG";
    if (level == SD_LOG_INFO) {
        level_str = "INFO ";
    } else if (level == SD_LOG_WARN) {
        level_str = "WARN ";
    } else if (level == SD_LOG_ERROR) {
        level_str = "ERROR";
    }
    static char log_buffer[LOG_BUFFER_SIZE];
    int written = snprintf(log_buffer, LOG_BUFFER_SIZE, "[%s] %s:%-4d - ", level_str, sd_basename(file).c_str(), line);
    if (written >= 0 && written < LOG_BUFFER_SIZE) {
        vsnprintf(log_buffer + written, LOG_BUFFER_SIZE - written, format, args);
        strncat(log_buffer, "\n", LOG_BUFFER_SIZE - strlen(log_buffer) - 1);
    }
    if (sd_log_cb) {
        sd_log_cb(level, log_buffer, sd_log_cb_data);
    }
    va_end(args);
 }
 void sd_set_log_callback(sd_log_cb_t cb, void* data) {
    sd_log_cb      = cb;
    sd_log_cb_data = data;
 }
 const char* sd_get_system_info() {
    static char buffer[1024];
    std::stringstream ss;
    ss << "System Info: \n";
    ss << "    BLAS = " << ggml_cpu_has_blas() << std::endl;
    ss << "    SSE3 = " << ggml_cpu_has_sse3() << std::endl;
    ss << "    AVX = " << ggml_cpu_has_avx() << std::endl;
    ss << "    AVX2 = " << ggml_cpu_has_avx2() << std::endl;
    ss << "    AVX512 = " << ggml_cpu_has_avx512() << std::endl;
    ss << "    AVX512_VBMI = " << ggml_cpu_has_avx512_vbmi() << std::endl;
    ss << "    AVX512_VNNI = " << ggml_cpu_has_avx512_vnni() << std::endl;
    ss << "    FMA = " << ggml_cpu_has_fma() << std::endl;
    ss << "    NEON = " << ggml_cpu_has_neon() << std::endl;
    ss << "    ARM_FMA = " << ggml_cpu_has_arm_fma() << std::endl;
    ss << "    F16C = " << ggml_cpu_has_f16c() << std::endl;
    ss << "    FP16_VA = " << ggml_cpu_has_fp16_va() << std::endl;
    ss << "    WASM_SIMD = " << ggml_cpu_has_wasm_simd() << std::endl;
    ss << "    VSX = " << ggml_cpu_has_vsx() << std::endl;
    snprintf(buffer, sizeof(buffer), "%s", ss.str().c_str());
    return buffer;
 }
 const char* sd_type_name(enum sd_type_t type) {
    return ggml_type_name((ggml_type)type);
 }
 sd_image_f32_t sd_image_t_to_sd_image_f32_t(sd_image_t image) {
    sd_image_f32_t converted_image;
    converted_image.width   = image.width;
    converted_image.height  = image.height;
    converted_image.channel = image.channel;
    // Allocate memory for float data
    converted_image.data = (float*)malloc(image.width * image.height * image.channel * sizeof(float));
    for (int i = 0; i < image.width * image.height * image.channel; i++) {
        // Convert uint8_t to float
        converted_image.data[i] = (float)image.data[i];
    }
    return converted_image;
 }
 // Function to perform double linear interpolation
 float interpolate(float v1, float v2, float v3, float v4, float x_ratio, float y_ratio) {
    return v1 * (1 - x_ratio) * (1 - y_ratio) + v2 * x_ratio * (1 - y_ratio) + v3 * (1 - x_ratio) * y_ratio + v4 * x_ratio * y_ratio;
 }
 sd_image_f32_t resize_sd_image_f32_t(sd_image_f32_t image, int target_width, int target_height) {
    sd_image_f32_t resized_image;
    resized_image.width   = target_width;
    resized_image.height  = target_height;
    resized_image.channel = image.channel;
    // Allocate memory for resized float data
    resized_image.data = (float*)malloc(target_width * target_height * image.channel * sizeof(float));
    for (int y = 0; y < target_height; y++) {
        for (int x = 0; x < target_width; x++) {
            float original_x = (float)x * image.width / target_width;
            float original_y = (float)y * image.height / target_height;
            int x1 = (int)original_x;
            int y1 = (int)original_y;
            int x2 = x1 + 1;
            int y2 = y1 + 1;
            for (int k = 0; k < image.channel; k++) {
                float v1 = *(image.data + y1 * image.width * image.channel + x1 * image.channel + k);
                float v2 = *(image.data + y1 * image.width * image.channel + x2 * image.channel + k);
                float v3 = *(image.data + y2 * image.width * image.channel + x1 * image.channel + k);
                float v4 = *(image.data + y2 * image.width * image.channel + x2 * image.channel + k);
                float x_ratio = original_x - x1;
                float y_ratio = original_y - y1;
                float value = interpolate(v1, v2, v3, v4, x_ratio, y_ratio);
                *(resized_image.data + y * target_width * image.channel + x * image.channel + k) = value;
            }
        }
    }
    return resized_image;
 }
 void normalize_sd_image_f32_t(sd_image_f32_t image, float means[3], float stds[3]) {
    for (int y = 0; y < image.height; y++) {
        for (int x = 0; x < image.width; x++) {
            for (int k = 0; k < image.channel; k++) {
                int index         = (y * image.width + x) * image.channel + k;
                image.data[index] = (image.data[index] - means[k]) / stds[k];
            }
        }
    }
 }
 // Constants for means and std
 float means[3] = {0.48145466, 0.4578275, 0.40821073};
 float stds[3]  = {0.26862954, 0.26130258, 0.27577711};
 // Function to clip and preprocess sd_image_f32_t
 sd_image_f32_t clip_preprocess(sd_image_f32_t image, int size) {
    float scale = (float)size / fmin(image.width, image.height);
    // Interpolation
    int new_width       = (int)(scale * image.width);
    int new_height      = (int)(scale * image.height);
    float* resized_data = (float*)malloc(new_width * new_height * image.channel * sizeof(float));
    for (int y = 0; y < new_height; y++) {
        for (int x = 0; x < new_width; x++) {
            float original_x = (float)x * image.width / new_width;
            float original_y = (float)y * image.height / new_height;
            int x1 = (int)original_x;
            int y1 = (int)original_y;
            int x2 = x1 + 1;
            int y2 = y1 + 1;
            for (int k = 0; k < image.channel; k++) {
                float v1 = *(image.data + y1 * image.width * image.channel + x1 * image.channel + k);
                float v2 = *(image.data + y1 * image.width * image.channel + x2 * image.channel + k);
                float v3 = *(image.data + y2 * image.width * image.channel + x1 * image.channel + k);
                float v4 = *(image.data + y2 * image.width * image.channel + x2 * image.channel + k);
                float x_ratio = original_x - x1;
                float y_ratio = original_y - y1;
                float value = interpolate(v1, v2, v3, v4, x_ratio, y_ratio);
                *(resized_data + y * new_width * image.channel + x * image.channel + k) = value;
            }
        }
    }
    // Clip and preprocess
    int h = (new_height - size) / 2;
    int w = (new_width - size) / 2;
    sd_image_f32_t result;
    result.width   = size;
    result.height  = size;
    result.channel = image.channel;
    result.data    = (float*)malloc(size * size * image.channel * sizeof(float));
    for (int k = 0; k < image.channel; k++) {
        for (int i = 0; i < size; i++) {
            for (int j = 0; j < size; j++) {
                *(result.data + i * size * image.channel + j * image.channel + k) =
                    fmin(fmax(*(resized_data + (i + h) * new_width * image.channel + (j + w) * image.channel + k), 0.0f), 255.0f) / 255.0f;
            }
        }
    }
    // Free allocated memory
    free(resized_data);
    // Normalize
    for (int k = 0; k < image.channel; k++) {
        for (int i = 0; i < size; i++) {
            for (int j = 0; j < size; j++) {
                // *(result.data + i * size * image.channel + j * image.channel + k) = 0.5f;
                int offset  = i * size * image.channel + j * image.channel + k;
                float value = *(result.data + offset);
                value       = (value - means[k]) / stds[k];
                // value = 0.5f;
                *(result.data + offset) = value;
            }
        }
    }
    return result;
 }
--- a/otherarch/sdcpp/util.h
+++ b/otherarch/sdcpp/util.h
@ -0,0 +1,51 @@
 #ifndef __UTIL_H__
 #define __UTIL_H__
 #include <cstdint>
 #include <string>
 #include "stable-diffusion.h"
 bool ends_with(const std::string& str, const std::string& ending);
 bool starts_with(const std::string& str, const std::string& start);
 std::string format(const char* fmt, ...);
 void replace_all_chars(std::string& str, char target, char replacement);
 bool file_exists(const std::string& filename);
 bool is_directory(const std::string& path);
 std::string get_full_path(const std::string& dir, const std::string& filename);
 std::u32string utf8_to_utf32(const std::string& utf8_str);
 std::string utf32_to_utf8(const std::u32string& utf32_str);
 std::u32string unicode_value_to_utf32(int unicode_value);
 typedef struct {
    uint32_t width;
    uint32_t height;
    uint32_t channel;
    float* data;
 } sd_image_f32_t;
 void normalize_sd_image_f32_t(sd_image_f32_t image, float means[3], float stds[3]);
 sd_image_f32_t sd_image_t_to_sd_image_f32_t(sd_image_t image);
 sd_image_f32_t resize_sd_image_f32_t(sd_image_f32_t image, int target_width, int target_height);
 sd_image_f32_t clip_preprocess(sd_image_f32_t image, int size);
 std::string path_join(const std::string& p1, const std::string& p2);
 void pretty_progress(int step, int steps, float time);
 void log_printf(sd_log_level_t level, const char* file, int line, const char* format, ...);
 std::string trim(const std::string& s);
 #define LOG_DEBUG(format, ...) log_printf(SD_LOG_DEBUG, __FILE__, __LINE__, format, ##__VA_ARGS__)
 #define LOG_INFO(format, ...) log_printf(SD_LOG_INFO, __FILE__, __LINE__, format, ##__VA_ARGS__)
 #define LOG_WARN(format, ...) log_printf(SD_LOG_WARN, __FILE__, __LINE__, format, ##__VA_ARGS__)
 #define LOG_ERROR(format, ...) log_printf(SD_LOG_ERROR, __FILE__, __LINE__, format, ##__VA_ARGS__)
 #endif  // __UTIL_H__
--- a/otherarch/sdcpp/vae.hpp
+++ b/otherarch/sdcpp/vae.hpp
@ -0,0 +1,613 @@
 #ifndef __VAE_HPP__
 #define __VAE_HPP__
 #include "common.hpp"
 #include "ggml_extend.hpp"
 /*================================================== AutoEncoderKL ===================================================*/
 #define VAE_GRAPH_SIZE 10240
 class ResnetBlock : public UnaryBlock {
 protected:
    int64_t in_channels;
    int64_t out_channels;
 public:
    ResnetBlock(int64_t in_channels,
                int64_t out_channels)
        : in_channels(in_channels),
          out_channels(out_channels) {
        // temb_channels is always 0
        blocks["norm1"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(in_channels));
        blocks["conv1"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
        blocks["norm2"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(out_channels));
        blocks["conv2"] = std::shared_ptr<GGMLBlock>(new Conv2d(out_channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
        if (out_channels != in_channels) {
            blocks["nin_shortcut"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, out_channels, {1, 1}));
        }
    }
    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
        // x: [N, in_channels, h, w]
        // t_emb is always None
        auto norm1 = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm1"]);
        auto conv1 = std::dynamic_pointer_cast<Conv2d>(blocks["conv1"]);
        auto norm2 = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm2"]);
        auto conv2 = std::dynamic_pointer_cast<Conv2d>(blocks["conv2"]);
        auto h = x;
        h      = norm1->forward(ctx, h);
        h      = ggml_silu_inplace(ctx, h);  // swish
        h      = conv1->forward(ctx, h);
        // return h;
        h = norm2->forward(ctx, h);
        h = ggml_silu_inplace(ctx, h);  // swish
        // dropout, skip for inference
        h = conv2->forward(ctx, h);
        // skip connection
        if (out_channels != in_channels) {
            auto nin_shortcut = std::dynamic_pointer_cast<Conv2d>(blocks["nin_shortcut"]);
            x = nin_shortcut->forward(ctx, x);  // [N, out_channels, h, w]
        }
        h = ggml_add(ctx, h, x);
        return h;  // [N, out_channels, h, w]
    }
 };
 class AttnBlock : public UnaryBlock {
 protected:
    int64_t in_channels;
 public:
    AttnBlock(int64_t in_channels)
        : in_channels(in_channels) {
        blocks["norm"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(in_channels));
        blocks["q"]    = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1}));
        blocks["k"]    = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1}));
        blocks["v"]    = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1}));
        blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1}));
    }
    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
        // x: [N, in_channels, h, w]
        auto norm     = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm"]);
        auto q_proj   = std::dynamic_pointer_cast<Conv2d>(blocks["q"]);
        auto k_proj   = std::dynamic_pointer_cast<Conv2d>(blocks["k"]);
        auto v_proj   = std::dynamic_pointer_cast<Conv2d>(blocks["v"]);
        auto proj_out = std::dynamic_pointer_cast<Conv2d>(blocks["proj_out"]);
        auto h_ = norm->forward(ctx, x);
        const int64_t n = h_->ne[3];
        const int64_t c = h_->ne[2];
        const int64_t h = h_->ne[1];
        const int64_t w = h_->ne[0];
        auto q = q_proj->forward(ctx, h_);                          // [N, in_channels, h, w]
        q      = ggml_cont(ctx, ggml_permute(ctx, q, 1, 2, 0, 3));  // [N, h, w, in_channels]
        q      = ggml_reshape_3d(ctx, q, c, h * w, n);              // [N, h * w, in_channels]
        auto k = k_proj->forward(ctx, h_);                          // [N, in_channels, h, w]
        k      = ggml_cont(ctx, ggml_permute(ctx, k, 1, 2, 0, 3));  // [N, h, w, in_channels]
        k      = ggml_reshape_3d(ctx, k, c, h * w, n);              // [N, h * w, in_channels]
        auto v = v_proj->forward(ctx, h_);              // [N, in_channels, h, w]
        v      = ggml_reshape_3d(ctx, v, h * w, c, n);  // [N, in_channels, h * w]
        h_ = ggml_nn_attention(ctx, q, k, v, false);  // [N, h * w, in_channels]
        h_ = ggml_cont(ctx, ggml_permute(ctx, h_, 1, 0, 2, 3));  // [N, in_channels, h * w]
        h_ = ggml_reshape_4d(ctx, h_, w, h, c, n);               // [N, in_channels, h, w]
        h_ = proj_out->forward(ctx, h_);  // [N, in_channels, h, w]
        h_ = ggml_add(ctx, h_, x);
        return h_;
    }
 };
 class AE3DConv : public Conv2d {
 public:
    AE3DConv(int64_t in_channels,
             int64_t out_channels,
             std::pair<int, int> kernel_size,
             int64_t video_kernel_size    = 3,
             std::pair<int, int> stride   = {1, 1},
             std::pair<int, int> padding  = {0, 0},
             std::pair<int, int> dilation = {1, 1},
             bool bias                    = true)
        : Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias) {
        int64_t kernel_padding  = video_kernel_size / 2;
        blocks["time_mix_conv"] = std::shared_ptr<GGMLBlock>(new Conv3dnx1x1(out_channels,
                                                                             out_channels,
                                                                             video_kernel_size,
                                                                             1,
                                                                             kernel_padding));
    }
    struct ggml_tensor* forward(struct ggml_context* ctx,
                                struct ggml_tensor* x) {
        // timesteps always None
        // skip_video always False
        // x: [N, IC, IH, IW]
        // result: [N, OC, OH, OW]
        auto time_mix_conv = std::dynamic_pointer_cast<Conv3dnx1x1>(blocks["time_mix_conv"]);
        x = Conv2d::forward(ctx, x);
        // timesteps = x.shape[0]
        // x = rearrange(x, "(b t) c h w -> b c t h w", t=timesteps)
        // x = conv3d(x)
        // return rearrange(x, "b c t h w -> (b t) c h w")
        int64_t T = x->ne[3];
        int64_t B = x->ne[3] / T;
        int64_t C = x->ne[2];
        int64_t H = x->ne[1];
        int64_t W = x->ne[0];
        x = ggml_reshape_4d(ctx, x, W * H, C, T, B);           // (b t) c h w -> b t c (h w)
        x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));  // b t c (h w) -> b c t (h w)
        x = time_mix_conv->forward(ctx, x);                    // [B, OC, T, OH * OW]
        x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));  // b c t (h w) -> b t c (h w)
        x = ggml_reshape_4d(ctx, x, W, H, C, T * B);           // b t c (h w) -> (b t) c h w
        return x;                                              // [B*T, OC, OH, OW]
    }
 };
 class VideoResnetBlock : public ResnetBlock {
 protected:
    void init_params(struct ggml_context* ctx, ggml_type wtype) {
        params["mix_factor"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
    }
    float get_alpha() {
        float alpha = ggml_backend_tensor_get_f32(params["mix_factor"]);
        return sigmoid(alpha);
    }
 public:
    VideoResnetBlock(int64_t in_channels,
                     int64_t out_channels,
                     int video_kernel_size = 3)
        : ResnetBlock(in_channels, out_channels) {
        // merge_strategy is always learned
        blocks["time_stack"] = std::shared_ptr<GGMLBlock>(new ResBlock(out_channels, 0, out_channels, {video_kernel_size, 1}, 3, false, true));
    }
    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
        // x: [N, in_channels, h, w] aka [b*t, in_channels, h, w]
        // return: [N, out_channels, h, w] aka [b*t, out_channels, h, w]
        // t_emb is always None
        // skip_video is always False
        // timesteps is always None
        auto time_stack = std::dynamic_pointer_cast<ResBlock>(blocks["time_stack"]);
        x = ResnetBlock::forward(ctx, x);  // [N, out_channels, h, w]
        // return x;
        int64_t T = x->ne[3];
        int64_t B = x->ne[3] / T;
        int64_t C = x->ne[2];
        int64_t H = x->ne[1];
        int64_t W = x->ne[0];
        x          = ggml_reshape_4d(ctx, x, W * H, C, T, B);           // (b t) c h w -> b t c (h w)
        x          = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));  // b t c (h w) -> b c t (h w)
        auto x_mix = x;
        x = time_stack->forward(ctx, x);  // b t c (h w)
        float alpha = get_alpha();
        x           = ggml_add(ctx,
                               ggml_scale(ctx, x, alpha),
                               ggml_scale(ctx, x_mix, 1.0f - alpha));
        x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));  // b c t (h w) -> b t c (h w)
        x = ggml_reshape_4d(ctx, x, W, H, C, T * B);           // b t c (h w) -> (b t) c h w
        return x;
    }
 };
 // ldm.modules.diffusionmodules.model.Encoder
 class Encoder : public GGMLBlock {
 protected:
    int ch                   = 128;
    std::vector<int> ch_mult = {1, 2, 4, 4};
    int num_res_blocks       = 2;
    int in_channels          = 3;
    int z_channels           = 4;
    bool double_z            = true;
 public:
    Encoder(int ch,
            std::vector<int> ch_mult,
            int num_res_blocks,
            int in_channels,
            int z_channels,
            bool double_z = true)
        : ch(ch),
          ch_mult(ch_mult),
          num_res_blocks(num_res_blocks),
          in_channels(in_channels),
          z_channels(z_channels),
          double_z(double_z) {
        blocks["conv_in"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, ch, {3, 3}, {1, 1}, {1, 1}));
        size_t num_resolutions = ch_mult.size();
        int block_in = 1;
        for (int i = 0; i < num_resolutions; i++) {
            if (i == 0) {
                block_in = ch;
            } else {
                block_in = ch * ch_mult[i - 1];
            }
            int block_out = ch * ch_mult[i];
            for (int j = 0; j < num_res_blocks; j++) {
                std::string name = "down." + std::to_string(i) + ".block." + std::to_string(j);
                blocks[name]     = std::shared_ptr<GGMLBlock>(new ResnetBlock(block_in, block_out));
                block_in         = block_out;
            }
            if (i != num_resolutions - 1) {
                std::string name = "down." + std::to_string(i) + ".downsample";
                blocks[name]     = std::shared_ptr<GGMLBlock>(new DownSampleBlock(block_in, block_in, true));
            }
        }
        blocks["mid.block_1"] = std::shared_ptr<GGMLBlock>(new ResnetBlock(block_in, block_in));
        blocks["mid.attn_1"]  = std::shared_ptr<GGMLBlock>(new AttnBlock(block_in));
        blocks["mid.block_2"] = std::shared_ptr<GGMLBlock>(new ResnetBlock(block_in, block_in));
        blocks["norm_out"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(block_in));
        blocks["conv_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(block_in, double_z ? z_channels * 2 : z_channels, {3, 3}, {1, 1}, {1, 1}));
    }
    virtual struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
        // x: [N, in_channels, h, w]
        auto conv_in     = std::dynamic_pointer_cast<Conv2d>(blocks["conv_in"]);
        auto mid_block_1 = std::dynamic_pointer_cast<ResnetBlock>(blocks["mid.block_1"]);
        auto mid_attn_1  = std::dynamic_pointer_cast<AttnBlock>(blocks["mid.attn_1"]);
        auto mid_block_2 = std::dynamic_pointer_cast<ResnetBlock>(blocks["mid.block_2"]);
        auto norm_out    = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm_out"]);
        auto conv_out    = std::dynamic_pointer_cast<Conv2d>(blocks["conv_out"]);
        auto h = conv_in->forward(ctx, x);  // [N, ch, h, w]
        // downsampling
        size_t num_resolutions = ch_mult.size();
        for (int i = 0; i < num_resolutions; i++) {
            for (int j = 0; j < num_res_blocks; j++) {
                std::string name = "down." + std::to_string(i) + ".block." + std::to_string(j);
                auto down_block  = std::dynamic_pointer_cast<ResnetBlock>(blocks[name]);
                h = down_block->forward(ctx, h);
            }
            if (i != num_resolutions - 1) {
                std::string name = "down." + std::to_string(i) + ".downsample";
                auto down_sample = std::dynamic_pointer_cast<DownSampleBlock>(blocks[name]);
                h = down_sample->forward(ctx, h);
            }
        }
        // middle
        h = mid_block_1->forward(ctx, h);
        h = mid_attn_1->forward(ctx, h);
        h = mid_block_2->forward(ctx, h);  // [N, block_in, h, w]
        // end
        h = norm_out->forward(ctx, h);
        h = ggml_silu_inplace(ctx, h);  // nonlinearity/swish
        h = conv_out->forward(ctx, h);  // [N, z_channels*2, h, w]
        return h;
    }
 };
 // ldm.modules.diffusionmodules.model.Decoder
 class Decoder : public GGMLBlock {
 protected:
    int ch                   = 128;
    int out_ch               = 3;
    std::vector<int> ch_mult = {1, 2, 4, 4};
    int num_res_blocks       = 2;
    int z_channels           = 4;
    bool video_decoder       = false;
    int video_kernel_size    = 3;
    virtual std::shared_ptr<GGMLBlock> get_conv_out(int64_t in_channels,
                                                    int64_t out_channels,
                                                    std::pair<int, int> kernel_size,
                                                    std::pair<int, int> stride  = {1, 1},
                                                    std::pair<int, int> padding = {0, 0}) {
        if (video_decoder) {
            return std::shared_ptr<GGMLBlock>(new AE3DConv(in_channels, out_channels, kernel_size, video_kernel_size, stride, padding));
        } else {
            return std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, out_channels, kernel_size, stride, padding));
        }
    }
    virtual std::shared_ptr<GGMLBlock> get_resnet_block(int64_t in_channels,
                                                        int64_t out_channels) {
        if (video_decoder) {
            return std::shared_ptr<GGMLBlock>(new VideoResnetBlock(in_channels, out_channels, video_kernel_size));
        } else {
            return std::shared_ptr<GGMLBlock>(new ResnetBlock(in_channels, out_channels));
        }
    }
 public:
    Decoder(int ch,
            int out_ch,
            std::vector<int> ch_mult,
            int num_res_blocks,
            int z_channels,
            bool video_decoder    = false,
            int video_kernel_size = 3)
        : ch(ch),
          out_ch(out_ch),
          ch_mult(ch_mult),
          num_res_blocks(num_res_blocks),
          z_channels(z_channels),
          video_decoder(video_decoder),
          video_kernel_size(video_kernel_size) {
        size_t num_resolutions = ch_mult.size();
        int block_in           = ch * ch_mult[num_resolutions - 1];
        blocks["conv_in"] = std::shared_ptr<GGMLBlock>(new Conv2d(z_channels, block_in, {3, 3}, {1, 1}, {1, 1}));
        blocks["mid.block_1"] = get_resnet_block(block_in, block_in);
        blocks["mid.attn_1"]  = std::shared_ptr<GGMLBlock>(new AttnBlock(block_in));
        blocks["mid.block_2"] = get_resnet_block(block_in, block_in);
        for (int i = num_resolutions - 1; i >= 0; i--) {
            int mult      = ch_mult[i];
            int block_out = ch * mult;
            for (int j = 0; j < num_res_blocks + 1; j++) {
                std::string name = "up." + std::to_string(i) + ".block." + std::to_string(j);
                blocks[name]     = get_resnet_block(block_in, block_out);
                block_in = block_out;
            }
            if (i != 0) {
                std::string name = "up." + std::to_string(i) + ".upsample";
                blocks[name]     = std::shared_ptr<GGMLBlock>(new UpSampleBlock(block_in, block_in));
            }
        }
        blocks["norm_out"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(block_in));
        blocks["conv_out"] = get_conv_out(block_in, out_ch, {3, 3}, {1, 1}, {1, 1});
    }
    virtual struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* z) {
        // z: [N, z_channels, h, w]
        // alpha is always 0
        // merge_strategy is always learned
        // time_mode is always conv-only, so we need to replace conv_out_op/resnet_op to AE3DConv/VideoResBlock
        // AttnVideoBlock will not be used
        auto conv_in     = std::dynamic_pointer_cast<Conv2d>(blocks["conv_in"]);
        auto mid_block_1 = std::dynamic_pointer_cast<ResnetBlock>(blocks["mid.block_1"]);
        auto mid_attn_1  = std::dynamic_pointer_cast<AttnBlock>(blocks["mid.attn_1"]);
        auto mid_block_2 = std::dynamic_pointer_cast<ResnetBlock>(blocks["mid.block_2"]);
        auto norm_out    = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm_out"]);
        auto conv_out    = std::dynamic_pointer_cast<Conv2d>(blocks["conv_out"]);
        // conv_in
        auto h = conv_in->forward(ctx, z);  // [N, block_in, h, w]
        // middle
        h = mid_block_1->forward(ctx, h);
        // return h;
        h = mid_attn_1->forward(ctx, h);
        h = mid_block_2->forward(ctx, h);  // [N, block_in, h, w]
        // upsampling
        size_t num_resolutions = ch_mult.size();
        for (int i = num_resolutions - 1; i >= 0; i--) {
            for (int j = 0; j < num_res_blocks + 1; j++) {
                std::string name = "up." + std::to_string(i) + ".block." + std::to_string(j);
                auto up_block    = std::dynamic_pointer_cast<ResnetBlock>(blocks[name]);
                h = up_block->forward(ctx, h);
            }
            if (i != 0) {
                std::string name = "up." + std::to_string(i) + ".upsample";
                auto up_sample   = std::dynamic_pointer_cast<UpSampleBlock>(blocks[name]);
                h = up_sample->forward(ctx, h);
            }
        }
        h = norm_out->forward(ctx, h);
        h = ggml_silu_inplace(ctx, h);  // nonlinearity/swish
        h = conv_out->forward(ctx, h);  // [N, out_ch, h*8, w*8]
        return h;
    }
 };
 // ldm.models.autoencoder.AutoencoderKL
 class AutoencodingEngine : public GGMLBlock {
 protected:
    bool decode_only       = true;
    bool use_video_decoder = false;
    int embed_dim          = 4;
    struct {
        int z_channels           = 4;
        int resolution           = 256;
        int in_channels          = 3;
        int out_ch               = 3;
        int ch                   = 128;
        std::vector<int> ch_mult = {1, 2, 4, 4};
        int num_res_blocks       = 2;
        bool double_z            = true;
    } dd_config;
 public:
    AutoencodingEngine(bool decode_only       = true,
                       bool use_video_decoder = false)
        : decode_only(decode_only), use_video_decoder(use_video_decoder) {
        blocks["decoder"] = std::shared_ptr<GGMLBlock>(new Decoder(dd_config.ch,
                                                                   dd_config.out_ch,
                                                                   dd_config.ch_mult,
                                                                   dd_config.num_res_blocks,
                                                                   dd_config.z_channels,
                                                                   use_video_decoder));
        if (!use_video_decoder) {
            blocks["post_quant_conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(dd_config.z_channels,
                                                                              embed_dim,
                                                                              {1, 1}));
        }
        if (!decode_only) {
            blocks["encoder"] = std::shared_ptr<GGMLBlock>(new Encoder(dd_config.ch,
                                                                       dd_config.ch_mult,
                                                                       dd_config.num_res_blocks,
                                                                       dd_config.in_channels,
                                                                       dd_config.z_channels,
                                                                       dd_config.double_z));
            if (!use_video_decoder) {
                int factor = dd_config.double_z ? 2 : 1;
                blocks["quant_conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(embed_dim * factor,
                                                                             dd_config.z_channels * factor,
                                                                             {1, 1}));
            }
        }
    }
    struct ggml_tensor* decode(struct ggml_context* ctx, struct ggml_tensor* z) {
        // z: [N, z_channels, h, w]
        if (!use_video_decoder) {
            auto post_quant_conv = std::dynamic_pointer_cast<Conv2d>(blocks["post_quant_conv"]);
            z                    = post_quant_conv->forward(ctx, z);  // [N, z_channels, h, w]
        }
        auto decoder = std::dynamic_pointer_cast<Decoder>(blocks["decoder"]);
        ggml_set_name(z, "bench-start");
        auto h = decoder->forward(ctx, z);
        ggml_set_name(h, "bench-end");
        return h;
    }
    struct ggml_tensor* encode(struct ggml_context* ctx, struct ggml_tensor* x) {
        // x: [N, in_channels, h, w]
        auto encoder = std::dynamic_pointer_cast<Encoder>(blocks["encoder"]);
        auto h = encoder->forward(ctx, x);  // [N, 2*z_channels, h/8, w/8]
        if (!use_video_decoder) {
            auto quant_conv = std::dynamic_pointer_cast<Conv2d>(blocks["quant_conv"]);
            h               = quant_conv->forward(ctx, h);  // [N, 2*embed_dim, h/8, w/8]
        }
        return h;
    }
 };
 struct AutoEncoderKL : public GGMLModule {
    bool decode_only = true;
    AutoencodingEngine ae;
    AutoEncoderKL(ggml_backend_t backend,
                  ggml_type wtype,
                  bool decode_only       = false,
                  bool use_video_decoder = false)
        : decode_only(decode_only), ae(decode_only, use_video_decoder), GGMLModule(backend, wtype) {
        ae.init(params_ctx, wtype);
    }
    std::string get_desc() {
        return "vae";
    }
    size_t get_params_mem_size() {
        return ae.get_params_mem_size();
    }
    size_t get_params_num() {
        return ae.get_params_num();
    }
    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
        ae.get_param_tensors(tensors, prefix);
    }
    struct ggml_cgraph* build_graph(struct ggml_tensor* z, bool decode_graph) {
        struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
        z = to_backend(z);
        struct ggml_tensor* out = decode_graph ? ae.decode(compute_ctx, z) : ae.encode(compute_ctx, z);
        ggml_build_forward_expand(gf, out);
        return gf;
    }
    void compute(const int n_threads,
                 struct ggml_tensor* z,
                 bool decode_graph,
                 struct ggml_tensor** output,
                 struct ggml_context* output_ctx = NULL) {
        auto get_graph = [&]() -> struct ggml_cgraph* {
            return build_graph(z, decode_graph);
        };
        // ggml_set_f32(z, 0.5f);
        // print_ggml_tensor(z);
        GGMLModule::compute(get_graph, n_threads, true, output, output_ctx);
    }
    void test() {
        struct ggml_init_params params;
        params.mem_size   = static_cast<size_t>(10 * 1024 * 1024);  // 10 MB
        params.mem_buffer = NULL;
        params.no_alloc   = false;
        struct ggml_context* work_ctx = ggml_init(params);
        GGML_ASSERT(work_ctx != NULL);
        {
            // CPU, x{1, 3, 64, 64}: Pass
            // CUDA, x{1, 3, 64, 64}: Pass, but sill get wrong result for some image, may be due to interlnal nan
            // CPU, x{2, 3, 64, 64}: Wrong result
            // CUDA, x{2, 3, 64, 64}: Wrong result, and different from CPU result
            auto x = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 64, 64, 3, 2);
            ggml_set_f32(x, 0.5f);
            print_ggml_tensor(x);
            struct ggml_tensor* out = NULL;
            int t0 = ggml_time_ms();
            compute(8, x, false, &out, work_ctx);
            int t1 = ggml_time_ms();
            print_ggml_tensor(out);
            LOG_DEBUG("encode test done in %dms", t1 - t0);
        }
        if (false) {
            // CPU, z{1, 4, 8, 8}: Pass
            // CUDA, z{1, 4, 8, 8}: Pass
            // CPU, z{3, 4, 8, 8}: Wrong result
            // CUDA, z{3, 4, 8, 8}: Wrong result, and different from CPU result
            auto z = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 4, 1);
            ggml_set_f32(z, 0.5f);
            print_ggml_tensor(z);
            struct ggml_tensor* out = NULL;
            int t0 = ggml_time_ms();
            compute(8, z, true, &out, work_ctx);
            int t1 = ggml_time_ms();
            print_ggml_tensor(out);
            LOG_DEBUG("decode test done in %dms", t1 - t0);
        }
    };
 };
 #endif
--- a/otherarch/sdcpp/vocab.hpp
+++ b/otherarch/sdcpp/vocab.hpp
		`@ -0,0 +1,2 @@`
							`- json.hpp library from: https://github.com/nlohmann/json`
							`- ZIP Library from: https://github.com/kuba--/zip`