mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 01:24:36 +00:00
initial files from sdcpp (not working)
This commit is contained in:
parent
ad638285de
commit
26696970ce
33 changed files with 582497 additions and 4 deletions
8
Makefile
8
Makefile
|
@ -39,8 +39,8 @@ endif
|
||||||
#
|
#
|
||||||
|
|
||||||
# keep standard at C11 and C++11
|
# keep standard at C11 and C++11
|
||||||
CFLAGS = -I. -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./include/vulkan -O3 -DNDEBUG -std=c11 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE
|
CFLAGS = -I. -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./otherarch/sdcpp -I./otherarch/sdcpp/thirdparty -I./include/vulkan -O3 -DNDEBUG -std=c11 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE
|
||||||
CXXFLAGS = -I. -I./common -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./include/vulkan -O3 -DNDEBUG -std=c++11 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE
|
CXXFLAGS = -I. -I./common -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./otherarch/sdcpp -I./otherarch/sdcpp/thirdparty -I./include/vulkan -O3 -DNDEBUG -std=c++11 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE
|
||||||
LDFLAGS =
|
LDFLAGS =
|
||||||
FASTCFLAGS = $(subst -O3,-Ofast,$(CFLAGS))
|
FASTCFLAGS = $(subst -O3,-Ofast,$(CFLAGS))
|
||||||
|
|
||||||
|
@ -496,12 +496,14 @@ gpttype_adapter_vulkan_noavx2.o: $(GPTTYPE_ADAPTER)
|
||||||
$(CXX) $(CXXFLAGS) $(FAILSAFE_FLAGS) $(VULKAN_FLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) $(FAILSAFE_FLAGS) $(VULKAN_FLAGS) -c $< -o $@
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
rm -vf *.o main quantize_llama quantize_gpt2 quantize_gptj quantize_neox quantize_mpt quantize-stats perplexity embedding benchmark-matmult save-load-state gguf imatrix imatrix.exe gguf.exe main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe quantize_mpt.exe koboldcpp_default.dll koboldcpp_openblas.dll koboldcpp_failsafe.dll koboldcpp_noavx2.dll koboldcpp_clblast.dll koboldcpp_clblast_noavx2.dll koboldcpp_cublas.dll koboldcpp_hipblas.dll koboldcpp_vulkan.dll koboldcpp_vulkan_noavx2.dll koboldcpp_default.so koboldcpp_openblas.so koboldcpp_failsafe.so koboldcpp_noavx2.so koboldcpp_clblast.so koboldcpp_clblast_noavx2.so koboldcpp_cublas.so koboldcpp_hipblas.so koboldcpp_vulkan.so koboldcpp_vulkan_noavx2.so
|
rm -vf *.o main sdmain quantize_llama quantize_gpt2 quantize_gptj quantize_neox quantize_mpt quantize-stats perplexity embedding benchmark-matmult save-load-state gguf imatrix imatrix.exe gguf.exe main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe quantize_mpt.exe koboldcpp_default.dll koboldcpp_openblas.dll koboldcpp_failsafe.dll koboldcpp_noavx2.dll koboldcpp_clblast.dll koboldcpp_clblast_noavx2.dll koboldcpp_cublas.dll koboldcpp_hipblas.dll koboldcpp_vulkan.dll koboldcpp_vulkan_noavx2.dll koboldcpp_default.so koboldcpp_openblas.so koboldcpp_failsafe.so koboldcpp_noavx2.so koboldcpp_clblast.so koboldcpp_clblast_noavx2.so koboldcpp_cublas.so koboldcpp_hipblas.so koboldcpp_vulkan.so koboldcpp_vulkan_noavx2.so
|
||||||
|
|
||||||
# useful tools
|
# useful tools
|
||||||
main: examples/main/main.cpp common/sampling.cpp build-info.h ggml.o ggml-quants.o ggml-alloc.o ggml-backend.o llama.o common.o console.o grammar-parser.o $(OBJS)
|
main: examples/main/main.cpp common/sampling.cpp build-info.h ggml.o ggml-quants.o ggml-alloc.o ggml-backend.o llama.o common.o console.o grammar-parser.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
@echo '==== Run ./main -h for help. ===='
|
@echo '==== Run ./main -h for help. ===='
|
||||||
|
sdmain: otherarch/sdcpp/main.cpp otherarch/sdcpp/stable-diffusion.cpp common/sampling.cpp build-info.h ggml.o ggml-quants.o ggml-alloc.o ggml-backend.o llama.o common.o console.o grammar-parser.o $(OBJS)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
imatrix: examples/imatrix/imatrix.cpp common/sampling.cpp build-info.h ggml.o ggml-quants.o ggml-alloc.o ggml-backend.o llama.o common.o console.o grammar-parser.o $(OBJS)
|
imatrix: examples/imatrix/imatrix.cpp common/sampling.cpp build-info.h ggml.o ggml-quants.o ggml-alloc.o ggml-backend.o llama.o common.o console.o grammar-parser.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
gguf: examples/gguf/gguf.cpp build-info.h ggml.o llama.o $(OBJS)
|
gguf: examples/gguf/gguf.cpp build-info.h ggml.o llama.o $(OBJS)
|
||||||
|
|
2
ggml.h
2
ggml.h
|
@ -229,7 +229,7 @@
|
||||||
#define GGML_MAX_CONTEXTS 64
|
#define GGML_MAX_CONTEXTS 64
|
||||||
#define GGML_MAX_SRC 10
|
#define GGML_MAX_SRC 10
|
||||||
#ifndef GGML_MAX_NAME
|
#ifndef GGML_MAX_NAME
|
||||||
#define GGML_MAX_NAME 64
|
#define GGML_MAX_NAME 128
|
||||||
#endif
|
#endif
|
||||||
#define GGML_MAX_OP_PARAMS 64
|
#define GGML_MAX_OP_PARAMS 64
|
||||||
#define GGML_DEFAULT_N_THREADS 4
|
#define GGML_DEFAULT_N_THREADS 4
|
||||||
|
|
95
otherarch/sdcpp/CMakeLists.txt
Normal file
95
otherarch/sdcpp/CMakeLists.txt
Normal file
|
@ -0,0 +1,95 @@
|
||||||
|
cmake_minimum_required(VERSION 3.12)
|
||||||
|
project("stable-diffusion")
|
||||||
|
|
||||||
|
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
|
||||||
|
|
||||||
|
if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
|
||||||
|
set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
|
||||||
|
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
|
||||||
|
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
|
||||||
|
|
||||||
|
if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
|
||||||
|
set(SD_STANDALONE ON)
|
||||||
|
else()
|
||||||
|
set(SD_STANDALONE OFF)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
#
|
||||||
|
# Option list
|
||||||
|
#
|
||||||
|
|
||||||
|
# general
|
||||||
|
option(SD_CUBLAS "sd: cuda backend" OFF)
|
||||||
|
option(SD_HIPBLAS "sd: rocm backend" OFF)
|
||||||
|
option(SD_METAL "sd: metal backend" OFF)
|
||||||
|
option(SD_FLASH_ATTN "sd: use flash attention for x4 less memory usage" OFF)
|
||||||
|
option(SD_FAST_SOFTMAX "sd: x1.5 faster softmax, indeterministic (sometimes, same seed don't generate same image), cuda only" OFF)
|
||||||
|
option(SD_BUILD_SHARED_LIBS "sd: build shared libs" OFF)
|
||||||
|
|
||||||
|
if(SD_CUBLAS)
|
||||||
|
message("Use CUBLAS as backend stable-diffusion")
|
||||||
|
set(GGML_CUBLAS ON)
|
||||||
|
add_definitions(-DSD_USE_CUBLAS)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if(SD_METAL)
|
||||||
|
message("Use Metal as backend stable-diffusion")
|
||||||
|
set(GGML_METAL ON)
|
||||||
|
add_definitions(-DSD_USE_METAL)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (SD_HIPBLAS)
|
||||||
|
message("Use HIPBLAS as backend stable-diffusion")
|
||||||
|
set(GGML_HIPBLAS ON)
|
||||||
|
add_definitions(-DSD_USE_CUBLAS)
|
||||||
|
if(SD_FAST_SOFTMAX)
|
||||||
|
set(GGML_CUDA_FAST_SOFTMAX ON)
|
||||||
|
endif()
|
||||||
|
endif ()
|
||||||
|
|
||||||
|
if(SD_FLASH_ATTN)
|
||||||
|
message("Use Flash Attention for memory optimization")
|
||||||
|
add_definitions(-DSD_USE_FLASH_ATTENTION)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
set(SD_LIB stable-diffusion)
|
||||||
|
|
||||||
|
file(GLOB SD_LIB_SOURCES
|
||||||
|
"*.h"
|
||||||
|
"*.cpp"
|
||||||
|
"*.hpp"
|
||||||
|
)
|
||||||
|
|
||||||
|
# we can get only one share lib
|
||||||
|
if(SD_BUILD_SHARED_LIBS)
|
||||||
|
message("Build shared library")
|
||||||
|
set(BUILD_SHARED_LIBS OFF)
|
||||||
|
message(${SD_LIB_SOURCES})
|
||||||
|
add_library(${SD_LIB} SHARED ${SD_LIB_SOURCES})
|
||||||
|
add_definitions(-DSD_BUILD_SHARED_LIB)
|
||||||
|
target_compile_definitions(${SD_LIB} PRIVATE -DSD_BUILD_DLL)
|
||||||
|
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
|
||||||
|
else()
|
||||||
|
message("Build static library")
|
||||||
|
add_library(${SD_LIB} STATIC ${SD_LIB_SOURCES})
|
||||||
|
endif()
|
||||||
|
|
||||||
|
|
||||||
|
set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
|
||||||
|
|
||||||
|
|
||||||
|
# deps
|
||||||
|
add_subdirectory(ggml)
|
||||||
|
|
||||||
|
add_subdirectory(thirdparty)
|
||||||
|
|
||||||
|
target_link_libraries(${SD_LIB} PUBLIC ggml zip)
|
||||||
|
target_include_directories(${SD_LIB} PUBLIC . thirdparty)
|
||||||
|
target_compile_features(${SD_LIB} PUBLIC cxx_std_11)
|
||||||
|
|
||||||
|
|
||||||
|
add_subdirectory(examples)
|
||||||
|
|
21
otherarch/sdcpp/SDCPP_LICENSE
Normal file
21
otherarch/sdcpp/SDCPP_LICENSE
Normal file
|
@ -0,0 +1,21 @@
|
||||||
|
MIT License
|
||||||
|
|
||||||
|
Copyright (c) 2023 leejet
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
SOFTWARE.
|
1139
otherarch/sdcpp/clip.hpp
Normal file
1139
otherarch/sdcpp/clip.hpp
Normal file
File diff suppressed because it is too large
Load diff
529
otherarch/sdcpp/common.hpp
Normal file
529
otherarch/sdcpp/common.hpp
Normal file
|
@ -0,0 +1,529 @@
|
||||||
|
#ifndef __COMMON_HPP__
|
||||||
|
#define __COMMON_HPP__
|
||||||
|
|
||||||
|
#include "ggml_extend.hpp"
|
||||||
|
|
||||||
|
class DownSampleBlock : public GGMLBlock {
|
||||||
|
protected:
|
||||||
|
int channels;
|
||||||
|
int out_channels;
|
||||||
|
bool vae_downsample;
|
||||||
|
|
||||||
|
public:
|
||||||
|
DownSampleBlock(int channels,
|
||||||
|
int out_channels,
|
||||||
|
bool vae_downsample = false)
|
||||||
|
: channels(channels),
|
||||||
|
out_channels(out_channels),
|
||||||
|
vae_downsample(vae_downsample) {
|
||||||
|
if (vae_downsample) {
|
||||||
|
blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {2, 2}, {0, 0}));
|
||||||
|
} else {
|
||||||
|
blocks["op"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {2, 2}, {1, 1}));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
|
||||||
|
// x: [N, channels, h, w]
|
||||||
|
if (vae_downsample) {
|
||||||
|
auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]);
|
||||||
|
|
||||||
|
x = ggml_pad(ctx, x, 1, 1, 0, 0);
|
||||||
|
x = conv->forward(ctx, x);
|
||||||
|
} else {
|
||||||
|
auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["op"]);
|
||||||
|
|
||||||
|
x = conv->forward(ctx, x);
|
||||||
|
}
|
||||||
|
return x; // [N, out_channels, h/2, w/2]
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class UpSampleBlock : public GGMLBlock {
|
||||||
|
protected:
|
||||||
|
int channels;
|
||||||
|
int out_channels;
|
||||||
|
|
||||||
|
public:
|
||||||
|
UpSampleBlock(int channels,
|
||||||
|
int out_channels)
|
||||||
|
: channels(channels),
|
||||||
|
out_channels(out_channels) {
|
||||||
|
blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
|
||||||
|
// x: [N, channels, h, w]
|
||||||
|
auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]);
|
||||||
|
|
||||||
|
x = ggml_upscale(ctx, x, 2); // [N, channels, h*2, w*2]
|
||||||
|
x = conv->forward(ctx, x); // [N, out_channels, h*2, w*2]
|
||||||
|
return x;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class ResBlock : public GGMLBlock {
|
||||||
|
protected:
|
||||||
|
// network hparams
|
||||||
|
int64_t channels; // model_channels * (1, 1, 1, 2, 2, 4, 4, 4)
|
||||||
|
int64_t emb_channels; // time_embed_dim
|
||||||
|
int64_t out_channels; // mult * model_channels
|
||||||
|
std::pair<int, int> kernel_size;
|
||||||
|
int dims;
|
||||||
|
bool skip_t_emb;
|
||||||
|
bool exchange_temb_dims;
|
||||||
|
|
||||||
|
std::shared_ptr<GGMLBlock> conv_nd(int dims,
|
||||||
|
int64_t in_channels,
|
||||||
|
int64_t out_channels,
|
||||||
|
std::pair<int, int> kernel_size,
|
||||||
|
std::pair<int, int> padding) {
|
||||||
|
GGML_ASSERT(dims == 2 || dims == 3);
|
||||||
|
if (dims == 3) {
|
||||||
|
return std::shared_ptr<GGMLBlock>(new Conv3dnx1x1(in_channels, out_channels, kernel_size.first, 1, padding.first));
|
||||||
|
} else {
|
||||||
|
return std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, out_channels, kernel_size, {1, 1}, padding));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public:
|
||||||
|
ResBlock(int64_t channels,
|
||||||
|
int64_t emb_channels,
|
||||||
|
int64_t out_channels,
|
||||||
|
std::pair<int, int> kernel_size = {3, 3},
|
||||||
|
int dims = 2,
|
||||||
|
bool exchange_temb_dims = false,
|
||||||
|
bool skip_t_emb = false)
|
||||||
|
: channels(channels),
|
||||||
|
emb_channels(emb_channels),
|
||||||
|
out_channels(out_channels),
|
||||||
|
kernel_size(kernel_size),
|
||||||
|
dims(dims),
|
||||||
|
skip_t_emb(skip_t_emb),
|
||||||
|
exchange_temb_dims(exchange_temb_dims) {
|
||||||
|
std::pair<int, int> padding = {kernel_size.first / 2, kernel_size.second / 2};
|
||||||
|
blocks["in_layers.0"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(channels));
|
||||||
|
// in_layer_1 is nn.SILU()
|
||||||
|
blocks["in_layers.2"] = conv_nd(dims, channels, out_channels, kernel_size, padding);
|
||||||
|
|
||||||
|
if (!skip_t_emb) {
|
||||||
|
// emb_layer_0 is nn.SILU()
|
||||||
|
blocks["emb_layers.1"] = std::shared_ptr<GGMLBlock>(new Linear(emb_channels, out_channels));
|
||||||
|
}
|
||||||
|
|
||||||
|
blocks["out_layers.0"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(out_channels));
|
||||||
|
// out_layer_1 is nn.SILU()
|
||||||
|
// out_layer_2 is nn.Dropout(), skip for inference
|
||||||
|
blocks["out_layers.3"] = conv_nd(dims, out_channels, out_channels, kernel_size, padding);
|
||||||
|
|
||||||
|
if (out_channels != channels) {
|
||||||
|
blocks["skip_connection"] = conv_nd(dims, channels, out_channels, {1, 1}, {0, 0});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
virtual struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* emb = NULL) {
|
||||||
|
// For dims==3, we reduce dimension from 5d to 4d by merging h and w, in order not to change ggml
|
||||||
|
// [N, c, t, h, w] => [N, c, t, h * w]
|
||||||
|
// x: [N, channels, h, w] if dims == 2 else [N, channels, t, h, w]
|
||||||
|
// emb: [N, emb_channels] if dims == 2 else [N, t, emb_channels]
|
||||||
|
auto in_layers_0 = std::dynamic_pointer_cast<GroupNorm32>(blocks["in_layers.0"]);
|
||||||
|
auto in_layers_2 = std::dynamic_pointer_cast<UnaryBlock>(blocks["in_layers.2"]);
|
||||||
|
auto out_layers_0 = std::dynamic_pointer_cast<GroupNorm32>(blocks["out_layers.0"]);
|
||||||
|
auto out_layers_3 = std::dynamic_pointer_cast<UnaryBlock>(blocks["out_layers.3"]);
|
||||||
|
|
||||||
|
if (emb == NULL) {
|
||||||
|
GGML_ASSERT(skip_t_emb);
|
||||||
|
}
|
||||||
|
|
||||||
|
// in_layers
|
||||||
|
auto h = in_layers_0->forward(ctx, x);
|
||||||
|
h = ggml_silu_inplace(ctx, h);
|
||||||
|
h = in_layers_2->forward(ctx, h); // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
|
||||||
|
|
||||||
|
// emb_layers
|
||||||
|
if (!skip_t_emb) {
|
||||||
|
auto emb_layer_1 = std::dynamic_pointer_cast<Linear>(blocks["emb_layers.1"]);
|
||||||
|
|
||||||
|
auto emb_out = ggml_silu(ctx, emb);
|
||||||
|
emb_out = emb_layer_1->forward(ctx, emb_out); // [N, out_channels] if dims == 2 else [N, t, out_channels]
|
||||||
|
|
||||||
|
if (dims == 2) {
|
||||||
|
emb_out = ggml_reshape_4d(ctx, emb_out, 1, 1, emb_out->ne[0], emb_out->ne[1]); // [N, out_channels, 1, 1]
|
||||||
|
} else {
|
||||||
|
emb_out = ggml_reshape_4d(ctx, emb_out, 1, emb_out->ne[0], emb_out->ne[1], emb_out->ne[2]); // [N, t, out_channels, 1]
|
||||||
|
if (exchange_temb_dims) {
|
||||||
|
// emb_out = rearrange(emb_out, "b t c ... -> b c t ...")
|
||||||
|
emb_out = ggml_cont(ctx, ggml_permute(ctx, emb_out, 0, 2, 1, 3)); // [N, out_channels, t, 1]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
h = ggml_add(ctx, h, emb_out); // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
|
||||||
|
}
|
||||||
|
|
||||||
|
// out_layers
|
||||||
|
h = out_layers_0->forward(ctx, h);
|
||||||
|
h = ggml_silu_inplace(ctx, h);
|
||||||
|
// dropout, skip for inference
|
||||||
|
h = out_layers_3->forward(ctx, h);
|
||||||
|
|
||||||
|
// skip connection
|
||||||
|
if (out_channels != channels) {
|
||||||
|
auto skip_connection = std::dynamic_pointer_cast<UnaryBlock>(blocks["skip_connection"]);
|
||||||
|
x = skip_connection->forward(ctx, x); // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
|
||||||
|
}
|
||||||
|
|
||||||
|
h = ggml_add(ctx, h, x);
|
||||||
|
return h; // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class GEGLU : public GGMLBlock {
|
||||||
|
protected:
|
||||||
|
int64_t dim_in;
|
||||||
|
int64_t dim_out;
|
||||||
|
|
||||||
|
void init_params(struct ggml_context* ctx, ggml_type wtype) {
|
||||||
|
params["proj.weight"] = ggml_new_tensor_2d(ctx, wtype, dim_in, dim_out * 2);
|
||||||
|
params["proj.bias"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, dim_out * 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
public:
|
||||||
|
GEGLU(int64_t dim_in, int64_t dim_out)
|
||||||
|
: dim_in(dim_in), dim_out(dim_out) {}
|
||||||
|
|
||||||
|
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
|
||||||
|
// x: [ne3, ne2, ne1, dim_in]
|
||||||
|
// return: [ne3, ne2, ne1, dim_out]
|
||||||
|
struct ggml_tensor* w = params["proj.weight"];
|
||||||
|
struct ggml_tensor* b = params["proj.bias"];
|
||||||
|
|
||||||
|
auto x_w = ggml_view_2d(ctx, w, w->ne[0], w->ne[1] / 2, w->nb[1], 0); // [dim_out, dim_in]
|
||||||
|
auto x_b = ggml_view_1d(ctx, b, b->ne[0] / 2, 0); // [dim_out, dim_in]
|
||||||
|
auto gate_w = ggml_view_2d(ctx, w, w->ne[0], w->ne[1] / 2, w->nb[1], w->nb[1] * w->ne[1] / 2); // [dim_out, ]
|
||||||
|
auto gate_b = ggml_view_1d(ctx, b, b->ne[0] / 2, b->nb[0] * b->ne[0] / 2); // [dim_out, ]
|
||||||
|
|
||||||
|
auto x_in = x;
|
||||||
|
x = ggml_nn_linear(ctx, x_in, x_w, x_b); // [ne3, ne2, ne1, dim_out]
|
||||||
|
auto gate = ggml_nn_linear(ctx, x_in, gate_w, gate_b); // [ne3, ne2, ne1, dim_out]
|
||||||
|
|
||||||
|
gate = ggml_gelu_inplace(ctx, gate);
|
||||||
|
|
||||||
|
x = ggml_mul(ctx, x, gate); // [ne3, ne2, ne1, dim_out]
|
||||||
|
|
||||||
|
return x;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class FeedForward : public GGMLBlock {
|
||||||
|
public:
|
||||||
|
FeedForward(int64_t dim,
|
||||||
|
int64_t dim_out,
|
||||||
|
int64_t mult = 4) {
|
||||||
|
int64_t inner_dim = dim * mult;
|
||||||
|
|
||||||
|
blocks["net.0"] = std::shared_ptr<GGMLBlock>(new GEGLU(dim, inner_dim));
|
||||||
|
// net_1 is nn.Dropout(), skip for inference
|
||||||
|
blocks["net.2"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim_out));
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
|
||||||
|
// x: [ne3, ne2, ne1, dim]
|
||||||
|
// return: [ne3, ne2, ne1, dim_out]
|
||||||
|
|
||||||
|
auto net_0 = std::dynamic_pointer_cast<GEGLU>(blocks["net.0"]);
|
||||||
|
auto net_2 = std::dynamic_pointer_cast<Linear>(blocks["net.2"]);
|
||||||
|
|
||||||
|
x = net_0->forward(ctx, x); // [ne3, ne2, ne1, inner_dim]
|
||||||
|
x = net_2->forward(ctx, x); // [ne3, ne2, ne1, dim_out]
|
||||||
|
return x;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class CrossAttention : public GGMLBlock {
|
||||||
|
protected:
|
||||||
|
int64_t query_dim;
|
||||||
|
int64_t context_dim;
|
||||||
|
int64_t n_head;
|
||||||
|
int64_t d_head;
|
||||||
|
|
||||||
|
public:
|
||||||
|
CrossAttention(int64_t query_dim,
|
||||||
|
int64_t context_dim,
|
||||||
|
int64_t n_head,
|
||||||
|
int64_t d_head)
|
||||||
|
: n_head(n_head),
|
||||||
|
d_head(d_head),
|
||||||
|
query_dim(query_dim),
|
||||||
|
context_dim(context_dim) {
|
||||||
|
int64_t inner_dim = d_head * n_head;
|
||||||
|
|
||||||
|
blocks["to_q"] = std::shared_ptr<GGMLBlock>(new Linear(query_dim, inner_dim, false));
|
||||||
|
blocks["to_k"] = std::shared_ptr<GGMLBlock>(new Linear(context_dim, inner_dim, false));
|
||||||
|
blocks["to_v"] = std::shared_ptr<GGMLBlock>(new Linear(context_dim, inner_dim, false));
|
||||||
|
|
||||||
|
blocks["to_out.0"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, query_dim));
|
||||||
|
// to_out_1 is nn.Dropout(), skip for inference
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* context) {
|
||||||
|
// x: [N, n_token, query_dim]
|
||||||
|
// context: [N, n_context, context_dim]
|
||||||
|
// return: [N, n_token, query_dim]
|
||||||
|
auto to_q = std::dynamic_pointer_cast<Linear>(blocks["to_q"]);
|
||||||
|
auto to_k = std::dynamic_pointer_cast<Linear>(blocks["to_k"]);
|
||||||
|
auto to_v = std::dynamic_pointer_cast<Linear>(blocks["to_v"]);
|
||||||
|
auto to_out_0 = std::dynamic_pointer_cast<Linear>(blocks["to_out.0"]);
|
||||||
|
|
||||||
|
int64_t n = x->ne[2];
|
||||||
|
int64_t n_token = x->ne[1];
|
||||||
|
int64_t n_context = context->ne[1];
|
||||||
|
int64_t inner_dim = d_head * n_head;
|
||||||
|
|
||||||
|
auto q = to_q->forward(ctx, x); // [N, n_token, inner_dim]
|
||||||
|
q = ggml_reshape_4d(ctx, q, d_head, n_head, n_token, n); // [N, n_token, n_head, d_head]
|
||||||
|
q = ggml_cont(ctx, ggml_permute(ctx, q, 0, 2, 1, 3)); // [N, n_head, n_token, d_head]
|
||||||
|
q = ggml_reshape_3d(ctx, q, d_head, n_token, n_head * n); // [N * n_head, n_token, d_head]
|
||||||
|
|
||||||
|
auto k = to_k->forward(ctx, context); // [N, n_context, inner_dim]
|
||||||
|
k = ggml_reshape_4d(ctx, k, d_head, n_head, n_context, n); // [N, n_context, n_head, d_head]
|
||||||
|
k = ggml_cont(ctx, ggml_permute(ctx, k, 0, 2, 1, 3)); // [N, n_head, n_context, d_head]
|
||||||
|
k = ggml_reshape_3d(ctx, k, d_head, n_context, n_head * n); // [N * n_head, n_context, d_head]
|
||||||
|
|
||||||
|
auto v = to_v->forward(ctx, context); // [N, n_context, inner_dim]
|
||||||
|
v = ggml_reshape_4d(ctx, v, d_head, n_head, n_context, n); // [N, n_context, n_head, d_head]
|
||||||
|
v = ggml_cont(ctx, ggml_permute(ctx, v, 1, 2, 0, 3)); // [N, n_head, d_head, n_context]
|
||||||
|
v = ggml_reshape_3d(ctx, v, n_context, d_head, n_head * n); // [N * n_head, d_head, n_context]
|
||||||
|
|
||||||
|
auto kqv = ggml_nn_attention(ctx, q, k, v, false); // [N * n_head, n_token, d_head]
|
||||||
|
kqv = ggml_reshape_4d(ctx, kqv, d_head, n_token, n_head, n);
|
||||||
|
kqv = ggml_cont(ctx, ggml_permute(ctx, kqv, 0, 2, 1, 3)); // [N, n_token, n_head, d_head]
|
||||||
|
|
||||||
|
x = ggml_reshape_3d(ctx, kqv, d_head * n_head, n_token, n); // [N, n_token, inner_dim]
|
||||||
|
|
||||||
|
x = to_out_0->forward(ctx, x); // [N, n_token, query_dim]
|
||||||
|
return x;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class BasicTransformerBlock : public GGMLBlock {
|
||||||
|
protected:
|
||||||
|
int64_t n_head;
|
||||||
|
int64_t d_head;
|
||||||
|
bool ff_in;
|
||||||
|
|
||||||
|
public:
|
||||||
|
BasicTransformerBlock(int64_t dim,
|
||||||
|
int64_t n_head,
|
||||||
|
int64_t d_head,
|
||||||
|
int64_t context_dim,
|
||||||
|
bool ff_in = false)
|
||||||
|
: n_head(n_head), d_head(d_head), ff_in(ff_in) {
|
||||||
|
// disable_self_attn is always False
|
||||||
|
// disable_temporal_crossattention is always False
|
||||||
|
// switch_temporal_ca_to_sa is always False
|
||||||
|
// inner_dim is always None or equal to dim
|
||||||
|
// gated_ff is always True
|
||||||
|
blocks["attn1"] = std::shared_ptr<GGMLBlock>(new CrossAttention(dim, dim, n_head, d_head));
|
||||||
|
blocks["attn2"] = std::shared_ptr<GGMLBlock>(new CrossAttention(dim, context_dim, n_head, d_head));
|
||||||
|
blocks["ff"] = std::shared_ptr<GGMLBlock>(new FeedForward(dim, dim));
|
||||||
|
blocks["norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
|
||||||
|
blocks["norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
|
||||||
|
blocks["norm3"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
|
||||||
|
|
||||||
|
if (ff_in) {
|
||||||
|
blocks["norm_in"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
|
||||||
|
blocks["ff_in"] = std::shared_ptr<GGMLBlock>(new FeedForward(dim, dim));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* context) {
|
||||||
|
// x: [N, n_token, query_dim]
|
||||||
|
// context: [N, n_context, context_dim]
|
||||||
|
// return: [N, n_token, query_dim]
|
||||||
|
|
||||||
|
auto attn1 = std::dynamic_pointer_cast<CrossAttention>(blocks["attn1"]);
|
||||||
|
auto attn2 = std::dynamic_pointer_cast<CrossAttention>(blocks["attn2"]);
|
||||||
|
auto ff = std::dynamic_pointer_cast<FeedForward>(blocks["ff"]);
|
||||||
|
auto norm1 = std::dynamic_pointer_cast<LayerNorm>(blocks["norm1"]);
|
||||||
|
auto norm2 = std::dynamic_pointer_cast<LayerNorm>(blocks["norm2"]);
|
||||||
|
auto norm3 = std::dynamic_pointer_cast<LayerNorm>(blocks["norm3"]);
|
||||||
|
|
||||||
|
if (ff_in) {
|
||||||
|
auto norm_in = std::dynamic_pointer_cast<LayerNorm>(blocks["norm_in"]);
|
||||||
|
auto ff_in = std::dynamic_pointer_cast<FeedForward>(blocks["ff_in"]);
|
||||||
|
|
||||||
|
auto x_skip = x;
|
||||||
|
x = norm_in->forward(ctx, x);
|
||||||
|
x = ff_in->forward(ctx, x);
|
||||||
|
// self.is_res is always True
|
||||||
|
x = ggml_add(ctx, x, x_skip);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto r = x;
|
||||||
|
x = norm1->forward(ctx, x);
|
||||||
|
x = attn1->forward(ctx, x, x); // self-attention
|
||||||
|
x = ggml_add(ctx, x, r);
|
||||||
|
r = x;
|
||||||
|
x = norm2->forward(ctx, x);
|
||||||
|
x = attn2->forward(ctx, x, context); // cross-attention
|
||||||
|
x = ggml_add(ctx, x, r);
|
||||||
|
r = x;
|
||||||
|
x = norm3->forward(ctx, x);
|
||||||
|
x = ff->forward(ctx, x);
|
||||||
|
x = ggml_add(ctx, x, r);
|
||||||
|
|
||||||
|
return x;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class SpatialTransformer : public GGMLBlock {
|
||||||
|
protected:
|
||||||
|
int64_t in_channels; // mult * model_channels
|
||||||
|
int64_t n_head;
|
||||||
|
int64_t d_head;
|
||||||
|
int64_t depth = 1; // 1
|
||||||
|
int64_t context_dim = 768; // hidden_size, 1024 for VERSION_2_x
|
||||||
|
|
||||||
|
public:
|
||||||
|
SpatialTransformer(int64_t in_channels,
|
||||||
|
int64_t n_head,
|
||||||
|
int64_t d_head,
|
||||||
|
int64_t depth,
|
||||||
|
int64_t context_dim)
|
||||||
|
: in_channels(in_channels),
|
||||||
|
n_head(n_head),
|
||||||
|
d_head(d_head),
|
||||||
|
depth(depth),
|
||||||
|
context_dim(context_dim) {
|
||||||
|
// We will convert unet transformer linear to conv2d 1x1 when loading the weights, so use_linear is always False
|
||||||
|
// disable_self_attn is always False
|
||||||
|
int64_t inner_dim = n_head * d_head; // in_channels
|
||||||
|
blocks["norm"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(in_channels));
|
||||||
|
blocks["proj_in"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, inner_dim, {1, 1}));
|
||||||
|
|
||||||
|
for (int i = 0; i < depth; i++) {
|
||||||
|
std::string name = "transformer_blocks." + std::to_string(i);
|
||||||
|
blocks[name] = std::shared_ptr<GGMLBlock>(new BasicTransformerBlock(inner_dim, n_head, d_head, context_dim));
|
||||||
|
}
|
||||||
|
|
||||||
|
blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(inner_dim, in_channels, {1, 1}));
|
||||||
|
}
|
||||||
|
|
||||||
|
virtual struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* context) {
|
||||||
|
// x: [N, in_channels, h, w]
|
||||||
|
// context: [N, max_position(aka n_token), hidden_size(aka context_dim)]
|
||||||
|
auto norm = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm"]);
|
||||||
|
auto proj_in = std::dynamic_pointer_cast<Conv2d>(blocks["proj_in"]);
|
||||||
|
auto proj_out = std::dynamic_pointer_cast<Conv2d>(blocks["proj_out"]);
|
||||||
|
|
||||||
|
auto x_in = x;
|
||||||
|
int64_t n = x->ne[3];
|
||||||
|
int64_t h = x->ne[1];
|
||||||
|
int64_t w = x->ne[0];
|
||||||
|
int64_t inner_dim = n_head * d_head;
|
||||||
|
|
||||||
|
x = norm->forward(ctx, x);
|
||||||
|
x = proj_in->forward(ctx, x); // [N, inner_dim, h, w]
|
||||||
|
|
||||||
|
x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 2, 0, 3)); // [N, h, w, inner_dim]
|
||||||
|
x = ggml_reshape_3d(ctx, x, inner_dim, w * h, n); // [N, h * w, inner_dim]
|
||||||
|
|
||||||
|
for (int i = 0; i < depth; i++) {
|
||||||
|
std::string name = "transformer_blocks." + std::to_string(i);
|
||||||
|
auto transformer_block = std::dynamic_pointer_cast<BasicTransformerBlock>(blocks[name]);
|
||||||
|
|
||||||
|
x = transformer_block->forward(ctx, x, context);
|
||||||
|
}
|
||||||
|
|
||||||
|
x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 0, 2, 3)); // [N, inner_dim, h * w]
|
||||||
|
x = ggml_reshape_4d(ctx, x, w, h, inner_dim, n); // [N, inner_dim, h, w]
|
||||||
|
|
||||||
|
// proj_out
|
||||||
|
x = proj_out->forward(ctx, x); // [N, in_channels, h, w]
|
||||||
|
|
||||||
|
x = ggml_add(ctx, x, x_in);
|
||||||
|
return x;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class AlphaBlender : public GGMLBlock {
|
||||||
|
protected:
|
||||||
|
void init_params(struct ggml_context* ctx, ggml_type wtype) {
|
||||||
|
params["mix_factor"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
float get_alpha() {
|
||||||
|
// image_only_indicator is always tensor([0.]) and since mix_factor.shape is [1,]
|
||||||
|
// so learned_with_images is same as learned
|
||||||
|
float alpha = ggml_backend_tensor_get_f32(params["mix_factor"]);
|
||||||
|
return sigmoid(alpha);
|
||||||
|
}
|
||||||
|
|
||||||
|
public:
|
||||||
|
AlphaBlender() {
|
||||||
|
// merge_strategy is always learned_with_images
|
||||||
|
// for inference, we don't need to set alpha
|
||||||
|
// since mix_factor.shape is [1,], we don't need rearrange using rearrange_pattern
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor* forward(struct ggml_context* ctx,
|
||||||
|
struct ggml_tensor* x_spatial,
|
||||||
|
struct ggml_tensor* x_temporal) {
|
||||||
|
// image_only_indicator is always tensor([0.])
|
||||||
|
float alpha = get_alpha();
|
||||||
|
auto x = ggml_add(ctx,
|
||||||
|
ggml_scale(ctx, x_spatial, alpha),
|
||||||
|
ggml_scale(ctx, x_temporal, 1.0f - alpha));
|
||||||
|
return x;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class VideoResBlock : public ResBlock {
|
||||||
|
public:
|
||||||
|
VideoResBlock(int channels,
|
||||||
|
int emb_channels,
|
||||||
|
int out_channels,
|
||||||
|
std::pair<int, int> kernel_size = {3, 3},
|
||||||
|
int64_t video_kernel_size = 3,
|
||||||
|
int dims = 2) // always 2
|
||||||
|
: ResBlock(channels, emb_channels, out_channels, kernel_size, dims) {
|
||||||
|
blocks["time_stack"] = std::shared_ptr<GGMLBlock>(new ResBlock(out_channels, emb_channels, out_channels, kernel_size, 3, true));
|
||||||
|
blocks["time_mixer"] = std::shared_ptr<GGMLBlock>(new AlphaBlender());
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor* forward(struct ggml_context* ctx,
|
||||||
|
struct ggml_tensor* x,
|
||||||
|
struct ggml_tensor* emb,
|
||||||
|
int num_video_frames) {
|
||||||
|
// x: [N, channels, h, w] aka [b*t, channels, h, w]
|
||||||
|
// emb: [N, emb_channels] aka [b*t, emb_channels]
|
||||||
|
// image_only_indicator is always tensor([0.])
|
||||||
|
auto time_stack = std::dynamic_pointer_cast<ResBlock>(blocks["time_stack"]);
|
||||||
|
auto time_mixer = std::dynamic_pointer_cast<AlphaBlender>(blocks["time_mixer"]);
|
||||||
|
|
||||||
|
x = ResBlock::forward(ctx, x, emb);
|
||||||
|
|
||||||
|
int64_t T = num_video_frames;
|
||||||
|
int64_t B = x->ne[3] / T;
|
||||||
|
int64_t C = x->ne[2];
|
||||||
|
int64_t H = x->ne[1];
|
||||||
|
int64_t W = x->ne[0];
|
||||||
|
|
||||||
|
x = ggml_reshape_4d(ctx, x, W * H, C, T, B); // (b t) c h w -> b t c (h w)
|
||||||
|
x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // b t c (h w) -> b c t (h w)
|
||||||
|
auto x_mix = x;
|
||||||
|
|
||||||
|
emb = ggml_reshape_4d(ctx, emb, emb->ne[0], T, B, emb->ne[3]); // (b t) ... -> b t ...
|
||||||
|
|
||||||
|
x = time_stack->forward(ctx, x, emb); // b t c (h w)
|
||||||
|
|
||||||
|
x = time_mixer->forward(ctx, x_mix, x); // b t c (h w)
|
||||||
|
|
||||||
|
x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // b c t (h w) -> b t c (h w)
|
||||||
|
x = ggml_reshape_4d(ctx, x, W, H, C, T * B); // b t c (h w) -> (b t) c h w
|
||||||
|
|
||||||
|
return x;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif // __COMMON_HPP__
|
466
otherarch/sdcpp/control.hpp
Normal file
466
otherarch/sdcpp/control.hpp
Normal file
|
@ -0,0 +1,466 @@
|
||||||
|
#ifndef __CONTROL_HPP__
|
||||||
|
#define __CONTROL_HPP__
|
||||||
|
|
||||||
|
#include "common.hpp"
|
||||||
|
#include "ggml_extend.hpp"
|
||||||
|
#include "model.h"
|
||||||
|
|
||||||
|
#define CONTROL_NET_GRAPH_SIZE 1536
|
||||||
|
|
||||||
|
/*
|
||||||
|
=================================== ControlNet ===================================
|
||||||
|
Reference: https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/cldm/cldm.py
|
||||||
|
|
||||||
|
*/
|
||||||
|
class ControlNetBlock : public GGMLBlock {
|
||||||
|
protected:
|
||||||
|
SDVersion version = VERSION_1_x;
|
||||||
|
// network hparams
|
||||||
|
int in_channels = 4;
|
||||||
|
int out_channels = 4;
|
||||||
|
int hint_channels = 3;
|
||||||
|
int num_res_blocks = 2;
|
||||||
|
std::vector<int> attention_resolutions = {4, 2, 1};
|
||||||
|
std::vector<int> channel_mult = {1, 2, 4, 4};
|
||||||
|
std::vector<int> transformer_depth = {1, 1, 1, 1};
|
||||||
|
int time_embed_dim = 1280; // model_channels*4
|
||||||
|
int num_heads = 8;
|
||||||
|
int num_head_channels = -1; // channels // num_heads
|
||||||
|
int context_dim = 768; // 1024 for VERSION_2_x, 2048 for VERSION_XL
|
||||||
|
|
||||||
|
public:
|
||||||
|
int model_channels = 320;
|
||||||
|
int adm_in_channels = 2816; // only for VERSION_XL
|
||||||
|
|
||||||
|
ControlNetBlock(SDVersion version = VERSION_1_x)
|
||||||
|
: version(version) {
|
||||||
|
if (version == VERSION_2_x) {
|
||||||
|
context_dim = 1024;
|
||||||
|
num_head_channels = 64;
|
||||||
|
num_heads = -1;
|
||||||
|
} else if (version == VERSION_XL) {
|
||||||
|
context_dim = 2048;
|
||||||
|
attention_resolutions = {4, 2};
|
||||||
|
channel_mult = {1, 2, 4};
|
||||||
|
transformer_depth = {1, 2, 10};
|
||||||
|
num_head_channels = 64;
|
||||||
|
num_heads = -1;
|
||||||
|
} else if (version == VERSION_SVD) {
|
||||||
|
in_channels = 8;
|
||||||
|
out_channels = 4;
|
||||||
|
context_dim = 1024;
|
||||||
|
adm_in_channels = 768;
|
||||||
|
num_head_channels = 64;
|
||||||
|
num_heads = -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
blocks["time_embed.0"] = std::shared_ptr<GGMLBlock>(new Linear(model_channels, time_embed_dim));
|
||||||
|
// time_embed_1 is nn.SiLU()
|
||||||
|
blocks["time_embed.2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, time_embed_dim));
|
||||||
|
|
||||||
|
if (version == VERSION_XL || version == VERSION_SVD) {
|
||||||
|
blocks["label_emb.0.0"] = std::shared_ptr<GGMLBlock>(new Linear(adm_in_channels, time_embed_dim));
|
||||||
|
// label_emb_1 is nn.SiLU()
|
||||||
|
blocks["label_emb.0.2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, time_embed_dim));
|
||||||
|
}
|
||||||
|
|
||||||
|
// input_blocks
|
||||||
|
blocks["input_blocks.0.0"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, model_channels, {3, 3}, {1, 1}, {1, 1}));
|
||||||
|
|
||||||
|
std::vector<int> input_block_chans;
|
||||||
|
input_block_chans.push_back(model_channels);
|
||||||
|
int ch = model_channels;
|
||||||
|
int input_block_idx = 0;
|
||||||
|
int ds = 1;
|
||||||
|
|
||||||
|
auto get_resblock = [&](int64_t channels, int64_t emb_channels, int64_t out_channels) -> ResBlock* {
|
||||||
|
return new ResBlock(channels, emb_channels, out_channels);
|
||||||
|
};
|
||||||
|
|
||||||
|
auto get_attention_layer = [&](int64_t in_channels,
|
||||||
|
int64_t n_head,
|
||||||
|
int64_t d_head,
|
||||||
|
int64_t depth,
|
||||||
|
int64_t context_dim) -> SpatialTransformer* {
|
||||||
|
return new SpatialTransformer(in_channels, n_head, d_head, depth, context_dim);
|
||||||
|
};
|
||||||
|
|
||||||
|
auto make_zero_conv = [&](int64_t channels) {
|
||||||
|
return new Conv2d(channels, channels, {1, 1});
|
||||||
|
};
|
||||||
|
|
||||||
|
blocks["zero_convs.0.0"] = std::shared_ptr<GGMLBlock>(make_zero_conv(model_channels));
|
||||||
|
|
||||||
|
blocks["input_hint_block.0"] = std::shared_ptr<GGMLBlock>(new Conv2d(hint_channels, 16, {3, 3}, {1, 1}, {1, 1}));
|
||||||
|
// nn.SiLU()
|
||||||
|
blocks["input_hint_block.2"] = std::shared_ptr<GGMLBlock>(new Conv2d(16, 16, {3, 3}, {1, 1}, {1, 1}));
|
||||||
|
// nn.SiLU()
|
||||||
|
blocks["input_hint_block.4"] = std::shared_ptr<GGMLBlock>(new Conv2d(16, 32, {3, 3}, {2, 2}, {1, 1}));
|
||||||
|
// nn.SiLU()
|
||||||
|
blocks["input_hint_block.6"] = std::shared_ptr<GGMLBlock>(new Conv2d(32, 32, {3, 3}, {1, 1}, {1, 1}));
|
||||||
|
// nn.SiLU()
|
||||||
|
blocks["input_hint_block.8"] = std::shared_ptr<GGMLBlock>(new Conv2d(32, 96, {3, 3}, {2, 2}, {1, 1}));
|
||||||
|
// nn.SiLU()
|
||||||
|
blocks["input_hint_block.10"] = std::shared_ptr<GGMLBlock>(new Conv2d(96, 96, {3, 3}, {1, 1}, {1, 1}));
|
||||||
|
// nn.SiLU()
|
||||||
|
blocks["input_hint_block.12"] = std::shared_ptr<GGMLBlock>(new Conv2d(96, 256, {3, 3}, {2, 2}, {1, 1}));
|
||||||
|
// nn.SiLU()
|
||||||
|
blocks["input_hint_block.14"] = std::shared_ptr<GGMLBlock>(new Conv2d(256, model_channels, {3, 3}, {1, 1}, {1, 1}));
|
||||||
|
|
||||||
|
size_t len_mults = channel_mult.size();
|
||||||
|
for (int i = 0; i < len_mults; i++) {
|
||||||
|
int mult = channel_mult[i];
|
||||||
|
for (int j = 0; j < num_res_blocks; j++) {
|
||||||
|
input_block_idx += 1;
|
||||||
|
std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0";
|
||||||
|
blocks[name] = std::shared_ptr<GGMLBlock>(get_resblock(ch, time_embed_dim, mult * model_channels));
|
||||||
|
|
||||||
|
ch = mult * model_channels;
|
||||||
|
if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) {
|
||||||
|
int n_head = num_heads;
|
||||||
|
int d_head = ch / num_heads;
|
||||||
|
if (num_head_channels != -1) {
|
||||||
|
d_head = num_head_channels;
|
||||||
|
n_head = ch / d_head;
|
||||||
|
}
|
||||||
|
std::string name = "input_blocks." + std::to_string(input_block_idx) + ".1";
|
||||||
|
blocks[name] = std::shared_ptr<GGMLBlock>(get_attention_layer(ch,
|
||||||
|
n_head,
|
||||||
|
d_head,
|
||||||
|
transformer_depth[i],
|
||||||
|
context_dim));
|
||||||
|
}
|
||||||
|
blocks["zero_convs." + std::to_string(input_block_idx) + ".0"] = std::shared_ptr<GGMLBlock>(make_zero_conv(ch));
|
||||||
|
input_block_chans.push_back(ch);
|
||||||
|
}
|
||||||
|
if (i != len_mults - 1) {
|
||||||
|
input_block_idx += 1;
|
||||||
|
std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0";
|
||||||
|
blocks[name] = std::shared_ptr<GGMLBlock>(new DownSampleBlock(ch, ch));
|
||||||
|
|
||||||
|
blocks["zero_convs." + std::to_string(input_block_idx) + ".0"] = std::shared_ptr<GGMLBlock>(make_zero_conv(ch));
|
||||||
|
|
||||||
|
input_block_chans.push_back(ch);
|
||||||
|
ds *= 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// middle blocks
|
||||||
|
int n_head = num_heads;
|
||||||
|
int d_head = ch / num_heads;
|
||||||
|
if (num_head_channels != -1) {
|
||||||
|
d_head = num_head_channels;
|
||||||
|
n_head = ch / d_head;
|
||||||
|
}
|
||||||
|
blocks["middle_block.0"] = std::shared_ptr<GGMLBlock>(get_resblock(ch, time_embed_dim, ch));
|
||||||
|
blocks["middle_block.1"] = std::shared_ptr<GGMLBlock>(get_attention_layer(ch,
|
||||||
|
n_head,
|
||||||
|
d_head,
|
||||||
|
transformer_depth[transformer_depth.size() - 1],
|
||||||
|
context_dim));
|
||||||
|
blocks["middle_block.2"] = std::shared_ptr<GGMLBlock>(get_resblock(ch, time_embed_dim, ch));
|
||||||
|
|
||||||
|
// middle_block_out
|
||||||
|
blocks["middle_block_out.0"] = std::shared_ptr<GGMLBlock>(make_zero_conv(ch));
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor* resblock_forward(std::string name,
|
||||||
|
struct ggml_context* ctx,
|
||||||
|
struct ggml_tensor* x,
|
||||||
|
struct ggml_tensor* emb) {
|
||||||
|
auto block = std::dynamic_pointer_cast<ResBlock>(blocks[name]);
|
||||||
|
return block->forward(ctx, x, emb);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor* attention_layer_forward(std::string name,
|
||||||
|
struct ggml_context* ctx,
|
||||||
|
struct ggml_tensor* x,
|
||||||
|
struct ggml_tensor* context) {
|
||||||
|
auto block = std::dynamic_pointer_cast<SpatialTransformer>(blocks[name]);
|
||||||
|
return block->forward(ctx, x, context);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor* input_hint_block_forward(struct ggml_context* ctx,
|
||||||
|
struct ggml_tensor* hint,
|
||||||
|
struct ggml_tensor* emb,
|
||||||
|
struct ggml_tensor* context) {
|
||||||
|
int num_input_blocks = 15;
|
||||||
|
auto h = hint;
|
||||||
|
for (int i = 0; i < num_input_blocks; i++) {
|
||||||
|
if (i % 2 == 0) {
|
||||||
|
auto block = std::dynamic_pointer_cast<Conv2d>(blocks["input_hint_block." + std::to_string(i)]);
|
||||||
|
|
||||||
|
h = block->forward(ctx, h);
|
||||||
|
} else {
|
||||||
|
h = ggml_silu_inplace(ctx, h);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return h;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<struct ggml_tensor*> forward(struct ggml_context* ctx,
|
||||||
|
struct ggml_tensor* x,
|
||||||
|
struct ggml_tensor* hint,
|
||||||
|
struct ggml_tensor* guided_hint,
|
||||||
|
struct ggml_tensor* timesteps,
|
||||||
|
struct ggml_tensor* context,
|
||||||
|
struct ggml_tensor* y = NULL) {
|
||||||
|
// x: [N, in_channels, h, w] or [N, in_channels/2, h, w]
|
||||||
|
// timesteps: [N,]
|
||||||
|
// context: [N, max_position, hidden_size] or [1, max_position, hidden_size]. for example, [N, 77, 768]
|
||||||
|
// y: [N, adm_in_channels] or [1, adm_in_channels]
|
||||||
|
if (context != NULL) {
|
||||||
|
if (context->ne[2] != x->ne[3]) {
|
||||||
|
context = ggml_repeat(ctx, context, ggml_new_tensor_3d(ctx, GGML_TYPE_F32, context->ne[0], context->ne[1], x->ne[3]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (y != NULL) {
|
||||||
|
if (y->ne[1] != x->ne[3]) {
|
||||||
|
y = ggml_repeat(ctx, y, ggml_new_tensor_2d(ctx, GGML_TYPE_F32, y->ne[0], x->ne[3]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
auto time_embed_0 = std::dynamic_pointer_cast<Linear>(blocks["time_embed.0"]);
|
||||||
|
auto time_embed_2 = std::dynamic_pointer_cast<Linear>(blocks["time_embed.2"]);
|
||||||
|
auto input_blocks_0_0 = std::dynamic_pointer_cast<Conv2d>(blocks["input_blocks.0.0"]);
|
||||||
|
auto zero_convs_0 = std::dynamic_pointer_cast<Conv2d>(blocks["zero_convs.0.0"]);
|
||||||
|
|
||||||
|
auto middle_block_out = std::dynamic_pointer_cast<Conv2d>(blocks["middle_block_out.0"]);
|
||||||
|
|
||||||
|
auto t_emb = ggml_nn_timestep_embedding(ctx, timesteps, model_channels); // [N, model_channels]
|
||||||
|
|
||||||
|
auto emb = time_embed_0->forward(ctx, t_emb);
|
||||||
|
emb = ggml_silu_inplace(ctx, emb);
|
||||||
|
emb = time_embed_2->forward(ctx, emb); // [N, time_embed_dim]
|
||||||
|
|
||||||
|
// SDXL/SVD
|
||||||
|
if (y != NULL) {
|
||||||
|
auto label_embed_0 = std::dynamic_pointer_cast<Linear>(blocks["label_emb.0.0"]);
|
||||||
|
auto label_embed_2 = std::dynamic_pointer_cast<Linear>(blocks["label_emb.0.2"]);
|
||||||
|
|
||||||
|
auto label_emb = label_embed_0->forward(ctx, y);
|
||||||
|
label_emb = ggml_silu_inplace(ctx, label_emb);
|
||||||
|
label_emb = label_embed_2->forward(ctx, label_emb); // [N, time_embed_dim]
|
||||||
|
|
||||||
|
emb = ggml_add(ctx, emb, label_emb); // [N, time_embed_dim]
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<struct ggml_tensor*> outs;
|
||||||
|
|
||||||
|
if (guided_hint == NULL) {
|
||||||
|
guided_hint = input_hint_block_forward(ctx, hint, emb, context);
|
||||||
|
}
|
||||||
|
outs.push_back(guided_hint);
|
||||||
|
|
||||||
|
// input_blocks
|
||||||
|
|
||||||
|
// input block 0
|
||||||
|
auto h = input_blocks_0_0->forward(ctx, x);
|
||||||
|
h = ggml_add(ctx, h, guided_hint);
|
||||||
|
outs.push_back(zero_convs_0->forward(ctx, h));
|
||||||
|
|
||||||
|
// input block 1-11
|
||||||
|
size_t len_mults = channel_mult.size();
|
||||||
|
int input_block_idx = 0;
|
||||||
|
int ds = 1;
|
||||||
|
for (int i = 0; i < len_mults; i++) {
|
||||||
|
int mult = channel_mult[i];
|
||||||
|
for (int j = 0; j < num_res_blocks; j++) {
|
||||||
|
input_block_idx += 1;
|
||||||
|
std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0";
|
||||||
|
h = resblock_forward(name, ctx, h, emb); // [N, mult*model_channels, h, w]
|
||||||
|
if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) {
|
||||||
|
std::string name = "input_blocks." + std::to_string(input_block_idx) + ".1";
|
||||||
|
h = attention_layer_forward(name, ctx, h, context); // [N, mult*model_channels, h, w]
|
||||||
|
}
|
||||||
|
|
||||||
|
auto zero_conv = std::dynamic_pointer_cast<Conv2d>(blocks["zero_convs." + std::to_string(input_block_idx) + ".0"]);
|
||||||
|
|
||||||
|
outs.push_back(zero_conv->forward(ctx, h));
|
||||||
|
}
|
||||||
|
if (i != len_mults - 1) {
|
||||||
|
ds *= 2;
|
||||||
|
input_block_idx += 1;
|
||||||
|
|
||||||
|
std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0";
|
||||||
|
auto block = std::dynamic_pointer_cast<DownSampleBlock>(blocks[name]);
|
||||||
|
|
||||||
|
h = block->forward(ctx, h); // [N, mult*model_channels, h/(2^(i+1)), w/(2^(i+1))]
|
||||||
|
|
||||||
|
auto zero_conv = std::dynamic_pointer_cast<Conv2d>(blocks["zero_convs." + std::to_string(input_block_idx) + ".0"]);
|
||||||
|
|
||||||
|
outs.push_back(zero_conv->forward(ctx, h));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// [N, 4*model_channels, h/8, w/8]
|
||||||
|
|
||||||
|
// middle_block
|
||||||
|
h = resblock_forward("middle_block.0", ctx, h, emb); // [N, 4*model_channels, h/8, w/8]
|
||||||
|
h = attention_layer_forward("middle_block.1", ctx, h, context); // [N, 4*model_channels, h/8, w/8]
|
||||||
|
h = resblock_forward("middle_block.2", ctx, h, emb); // [N, 4*model_channels, h/8, w/8]
|
||||||
|
|
||||||
|
// out
|
||||||
|
outs.push_back(middle_block_out->forward(ctx, h));
|
||||||
|
return outs;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ControlNet : public GGMLModule {
|
||||||
|
SDVersion version = VERSION_1_x;
|
||||||
|
ControlNetBlock control_net;
|
||||||
|
|
||||||
|
ggml_backend_buffer_t control_buffer = NULL; // keep control output tensors in backend memory
|
||||||
|
ggml_context* control_ctx = NULL;
|
||||||
|
std::vector<struct ggml_tensor*> controls; // (12 input block outputs, 1 middle block output) SD 1.5
|
||||||
|
struct ggml_tensor* guided_hint = NULL; // guided_hint cache, for faster inference
|
||||||
|
bool guided_hint_cached = false;
|
||||||
|
|
||||||
|
ControlNet(ggml_backend_t backend,
|
||||||
|
ggml_type wtype,
|
||||||
|
SDVersion version = VERSION_1_x)
|
||||||
|
: GGMLModule(backend, wtype), control_net(version) {
|
||||||
|
control_net.init(params_ctx, wtype);
|
||||||
|
}
|
||||||
|
|
||||||
|
~ControlNet() {
|
||||||
|
free_control_ctx();
|
||||||
|
}
|
||||||
|
|
||||||
|
void alloc_control_ctx(std::vector<struct ggml_tensor*> outs) {
|
||||||
|
struct ggml_init_params params;
|
||||||
|
params.mem_size = static_cast<size_t>(outs.size() * ggml_tensor_overhead()) + 1024 * 1024;
|
||||||
|
params.mem_buffer = NULL;
|
||||||
|
params.no_alloc = true;
|
||||||
|
control_ctx = ggml_init(params);
|
||||||
|
|
||||||
|
controls.resize(outs.size() - 1);
|
||||||
|
|
||||||
|
size_t control_buffer_size = 0;
|
||||||
|
|
||||||
|
guided_hint = ggml_dup_tensor(control_ctx, outs[0]);
|
||||||
|
control_buffer_size += ggml_nbytes(guided_hint);
|
||||||
|
|
||||||
|
for (int i = 0; i < outs.size() - 1; i++) {
|
||||||
|
controls[i] = ggml_dup_tensor(control_ctx, outs[i + 1]);
|
||||||
|
control_buffer_size += ggml_nbytes(controls[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
control_buffer = ggml_backend_alloc_ctx_tensors(control_ctx, backend);
|
||||||
|
|
||||||
|
LOG_DEBUG("control buffer size %.2fMB", control_buffer_size * 1.f / 1024.f / 1024.f);
|
||||||
|
}
|
||||||
|
|
||||||
|
void free_control_ctx() {
|
||||||
|
if (control_buffer != NULL) {
|
||||||
|
ggml_backend_buffer_free(control_buffer);
|
||||||
|
control_buffer = NULL;
|
||||||
|
}
|
||||||
|
if (control_ctx != NULL) {
|
||||||
|
ggml_free(control_ctx);
|
||||||
|
control_ctx = NULL;
|
||||||
|
}
|
||||||
|
guided_hint = NULL;
|
||||||
|
guided_hint_cached = false;
|
||||||
|
controls.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string get_desc() {
|
||||||
|
return "control_net";
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t get_params_mem_size() {
|
||||||
|
return control_net.get_params_mem_size();
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t get_params_num() {
|
||||||
|
return control_net.get_params_num();
|
||||||
|
}
|
||||||
|
|
||||||
|
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
|
||||||
|
control_net.get_param_tensors(tensors, prefix);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_cgraph* build_graph(struct ggml_tensor* x,
|
||||||
|
struct ggml_tensor* hint,
|
||||||
|
struct ggml_tensor* timesteps,
|
||||||
|
struct ggml_tensor* context,
|
||||||
|
struct ggml_tensor* y = NULL) {
|
||||||
|
struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, CONTROL_NET_GRAPH_SIZE, false);
|
||||||
|
|
||||||
|
x = to_backend(x);
|
||||||
|
if (guided_hint_cached) {
|
||||||
|
hint = NULL;
|
||||||
|
} else {
|
||||||
|
hint = to_backend(hint);
|
||||||
|
}
|
||||||
|
context = to_backend(context);
|
||||||
|
y = to_backend(y);
|
||||||
|
timesteps = to_backend(timesteps);
|
||||||
|
|
||||||
|
auto outs = control_net.forward(compute_ctx,
|
||||||
|
x,
|
||||||
|
hint,
|
||||||
|
guided_hint_cached ? guided_hint : NULL,
|
||||||
|
timesteps,
|
||||||
|
context,
|
||||||
|
y);
|
||||||
|
|
||||||
|
if (control_ctx == NULL) {
|
||||||
|
alloc_control_ctx(outs);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_build_forward_expand(gf, ggml_cpy(compute_ctx, outs[0], guided_hint));
|
||||||
|
for (int i = 0; i < outs.size() - 1; i++) {
|
||||||
|
ggml_build_forward_expand(gf, ggml_cpy(compute_ctx, outs[i + 1], controls[i]));
|
||||||
|
}
|
||||||
|
|
||||||
|
return gf;
|
||||||
|
}
|
||||||
|
|
||||||
|
void compute(int n_threads,
|
||||||
|
struct ggml_tensor* x,
|
||||||
|
struct ggml_tensor* hint,
|
||||||
|
struct ggml_tensor* timesteps,
|
||||||
|
struct ggml_tensor* context,
|
||||||
|
struct ggml_tensor* y,
|
||||||
|
struct ggml_tensor** output = NULL,
|
||||||
|
struct ggml_context* output_ctx = NULL) {
|
||||||
|
// x: [N, in_channels, h, w]
|
||||||
|
// timesteps: [N, ]
|
||||||
|
// context: [N, max_position, hidden_size]([N, 77, 768]) or [1, max_position, hidden_size]
|
||||||
|
// y: [N, adm_in_channels] or [1, adm_in_channels]
|
||||||
|
auto get_graph = [&]() -> struct ggml_cgraph* {
|
||||||
|
return build_graph(x, hint, timesteps, context, y);
|
||||||
|
};
|
||||||
|
|
||||||
|
GGMLModule::compute(get_graph, n_threads, false, output, output_ctx);
|
||||||
|
guided_hint_cached = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool load_from_file(const std::string& file_path) {
|
||||||
|
LOG_INFO("loading control net from '%s'", file_path.c_str());
|
||||||
|
alloc_params_buffer();
|
||||||
|
std::map<std::string, ggml_tensor*> tensors;
|
||||||
|
control_net.get_param_tensors(tensors);
|
||||||
|
std::set<std::string> ignore_tensors;
|
||||||
|
|
||||||
|
ModelLoader model_loader;
|
||||||
|
if (!model_loader.init_from_file(file_path)) {
|
||||||
|
LOG_ERROR("init control net model loader from file failed: '%s'", file_path.c_str());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool success = model_loader.load_tensors(tensors, backend, ignore_tensors);
|
||||||
|
|
||||||
|
if (!success) {
|
||||||
|
LOG_ERROR("load control net tensors from model loader failed");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
LOG_INFO("control net model loaded");
|
||||||
|
return success;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif // __CONTROL_HPP__
|
125
otherarch/sdcpp/denoiser.hpp
Normal file
125
otherarch/sdcpp/denoiser.hpp
Normal file
|
@ -0,0 +1,125 @@
|
||||||
|
#ifndef __DENOISER_HPP__
|
||||||
|
#define __DENOISER_HPP__
|
||||||
|
|
||||||
|
#include "ggml_extend.hpp"
|
||||||
|
|
||||||
|
/*================================================= CompVisDenoiser ==================================================*/
|
||||||
|
|
||||||
|
// Ref: https://github.com/crowsonkb/k-diffusion/blob/master/k_diffusion/external.py
|
||||||
|
|
||||||
|
#define TIMESTEPS 1000
|
||||||
|
|
||||||
|
struct SigmaSchedule {
|
||||||
|
float alphas_cumprod[TIMESTEPS];
|
||||||
|
float sigmas[TIMESTEPS];
|
||||||
|
float log_sigmas[TIMESTEPS];
|
||||||
|
|
||||||
|
virtual std::vector<float> get_sigmas(uint32_t n) = 0;
|
||||||
|
|
||||||
|
float sigma_to_t(float sigma) {
|
||||||
|
float log_sigma = std::log(sigma);
|
||||||
|
std::vector<float> dists;
|
||||||
|
dists.reserve(TIMESTEPS);
|
||||||
|
for (float log_sigma_val : log_sigmas) {
|
||||||
|
dists.push_back(log_sigma - log_sigma_val);
|
||||||
|
}
|
||||||
|
|
||||||
|
int low_idx = 0;
|
||||||
|
for (size_t i = 0; i < TIMESTEPS; i++) {
|
||||||
|
if (dists[i] >= 0) {
|
||||||
|
low_idx++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
low_idx = std::min(std::max(low_idx - 1, 0), TIMESTEPS - 2);
|
||||||
|
int high_idx = low_idx + 1;
|
||||||
|
|
||||||
|
float low = log_sigmas[low_idx];
|
||||||
|
float high = log_sigmas[high_idx];
|
||||||
|
float w = (low - log_sigma) / (low - high);
|
||||||
|
w = std::max(0.f, std::min(1.f, w));
|
||||||
|
float t = (1.0f - w) * low_idx + w * high_idx;
|
||||||
|
|
||||||
|
return t;
|
||||||
|
}
|
||||||
|
|
||||||
|
float t_to_sigma(float t) {
|
||||||
|
int low_idx = static_cast<int>(std::floor(t));
|
||||||
|
int high_idx = static_cast<int>(std::ceil(t));
|
||||||
|
float w = t - static_cast<float>(low_idx);
|
||||||
|
float log_sigma = (1.0f - w) * log_sigmas[low_idx] + w * log_sigmas[high_idx];
|
||||||
|
return std::exp(log_sigma);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct DiscreteSchedule : SigmaSchedule {
|
||||||
|
std::vector<float> get_sigmas(uint32_t n) {
|
||||||
|
std::vector<float> result;
|
||||||
|
|
||||||
|
int t_max = TIMESTEPS - 1;
|
||||||
|
|
||||||
|
if (n == 0) {
|
||||||
|
return result;
|
||||||
|
} else if (n == 1) {
|
||||||
|
result.push_back(t_to_sigma((float)t_max));
|
||||||
|
result.push_back(0);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
float step = static_cast<float>(t_max) / static_cast<float>(n - 1);
|
||||||
|
for (uint32_t i = 0; i < n; ++i) {
|
||||||
|
float t = t_max - step * i;
|
||||||
|
result.push_back(t_to_sigma(t));
|
||||||
|
}
|
||||||
|
result.push_back(0);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct KarrasSchedule : SigmaSchedule {
|
||||||
|
std::vector<float> get_sigmas(uint32_t n) {
|
||||||
|
// These *COULD* be function arguments here,
|
||||||
|
// but does anybody ever bother to touch them?
|
||||||
|
float sigma_min = 0.1f;
|
||||||
|
float sigma_max = 10.f;
|
||||||
|
float rho = 7.f;
|
||||||
|
|
||||||
|
std::vector<float> result(n + 1);
|
||||||
|
|
||||||
|
float min_inv_rho = pow(sigma_min, (1.f / rho));
|
||||||
|
float max_inv_rho = pow(sigma_max, (1.f / rho));
|
||||||
|
for (uint32_t i = 0; i < n; i++) {
|
||||||
|
// Eq. (5) from Karras et al 2022
|
||||||
|
result[i] = pow(max_inv_rho + (float)i / ((float)n - 1.f) * (min_inv_rho - max_inv_rho), rho);
|
||||||
|
}
|
||||||
|
result[n] = 0.;
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct Denoiser {
|
||||||
|
std::shared_ptr<SigmaSchedule> schedule = std::make_shared<DiscreteSchedule>();
|
||||||
|
virtual std::vector<float> get_scalings(float sigma) = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct CompVisDenoiser : public Denoiser {
|
||||||
|
float sigma_data = 1.0f;
|
||||||
|
|
||||||
|
std::vector<float> get_scalings(float sigma) {
|
||||||
|
float c_out = -sigma;
|
||||||
|
float c_in = 1.0f / std::sqrt(sigma * sigma + sigma_data * sigma_data);
|
||||||
|
return {c_out, c_in};
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct CompVisVDenoiser : public Denoiser {
|
||||||
|
float sigma_data = 1.0f;
|
||||||
|
|
||||||
|
std::vector<float> get_scalings(float sigma) {
|
||||||
|
float c_skip = sigma_data * sigma_data / (sigma * sigma + sigma_data * sigma_data);
|
||||||
|
float c_out = -sigma * sigma_data / std::sqrt(sigma * sigma + sigma_data * sigma_data);
|
||||||
|
float c_in = 1.0f / std::sqrt(sigma * sigma + sigma_data * sigma_data);
|
||||||
|
return {c_skip, c_out, c_in};
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif // __DENOISER_HPP__
|
206
otherarch/sdcpp/esrgan.hpp
Normal file
206
otherarch/sdcpp/esrgan.hpp
Normal file
|
@ -0,0 +1,206 @@
|
||||||
|
#ifndef __ESRGAN_HPP__
|
||||||
|
#define __ESRGAN_HPP__
|
||||||
|
|
||||||
|
#include "ggml_extend.hpp"
|
||||||
|
#include "model.h"
|
||||||
|
|
||||||
|
/*
|
||||||
|
=================================== ESRGAN ===================================
|
||||||
|
References:
|
||||||
|
https://github.com/xinntao/Real-ESRGAN/blob/master/inference_realesrgan.py
|
||||||
|
https://github.com/XPixelGroup/BasicSR/blob/v1.4.2/basicsr/archs/rrdbnet_arch.py
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
class ResidualDenseBlock : public GGMLBlock {
|
||||||
|
protected:
|
||||||
|
int num_feat;
|
||||||
|
int num_grow_ch;
|
||||||
|
|
||||||
|
public:
|
||||||
|
ResidualDenseBlock(int num_feat = 64, int num_grow_ch = 32)
|
||||||
|
: num_feat(num_feat), num_grow_ch(num_grow_ch) {
|
||||||
|
blocks["conv1"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_grow_ch, {3, 3}, {1, 1}, {1, 1}));
|
||||||
|
blocks["conv2"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat + num_grow_ch, num_grow_ch, {3, 3}, {1, 1}, {1, 1}));
|
||||||
|
blocks["conv3"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat + 2 * num_grow_ch, num_grow_ch, {3, 3}, {1, 1}, {1, 1}));
|
||||||
|
blocks["conv4"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat + 3 * num_grow_ch, num_grow_ch, {3, 3}, {1, 1}, {1, 1}));
|
||||||
|
blocks["conv5"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat + 4 * num_grow_ch, num_feat, {3, 3}, {1, 1}, {1, 1}));
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor* lrelu(struct ggml_context* ctx, struct ggml_tensor* x) {
|
||||||
|
return ggml_leaky_relu(ctx, x, 0.2f, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
|
||||||
|
// x: [n, num_feat, h, w]
|
||||||
|
// return: [n, num_feat, h, w]
|
||||||
|
|
||||||
|
auto conv1 = std::dynamic_pointer_cast<Conv2d>(blocks["conv1"]);
|
||||||
|
auto conv2 = std::dynamic_pointer_cast<Conv2d>(blocks["conv2"]);
|
||||||
|
auto conv3 = std::dynamic_pointer_cast<Conv2d>(blocks["conv3"]);
|
||||||
|
auto conv4 = std::dynamic_pointer_cast<Conv2d>(blocks["conv4"]);
|
||||||
|
auto conv5 = std::dynamic_pointer_cast<Conv2d>(blocks["conv5"]);
|
||||||
|
|
||||||
|
auto x1 = lrelu(ctx, conv1->forward(ctx, x));
|
||||||
|
auto x_cat = ggml_concat(ctx, x, x1);
|
||||||
|
auto x2 = lrelu(ctx, conv2->forward(ctx, x_cat));
|
||||||
|
x_cat = ggml_concat(ctx, x_cat, x2);
|
||||||
|
auto x3 = lrelu(ctx, conv3->forward(ctx, x_cat));
|
||||||
|
x_cat = ggml_concat(ctx, x_cat, x3);
|
||||||
|
auto x4 = lrelu(ctx, conv4->forward(ctx, x_cat));
|
||||||
|
x_cat = ggml_concat(ctx, x_cat, x4);
|
||||||
|
auto x5 = conv5->forward(ctx, x_cat);
|
||||||
|
|
||||||
|
x5 = ggml_add(ctx, ggml_scale(ctx, x5, 0.2f), x);
|
||||||
|
return x5;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class RRDB : public GGMLBlock {
|
||||||
|
public:
|
||||||
|
RRDB(int num_feat, int num_grow_ch = 32) {
|
||||||
|
blocks["rdb1"] = std::shared_ptr<GGMLBlock>(new ResidualDenseBlock(num_feat, num_grow_ch));
|
||||||
|
blocks["rdb2"] = std::shared_ptr<GGMLBlock>(new ResidualDenseBlock(num_feat, num_grow_ch));
|
||||||
|
blocks["rdb3"] = std::shared_ptr<GGMLBlock>(new ResidualDenseBlock(num_feat, num_grow_ch));
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
|
||||||
|
// x: [n, num_feat, h, w]
|
||||||
|
// return: [n, num_feat, h, w]
|
||||||
|
|
||||||
|
auto rdb1 = std::dynamic_pointer_cast<ResidualDenseBlock>(blocks["rdb1"]);
|
||||||
|
auto rdb2 = std::dynamic_pointer_cast<ResidualDenseBlock>(blocks["rdb2"]);
|
||||||
|
auto rdb3 = std::dynamic_pointer_cast<ResidualDenseBlock>(blocks["rdb3"]);
|
||||||
|
|
||||||
|
auto out = rdb1->forward(ctx, x);
|
||||||
|
out = rdb2->forward(ctx, out);
|
||||||
|
out = rdb3->forward(ctx, out);
|
||||||
|
|
||||||
|
out = ggml_add(ctx, ggml_scale(ctx, out, 0.2f), x);
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class RRDBNet : public GGMLBlock {
|
||||||
|
protected:
|
||||||
|
int scale = 4; // default RealESRGAN_x4plus_anime_6B
|
||||||
|
int num_block = 6; // default RealESRGAN_x4plus_anime_6B
|
||||||
|
int num_in_ch = 3;
|
||||||
|
int num_out_ch = 3;
|
||||||
|
int num_feat = 64; // default RealESRGAN_x4plus_anime_6B
|
||||||
|
int num_grow_ch = 32; // default RealESRGAN_x4plus_anime_6B
|
||||||
|
|
||||||
|
public:
|
||||||
|
RRDBNet() {
|
||||||
|
blocks["conv_first"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_in_ch, num_feat, {3, 3}, {1, 1}, {1, 1}));
|
||||||
|
for (int i = 0; i < num_block; i++) {
|
||||||
|
std::string name = "body." + std::to_string(i);
|
||||||
|
blocks[name] = std::shared_ptr<GGMLBlock>(new RRDB(num_feat, num_grow_ch));
|
||||||
|
}
|
||||||
|
blocks["conv_body"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}));
|
||||||
|
// upsample
|
||||||
|
blocks["conv_up1"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}));
|
||||||
|
blocks["conv_up2"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}));
|
||||||
|
blocks["conv_hr"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}));
|
||||||
|
blocks["conv_last"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_out_ch, {3, 3}, {1, 1}, {1, 1}));
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor* lrelu(struct ggml_context* ctx, struct ggml_tensor* x) {
|
||||||
|
return ggml_leaky_relu(ctx, x, 0.2f, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
|
||||||
|
// x: [n, num_in_ch, h, w]
|
||||||
|
// return: [n, num_out_ch, h*4, w*4]
|
||||||
|
auto conv_first = std::dynamic_pointer_cast<Conv2d>(blocks["conv_first"]);
|
||||||
|
auto conv_body = std::dynamic_pointer_cast<Conv2d>(blocks["conv_body"]);
|
||||||
|
auto conv_up1 = std::dynamic_pointer_cast<Conv2d>(blocks["conv_up1"]);
|
||||||
|
auto conv_up2 = std::dynamic_pointer_cast<Conv2d>(blocks["conv_up2"]);
|
||||||
|
auto conv_hr = std::dynamic_pointer_cast<Conv2d>(blocks["conv_hr"]);
|
||||||
|
auto conv_last = std::dynamic_pointer_cast<Conv2d>(blocks["conv_last"]);
|
||||||
|
|
||||||
|
auto feat = conv_first->forward(ctx, x);
|
||||||
|
auto body_feat = feat;
|
||||||
|
for (int i = 0; i < num_block; i++) {
|
||||||
|
std::string name = "body." + std::to_string(i);
|
||||||
|
auto block = std::dynamic_pointer_cast<RRDB>(blocks[name]);
|
||||||
|
|
||||||
|
body_feat = block->forward(ctx, body_feat);
|
||||||
|
}
|
||||||
|
body_feat = conv_body->forward(ctx, body_feat);
|
||||||
|
feat = ggml_add(ctx, feat, body_feat);
|
||||||
|
// upsample
|
||||||
|
feat = lrelu(ctx, conv_up1->forward(ctx, ggml_upscale(ctx, feat, 2)));
|
||||||
|
feat = lrelu(ctx, conv_up2->forward(ctx, ggml_upscale(ctx, feat, 2)));
|
||||||
|
auto out = conv_last->forward(ctx, lrelu(ctx, conv_hr->forward(ctx, feat)));
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ESRGAN : public GGMLModule {
|
||||||
|
RRDBNet rrdb_net;
|
||||||
|
int scale = 4;
|
||||||
|
int tile_size = 128; // avoid cuda OOM for 4gb VRAM
|
||||||
|
|
||||||
|
ESRGAN(ggml_backend_t backend,
|
||||||
|
ggml_type wtype)
|
||||||
|
: GGMLModule(backend, wtype) {
|
||||||
|
rrdb_net.init(params_ctx, wtype);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string get_desc() {
|
||||||
|
return "esrgan";
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t get_params_mem_size() {
|
||||||
|
return rrdb_net.get_params_mem_size();
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t get_params_num() {
|
||||||
|
return rrdb_net.get_params_num();
|
||||||
|
}
|
||||||
|
|
||||||
|
bool load_from_file(const std::string& file_path) {
|
||||||
|
LOG_INFO("loading esrgan from '%s'", file_path.c_str());
|
||||||
|
|
||||||
|
alloc_params_buffer();
|
||||||
|
std::map<std::string, ggml_tensor*> esrgan_tensors;
|
||||||
|
rrdb_net.get_param_tensors(esrgan_tensors);
|
||||||
|
|
||||||
|
ModelLoader model_loader;
|
||||||
|
if (!model_loader.init_from_file(file_path)) {
|
||||||
|
LOG_ERROR("init esrgan model loader from file failed: '%s'", file_path.c_str());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool success = model_loader.load_tensors(esrgan_tensors, backend);
|
||||||
|
|
||||||
|
if (!success) {
|
||||||
|
LOG_ERROR("load esrgan tensors from model loader failed");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
LOG_INFO("esrgan model loaded");
|
||||||
|
return success;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_cgraph* build_graph(struct ggml_tensor* x) {
|
||||||
|
struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
|
||||||
|
x = to_backend(x);
|
||||||
|
struct ggml_tensor* out = rrdb_net.forward(compute_ctx, x);
|
||||||
|
ggml_build_forward_expand(gf, out);
|
||||||
|
return gf;
|
||||||
|
}
|
||||||
|
|
||||||
|
void compute(const int n_threads,
|
||||||
|
struct ggml_tensor* x,
|
||||||
|
ggml_tensor** output,
|
||||||
|
ggml_context* output_ctx = NULL) {
|
||||||
|
auto get_graph = [&]() -> struct ggml_cgraph* {
|
||||||
|
return build_graph(x);
|
||||||
|
};
|
||||||
|
GGMLModule::compute(get_graph, n_threads, false, output, output_ctx);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif // __ESRGAN_HPP__
|
1255
otherarch/sdcpp/ggml_extend.hpp
Normal file
1255
otherarch/sdcpp/ggml_extend.hpp
Normal file
File diff suppressed because it is too large
Load diff
164
otherarch/sdcpp/lora.hpp
Normal file
164
otherarch/sdcpp/lora.hpp
Normal file
|
@ -0,0 +1,164 @@
|
||||||
|
#ifndef __LORA_HPP__
|
||||||
|
#define __LORA_HPP__
|
||||||
|
|
||||||
|
#include "ggml_extend.hpp"
|
||||||
|
|
||||||
|
#define LORA_GRAPH_SIZE 10240
|
||||||
|
|
||||||
|
struct LoraModel : public GGMLModule {
|
||||||
|
float multiplier = 1.0f;
|
||||||
|
std::map<std::string, struct ggml_tensor*> lora_tensors;
|
||||||
|
std::string file_path;
|
||||||
|
ModelLoader model_loader;
|
||||||
|
bool load_failed = false;
|
||||||
|
|
||||||
|
LoraModel(ggml_backend_t backend,
|
||||||
|
ggml_type wtype,
|
||||||
|
const std::string file_path = "")
|
||||||
|
: file_path(file_path), GGMLModule(backend, wtype) {
|
||||||
|
if (!model_loader.init_from_file(file_path)) {
|
||||||
|
load_failed = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string get_desc() {
|
||||||
|
return "lora";
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t get_params_num() {
|
||||||
|
return LORA_GRAPH_SIZE;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t get_params_mem_size() {
|
||||||
|
return model_loader.get_params_mem_size(NULL);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool load_from_file() {
|
||||||
|
LOG_INFO("loading LoRA from '%s'", file_path.c_str());
|
||||||
|
|
||||||
|
if (load_failed) {
|
||||||
|
LOG_ERROR("init lora model loader from file failed: '%s'", file_path.c_str());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool dry_run = true;
|
||||||
|
auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool {
|
||||||
|
const std::string& name = tensor_storage.name;
|
||||||
|
|
||||||
|
if (dry_run) {
|
||||||
|
struct ggml_tensor* real = ggml_new_tensor(params_ctx,
|
||||||
|
tensor_storage.type,
|
||||||
|
tensor_storage.n_dims,
|
||||||
|
tensor_storage.ne);
|
||||||
|
lora_tensors[name] = real;
|
||||||
|
} else {
|
||||||
|
auto real = lora_tensors[name];
|
||||||
|
*dst_tensor = real;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
};
|
||||||
|
|
||||||
|
model_loader.load_tensors(on_new_tensor_cb, backend);
|
||||||
|
alloc_params_buffer();
|
||||||
|
|
||||||
|
dry_run = false;
|
||||||
|
model_loader.load_tensors(on_new_tensor_cb, backend);
|
||||||
|
|
||||||
|
LOG_DEBUG("finished loaded lora");
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_cgraph* build_graph(std::map<std::string, struct ggml_tensor*> model_tensors) {
|
||||||
|
struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, LORA_GRAPH_SIZE, false);
|
||||||
|
|
||||||
|
std::set<std::string> applied_lora_tensors;
|
||||||
|
for (auto it : model_tensors) {
|
||||||
|
std::string k_tensor = it.first;
|
||||||
|
struct ggml_tensor* weight = model_tensors[it.first];
|
||||||
|
|
||||||
|
size_t k_pos = k_tensor.find(".weight");
|
||||||
|
if (k_pos == std::string::npos) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
k_tensor = k_tensor.substr(0, k_pos);
|
||||||
|
replace_all_chars(k_tensor, '.', '_');
|
||||||
|
std::string lora_up_name = "lora." + k_tensor + ".lora_up.weight";
|
||||||
|
std::string lora_down_name = "lora." + k_tensor + ".lora_down.weight";
|
||||||
|
std::string alpha_name = "lora." + k_tensor + ".alpha";
|
||||||
|
std::string scale_name = "lora." + k_tensor + ".scale";
|
||||||
|
|
||||||
|
ggml_tensor* lora_up = NULL;
|
||||||
|
ggml_tensor* lora_down = NULL;
|
||||||
|
|
||||||
|
if (lora_tensors.find(lora_up_name) != lora_tensors.end()) {
|
||||||
|
lora_up = lora_tensors[lora_up_name];
|
||||||
|
}
|
||||||
|
|
||||||
|
if (lora_tensors.find(lora_down_name) != lora_tensors.end()) {
|
||||||
|
lora_down = lora_tensors[lora_down_name];
|
||||||
|
}
|
||||||
|
|
||||||
|
if (lora_up == NULL || lora_down == NULL) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
applied_lora_tensors.insert(lora_up_name);
|
||||||
|
applied_lora_tensors.insert(lora_down_name);
|
||||||
|
applied_lora_tensors.insert(alpha_name);
|
||||||
|
applied_lora_tensors.insert(scale_name);
|
||||||
|
|
||||||
|
// calc_cale
|
||||||
|
int64_t dim = lora_down->ne[ggml_n_dims(lora_down) - 1];
|
||||||
|
float scale_value = 1.0f;
|
||||||
|
if (lora_tensors.find(scale_name) != lora_tensors.end()) {
|
||||||
|
scale_value = ggml_backend_tensor_get_f32(lora_tensors[scale_name]);
|
||||||
|
} else if (lora_tensors.find(alpha_name) != lora_tensors.end()) {
|
||||||
|
float alpha = ggml_backend_tensor_get_f32(lora_tensors[alpha_name]);
|
||||||
|
scale_value = alpha / dim;
|
||||||
|
}
|
||||||
|
scale_value *= multiplier;
|
||||||
|
|
||||||
|
// flat lora tensors to multiply it
|
||||||
|
int64_t lora_up_rows = lora_up->ne[ggml_n_dims(lora_up) - 1];
|
||||||
|
lora_up = ggml_reshape_2d(compute_ctx, lora_up, ggml_nelements(lora_up) / lora_up_rows, lora_up_rows);
|
||||||
|
int64_t lora_down_rows = lora_down->ne[ggml_n_dims(lora_down) - 1];
|
||||||
|
lora_down = ggml_reshape_2d(compute_ctx, lora_down, ggml_nelements(lora_down) / lora_down_rows, lora_down_rows);
|
||||||
|
|
||||||
|
// ggml_mul_mat requires tensor b transposed
|
||||||
|
lora_down = ggml_cont(compute_ctx, ggml_transpose(compute_ctx, lora_down));
|
||||||
|
struct ggml_tensor* updown = ggml_mul_mat(compute_ctx, lora_up, lora_down);
|
||||||
|
updown = ggml_cont(compute_ctx, ggml_transpose(compute_ctx, updown));
|
||||||
|
updown = ggml_reshape(compute_ctx, updown, weight);
|
||||||
|
GGML_ASSERT(ggml_nelements(updown) == ggml_nelements(weight));
|
||||||
|
updown = ggml_scale_inplace(compute_ctx, updown, scale_value);
|
||||||
|
ggml_tensor* final_weight;
|
||||||
|
// if (weight->type != GGML_TYPE_F32 && weight->type != GGML_TYPE_F16) {
|
||||||
|
// final_weight = ggml_new_tensor(compute_ctx, GGML_TYPE_F32, weight->n_dims, weight->ne);
|
||||||
|
// final_weight = ggml_cpy_inplace(compute_ctx, weight, final_weight);
|
||||||
|
// final_weight = ggml_add_inplace(compute_ctx, final_weight, updown);
|
||||||
|
// final_weight = ggml_cpy_inplace(compute_ctx, final_weight, weight);
|
||||||
|
// } else {
|
||||||
|
// final_weight = ggml_add_inplace(compute_ctx, weight, updown);
|
||||||
|
// }
|
||||||
|
final_weight = ggml_add_inplace(compute_ctx, weight, updown); // apply directly
|
||||||
|
ggml_build_forward_expand(gf, final_weight);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (auto& kv : lora_tensors) {
|
||||||
|
if (applied_lora_tensors.find(kv.first) == applied_lora_tensors.end()) {
|
||||||
|
LOG_WARN("unused lora tensor %s", kv.first.c_str());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return gf;
|
||||||
|
}
|
||||||
|
|
||||||
|
void apply(std::map<std::string, struct ggml_tensor*> model_tensors, int n_threads) {
|
||||||
|
auto get_graph = [&]() -> struct ggml_cgraph* {
|
||||||
|
return build_graph(model_tensors);
|
||||||
|
};
|
||||||
|
GGMLModule::compute(get_graph, n_threads, true);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif // __LORA_HPP__
|
778
otherarch/sdcpp/main.cpp
Normal file
778
otherarch/sdcpp/main.cpp
Normal file
|
@ -0,0 +1,778 @@
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <time.h>
|
||||||
|
#include <iostream>
|
||||||
|
#include <random>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
// #include "preprocessing.hpp"
|
||||||
|
#include "stable-diffusion.h"
|
||||||
|
|
||||||
|
#define STB_IMAGE_IMPLEMENTATION
|
||||||
|
#include "stb_image.h"
|
||||||
|
|
||||||
|
#define STB_IMAGE_WRITE_IMPLEMENTATION
|
||||||
|
#define STB_IMAGE_WRITE_STATIC
|
||||||
|
#include "stb_image_write.h"
|
||||||
|
|
||||||
|
const char* rng_type_to_str[] = {
|
||||||
|
"std_default",
|
||||||
|
"cuda",
|
||||||
|
};
|
||||||
|
|
||||||
|
// Names of the sampler method, same order as enum sample_method in stable-diffusion.h
|
||||||
|
const char* sample_method_str[] = {
|
||||||
|
"euler_a",
|
||||||
|
"euler",
|
||||||
|
"heun",
|
||||||
|
"dpm2",
|
||||||
|
"dpm++2s_a",
|
||||||
|
"dpm++2m",
|
||||||
|
"dpm++2mv2",
|
||||||
|
"lcm",
|
||||||
|
};
|
||||||
|
|
||||||
|
// Names of the sigma schedule overrides, same order as sample_schedule in stable-diffusion.h
|
||||||
|
const char* schedule_str[] = {
|
||||||
|
"default",
|
||||||
|
"discrete",
|
||||||
|
"karras",
|
||||||
|
};
|
||||||
|
|
||||||
|
const char* modes_str[] = {
|
||||||
|
"txt2img",
|
||||||
|
"img2img",
|
||||||
|
"img2vid",
|
||||||
|
"convert",
|
||||||
|
};
|
||||||
|
|
||||||
|
enum SDMode {
|
||||||
|
TXT2IMG,
|
||||||
|
IMG2IMG,
|
||||||
|
IMG2VID,
|
||||||
|
CONVERT,
|
||||||
|
MODE_COUNT
|
||||||
|
};
|
||||||
|
|
||||||
|
struct SDParams {
|
||||||
|
int n_threads = -1;
|
||||||
|
SDMode mode = TXT2IMG;
|
||||||
|
|
||||||
|
std::string model_path;
|
||||||
|
std::string vae_path;
|
||||||
|
std::string taesd_path;
|
||||||
|
std::string esrgan_path;
|
||||||
|
std::string controlnet_path;
|
||||||
|
std::string embeddings_path;
|
||||||
|
sd_type_t wtype = SD_TYPE_COUNT;
|
||||||
|
std::string lora_model_dir;
|
||||||
|
std::string output_path = "output.png";
|
||||||
|
std::string input_path;
|
||||||
|
std::string control_image_path;
|
||||||
|
|
||||||
|
std::string prompt;
|
||||||
|
std::string negative_prompt;
|
||||||
|
float min_cfg = 1.0f;
|
||||||
|
float cfg_scale = 7.0f;
|
||||||
|
int clip_skip = -1; // <= 0 represents unspecified
|
||||||
|
int width = 512;
|
||||||
|
int height = 512;
|
||||||
|
int batch_count = 1;
|
||||||
|
|
||||||
|
int video_frames = 6;
|
||||||
|
int motion_bucket_id = 127;
|
||||||
|
int fps = 6;
|
||||||
|
float augmentation_level = 0.f;
|
||||||
|
|
||||||
|
sample_method_t sample_method = EULER_A;
|
||||||
|
schedule_t schedule = DEFAULT;
|
||||||
|
int sample_steps = 20;
|
||||||
|
float strength = 0.75f;
|
||||||
|
float control_strength = 0.9f;
|
||||||
|
rng_type_t rng_type = CUDA_RNG;
|
||||||
|
int64_t seed = 42;
|
||||||
|
bool verbose = false;
|
||||||
|
bool vae_tiling = false;
|
||||||
|
bool control_net_cpu = false;
|
||||||
|
bool canny_preprocess = false;
|
||||||
|
int upscale_repeats = 1;
|
||||||
|
};
|
||||||
|
|
||||||
|
void print_params(SDParams params) {
|
||||||
|
printf("Option: \n");
|
||||||
|
printf(" n_threads: %d\n", params.n_threads);
|
||||||
|
printf(" mode: %s\n", modes_str[params.mode]);
|
||||||
|
printf(" model_path: %s\n", params.model_path.c_str());
|
||||||
|
printf(" wtype: %s\n", params.wtype < SD_TYPE_COUNT ? sd_type_name(params.wtype) : "unspecified");
|
||||||
|
printf(" vae_path: %s\n", params.vae_path.c_str());
|
||||||
|
printf(" taesd_path: %s\n", params.taesd_path.c_str());
|
||||||
|
printf(" esrgan_path: %s\n", params.esrgan_path.c_str());
|
||||||
|
printf(" controlnet_path: %s\n", params.controlnet_path.c_str());
|
||||||
|
printf(" embeddings_path: %s\n", params.embeddings_path.c_str());
|
||||||
|
printf(" output_path: %s\n", params.output_path.c_str());
|
||||||
|
printf(" init_img: %s\n", params.input_path.c_str());
|
||||||
|
printf(" control_image: %s\n", params.control_image_path.c_str());
|
||||||
|
printf(" controlnet cpu: %s\n", params.control_net_cpu ? "true" : "false");
|
||||||
|
printf(" strength(control): %.2f\n", params.control_strength);
|
||||||
|
printf(" prompt: %s\n", params.prompt.c_str());
|
||||||
|
printf(" negative_prompt: %s\n", params.negative_prompt.c_str());
|
||||||
|
printf(" min_cfg: %.2f\n", params.min_cfg);
|
||||||
|
printf(" cfg_scale: %.2f\n", params.cfg_scale);
|
||||||
|
printf(" clip_skip: %d\n", params.clip_skip);
|
||||||
|
printf(" width: %d\n", params.width);
|
||||||
|
printf(" height: %d\n", params.height);
|
||||||
|
printf(" sample_method: %s\n", sample_method_str[params.sample_method]);
|
||||||
|
printf(" schedule: %s\n", schedule_str[params.schedule]);
|
||||||
|
printf(" sample_steps: %d\n", params.sample_steps);
|
||||||
|
printf(" strength(img2img): %.2f\n", params.strength);
|
||||||
|
printf(" rng: %s\n", rng_type_to_str[params.rng_type]);
|
||||||
|
printf(" seed: %ld\n", params.seed);
|
||||||
|
printf(" batch_count: %d\n", params.batch_count);
|
||||||
|
printf(" vae_tiling: %s\n", params.vae_tiling ? "true" : "false");
|
||||||
|
printf(" upscale_repeats: %d\n", params.upscale_repeats);
|
||||||
|
}
|
||||||
|
|
||||||
|
void print_usage(int argc, const char* argv[]) {
|
||||||
|
printf("usage: %s [arguments]\n", argv[0]);
|
||||||
|
printf("\n");
|
||||||
|
printf("arguments:\n");
|
||||||
|
printf(" -h, --help show this help message and exit\n");
|
||||||
|
printf(" -M, --mode [MODEL] run mode (txt2img or img2img or convert, default: txt2img)\n");
|
||||||
|
printf(" -t, --threads N number of threads to use during computation (default: -1).\n");
|
||||||
|
printf(" If threads <= 0, then threads will be set to the number of CPU physical cores\n");
|
||||||
|
printf(" -m, --model [MODEL] path to model\n");
|
||||||
|
printf(" --vae [VAE] path to vae\n");
|
||||||
|
printf(" --taesd [TAESD_PATH] path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)\n");
|
||||||
|
printf(" --control-net [CONTROL_PATH] path to control net model\n");
|
||||||
|
printf(" --embd-dir [EMBEDDING_PATH] path to embeddings.\n");
|
||||||
|
printf(" --upscale-model [ESRGAN_PATH] path to esrgan model. Upscale images after generate, just RealESRGAN_x4plus_anime_6B supported by now.\n");
|
||||||
|
printf(" --upscale-repeats Run the ESRGAN upscaler this many times (default 1)\n");
|
||||||
|
printf(" --type [TYPE] weight type (f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0)\n");
|
||||||
|
printf(" If not specified, the default is the type of the weight file.\n");
|
||||||
|
printf(" --lora-model-dir [DIR] lora model directory\n");
|
||||||
|
printf(" -i, --init-img [IMAGE] path to the input image, required by img2img\n");
|
||||||
|
printf(" --control-image [IMAGE] path to image condition, control net\n");
|
||||||
|
printf(" -o, --output OUTPUT path to write result image to (default: ./output.png)\n");
|
||||||
|
printf(" -p, --prompt [PROMPT] the prompt to render\n");
|
||||||
|
printf(" -n, --negative-prompt PROMPT the negative prompt (default: \"\")\n");
|
||||||
|
printf(" --cfg-scale SCALE unconditional guidance scale: (default: 7.0)\n");
|
||||||
|
printf(" --strength STRENGTH strength for noising/unnoising (default: 0.75)\n");
|
||||||
|
printf(" --control-strength STRENGTH strength to apply Control Net (default: 0.9)\n");
|
||||||
|
printf(" 1.0 corresponds to full destruction of information in init image\n");
|
||||||
|
printf(" -H, --height H image height, in pixel space (default: 512)\n");
|
||||||
|
printf(" -W, --width W image width, in pixel space (default: 512)\n");
|
||||||
|
printf(" --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, lcm}\n");
|
||||||
|
printf(" sampling method (default: \"euler_a\")\n");
|
||||||
|
printf(" --steps STEPS number of sample steps (default: 20)\n");
|
||||||
|
printf(" --rng {std_default, cuda} RNG (default: cuda)\n");
|
||||||
|
printf(" -s SEED, --seed SEED RNG seed (default: 42, use random seed for < 0)\n");
|
||||||
|
printf(" -b, --batch-count COUNT number of images to generate.\n");
|
||||||
|
printf(" --schedule {discrete, karras} Denoiser sigma schedule (default: discrete)\n");
|
||||||
|
printf(" --clip-skip N ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1)\n");
|
||||||
|
printf(" <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x\n");
|
||||||
|
printf(" --vae-tiling process vae in tiles to reduce memory usage\n");
|
||||||
|
printf(" --control-net-cpu keep controlnet in cpu (for low vram)\n");
|
||||||
|
printf(" --canny apply canny preprocessor (edge detection)\n");
|
||||||
|
printf(" -v, --verbose print extra info\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
void parse_args(int argc, const char** argv, SDParams& params) {
|
||||||
|
bool invalid_arg = false;
|
||||||
|
std::string arg;
|
||||||
|
for (int i = 1; i < argc; i++) {
|
||||||
|
arg = argv[i];
|
||||||
|
|
||||||
|
if (arg == "-t" || arg == "--threads") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_arg = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.n_threads = std::stoi(argv[i]);
|
||||||
|
} else if (arg == "-M" || arg == "--mode") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_arg = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
const char* mode_selected = argv[i];
|
||||||
|
int mode_found = -1;
|
||||||
|
for (int d = 0; d < MODE_COUNT; d++) {
|
||||||
|
if (!strcmp(mode_selected, modes_str[d])) {
|
||||||
|
mode_found = d;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (mode_found == -1) {
|
||||||
|
fprintf(stderr,
|
||||||
|
"error: invalid mode %s, must be one of [txt2img, img2img, img2vid, convert]\n",
|
||||||
|
mode_selected);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
params.mode = (SDMode)mode_found;
|
||||||
|
} else if (arg == "-m" || arg == "--model") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_arg = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.model_path = argv[i];
|
||||||
|
} else if (arg == "--vae") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_arg = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.vae_path = argv[i];
|
||||||
|
} else if (arg == "--taesd") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_arg = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.taesd_path = argv[i];
|
||||||
|
} else if (arg == "--control-net") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_arg = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.controlnet_path = argv[i];
|
||||||
|
} else if (arg == "--upscale-model") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_arg = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.esrgan_path = argv[i];
|
||||||
|
} else if (arg == "--embd-dir") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_arg = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.embeddings_path = argv[i];
|
||||||
|
} else if (arg == "--type") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_arg = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
std::string type = argv[i];
|
||||||
|
if (type == "f32") {
|
||||||
|
params.wtype = SD_TYPE_F32;
|
||||||
|
} else if (type == "f16") {
|
||||||
|
params.wtype = SD_TYPE_F16;
|
||||||
|
} else if (type == "q4_0") {
|
||||||
|
params.wtype = SD_TYPE_Q4_0;
|
||||||
|
} else if (type == "q4_1") {
|
||||||
|
params.wtype = SD_TYPE_Q4_1;
|
||||||
|
} else if (type == "q5_0") {
|
||||||
|
params.wtype = SD_TYPE_Q5_0;
|
||||||
|
} else if (type == "q5_1") {
|
||||||
|
params.wtype = SD_TYPE_Q5_1;
|
||||||
|
} else if (type == "q8_0") {
|
||||||
|
params.wtype = SD_TYPE_Q8_0;
|
||||||
|
} else {
|
||||||
|
fprintf(stderr, "error: invalid weight format %s, must be one of [f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0]\n",
|
||||||
|
type.c_str());
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
} else if (arg == "--lora-model-dir") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_arg = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.lora_model_dir = argv[i];
|
||||||
|
} else if (arg == "-i" || arg == "--init-img") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_arg = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.input_path = argv[i];
|
||||||
|
} else if (arg == "--control-image") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_arg = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.control_image_path = argv[i];
|
||||||
|
} else if (arg == "-o" || arg == "--output") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_arg = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.output_path = argv[i];
|
||||||
|
} else if (arg == "-p" || arg == "--prompt") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_arg = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.prompt = argv[i];
|
||||||
|
} else if (arg == "--upscale-repeats") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_arg = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.upscale_repeats = std::stoi(argv[i]);
|
||||||
|
if (params.upscale_repeats < 1) {
|
||||||
|
fprintf(stderr, "error: upscale multiplier must be at least 1\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
} else if (arg == "-n" || arg == "--negative-prompt") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_arg = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.negative_prompt = argv[i];
|
||||||
|
} else if (arg == "--cfg-scale") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_arg = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.cfg_scale = std::stof(argv[i]);
|
||||||
|
} else if (arg == "--strength") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_arg = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.strength = std::stof(argv[i]);
|
||||||
|
} else if (arg == "--control-strength") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_arg = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.control_strength = std::stof(argv[i]);
|
||||||
|
} else if (arg == "-H" || arg == "--height") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_arg = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.height = std::stoi(argv[i]);
|
||||||
|
} else if (arg == "-W" || arg == "--width") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_arg = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.width = std::stoi(argv[i]);
|
||||||
|
} else if (arg == "--steps") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_arg = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.sample_steps = std::stoi(argv[i]);
|
||||||
|
} else if (arg == "--clip-skip") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_arg = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.clip_skip = std::stoi(argv[i]);
|
||||||
|
} else if (arg == "--vae-tiling") {
|
||||||
|
params.vae_tiling = true;
|
||||||
|
} else if (arg == "--control-net-cpu") {
|
||||||
|
params.control_net_cpu = true;
|
||||||
|
} else if (arg == "--canny") {
|
||||||
|
params.canny_preprocess = true;
|
||||||
|
} else if (arg == "-b" || arg == "--batch-count") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_arg = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.batch_count = std::stoi(argv[i]);
|
||||||
|
} else if (arg == "--rng") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_arg = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
std::string rng_type_str = argv[i];
|
||||||
|
if (rng_type_str == "std_default") {
|
||||||
|
params.rng_type = STD_DEFAULT_RNG;
|
||||||
|
} else if (rng_type_str == "cuda") {
|
||||||
|
params.rng_type = CUDA_RNG;
|
||||||
|
} else {
|
||||||
|
invalid_arg = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} else if (arg == "--schedule") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_arg = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
const char* schedule_selected = argv[i];
|
||||||
|
int schedule_found = -1;
|
||||||
|
for (int d = 0; d < N_SCHEDULES; d++) {
|
||||||
|
if (!strcmp(schedule_selected, schedule_str[d])) {
|
||||||
|
schedule_found = d;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (schedule_found == -1) {
|
||||||
|
invalid_arg = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.schedule = (schedule_t)schedule_found;
|
||||||
|
} else if (arg == "-s" || arg == "--seed") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_arg = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.seed = std::stoll(argv[i]);
|
||||||
|
} else if (arg == "--sampling-method") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_arg = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
const char* sample_method_selected = argv[i];
|
||||||
|
int sample_method_found = -1;
|
||||||
|
for (int m = 0; m < N_SAMPLE_METHODS; m++) {
|
||||||
|
if (!strcmp(sample_method_selected, sample_method_str[m])) {
|
||||||
|
sample_method_found = m;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (sample_method_found == -1) {
|
||||||
|
invalid_arg = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.sample_method = (sample_method_t)sample_method_found;
|
||||||
|
} else if (arg == "-h" || arg == "--help") {
|
||||||
|
print_usage(argc, argv);
|
||||||
|
exit(0);
|
||||||
|
} else if (arg == "-v" || arg == "--verbose") {
|
||||||
|
params.verbose = true;
|
||||||
|
} else {
|
||||||
|
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||||
|
print_usage(argc, argv);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (invalid_arg) {
|
||||||
|
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
|
||||||
|
print_usage(argc, argv);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
if (params.n_threads <= 0) {
|
||||||
|
params.n_threads = get_num_physical_cores();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (params.mode != CONVERT && params.mode != IMG2VID && params.prompt.length() == 0) {
|
||||||
|
fprintf(stderr, "error: the following arguments are required: prompt\n");
|
||||||
|
print_usage(argc, argv);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (params.model_path.length() == 0) {
|
||||||
|
fprintf(stderr, "error: the following arguments are required: model_path\n");
|
||||||
|
print_usage(argc, argv);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((params.mode == IMG2IMG || params.mode == IMG2VID) && params.input_path.length() == 0) {
|
||||||
|
fprintf(stderr, "error: when using the img2img mode, the following arguments are required: init-img\n");
|
||||||
|
print_usage(argc, argv);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (params.output_path.length() == 0) {
|
||||||
|
fprintf(stderr, "error: the following arguments are required: output_path\n");
|
||||||
|
print_usage(argc, argv);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (params.width <= 0 || params.width % 64 != 0) {
|
||||||
|
fprintf(stderr, "error: the width must be a multiple of 64\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (params.height <= 0 || params.height % 64 != 0) {
|
||||||
|
fprintf(stderr, "error: the height must be a multiple of 64\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (params.sample_steps <= 0) {
|
||||||
|
fprintf(stderr, "error: the sample_steps must be greater than 0\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (params.strength < 0.f || params.strength > 1.f) {
|
||||||
|
fprintf(stderr, "error: can only work with strength in [0.0, 1.0]\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (params.seed < 0) {
|
||||||
|
srand((int)time(NULL));
|
||||||
|
params.seed = rand();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (params.mode == CONVERT) {
|
||||||
|
if (params.output_path == "output.png") {
|
||||||
|
params.output_path = "output.gguf";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::string sd_basename(const std::string& path) {
|
||||||
|
size_t pos = path.find_last_of('/');
|
||||||
|
if (pos != std::string::npos) {
|
||||||
|
return path.substr(pos + 1);
|
||||||
|
}
|
||||||
|
pos = path.find_last_of('\\');
|
||||||
|
if (pos != std::string::npos) {
|
||||||
|
return path.substr(pos + 1);
|
||||||
|
}
|
||||||
|
return path;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string get_image_params(SDParams params, int64_t seed) {
|
||||||
|
std::string parameter_string = params.prompt + "\n";
|
||||||
|
if (params.negative_prompt.size() != 0) {
|
||||||
|
parameter_string += "Negative prompt: " + params.negative_prompt + "\n";
|
||||||
|
}
|
||||||
|
parameter_string += "Steps: " + std::to_string(params.sample_steps) + ", ";
|
||||||
|
parameter_string += "CFG scale: " + std::to_string(params.cfg_scale) + ", ";
|
||||||
|
parameter_string += "Seed: " + std::to_string(seed) + ", ";
|
||||||
|
parameter_string += "Size: " + std::to_string(params.width) + "x" + std::to_string(params.height) + ", ";
|
||||||
|
parameter_string += "Model: " + sd_basename(params.model_path) + ", ";
|
||||||
|
parameter_string += "RNG: " + std::string(rng_type_to_str[params.rng_type]) + ", ";
|
||||||
|
parameter_string += "Sampler: " + std::string(sample_method_str[params.sample_method]);
|
||||||
|
if (params.schedule == KARRAS) {
|
||||||
|
parameter_string += " karras";
|
||||||
|
}
|
||||||
|
parameter_string += ", ";
|
||||||
|
parameter_string += "Version: stable-diffusion.cpp";
|
||||||
|
return parameter_string;
|
||||||
|
}
|
||||||
|
|
||||||
|
void sd_log_cb(enum sd_log_level_t level, const char* log, void* data) {
|
||||||
|
SDParams* params = (SDParams*)data;
|
||||||
|
if (!params->verbose && level <= SD_LOG_DEBUG) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (level <= SD_LOG_INFO) {
|
||||||
|
fputs(log, stdout);
|
||||||
|
fflush(stdout);
|
||||||
|
} else {
|
||||||
|
fputs(log, stderr);
|
||||||
|
fflush(stderr);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, const char* argv[]) {
|
||||||
|
SDParams params;
|
||||||
|
parse_args(argc, argv, params);
|
||||||
|
|
||||||
|
sd_set_log_callback(sd_log_cb, (void*)¶ms);
|
||||||
|
|
||||||
|
if (params.verbose) {
|
||||||
|
print_params(params);
|
||||||
|
printf("%s", sd_get_system_info());
|
||||||
|
}
|
||||||
|
|
||||||
|
if (params.mode == CONVERT) {
|
||||||
|
bool success = convert(params.model_path.c_str(), params.vae_path.c_str(), params.output_path.c_str(), params.wtype);
|
||||||
|
if (!success) {
|
||||||
|
fprintf(stderr,
|
||||||
|
"convert '%s'/'%s' to '%s' failed\n",
|
||||||
|
params.model_path.c_str(),
|
||||||
|
params.vae_path.c_str(),
|
||||||
|
params.output_path.c_str());
|
||||||
|
return 1;
|
||||||
|
} else {
|
||||||
|
printf("convert '%s'/'%s' to '%s' success\n",
|
||||||
|
params.model_path.c_str(),
|
||||||
|
params.vae_path.c_str(),
|
||||||
|
params.output_path.c_str());
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (params.mode == IMG2VID) {
|
||||||
|
fprintf(stderr, "SVD support is broken, do not use it!!!\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool vae_decode_only = true;
|
||||||
|
uint8_t* input_image_buffer = NULL;
|
||||||
|
if (params.mode == IMG2IMG || params.mode == IMG2VID) {
|
||||||
|
vae_decode_only = false;
|
||||||
|
|
||||||
|
int c = 0;
|
||||||
|
input_image_buffer = stbi_load(params.input_path.c_str(), ¶ms.width, ¶ms.height, &c, 3);
|
||||||
|
if (input_image_buffer == NULL) {
|
||||||
|
fprintf(stderr, "load image from '%s' failed\n", params.input_path.c_str());
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if (c != 3) {
|
||||||
|
fprintf(stderr, "input image must be a 3 channels RGB image, but got %d channels\n", c);
|
||||||
|
free(input_image_buffer);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if (params.width <= 0 || params.width % 64 != 0) {
|
||||||
|
fprintf(stderr, "error: the width of image must be a multiple of 64\n");
|
||||||
|
free(input_image_buffer);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if (params.height <= 0 || params.height % 64 != 0) {
|
||||||
|
fprintf(stderr, "error: the height of image must be a multiple of 64\n");
|
||||||
|
free(input_image_buffer);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
sd_ctx_t* sd_ctx = new_sd_ctx(params.model_path.c_str(),
|
||||||
|
params.vae_path.c_str(),
|
||||||
|
params.taesd_path.c_str(),
|
||||||
|
params.controlnet_path.c_str(),
|
||||||
|
params.lora_model_dir.c_str(),
|
||||||
|
params.embeddings_path.c_str(),
|
||||||
|
vae_decode_only,
|
||||||
|
params.vae_tiling,
|
||||||
|
true,
|
||||||
|
params.n_threads,
|
||||||
|
params.wtype,
|
||||||
|
params.rng_type,
|
||||||
|
params.schedule,
|
||||||
|
params.control_net_cpu);
|
||||||
|
|
||||||
|
if (sd_ctx == NULL) {
|
||||||
|
printf("new_sd_ctx_t failed\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
sd_image_t* results;
|
||||||
|
if (params.mode == TXT2IMG) {
|
||||||
|
sd_image_t* control_image = NULL;
|
||||||
|
if (params.controlnet_path.size() > 0 && params.control_image_path.size() > 0) {
|
||||||
|
int c = 0;
|
||||||
|
input_image_buffer = stbi_load(params.control_image_path.c_str(), ¶ms.width, ¶ms.height, &c, 3);
|
||||||
|
if (input_image_buffer == NULL) {
|
||||||
|
fprintf(stderr, "load image from '%s' failed\n", params.control_image_path.c_str());
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
control_image = new sd_image_t{(uint32_t)params.width,
|
||||||
|
(uint32_t)params.height,
|
||||||
|
3,
|
||||||
|
input_image_buffer};
|
||||||
|
if (params.canny_preprocess) { // apply preprocessor
|
||||||
|
control_image->data = preprocess_canny(control_image->data,
|
||||||
|
control_image->width,
|
||||||
|
control_image->height,
|
||||||
|
0.08f,
|
||||||
|
0.08f,
|
||||||
|
0.8f,
|
||||||
|
1.0f,
|
||||||
|
false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
results = txt2img(sd_ctx,
|
||||||
|
params.prompt.c_str(),
|
||||||
|
params.negative_prompt.c_str(),
|
||||||
|
params.clip_skip,
|
||||||
|
params.cfg_scale,
|
||||||
|
params.width,
|
||||||
|
params.height,
|
||||||
|
params.sample_method,
|
||||||
|
params.sample_steps,
|
||||||
|
params.seed,
|
||||||
|
params.batch_count,
|
||||||
|
control_image,
|
||||||
|
params.control_strength);
|
||||||
|
} else {
|
||||||
|
sd_image_t input_image = {(uint32_t)params.width,
|
||||||
|
(uint32_t)params.height,
|
||||||
|
3,
|
||||||
|
input_image_buffer};
|
||||||
|
|
||||||
|
if (params.mode == IMG2VID) {
|
||||||
|
results = img2vid(sd_ctx,
|
||||||
|
input_image,
|
||||||
|
params.width,
|
||||||
|
params.height,
|
||||||
|
params.video_frames,
|
||||||
|
params.motion_bucket_id,
|
||||||
|
params.fps,
|
||||||
|
params.augmentation_level,
|
||||||
|
params.min_cfg,
|
||||||
|
params.cfg_scale,
|
||||||
|
params.sample_method,
|
||||||
|
params.sample_steps,
|
||||||
|
params.strength,
|
||||||
|
params.seed);
|
||||||
|
if (results == NULL) {
|
||||||
|
printf("generate failed\n");
|
||||||
|
free_sd_ctx(sd_ctx);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
size_t last = params.output_path.find_last_of(".");
|
||||||
|
std::string dummy_name = last != std::string::npos ? params.output_path.substr(0, last) : params.output_path;
|
||||||
|
for (int i = 0; i < params.video_frames; i++) {
|
||||||
|
if (results[i].data == NULL) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
std::string final_image_path = i > 0 ? dummy_name + "_" + std::to_string(i + 1) + ".png" : dummy_name + ".png";
|
||||||
|
stbi_write_png(final_image_path.c_str(), results[i].width, results[i].height, results[i].channel,
|
||||||
|
results[i].data, 0, get_image_params(params, params.seed + i).c_str());
|
||||||
|
printf("save result image to '%s'\n", final_image_path.c_str());
|
||||||
|
free(results[i].data);
|
||||||
|
results[i].data = NULL;
|
||||||
|
}
|
||||||
|
free(results);
|
||||||
|
free_sd_ctx(sd_ctx);
|
||||||
|
return 0;
|
||||||
|
} else {
|
||||||
|
results = img2img(sd_ctx,
|
||||||
|
input_image,
|
||||||
|
params.prompt.c_str(),
|
||||||
|
params.negative_prompt.c_str(),
|
||||||
|
params.clip_skip,
|
||||||
|
params.cfg_scale,
|
||||||
|
params.width,
|
||||||
|
params.height,
|
||||||
|
params.sample_method,
|
||||||
|
params.sample_steps,
|
||||||
|
params.strength,
|
||||||
|
params.seed,
|
||||||
|
params.batch_count);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (results == NULL) {
|
||||||
|
printf("generate failed\n");
|
||||||
|
free_sd_ctx(sd_ctx);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
int upscale_factor = 4; // unused for RealESRGAN_x4plus_anime_6B.pth
|
||||||
|
if (params.esrgan_path.size() > 0 && params.upscale_repeats > 0) {
|
||||||
|
upscaler_ctx_t* upscaler_ctx = new_upscaler_ctx(params.esrgan_path.c_str(),
|
||||||
|
params.n_threads,
|
||||||
|
params.wtype);
|
||||||
|
|
||||||
|
if (upscaler_ctx == NULL) {
|
||||||
|
printf("new_upscaler_ctx failed\n");
|
||||||
|
} else {
|
||||||
|
for (int i = 0; i < params.batch_count; i++) {
|
||||||
|
if (results[i].data == NULL) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
sd_image_t current_image = results[i];
|
||||||
|
for (int u = 0; u < params.upscale_repeats; ++u) {
|
||||||
|
sd_image_t upscaled_image = upscale(upscaler_ctx, current_image, upscale_factor);
|
||||||
|
if (upscaled_image.data == NULL) {
|
||||||
|
printf("upscale failed\n");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
free(current_image.data);
|
||||||
|
current_image = upscaled_image;
|
||||||
|
}
|
||||||
|
results[i] = current_image; // Set the final upscaled image as the result
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t last = params.output_path.find_last_of(".");
|
||||||
|
std::string dummy_name = last != std::string::npos ? params.output_path.substr(0, last) : params.output_path;
|
||||||
|
for (int i = 0; i < params.batch_count; i++) {
|
||||||
|
if (results[i].data == NULL) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
std::string final_image_path = i > 0 ? dummy_name + "_" + std::to_string(i + 1) + ".png" : dummy_name + ".png";
|
||||||
|
stbi_write_png(final_image_path.c_str(), results[i].width, results[i].height, results[i].channel,
|
||||||
|
results[i].data, 0, get_image_params(params, params.seed + i).c_str());
|
||||||
|
printf("save result image to '%s'\n", final_image_path.c_str());
|
||||||
|
free(results[i].data);
|
||||||
|
results[i].data = NULL;
|
||||||
|
}
|
||||||
|
free(results);
|
||||||
|
free_sd_ctx(sd_ctx);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
1609
otherarch/sdcpp/model.cpp
Normal file
1609
otherarch/sdcpp/model.cpp
Normal file
File diff suppressed because it is too large
Load diff
154
otherarch/sdcpp/model.h
Normal file
154
otherarch/sdcpp/model.h
Normal file
|
@ -0,0 +1,154 @@
|
||||||
|
#ifndef __MODEL_H__
|
||||||
|
#define __MODEL_H__
|
||||||
|
|
||||||
|
#include <functional>
|
||||||
|
#include <map>
|
||||||
|
#include <memory>
|
||||||
|
#include <set>
|
||||||
|
#include <sstream>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include "ggml-backend.h"
|
||||||
|
#include "ggml.h"
|
||||||
|
#include "json.hpp"
|
||||||
|
#include "zip.h"
|
||||||
|
|
||||||
|
#define SD_MAX_DIMS 5
|
||||||
|
|
||||||
|
enum SDVersion {
|
||||||
|
VERSION_1_x,
|
||||||
|
VERSION_2_x,
|
||||||
|
VERSION_XL,
|
||||||
|
VERSION_SVD,
|
||||||
|
VERSION_COUNT,
|
||||||
|
};
|
||||||
|
|
||||||
|
struct TensorStorage {
|
||||||
|
std::string name;
|
||||||
|
ggml_type type = GGML_TYPE_F32;
|
||||||
|
bool is_bf16 = false;
|
||||||
|
int64_t ne[SD_MAX_DIMS] = {1, 1, 1, 1, 1};
|
||||||
|
int n_dims = 0;
|
||||||
|
|
||||||
|
size_t file_index = 0;
|
||||||
|
int index_in_zip = -1; // >= means stored in a zip file
|
||||||
|
size_t offset = 0; // offset in file
|
||||||
|
|
||||||
|
TensorStorage() = default;
|
||||||
|
|
||||||
|
TensorStorage(const std::string& name, ggml_type type, int64_t* ne, int n_dims, size_t file_index, size_t offset = 0)
|
||||||
|
: name(name), type(type), n_dims(n_dims), file_index(file_index), offset(offset) {
|
||||||
|
for (int i = 0; i < n_dims; i++) {
|
||||||
|
this->ne[i] = ne[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int64_t nelements() const {
|
||||||
|
int64_t n = 1;
|
||||||
|
for (int i = 0; i < SD_MAX_DIMS; i++) {
|
||||||
|
n *= ne[i];
|
||||||
|
}
|
||||||
|
return n;
|
||||||
|
}
|
||||||
|
|
||||||
|
int64_t nbytes() const {
|
||||||
|
return nelements() * ggml_type_size(type) / ggml_blck_size(type);
|
||||||
|
}
|
||||||
|
|
||||||
|
int64_t nbytes_to_read() const {
|
||||||
|
if (is_bf16) {
|
||||||
|
return nbytes() / 2;
|
||||||
|
} else {
|
||||||
|
return nbytes();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void unsqueeze() {
|
||||||
|
if (n_dims == 2) {
|
||||||
|
n_dims = 4;
|
||||||
|
ne[3] = ne[1];
|
||||||
|
ne[2] = ne[0];
|
||||||
|
ne[1] = 1;
|
||||||
|
ne[0] = 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<TensorStorage> chunk(size_t n) {
|
||||||
|
std::vector<TensorStorage> chunks;
|
||||||
|
size_t chunk_size = nbytes_to_read() / n;
|
||||||
|
// printf("%d/%d\n", chunk_size, nbytes_to_read());
|
||||||
|
reverse_ne();
|
||||||
|
for (int i = 0; i < n; i++) {
|
||||||
|
TensorStorage chunk_i = *this;
|
||||||
|
chunk_i.ne[0] = ne[0] / n;
|
||||||
|
chunk_i.offset = offset + i * chunk_size;
|
||||||
|
chunk_i.reverse_ne();
|
||||||
|
chunks.push_back(chunk_i);
|
||||||
|
}
|
||||||
|
reverse_ne();
|
||||||
|
return chunks;
|
||||||
|
}
|
||||||
|
|
||||||
|
void reverse_ne() {
|
||||||
|
int64_t new_ne[SD_MAX_DIMS] = {1, 1, 1, 1, 1};
|
||||||
|
for (int i = 0; i < n_dims; i++) {
|
||||||
|
new_ne[i] = ne[n_dims - 1 - i];
|
||||||
|
}
|
||||||
|
for (int i = 0; i < n_dims; i++) {
|
||||||
|
ne[i] = new_ne[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string to_string() const {
|
||||||
|
std::stringstream ss;
|
||||||
|
const char* type_name = ggml_type_name(type);
|
||||||
|
if (is_bf16) {
|
||||||
|
type_name = "bf16";
|
||||||
|
}
|
||||||
|
ss << name << " | " << type_name << " | ";
|
||||||
|
ss << n_dims << " [";
|
||||||
|
for (int i = 0; i < SD_MAX_DIMS; i++) {
|
||||||
|
ss << ne[i];
|
||||||
|
if (i != SD_MAX_DIMS - 1) {
|
||||||
|
ss << ", ";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ss << "]";
|
||||||
|
return ss.str();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
typedef std::function<bool(const TensorStorage&, ggml_tensor**)> on_new_tensor_cb_t;
|
||||||
|
|
||||||
|
class ModelLoader {
|
||||||
|
protected:
|
||||||
|
std::vector<std::string> file_paths_;
|
||||||
|
std::vector<TensorStorage> tensor_storages;
|
||||||
|
|
||||||
|
bool parse_data_pkl(uint8_t* buffer,
|
||||||
|
size_t buffer_size,
|
||||||
|
zip_t* zip,
|
||||||
|
std::string dir,
|
||||||
|
size_t file_index,
|
||||||
|
const std::string& prefix);
|
||||||
|
|
||||||
|
bool init_from_gguf_file(const std::string& file_path, const std::string& prefix = "");
|
||||||
|
bool init_from_safetensors_file(const std::string& file_path, const std::string& prefix = "");
|
||||||
|
bool init_from_ckpt_file(const std::string& file_path, const std::string& prefix = "");
|
||||||
|
bool init_from_diffusers_file(const std::string& file_path, const std::string& prefix = "");
|
||||||
|
|
||||||
|
public:
|
||||||
|
bool init_from_file(const std::string& file_path, const std::string& prefix = "");
|
||||||
|
SDVersion get_sd_version();
|
||||||
|
ggml_type get_sd_wtype();
|
||||||
|
std::string load_merges();
|
||||||
|
bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend_t backend);
|
||||||
|
bool load_tensors(std::map<std::string, struct ggml_tensor*>& tensors,
|
||||||
|
ggml_backend_t backend,
|
||||||
|
std::set<std::string> ignore_tensors = {});
|
||||||
|
bool save_to_gguf_file(const std::string& file_path, ggml_type type);
|
||||||
|
int64_t get_params_mem_size(ggml_backend_t backend, ggml_type type = GGML_TYPE_COUNT);
|
||||||
|
~ModelLoader() = default;
|
||||||
|
};
|
||||||
|
#endif // __MODEL_H__
|
227
otherarch/sdcpp/preprocessing.hpp
Normal file
227
otherarch/sdcpp/preprocessing.hpp
Normal file
|
@ -0,0 +1,227 @@
|
||||||
|
#ifndef __PREPROCESSING_HPP__
|
||||||
|
#define __PREPROCESSING_HPP__
|
||||||
|
|
||||||
|
#include "ggml_extend.hpp"
|
||||||
|
#define M_PI_ 3.14159265358979323846
|
||||||
|
|
||||||
|
void convolve(struct ggml_tensor* input, struct ggml_tensor* output, struct ggml_tensor* kernel, int padding) {
|
||||||
|
struct ggml_init_params params;
|
||||||
|
params.mem_size = 20 * 1024 * 1024; // 10
|
||||||
|
params.mem_buffer = NULL;
|
||||||
|
params.no_alloc = false;
|
||||||
|
struct ggml_context* ctx0 = ggml_init(params);
|
||||||
|
struct ggml_tensor* kernel_fp16 = ggml_new_tensor_4d(ctx0, GGML_TYPE_F16, kernel->ne[0], kernel->ne[1], 1, 1);
|
||||||
|
ggml_fp32_to_fp16_row((float*)kernel->data, (ggml_fp16_t*)kernel_fp16->data, ggml_nelements(kernel));
|
||||||
|
ggml_tensor* h = ggml_conv_2d(ctx0, kernel_fp16, input, 1, 1, padding, padding, 1, 1);
|
||||||
|
ggml_cgraph* gf = ggml_new_graph(ctx0);
|
||||||
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, h, output));
|
||||||
|
ggml_graph_compute_with_ctx(ctx0, gf, 1);
|
||||||
|
ggml_free(ctx0);
|
||||||
|
}
|
||||||
|
|
||||||
|
void gaussian_kernel(struct ggml_tensor* kernel) {
|
||||||
|
int ks_mid = kernel->ne[0] / 2;
|
||||||
|
float sigma = 1.4f;
|
||||||
|
float normal = 1.f / (2.0f * M_PI_ * powf(sigma, 2.0f));
|
||||||
|
for (int y = 0; y < kernel->ne[0]; y++) {
|
||||||
|
float gx = -ks_mid + y;
|
||||||
|
for (int x = 0; x < kernel->ne[1]; x++) {
|
||||||
|
float gy = -ks_mid + x;
|
||||||
|
float k_ = expf(-((gx * gx + gy * gy) / (2.0f * powf(sigma, 2.0f)))) * normal;
|
||||||
|
ggml_tensor_set_f32(kernel, k_, x, y);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void grayscale(struct ggml_tensor* rgb_img, struct ggml_tensor* grayscale) {
|
||||||
|
for (int iy = 0; iy < rgb_img->ne[1]; iy++) {
|
||||||
|
for (int ix = 0; ix < rgb_img->ne[0]; ix++) {
|
||||||
|
float r = ggml_tensor_get_f32(rgb_img, ix, iy);
|
||||||
|
float g = ggml_tensor_get_f32(rgb_img, ix, iy, 1);
|
||||||
|
float b = ggml_tensor_get_f32(rgb_img, ix, iy, 2);
|
||||||
|
float gray = 0.2989f * r + 0.5870f * g + 0.1140f * b;
|
||||||
|
ggml_tensor_set_f32(grayscale, gray, ix, iy);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void prop_hypot(struct ggml_tensor* x, struct ggml_tensor* y, struct ggml_tensor* h) {
|
||||||
|
int n_elements = ggml_nelements(h);
|
||||||
|
float* dx = (float*)x->data;
|
||||||
|
float* dy = (float*)y->data;
|
||||||
|
float* dh = (float*)h->data;
|
||||||
|
for (int i = 0; i < n_elements; i++) {
|
||||||
|
dh[i] = sqrtf(dx[i] * dx[i] + dy[i] * dy[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void prop_arctan2(struct ggml_tensor* x, struct ggml_tensor* y, struct ggml_tensor* h) {
|
||||||
|
int n_elements = ggml_nelements(h);
|
||||||
|
float* dx = (float*)x->data;
|
||||||
|
float* dy = (float*)y->data;
|
||||||
|
float* dh = (float*)h->data;
|
||||||
|
for (int i = 0; i < n_elements; i++) {
|
||||||
|
dh[i] = atan2f(dy[i], dx[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void normalize_tensor(struct ggml_tensor* g) {
|
||||||
|
int n_elements = ggml_nelements(g);
|
||||||
|
float* dg = (float*)g->data;
|
||||||
|
float max = -INFINITY;
|
||||||
|
for (int i = 0; i < n_elements; i++) {
|
||||||
|
max = dg[i] > max ? dg[i] : max;
|
||||||
|
}
|
||||||
|
max = 1.0f / max;
|
||||||
|
for (int i = 0; i < n_elements; i++) {
|
||||||
|
dg[i] *= max;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void non_max_supression(struct ggml_tensor* result, struct ggml_tensor* G, struct ggml_tensor* D) {
|
||||||
|
for (int iy = 1; iy < result->ne[1] - 1; iy++) {
|
||||||
|
for (int ix = 1; ix < result->ne[0] - 1; ix++) {
|
||||||
|
float angle = ggml_tensor_get_f32(D, ix, iy) * 180.0f / M_PI_;
|
||||||
|
angle = angle < 0.0f ? angle += 180.0f : angle;
|
||||||
|
float q = 1.0f;
|
||||||
|
float r = 1.0f;
|
||||||
|
|
||||||
|
// angle 0
|
||||||
|
if ((0 >= angle && angle < 22.5f) || (157.5f >= angle && angle <= 180)) {
|
||||||
|
q = ggml_tensor_get_f32(G, ix, iy + 1);
|
||||||
|
r = ggml_tensor_get_f32(G, ix, iy - 1);
|
||||||
|
}
|
||||||
|
// angle 45
|
||||||
|
else if (22.5f >= angle && angle < 67.5f) {
|
||||||
|
q = ggml_tensor_get_f32(G, ix + 1, iy - 1);
|
||||||
|
r = ggml_tensor_get_f32(G, ix - 1, iy + 1);
|
||||||
|
}
|
||||||
|
// angle 90
|
||||||
|
else if (67.5f >= angle && angle < 112.5) {
|
||||||
|
q = ggml_tensor_get_f32(G, ix + 1, iy);
|
||||||
|
r = ggml_tensor_get_f32(G, ix - 1, iy);
|
||||||
|
}
|
||||||
|
// angle 135
|
||||||
|
else if (112.5 >= angle && angle < 157.5f) {
|
||||||
|
q = ggml_tensor_get_f32(G, ix - 1, iy - 1);
|
||||||
|
r = ggml_tensor_get_f32(G, ix + 1, iy + 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
float cur = ggml_tensor_get_f32(G, ix, iy);
|
||||||
|
if ((cur >= q) && (cur >= r)) {
|
||||||
|
ggml_tensor_set_f32(result, cur, ix, iy);
|
||||||
|
} else {
|
||||||
|
ggml_tensor_set_f32(result, 0.0f, ix, iy);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void threshold_hystersis(struct ggml_tensor* img, float high_threshold, float low_threshold, float weak, float strong) {
|
||||||
|
int n_elements = ggml_nelements(img);
|
||||||
|
float* imd = (float*)img->data;
|
||||||
|
float max = -INFINITY;
|
||||||
|
for (int i = 0; i < n_elements; i++) {
|
||||||
|
max = imd[i] > max ? imd[i] : max;
|
||||||
|
}
|
||||||
|
float ht = max * high_threshold;
|
||||||
|
float lt = ht * low_threshold;
|
||||||
|
for (int i = 0; i < n_elements; i++) {
|
||||||
|
float img_v = imd[i];
|
||||||
|
if (img_v >= ht) { // strong pixel
|
||||||
|
imd[i] = strong;
|
||||||
|
} else if (img_v <= ht && img_v >= lt) { // strong pixel
|
||||||
|
imd[i] = weak;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int iy = 0; iy < img->ne[1]; iy++) {
|
||||||
|
for (int ix = 0; ix < img->ne[0]; ix++) {
|
||||||
|
if (ix >= 3 && ix <= img->ne[0] - 3 && iy >= 3 && iy <= img->ne[1] - 3) {
|
||||||
|
ggml_tensor_set_f32(img, ggml_tensor_get_f32(img, ix, iy), ix, iy);
|
||||||
|
} else {
|
||||||
|
ggml_tensor_set_f32(img, 0.0f, ix, iy);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// hysteresis
|
||||||
|
for (int iy = 1; iy < img->ne[1] - 1; iy++) {
|
||||||
|
for (int ix = 1; ix < img->ne[0] - 1; ix++) {
|
||||||
|
float imd_v = ggml_tensor_get_f32(img, ix, iy);
|
||||||
|
if (imd_v == weak) {
|
||||||
|
if (ggml_tensor_get_f32(img, ix + 1, iy - 1) == strong || ggml_tensor_get_f32(img, ix + 1, iy) == strong ||
|
||||||
|
ggml_tensor_get_f32(img, ix, iy - 1) == strong || ggml_tensor_get_f32(img, ix, iy + 1) == strong ||
|
||||||
|
ggml_tensor_get_f32(img, ix - 1, iy - 1) == strong || ggml_tensor_get_f32(img, ix - 1, iy) == strong) {
|
||||||
|
ggml_tensor_set_f32(img, strong, ix, iy);
|
||||||
|
} else {
|
||||||
|
ggml_tensor_set_f32(img, 0.0f, ix, iy);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
uint8_t* preprocess_canny(uint8_t* img, int width, int height, float high_threshold, float low_threshold, float weak, float strong, bool inverse) {
|
||||||
|
struct ggml_init_params params;
|
||||||
|
params.mem_size = static_cast<size_t>(10 * 1024 * 1024); // 10
|
||||||
|
params.mem_buffer = NULL;
|
||||||
|
params.no_alloc = false;
|
||||||
|
struct ggml_context* work_ctx = ggml_init(params);
|
||||||
|
|
||||||
|
if (!work_ctx) {
|
||||||
|
LOG_ERROR("ggml_init() failed");
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
float kX[9] = {
|
||||||
|
-1, 0, 1,
|
||||||
|
-2, 0, 2,
|
||||||
|
-1, 0, 1};
|
||||||
|
|
||||||
|
float kY[9] = {
|
||||||
|
1, 2, 1,
|
||||||
|
0, 0, 0,
|
||||||
|
-1, -2, -1};
|
||||||
|
|
||||||
|
// generate kernel
|
||||||
|
int kernel_size = 5;
|
||||||
|
struct ggml_tensor* gkernel = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, kernel_size, kernel_size, 1, 1);
|
||||||
|
struct ggml_tensor* sf_kx = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 3, 3, 1, 1);
|
||||||
|
memcpy(sf_kx->data, kX, ggml_nbytes(sf_kx));
|
||||||
|
struct ggml_tensor* sf_ky = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 3, 3, 1, 1);
|
||||||
|
memcpy(sf_ky->data, kY, ggml_nbytes(sf_ky));
|
||||||
|
gaussian_kernel(gkernel);
|
||||||
|
struct ggml_tensor* image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
|
||||||
|
struct ggml_tensor* image_gray = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 1, 1);
|
||||||
|
struct ggml_tensor* iX = ggml_dup_tensor(work_ctx, image_gray);
|
||||||
|
struct ggml_tensor* iY = ggml_dup_tensor(work_ctx, image_gray);
|
||||||
|
struct ggml_tensor* G = ggml_dup_tensor(work_ctx, image_gray);
|
||||||
|
struct ggml_tensor* tetha = ggml_dup_tensor(work_ctx, image_gray);
|
||||||
|
sd_image_to_tensor(img, image);
|
||||||
|
grayscale(image, image_gray);
|
||||||
|
convolve(image_gray, image_gray, gkernel, 2);
|
||||||
|
convolve(image_gray, iX, sf_kx, 1);
|
||||||
|
convolve(image_gray, iY, sf_ky, 1);
|
||||||
|
prop_hypot(iX, iY, G);
|
||||||
|
normalize_tensor(G);
|
||||||
|
prop_arctan2(iX, iY, tetha);
|
||||||
|
non_max_supression(image_gray, G, tetha);
|
||||||
|
threshold_hystersis(image_gray, high_threshold, low_threshold, weak, strong);
|
||||||
|
// to RGB channels
|
||||||
|
for (int iy = 0; iy < height; iy++) {
|
||||||
|
for (int ix = 0; ix < width; ix++) {
|
||||||
|
float gray = ggml_tensor_get_f32(image_gray, ix, iy);
|
||||||
|
gray = inverse ? 1.0f - gray : gray;
|
||||||
|
ggml_tensor_set_f32(image, gray, ix, iy);
|
||||||
|
ggml_tensor_set_f32(image, gray, ix, iy, 1);
|
||||||
|
ggml_tensor_set_f32(image, gray, ix, iy, 2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
free(img);
|
||||||
|
uint8_t* output = sd_tensor_to_image(image);
|
||||||
|
ggml_free(work_ctx);
|
||||||
|
return output;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // __PREPROCESSING_HPP__
|
35
otherarch/sdcpp/rng.hpp
Normal file
35
otherarch/sdcpp/rng.hpp
Normal file
|
@ -0,0 +1,35 @@
|
||||||
|
#ifndef __RNG_H__
|
||||||
|
#define __RNG_H__
|
||||||
|
|
||||||
|
#include <random>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
class RNG {
|
||||||
|
public:
|
||||||
|
virtual void manual_seed(uint64_t seed) = 0;
|
||||||
|
virtual std::vector<float> randn(uint32_t n) = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
class STDDefaultRNG : public RNG {
|
||||||
|
private:
|
||||||
|
std::default_random_engine generator;
|
||||||
|
|
||||||
|
public:
|
||||||
|
void manual_seed(uint64_t seed) {
|
||||||
|
generator.seed((unsigned int)seed);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<float> randn(uint32_t n) {
|
||||||
|
std::vector<float> result;
|
||||||
|
float mean = 0.0;
|
||||||
|
float stddev = 1.0;
|
||||||
|
std::normal_distribution<float> distribution(mean, stddev);
|
||||||
|
for (uint32_t i = 0; i < n; i++) {
|
||||||
|
float random_number = distribution(generator);
|
||||||
|
result.push_back(random_number);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif // __RNG_H__
|
125
otherarch/sdcpp/rng_philox.hpp
Normal file
125
otherarch/sdcpp/rng_philox.hpp
Normal file
|
@ -0,0 +1,125 @@
|
||||||
|
#ifndef __RNG_PHILOX_H__
|
||||||
|
#define __RNG_PHILOX_H__
|
||||||
|
|
||||||
|
#include <cmath>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include "rng.hpp"
|
||||||
|
|
||||||
|
// RNG imitiating torch cuda randn on CPU.
|
||||||
|
// Port from: https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/5ef669de080814067961f28357256e8fe27544f4/modules/rng_philox.py
|
||||||
|
class PhiloxRNG : public RNG {
|
||||||
|
private:
|
||||||
|
uint64_t seed;
|
||||||
|
uint32_t offset;
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::vector<uint32_t> philox_m = {0xD2511F53, 0xCD9E8D57};
|
||||||
|
std::vector<uint32_t> philox_w = {0x9E3779B9, 0xBB67AE85};
|
||||||
|
float two_pow32_inv = 2.3283064e-10f;
|
||||||
|
float two_pow32_inv_2pi = 2.3283064e-10f * 6.2831855f;
|
||||||
|
|
||||||
|
std::vector<uint32_t> uint32(uint64_t x) {
|
||||||
|
std::vector<uint32_t> result(2);
|
||||||
|
result[0] = static_cast<uint32_t>(x & 0xFFFFFFFF);
|
||||||
|
result[1] = static_cast<uint32_t>(x >> 32);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::vector<uint32_t>> uint32(const std::vector<uint64_t>& x) {
|
||||||
|
uint32_t N = (uint32_t)x.size();
|
||||||
|
std::vector<std::vector<uint32_t>> result(2, std::vector<uint32_t>(N));
|
||||||
|
|
||||||
|
for (uint32_t i = 0; i < N; ++i) {
|
||||||
|
result[0][i] = static_cast<uint32_t>(x[i] & 0xFFFFFFFF);
|
||||||
|
result[1][i] = static_cast<uint32_t>(x[i] >> 32);
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
// A single round of the Philox 4x32 random number generator.
|
||||||
|
void philox4_round(std::vector<std::vector<uint32_t>>& counter,
|
||||||
|
const std::vector<std::vector<uint32_t>>& key) {
|
||||||
|
uint32_t N = (uint32_t)counter[0].size();
|
||||||
|
for (uint32_t i = 0; i < N; i++) {
|
||||||
|
std::vector<uint32_t> v1 = uint32(static_cast<uint64_t>(counter[0][i]) * static_cast<uint64_t>(philox_m[0]));
|
||||||
|
std::vector<uint32_t> v2 = uint32(static_cast<uint64_t>(counter[2][i]) * static_cast<uint64_t>(philox_m[1]));
|
||||||
|
|
||||||
|
counter[0][i] = v2[1] ^ counter[1][i] ^ key[0][i];
|
||||||
|
counter[1][i] = v2[0];
|
||||||
|
counter[2][i] = v1[1] ^ counter[3][i] ^ key[1][i];
|
||||||
|
counter[3][i] = v1[0];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Generates 32-bit random numbers using the Philox 4x32 random number generator.
|
||||||
|
// Parameters:
|
||||||
|
// counter : A 4xN array of 32-bit integers representing the counter values (offset into generation).
|
||||||
|
// key : A 2xN array of 32-bit integers representing the key values (seed).
|
||||||
|
// rounds : The number of rounds to perform.
|
||||||
|
// Returns:
|
||||||
|
// std::vector<std::vector<uint32_t>>: A 4xN array of 32-bit integers containing the generated random numbers.
|
||||||
|
std::vector<std::vector<uint32_t>> philox4_32(std::vector<std::vector<uint32_t>>& counter,
|
||||||
|
std::vector<std::vector<uint32_t>>& key,
|
||||||
|
int rounds = 10) {
|
||||||
|
uint32_t N = (uint32_t)counter[0].size();
|
||||||
|
for (int i = 0; i < rounds - 1; ++i) {
|
||||||
|
philox4_round(counter, key);
|
||||||
|
|
||||||
|
for (uint32_t j = 0; j < N; ++j) {
|
||||||
|
key[0][j] += philox_w[0];
|
||||||
|
key[1][j] += philox_w[1];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
philox4_round(counter, key);
|
||||||
|
return counter;
|
||||||
|
}
|
||||||
|
|
||||||
|
float box_muller(float x, float y) {
|
||||||
|
float u = x * two_pow32_inv + two_pow32_inv / 2;
|
||||||
|
float v = y * two_pow32_inv_2pi + two_pow32_inv_2pi / 2;
|
||||||
|
|
||||||
|
float s = sqrt(-2.0f * log(u));
|
||||||
|
|
||||||
|
float r1 = s * sin(v);
|
||||||
|
return r1;
|
||||||
|
}
|
||||||
|
|
||||||
|
public:
|
||||||
|
PhiloxRNG(uint64_t seed = 0) {
|
||||||
|
this->seed = seed;
|
||||||
|
this->offset = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
void manual_seed(uint64_t seed) {
|
||||||
|
this->seed = seed;
|
||||||
|
this->offset = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<float> randn(uint32_t n) {
|
||||||
|
std::vector<std::vector<uint32_t>> counter(4, std::vector<uint32_t>(n, 0));
|
||||||
|
for (uint32_t i = 0; i < n; i++) {
|
||||||
|
counter[0][i] = this->offset;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (uint32_t i = 0; i < n; i++) {
|
||||||
|
counter[2][i] = i;
|
||||||
|
}
|
||||||
|
this->offset += 1;
|
||||||
|
|
||||||
|
std::vector<uint64_t> key(n, this->seed);
|
||||||
|
std::vector<std::vector<uint32_t>> key_uint32 = uint32(key);
|
||||||
|
|
||||||
|
std::vector<std::vector<uint32_t>> g = philox4_32(counter, key_uint32);
|
||||||
|
|
||||||
|
std::vector<float> result;
|
||||||
|
for (uint32_t i = 0; i < n; ++i) {
|
||||||
|
result.push_back(box_muller((float)g[0][i], (float)g[1][i]));
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif // __RNG_PHILOX_H__
|
1778
otherarch/sdcpp/stable-diffusion.cpp
Normal file
1778
otherarch/sdcpp/stable-diffusion.cpp
Normal file
File diff suppressed because it is too large
Load diff
193
otherarch/sdcpp/stable-diffusion.h
Normal file
193
otherarch/sdcpp/stable-diffusion.h
Normal file
|
@ -0,0 +1,193 @@
|
||||||
|
#ifndef __STABLE_DIFFUSION_H__
|
||||||
|
#define __STABLE_DIFFUSION_H__
|
||||||
|
|
||||||
|
#if defined(_WIN32) || defined(__CYGWIN__)
|
||||||
|
#ifndef SD_BUILD_SHARED_LIB
|
||||||
|
#define SD_API
|
||||||
|
#else
|
||||||
|
#ifdef SD_BUILD_DLL
|
||||||
|
#define SD_API __declspec(dllexport)
|
||||||
|
#else
|
||||||
|
#define SD_API __declspec(dllimport)
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
#else
|
||||||
|
#if __GNUC__ >= 4
|
||||||
|
#define SD_API __attribute__((visibility("default")))
|
||||||
|
#else
|
||||||
|
#define SD_API
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include <stdbool.h>
|
||||||
|
#include <stddef.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
enum rng_type_t {
|
||||||
|
STD_DEFAULT_RNG,
|
||||||
|
CUDA_RNG
|
||||||
|
};
|
||||||
|
|
||||||
|
enum sample_method_t {
|
||||||
|
EULER_A,
|
||||||
|
EULER,
|
||||||
|
HEUN,
|
||||||
|
DPM2,
|
||||||
|
DPMPP2S_A,
|
||||||
|
DPMPP2M,
|
||||||
|
DPMPP2Mv2,
|
||||||
|
LCM,
|
||||||
|
N_SAMPLE_METHODS
|
||||||
|
};
|
||||||
|
|
||||||
|
enum schedule_t {
|
||||||
|
DEFAULT,
|
||||||
|
DISCRETE,
|
||||||
|
KARRAS,
|
||||||
|
N_SCHEDULES
|
||||||
|
};
|
||||||
|
|
||||||
|
// same as enum ggml_type
|
||||||
|
enum sd_type_t {
|
||||||
|
SD_TYPE_F32 = 0,
|
||||||
|
SD_TYPE_F16 = 1,
|
||||||
|
SD_TYPE_Q4_0 = 2,
|
||||||
|
SD_TYPE_Q4_1 = 3,
|
||||||
|
// SD_TYPE_Q4_2 = 4, support has been removed
|
||||||
|
// SD_TYPE_Q4_3 (5) support has been removed
|
||||||
|
SD_TYPE_Q5_0 = 6,
|
||||||
|
SD_TYPE_Q5_1 = 7,
|
||||||
|
SD_TYPE_Q8_0 = 8,
|
||||||
|
SD_TYPE_Q8_1 = 9,
|
||||||
|
// k-quantizations
|
||||||
|
SD_TYPE_Q2_K = 10,
|
||||||
|
SD_TYPE_Q3_K = 11,
|
||||||
|
SD_TYPE_Q4_K = 12,
|
||||||
|
SD_TYPE_Q5_K = 13,
|
||||||
|
SD_TYPE_Q6_K = 14,
|
||||||
|
SD_TYPE_Q8_K = 15,
|
||||||
|
SD_TYPE_IQ2_XXS = 16,
|
||||||
|
SD_TYPE_IQ2_XS = 17,
|
||||||
|
SD_TYPE_IQ3_XXS = 18,
|
||||||
|
SD_TYPE_IQ1_S = 19,
|
||||||
|
SD_TYPE_IQ4_NL = 20,
|
||||||
|
SD_TYPE_I8,
|
||||||
|
SD_TYPE_I16,
|
||||||
|
SD_TYPE_I32,
|
||||||
|
SD_TYPE_COUNT,
|
||||||
|
};
|
||||||
|
|
||||||
|
SD_API const char* sd_type_name(enum sd_type_t type);
|
||||||
|
|
||||||
|
enum sd_log_level_t {
|
||||||
|
SD_LOG_DEBUG,
|
||||||
|
SD_LOG_INFO,
|
||||||
|
SD_LOG_WARN,
|
||||||
|
SD_LOG_ERROR
|
||||||
|
};
|
||||||
|
|
||||||
|
typedef void (*sd_log_cb_t)(enum sd_log_level_t level, const char* text, void* data);
|
||||||
|
|
||||||
|
SD_API void sd_set_log_callback(sd_log_cb_t sd_log_cb, void* data);
|
||||||
|
SD_API int32_t get_num_physical_cores();
|
||||||
|
SD_API const char* sd_get_system_info();
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
uint32_t width;
|
||||||
|
uint32_t height;
|
||||||
|
uint32_t channel;
|
||||||
|
uint8_t* data;
|
||||||
|
} sd_image_t;
|
||||||
|
|
||||||
|
typedef struct sd_ctx_t sd_ctx_t;
|
||||||
|
|
||||||
|
SD_API sd_ctx_t* new_sd_ctx(const char* model_path,
|
||||||
|
const char* vae_path,
|
||||||
|
const char* taesd_path,
|
||||||
|
const char* control_net_path_c_str,
|
||||||
|
const char* lora_model_dir,
|
||||||
|
const char* embed_dir_c_str,
|
||||||
|
bool vae_decode_only,
|
||||||
|
bool vae_tiling,
|
||||||
|
bool free_params_immediately,
|
||||||
|
int n_threads,
|
||||||
|
enum sd_type_t wtype,
|
||||||
|
enum rng_type_t rng_type,
|
||||||
|
enum schedule_t s,
|
||||||
|
bool keep_control_net_cpu);
|
||||||
|
|
||||||
|
SD_API void free_sd_ctx(sd_ctx_t* sd_ctx);
|
||||||
|
|
||||||
|
SD_API sd_image_t* txt2img(sd_ctx_t* sd_ctx,
|
||||||
|
const char* prompt,
|
||||||
|
const char* negative_prompt,
|
||||||
|
int clip_skip,
|
||||||
|
float cfg_scale,
|
||||||
|
int width,
|
||||||
|
int height,
|
||||||
|
enum sample_method_t sample_method,
|
||||||
|
int sample_steps,
|
||||||
|
int64_t seed,
|
||||||
|
int batch_count,
|
||||||
|
const sd_image_t* control_cond,
|
||||||
|
float control_strength);
|
||||||
|
|
||||||
|
SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx,
|
||||||
|
sd_image_t init_image,
|
||||||
|
const char* prompt,
|
||||||
|
const char* negative_prompt,
|
||||||
|
int clip_skip,
|
||||||
|
float cfg_scale,
|
||||||
|
int width,
|
||||||
|
int height,
|
||||||
|
enum sample_method_t sample_method,
|
||||||
|
int sample_steps,
|
||||||
|
float strength,
|
||||||
|
int64_t seed,
|
||||||
|
int batch_count);
|
||||||
|
|
||||||
|
SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx,
|
||||||
|
sd_image_t init_image,
|
||||||
|
int width,
|
||||||
|
int height,
|
||||||
|
int video_frames,
|
||||||
|
int motion_bucket_id,
|
||||||
|
int fps,
|
||||||
|
float augmentation_level,
|
||||||
|
float min_cfg,
|
||||||
|
float cfg_scale,
|
||||||
|
enum sample_method_t sample_method,
|
||||||
|
int sample_steps,
|
||||||
|
float strength,
|
||||||
|
int64_t seed);
|
||||||
|
|
||||||
|
typedef struct upscaler_ctx_t upscaler_ctx_t;
|
||||||
|
|
||||||
|
SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path,
|
||||||
|
int n_threads,
|
||||||
|
enum sd_type_t wtype);
|
||||||
|
SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx);
|
||||||
|
|
||||||
|
SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx, sd_image_t input_image, uint32_t upscale_factor);
|
||||||
|
|
||||||
|
SD_API bool convert(const char* input_path, const char* vae_path, const char* output_path, sd_type_t output_type);
|
||||||
|
|
||||||
|
SD_API uint8_t* preprocess_canny(uint8_t* img,
|
||||||
|
int width,
|
||||||
|
int height,
|
||||||
|
float high_threshold,
|
||||||
|
float low_threshold,
|
||||||
|
float weak,
|
||||||
|
float strong,
|
||||||
|
bool inverse);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif // __STABLE_DIFFUSION_H__
|
259
otherarch/sdcpp/tae.hpp
Normal file
259
otherarch/sdcpp/tae.hpp
Normal file
|
@ -0,0 +1,259 @@
|
||||||
|
#ifndef __TAE_HPP__
|
||||||
|
#define __TAE_HPP__
|
||||||
|
|
||||||
|
#include "ggml_extend.hpp"
|
||||||
|
|
||||||
|
#include "model.h"
|
||||||
|
|
||||||
|
/*
|
||||||
|
=================================== TinyAutoEncoder ===================================
|
||||||
|
References:
|
||||||
|
https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/autoencoders/vae.py
|
||||||
|
https://github.com/madebyollin/taesd/blob/main/taesd.py
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
class TAEBlock : public UnaryBlock {
|
||||||
|
protected:
|
||||||
|
int n_in;
|
||||||
|
int n_out;
|
||||||
|
|
||||||
|
public:
|
||||||
|
TAEBlock(int n_in, int n_out)
|
||||||
|
: n_in(n_in), n_out(n_out) {
|
||||||
|
blocks["conv.0"] = std::shared_ptr<GGMLBlock>(new Conv2d(n_in, n_out, {3, 3}, {1, 1}, {1, 1}));
|
||||||
|
blocks["conv.2"] = std::shared_ptr<GGMLBlock>(new Conv2d(n_out, n_out, {3, 3}, {1, 1}, {1, 1}));
|
||||||
|
blocks["conv.4"] = std::shared_ptr<GGMLBlock>(new Conv2d(n_out, n_out, {3, 3}, {1, 1}, {1, 1}));
|
||||||
|
if (n_in != n_out) {
|
||||||
|
blocks["skip"] = std::shared_ptr<GGMLBlock>(new Conv2d(n_in, n_out, {1, 1}, {1, 1}, {1, 1}, {1, 1}, false));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
|
||||||
|
// x: [n, n_in, h, w]
|
||||||
|
// return: [n, n_out, h, w]
|
||||||
|
|
||||||
|
auto conv_0 = std::dynamic_pointer_cast<Conv2d>(blocks["conv.0"]);
|
||||||
|
auto conv_2 = std::dynamic_pointer_cast<Conv2d>(blocks["conv.2"]);
|
||||||
|
auto conv_4 = std::dynamic_pointer_cast<Conv2d>(blocks["conv.4"]);
|
||||||
|
|
||||||
|
auto h = conv_0->forward(ctx, x);
|
||||||
|
h = ggml_relu_inplace(ctx, h);
|
||||||
|
h = conv_2->forward(ctx, h);
|
||||||
|
h = ggml_relu_inplace(ctx, h);
|
||||||
|
h = conv_4->forward(ctx, h);
|
||||||
|
|
||||||
|
if (n_in != n_out) {
|
||||||
|
auto skip = std::dynamic_pointer_cast<Conv2d>(blocks["skip"]);
|
||||||
|
LOG_DEBUG("skip");
|
||||||
|
x = skip->forward(ctx, x);
|
||||||
|
}
|
||||||
|
|
||||||
|
h = ggml_add(ctx, h, x);
|
||||||
|
h = ggml_relu_inplace(ctx, h);
|
||||||
|
return h;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class TinyEncoder : public UnaryBlock {
|
||||||
|
int in_channels = 3;
|
||||||
|
int channels = 64;
|
||||||
|
int z_channels = 4;
|
||||||
|
int num_blocks = 3;
|
||||||
|
|
||||||
|
public:
|
||||||
|
TinyEncoder() {
|
||||||
|
int index = 0;
|
||||||
|
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, channels, {3, 3}, {1, 1}, {1, 1}));
|
||||||
|
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
|
||||||
|
|
||||||
|
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {2, 2}, {1, 1}, {1, 1}, false));
|
||||||
|
for (int i = 0; i < num_blocks; i++) {
|
||||||
|
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
|
||||||
|
}
|
||||||
|
|
||||||
|
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {2, 2}, {1, 1}, {1, 1}, false));
|
||||||
|
for (int i = 0; i < num_blocks; i++) {
|
||||||
|
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
|
||||||
|
}
|
||||||
|
|
||||||
|
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {2, 2}, {1, 1}, {1, 1}, false));
|
||||||
|
for (int i = 0; i < num_blocks; i++) {
|
||||||
|
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
|
||||||
|
}
|
||||||
|
|
||||||
|
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, z_channels, {3, 3}, {1, 1}, {1, 1}));
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
|
||||||
|
// x: [n, in_channels, h, w]
|
||||||
|
// return: [n, z_channels, h/8, w/8]
|
||||||
|
|
||||||
|
for (int i = 0; i < num_blocks * 3 + 6; i++) {
|
||||||
|
auto block = std::dynamic_pointer_cast<UnaryBlock>(blocks[std::to_string(i)]);
|
||||||
|
|
||||||
|
x = block->forward(ctx, x);
|
||||||
|
}
|
||||||
|
|
||||||
|
return x;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class TinyDecoder : public UnaryBlock {
|
||||||
|
int z_channels = 4;
|
||||||
|
int channels = 64;
|
||||||
|
int out_channels = 3;
|
||||||
|
int num_blocks = 3;
|
||||||
|
|
||||||
|
public:
|
||||||
|
TinyDecoder(int index = 0) {
|
||||||
|
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(z_channels, channels, {3, 3}, {1, 1}, {1, 1}));
|
||||||
|
index++; // nn.ReLU()
|
||||||
|
|
||||||
|
for (int i = 0; i < num_blocks; i++) {
|
||||||
|
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
|
||||||
|
}
|
||||||
|
index++; // nn.Upsample()
|
||||||
|
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, false));
|
||||||
|
|
||||||
|
for (int i = 0; i < num_blocks; i++) {
|
||||||
|
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
|
||||||
|
}
|
||||||
|
index++; // nn.Upsample()
|
||||||
|
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, false));
|
||||||
|
|
||||||
|
for (int i = 0; i < num_blocks; i++) {
|
||||||
|
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
|
||||||
|
}
|
||||||
|
index++; // nn.Upsample()
|
||||||
|
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, false));
|
||||||
|
|
||||||
|
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
|
||||||
|
blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* z) {
|
||||||
|
// z: [n, z_channels, h, w]
|
||||||
|
// return: [n, out_channels, h*8, w*8]
|
||||||
|
|
||||||
|
auto h = ggml_scale(ctx, z, 1.0f / 3.0f);
|
||||||
|
h = ggml_tanh_inplace(ctx, h);
|
||||||
|
h = ggml_scale(ctx, h, 3.0f);
|
||||||
|
|
||||||
|
for (int i = 0; i < num_blocks * 3 + 10; i++) {
|
||||||
|
if (blocks.find(std::to_string(i)) == blocks.end()) {
|
||||||
|
if (i == 1) {
|
||||||
|
h = ggml_relu_inplace(ctx, h);
|
||||||
|
} else {
|
||||||
|
h = ggml_upscale(ctx, h, 2);
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
auto block = std::dynamic_pointer_cast<UnaryBlock>(blocks[std::to_string(i)]);
|
||||||
|
|
||||||
|
h = block->forward(ctx, h);
|
||||||
|
}
|
||||||
|
|
||||||
|
return h;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class TAESD : public GGMLBlock {
|
||||||
|
protected:
|
||||||
|
bool decode_only;
|
||||||
|
|
||||||
|
public:
|
||||||
|
TAESD(bool decode_only = true)
|
||||||
|
: decode_only(decode_only) {
|
||||||
|
blocks["decoder.layers"] = std::shared_ptr<GGMLBlock>(new TinyDecoder());
|
||||||
|
|
||||||
|
if (!decode_only) {
|
||||||
|
blocks["encoder.layers"] = std::shared_ptr<GGMLBlock>(new TinyEncoder());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor* decode(struct ggml_context* ctx, struct ggml_tensor* z) {
|
||||||
|
auto decoder = std::dynamic_pointer_cast<TinyDecoder>(blocks["decoder.layers"]);
|
||||||
|
return decoder->forward(ctx, z);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor* encode(struct ggml_context* ctx, struct ggml_tensor* x) {
|
||||||
|
auto encoder = std::dynamic_pointer_cast<TinyEncoder>(blocks["encoder.layers"]);
|
||||||
|
return encoder->forward(ctx, x);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct TinyAutoEncoder : public GGMLModule {
|
||||||
|
TAESD taesd;
|
||||||
|
bool decode_only = false;
|
||||||
|
|
||||||
|
TinyAutoEncoder(ggml_backend_t backend,
|
||||||
|
ggml_type wtype,
|
||||||
|
bool decoder_only = true)
|
||||||
|
: decode_only(decoder_only),
|
||||||
|
taesd(decode_only),
|
||||||
|
GGMLModule(backend, wtype) {
|
||||||
|
taesd.init(params_ctx, wtype);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string get_desc() {
|
||||||
|
return "taesd";
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t get_params_mem_size() {
|
||||||
|
return taesd.get_params_mem_size();
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t get_params_num() {
|
||||||
|
return taesd.get_params_num();
|
||||||
|
}
|
||||||
|
|
||||||
|
bool load_from_file(const std::string& file_path) {
|
||||||
|
LOG_INFO("loading taesd from '%s'", file_path.c_str());
|
||||||
|
alloc_params_buffer();
|
||||||
|
std::map<std::string, ggml_tensor*> taesd_tensors;
|
||||||
|
taesd.get_param_tensors(taesd_tensors);
|
||||||
|
std::set<std::string> ignore_tensors;
|
||||||
|
if (decode_only) {
|
||||||
|
ignore_tensors.insert("encoder.");
|
||||||
|
}
|
||||||
|
|
||||||
|
ModelLoader model_loader;
|
||||||
|
if (!model_loader.init_from_file(file_path)) {
|
||||||
|
LOG_ERROR("init taesd model loader from file failed: '%s'", file_path.c_str());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool success = model_loader.load_tensors(taesd_tensors, backend, ignore_tensors);
|
||||||
|
|
||||||
|
if (!success) {
|
||||||
|
LOG_ERROR("load tae tensors from model loader failed");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
LOG_INFO("taesd model loaded");
|
||||||
|
return success;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_cgraph* build_graph(struct ggml_tensor* z, bool decode_graph) {
|
||||||
|
struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
|
||||||
|
z = to_backend(z);
|
||||||
|
struct ggml_tensor* out = decode_graph ? taesd.decode(compute_ctx, z) : taesd.encode(compute_ctx, z);
|
||||||
|
ggml_build_forward_expand(gf, out);
|
||||||
|
return gf;
|
||||||
|
}
|
||||||
|
|
||||||
|
void compute(const int n_threads,
|
||||||
|
struct ggml_tensor* z,
|
||||||
|
bool decode_graph,
|
||||||
|
struct ggml_tensor** output,
|
||||||
|
struct ggml_context* output_ctx = NULL) {
|
||||||
|
auto get_graph = [&]() -> struct ggml_cgraph* {
|
||||||
|
return build_graph(z, decode_graph);
|
||||||
|
};
|
||||||
|
|
||||||
|
GGMLModule::compute(get_graph, n_threads, false, output, output_ctx);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif // __TAE_HPP__
|
2
otherarch/sdcpp/thirdparty/README.md
vendored
Normal file
2
otherarch/sdcpp/thirdparty/README.md
vendored
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
- json.hpp library from: https://github.com/nlohmann/json
|
||||||
|
- ZIP Library from: https://github.com/kuba--/zip
|
24596
otherarch/sdcpp/thirdparty/json.hpp
vendored
Normal file
24596
otherarch/sdcpp/thirdparty/json.hpp
vendored
Normal file
File diff suppressed because it is too large
Load diff
10130
otherarch/sdcpp/thirdparty/miniz.h
vendored
Normal file
10130
otherarch/sdcpp/thirdparty/miniz.h
vendored
Normal file
File diff suppressed because it is too large
Load diff
7987
otherarch/sdcpp/thirdparty/stb_image.h
vendored
Normal file
7987
otherarch/sdcpp/thirdparty/stb_image.h
vendored
Normal file
File diff suppressed because it is too large
Load diff
1741
otherarch/sdcpp/thirdparty/stb_image_write.h
vendored
Normal file
1741
otherarch/sdcpp/thirdparty/stb_image_write.h
vendored
Normal file
File diff suppressed because it is too large
Load diff
1836
otherarch/sdcpp/thirdparty/zip.c
vendored
Normal file
1836
otherarch/sdcpp/thirdparty/zip.c
vendored
Normal file
File diff suppressed because it is too large
Load diff
509
otherarch/sdcpp/thirdparty/zip.h
vendored
Normal file
509
otherarch/sdcpp/thirdparty/zip.h
vendored
Normal file
|
@ -0,0 +1,509 @@
|
||||||
|
/*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||||
|
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||||
|
* IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||||
|
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||||
|
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
#ifndef ZIP_H
|
||||||
|
#define ZIP_H
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <sys/types.h>
|
||||||
|
|
||||||
|
#ifndef ZIP_SHARED
|
||||||
|
#define ZIP_EXPORT
|
||||||
|
#else
|
||||||
|
#ifdef _WIN32
|
||||||
|
#ifdef ZIP_BUILD_SHARED
|
||||||
|
#define ZIP_EXPORT __declspec(dllexport)
|
||||||
|
#else
|
||||||
|
#define ZIP_EXPORT __declspec(dllimport)
|
||||||
|
#endif
|
||||||
|
#else
|
||||||
|
#define ZIP_EXPORT __attribute__((visibility("default")))
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if !defined(_POSIX_C_SOURCE) && defined(_MSC_VER)
|
||||||
|
// 64-bit Windows is the only mainstream platform
|
||||||
|
// where sizeof(long) != sizeof(void*)
|
||||||
|
#ifdef _WIN64
|
||||||
|
typedef long long ssize_t; /* byte count or error */
|
||||||
|
#else
|
||||||
|
typedef long ssize_t; /* byte count or error */
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @mainpage
|
||||||
|
*
|
||||||
|
* Documentation for @ref zip.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @addtogroup zip
|
||||||
|
* @{
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Default zip compression level.
|
||||||
|
*/
|
||||||
|
#define ZIP_DEFAULT_COMPRESSION_LEVEL 6
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Error codes
|
||||||
|
*/
|
||||||
|
#define ZIP_ENOINIT -1 // not initialized
|
||||||
|
#define ZIP_EINVENTNAME -2 // invalid entry name
|
||||||
|
#define ZIP_ENOENT -3 // entry not found
|
||||||
|
#define ZIP_EINVMODE -4 // invalid zip mode
|
||||||
|
#define ZIP_EINVLVL -5 // invalid compression level
|
||||||
|
#define ZIP_ENOSUP64 -6 // no zip 64 support
|
||||||
|
#define ZIP_EMEMSET -7 // memset error
|
||||||
|
#define ZIP_EWRTENT -8 // cannot write data to entry
|
||||||
|
#define ZIP_ETDEFLINIT -9 // cannot initialize tdefl compressor
|
||||||
|
#define ZIP_EINVIDX -10 // invalid index
|
||||||
|
#define ZIP_ENOHDR -11 // header not found
|
||||||
|
#define ZIP_ETDEFLBUF -12 // cannot flush tdefl buffer
|
||||||
|
#define ZIP_ECRTHDR -13 // cannot create entry header
|
||||||
|
#define ZIP_EWRTHDR -14 // cannot write entry header
|
||||||
|
#define ZIP_EWRTDIR -15 // cannot write to central dir
|
||||||
|
#define ZIP_EOPNFILE -16 // cannot open file
|
||||||
|
#define ZIP_EINVENTTYPE -17 // invalid entry type
|
||||||
|
#define ZIP_EMEMNOALLOC -18 // extracting data using no memory allocation
|
||||||
|
#define ZIP_ENOFILE -19 // file not found
|
||||||
|
#define ZIP_ENOPERM -20 // no permission
|
||||||
|
#define ZIP_EOOMEM -21 // out of memory
|
||||||
|
#define ZIP_EINVZIPNAME -22 // invalid zip archive name
|
||||||
|
#define ZIP_EMKDIR -23 // make dir error
|
||||||
|
#define ZIP_ESYMLINK -24 // symlink error
|
||||||
|
#define ZIP_ECLSZIP -25 // close archive error
|
||||||
|
#define ZIP_ECAPSIZE -26 // capacity size too small
|
||||||
|
#define ZIP_EFSEEK -27 // fseek error
|
||||||
|
#define ZIP_EFREAD -28 // fread error
|
||||||
|
#define ZIP_EFWRITE -29 // fwrite error
|
||||||
|
#define ZIP_ERINIT -30 // cannot initialize reader
|
||||||
|
#define ZIP_EWINIT -31 // cannot initialize writer
|
||||||
|
#define ZIP_EWRINIT -32 // cannot initialize writer from reader
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Looks up the error message string corresponding to an error number.
|
||||||
|
* @param errnum error number
|
||||||
|
* @return error message string corresponding to errnum or NULL if error is not
|
||||||
|
* found.
|
||||||
|
*/
|
||||||
|
extern ZIP_EXPORT const char *zip_strerror(int errnum);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @struct zip_t
|
||||||
|
*
|
||||||
|
* This data structure is used throughout the library to represent zip archive -
|
||||||
|
* forward declaration.
|
||||||
|
*/
|
||||||
|
struct zip_t;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Opens zip archive with compression level using the given mode.
|
||||||
|
*
|
||||||
|
* @param zipname zip archive file name.
|
||||||
|
* @param level compression level (0-9 are the standard zlib-style levels).
|
||||||
|
* @param mode file access mode.
|
||||||
|
* - 'r': opens a file for reading/extracting (the file must exists).
|
||||||
|
* - 'w': creates an empty file for writing.
|
||||||
|
* - 'a': appends to an existing archive.
|
||||||
|
*
|
||||||
|
* @return the zip archive handler or NULL on error
|
||||||
|
*/
|
||||||
|
extern ZIP_EXPORT struct zip_t *zip_open(const char *zipname, int level,
|
||||||
|
char mode);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Opens zip archive with compression level using the given mode.
|
||||||
|
* The function additionally returns @param errnum -
|
||||||
|
*
|
||||||
|
* @param zipname zip archive file name.
|
||||||
|
* @param level compression level (0-9 are the standard zlib-style levels).
|
||||||
|
* @param mode file access mode.
|
||||||
|
* - 'r': opens a file for reading/extracting (the file must exists).
|
||||||
|
* - 'w': creates an empty file for writing.
|
||||||
|
* - 'a': appends to an existing archive.
|
||||||
|
* @param errnum 0 on success, negative number (< 0) on error.
|
||||||
|
*
|
||||||
|
* @return the zip archive handler or NULL on error
|
||||||
|
*/
|
||||||
|
extern ZIP_EXPORT struct zip_t *
|
||||||
|
zip_openwitherror(const char *zipname, int level, char mode, int *errnum);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Closes the zip archive, releases resources - always finalize.
|
||||||
|
*
|
||||||
|
* @param zip zip archive handler.
|
||||||
|
*/
|
||||||
|
extern ZIP_EXPORT void zip_close(struct zip_t *zip);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Determines if the archive has a zip64 end of central directory headers.
|
||||||
|
*
|
||||||
|
* @param zip zip archive handler.
|
||||||
|
*
|
||||||
|
* @return the return code - 1 (true), 0 (false), negative number (< 0) on
|
||||||
|
* error.
|
||||||
|
*/
|
||||||
|
extern ZIP_EXPORT int zip_is64(struct zip_t *zip);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Opens an entry by name in the zip archive.
|
||||||
|
*
|
||||||
|
* For zip archive opened in 'w' or 'a' mode the function will append
|
||||||
|
* a new entry. In readonly mode the function tries to locate the entry
|
||||||
|
* in global dictionary.
|
||||||
|
*
|
||||||
|
* @param zip zip archive handler.
|
||||||
|
* @param entryname an entry name in local dictionary.
|
||||||
|
*
|
||||||
|
* @return the return code - 0 on success, negative number (< 0) on error.
|
||||||
|
*/
|
||||||
|
extern ZIP_EXPORT int zip_entry_open(struct zip_t *zip, const char *entryname);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Opens an entry by name in the zip archive.
|
||||||
|
*
|
||||||
|
* For zip archive opened in 'w' or 'a' mode the function will append
|
||||||
|
* a new entry. In readonly mode the function tries to locate the entry
|
||||||
|
* in global dictionary (case sensitive).
|
||||||
|
*
|
||||||
|
* @param zip zip archive handler.
|
||||||
|
* @param entryname an entry name in local dictionary (case sensitive).
|
||||||
|
*
|
||||||
|
* @return the return code - 0 on success, negative number (< 0) on error.
|
||||||
|
*/
|
||||||
|
extern ZIP_EXPORT int zip_entry_opencasesensitive(struct zip_t *zip,
|
||||||
|
const char *entryname);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Opens a new entry by index in the zip archive.
|
||||||
|
*
|
||||||
|
* This function is only valid if zip archive was opened in 'r' (readonly) mode.
|
||||||
|
*
|
||||||
|
* @param zip zip archive handler.
|
||||||
|
* @param index index in local dictionary.
|
||||||
|
*
|
||||||
|
* @return the return code - 0 on success, negative number (< 0) on error.
|
||||||
|
*/
|
||||||
|
extern ZIP_EXPORT int zip_entry_openbyindex(struct zip_t *zip, size_t index);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Closes a zip entry, flushes buffer and releases resources.
|
||||||
|
*
|
||||||
|
* @param zip zip archive handler.
|
||||||
|
*
|
||||||
|
* @return the return code - 0 on success, negative number (< 0) on error.
|
||||||
|
*/
|
||||||
|
extern ZIP_EXPORT int zip_entry_close(struct zip_t *zip);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a local name of the current zip entry.
|
||||||
|
*
|
||||||
|
* The main difference between user's entry name and local entry name
|
||||||
|
* is optional relative path.
|
||||||
|
* Following .ZIP File Format Specification - the path stored MUST not contain
|
||||||
|
* a drive or device letter, or a leading slash.
|
||||||
|
* All slashes MUST be forward slashes '/' as opposed to backwards slashes '\'
|
||||||
|
* for compatibility with Amiga and UNIX file systems etc.
|
||||||
|
*
|
||||||
|
* @param zip: zip archive handler.
|
||||||
|
*
|
||||||
|
* @return the pointer to the current zip entry name, or NULL on error.
|
||||||
|
*/
|
||||||
|
extern ZIP_EXPORT const char *zip_entry_name(struct zip_t *zip);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns an index of the current zip entry.
|
||||||
|
*
|
||||||
|
* @param zip zip archive handler.
|
||||||
|
*
|
||||||
|
* @return the index on success, negative number (< 0) on error.
|
||||||
|
*/
|
||||||
|
extern ZIP_EXPORT ssize_t zip_entry_index(struct zip_t *zip);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Determines if the current zip entry is a directory entry.
|
||||||
|
*
|
||||||
|
* @param zip zip archive handler.
|
||||||
|
*
|
||||||
|
* @return the return code - 1 (true), 0 (false), negative number (< 0) on
|
||||||
|
* error.
|
||||||
|
*/
|
||||||
|
extern ZIP_EXPORT int zip_entry_isdir(struct zip_t *zip);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the uncompressed size of the current zip entry.
|
||||||
|
* Alias for zip_entry_uncomp_size (for backward compatibility).
|
||||||
|
*
|
||||||
|
* @param zip zip archive handler.
|
||||||
|
*
|
||||||
|
* @return the uncompressed size in bytes.
|
||||||
|
*/
|
||||||
|
extern ZIP_EXPORT unsigned long long zip_entry_size(struct zip_t *zip);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the uncompressed size of the current zip entry.
|
||||||
|
*
|
||||||
|
* @param zip zip archive handler.
|
||||||
|
*
|
||||||
|
* @return the uncompressed size in bytes.
|
||||||
|
*/
|
||||||
|
extern ZIP_EXPORT unsigned long long zip_entry_uncomp_size(struct zip_t *zip);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the compressed size of the current zip entry.
|
||||||
|
*
|
||||||
|
* @param zip zip archive handler.
|
||||||
|
*
|
||||||
|
* @return the compressed size in bytes.
|
||||||
|
*/
|
||||||
|
extern ZIP_EXPORT unsigned long long zip_entry_comp_size(struct zip_t *zip);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns CRC-32 checksum of the current zip entry.
|
||||||
|
*
|
||||||
|
* @param zip zip archive handler.
|
||||||
|
*
|
||||||
|
* @return the CRC-32 checksum.
|
||||||
|
*/
|
||||||
|
extern ZIP_EXPORT unsigned int zip_entry_crc32(struct zip_t *zip);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compresses an input buffer for the current zip entry.
|
||||||
|
*
|
||||||
|
* @param zip zip archive handler.
|
||||||
|
* @param buf input buffer.
|
||||||
|
* @param bufsize input buffer size (in bytes).
|
||||||
|
*
|
||||||
|
* @return the return code - 0 on success, negative number (< 0) on error.
|
||||||
|
*/
|
||||||
|
extern ZIP_EXPORT int zip_entry_write(struct zip_t *zip, const void *buf,
|
||||||
|
size_t bufsize);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compresses a file for the current zip entry.
|
||||||
|
*
|
||||||
|
* @param zip zip archive handler.
|
||||||
|
* @param filename input file.
|
||||||
|
*
|
||||||
|
* @return the return code - 0 on success, negative number (< 0) on error.
|
||||||
|
*/
|
||||||
|
extern ZIP_EXPORT int zip_entry_fwrite(struct zip_t *zip, const char *filename);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extracts the current zip entry into output buffer.
|
||||||
|
*
|
||||||
|
* The function allocates sufficient memory for a output buffer.
|
||||||
|
*
|
||||||
|
* @param zip zip archive handler.
|
||||||
|
* @param buf output buffer.
|
||||||
|
* @param bufsize output buffer size (in bytes).
|
||||||
|
*
|
||||||
|
* @note remember to release memory allocated for a output buffer.
|
||||||
|
* for large entries, please take a look at zip_entry_extract function.
|
||||||
|
*
|
||||||
|
* @return the return code - the number of bytes actually read on success.
|
||||||
|
* Otherwise a negative number (< 0) on error.
|
||||||
|
*/
|
||||||
|
extern ZIP_EXPORT ssize_t zip_entry_read(struct zip_t *zip, void **buf,
|
||||||
|
size_t *bufsize);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extracts the current zip entry into a memory buffer using no memory
|
||||||
|
* allocation.
|
||||||
|
*
|
||||||
|
* @param zip zip archive handler.
|
||||||
|
* @param buf preallocated output buffer.
|
||||||
|
* @param bufsize output buffer size (in bytes).
|
||||||
|
*
|
||||||
|
* @note ensure supplied output buffer is large enough.
|
||||||
|
* zip_entry_size function (returns uncompressed size for the current
|
||||||
|
* entry) can be handy to estimate how big buffer is needed.
|
||||||
|
* For large entries, please take a look at zip_entry_extract function.
|
||||||
|
*
|
||||||
|
* @return the return code - the number of bytes actually read on success.
|
||||||
|
* Otherwise a negative number (< 0) on error (e.g. bufsize is not large
|
||||||
|
* enough).
|
||||||
|
*/
|
||||||
|
extern ZIP_EXPORT ssize_t zip_entry_noallocread(struct zip_t *zip, void *buf,
|
||||||
|
size_t bufsize);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extracts the current zip entry into output file.
|
||||||
|
*
|
||||||
|
* @param zip zip archive handler.
|
||||||
|
* @param filename output file.
|
||||||
|
*
|
||||||
|
* @return the return code - 0 on success, negative number (< 0) on error.
|
||||||
|
*/
|
||||||
|
extern ZIP_EXPORT int zip_entry_fread(struct zip_t *zip, const char *filename);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extracts the current zip entry using a callback function (on_extract).
|
||||||
|
*
|
||||||
|
* @param zip zip archive handler.
|
||||||
|
* @param on_extract callback function.
|
||||||
|
* @param arg opaque pointer (optional argument, which you can pass to the
|
||||||
|
* on_extract callback)
|
||||||
|
*
|
||||||
|
* @return the return code - 0 on success, negative number (< 0) on error.
|
||||||
|
*/
|
||||||
|
extern ZIP_EXPORT int
|
||||||
|
zip_entry_extract(struct zip_t *zip,
|
||||||
|
size_t (*on_extract)(void *arg, uint64_t offset,
|
||||||
|
const void *data, size_t size),
|
||||||
|
void *arg);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the number of all entries (files and directories) in the zip archive.
|
||||||
|
*
|
||||||
|
* @param zip zip archive handler.
|
||||||
|
*
|
||||||
|
* @return the return code - the number of entries on success, negative number
|
||||||
|
* (< 0) on error.
|
||||||
|
*/
|
||||||
|
extern ZIP_EXPORT ssize_t zip_entries_total(struct zip_t *zip);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Deletes zip archive entries.
|
||||||
|
*
|
||||||
|
* @param zip zip archive handler.
|
||||||
|
* @param entries array of zip archive entries to be deleted.
|
||||||
|
* @param len the number of entries to be deleted.
|
||||||
|
* @return the number of deleted entries, or negative number (< 0) on error.
|
||||||
|
*/
|
||||||
|
extern ZIP_EXPORT ssize_t zip_entries_delete(struct zip_t *zip,
|
||||||
|
char *const entries[], size_t len);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extracts a zip archive stream into directory.
|
||||||
|
*
|
||||||
|
* If on_extract is not NULL, the callback will be called after
|
||||||
|
* successfully extracted each zip entry.
|
||||||
|
* Returning a negative value from the callback will cause abort and return an
|
||||||
|
* error. The last argument (void *arg) is optional, which you can use to pass
|
||||||
|
* data to the on_extract callback.
|
||||||
|
*
|
||||||
|
* @param stream zip archive stream.
|
||||||
|
* @param size stream size.
|
||||||
|
* @param dir output directory.
|
||||||
|
* @param on_extract on extract callback.
|
||||||
|
* @param arg opaque pointer.
|
||||||
|
*
|
||||||
|
* @return the return code - 0 on success, negative number (< 0) on error.
|
||||||
|
*/
|
||||||
|
extern ZIP_EXPORT int
|
||||||
|
zip_stream_extract(const char *stream, size_t size, const char *dir,
|
||||||
|
int (*on_extract)(const char *filename, void *arg),
|
||||||
|
void *arg);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Opens zip archive stream into memory.
|
||||||
|
*
|
||||||
|
* @param stream zip archive stream.
|
||||||
|
* @param size stream size.
|
||||||
|
* @param level compression level (0-9 are the standard zlib-style levels).
|
||||||
|
* @param mode file access mode.
|
||||||
|
* - 'r': opens a file for reading/extracting (the file must exists).
|
||||||
|
* - 'w': creates an empty file for writing.
|
||||||
|
* - 'a': appends to an existing archive.
|
||||||
|
*
|
||||||
|
* @return the zip archive handler or NULL on error
|
||||||
|
*/
|
||||||
|
extern ZIP_EXPORT struct zip_t *zip_stream_open(const char *stream, size_t size,
|
||||||
|
int level, char mode);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Opens zip archive stream into memory.
|
||||||
|
* The function additionally returns @param errnum -
|
||||||
|
*
|
||||||
|
* @param stream zip archive stream.
|
||||||
|
* @param size stream size.*
|
||||||
|
* @param level compression level (0-9 are the standard zlib-style levels).
|
||||||
|
* @param mode file access mode.
|
||||||
|
* - 'r': opens a file for reading/extracting (the file must exists).
|
||||||
|
* - 'w': creates an empty file for writing.
|
||||||
|
* - 'a': appends to an existing archive.
|
||||||
|
* @param errnum 0 on success, negative number (< 0) on error.
|
||||||
|
*
|
||||||
|
* @return the zip archive handler or NULL on error
|
||||||
|
*/
|
||||||
|
extern ZIP_EXPORT struct zip_t *zip_stream_openwitherror(const char *stream,
|
||||||
|
size_t size, int level,
|
||||||
|
char mode,
|
||||||
|
int *errnum);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Copy zip archive stream output buffer.
|
||||||
|
*
|
||||||
|
* @param zip zip archive handler.
|
||||||
|
* @param buf output buffer. User should free buf.
|
||||||
|
* @param bufsize output buffer size (in bytes).
|
||||||
|
*
|
||||||
|
* @return copy size
|
||||||
|
*/
|
||||||
|
extern ZIP_EXPORT ssize_t zip_stream_copy(struct zip_t *zip, void **buf,
|
||||||
|
size_t *bufsize);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Close zip archive releases resources.
|
||||||
|
*
|
||||||
|
* @param zip zip archive handler.
|
||||||
|
*
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
extern ZIP_EXPORT void zip_stream_close(struct zip_t *zip);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a new archive and puts files into a single zip archive.
|
||||||
|
*
|
||||||
|
* @param zipname zip archive file.
|
||||||
|
* @param filenames input files.
|
||||||
|
* @param len: number of input files.
|
||||||
|
*
|
||||||
|
* @return the return code - 0 on success, negative number (< 0) on error.
|
||||||
|
*/
|
||||||
|
extern ZIP_EXPORT int zip_create(const char *zipname, const char *filenames[],
|
||||||
|
size_t len);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extracts a zip archive file into directory.
|
||||||
|
*
|
||||||
|
* If on_extract_entry is not NULL, the callback will be called after
|
||||||
|
* successfully extracted each zip entry.
|
||||||
|
* Returning a negative value from the callback will cause abort and return an
|
||||||
|
* error. The last argument (void *arg) is optional, which you can use to pass
|
||||||
|
* data to the on_extract_entry callback.
|
||||||
|
*
|
||||||
|
* @param zipname zip archive file.
|
||||||
|
* @param dir output directory.
|
||||||
|
* @param on_extract_entry on extract callback.
|
||||||
|
* @param arg opaque pointer.
|
||||||
|
*
|
||||||
|
* @return the return code - 0 on success, negative number (< 0) on error.
|
||||||
|
*/
|
||||||
|
extern ZIP_EXPORT int zip_extract(const char *zipname, const char *dir,
|
||||||
|
int (*on_extract_entry)(const char *filename,
|
||||||
|
void *arg),
|
||||||
|
void *arg);
|
||||||
|
/** @} */
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif
|
661
otherarch/sdcpp/unet.hpp
Normal file
661
otherarch/sdcpp/unet.hpp
Normal file
|
@ -0,0 +1,661 @@
|
||||||
|
#ifndef __UNET_HPP__
|
||||||
|
#define __UNET_HPP__
|
||||||
|
|
||||||
|
#include "common.hpp"
|
||||||
|
#include "ggml_extend.hpp"
|
||||||
|
#include "model.h"
|
||||||
|
|
||||||
|
/*==================================================== UnetModel =====================================================*/
|
||||||
|
|
||||||
|
#define UNET_GRAPH_SIZE 10240
|
||||||
|
|
||||||
|
class SpatialVideoTransformer : public SpatialTransformer {
|
||||||
|
protected:
|
||||||
|
int64_t time_depth;
|
||||||
|
int64_t max_time_embed_period;
|
||||||
|
|
||||||
|
public:
|
||||||
|
SpatialVideoTransformer(int64_t in_channels,
|
||||||
|
int64_t n_head,
|
||||||
|
int64_t d_head,
|
||||||
|
int64_t depth,
|
||||||
|
int64_t context_dim,
|
||||||
|
int64_t time_depth = 1,
|
||||||
|
int64_t max_time_embed_period = 10000)
|
||||||
|
: SpatialTransformer(in_channels, n_head, d_head, depth, context_dim),
|
||||||
|
max_time_embed_period(max_time_embed_period) {
|
||||||
|
// We will convert unet transformer linear to conv2d 1x1 when loading the weights, so use_linear is always False
|
||||||
|
// use_spatial_context is always True
|
||||||
|
// merge_strategy is always learned_with_images
|
||||||
|
// merge_factor is loaded from weights
|
||||||
|
// time_context_dim is always None
|
||||||
|
// ff_in is always True
|
||||||
|
// disable_self_attn is always False
|
||||||
|
// disable_temporal_crossattention is always False
|
||||||
|
|
||||||
|
int64_t inner_dim = n_head * d_head;
|
||||||
|
|
||||||
|
GGML_ASSERT(depth == time_depth);
|
||||||
|
GGML_ASSERT(in_channels == inner_dim);
|
||||||
|
|
||||||
|
int64_t time_mix_d_head = d_head;
|
||||||
|
int64_t n_time_mix_heads = n_head;
|
||||||
|
int64_t time_mix_inner_dim = time_mix_d_head * n_time_mix_heads; // equal to inner_dim
|
||||||
|
int64_t time_context_dim = context_dim;
|
||||||
|
|
||||||
|
for (int i = 0; i < time_depth; i++) {
|
||||||
|
std::string name = "time_stack." + std::to_string(i);
|
||||||
|
blocks[name] = std::shared_ptr<GGMLBlock>(new BasicTransformerBlock(inner_dim,
|
||||||
|
n_time_mix_heads,
|
||||||
|
time_mix_d_head,
|
||||||
|
time_context_dim,
|
||||||
|
true));
|
||||||
|
}
|
||||||
|
|
||||||
|
int64_t time_embed_dim = in_channels * 4;
|
||||||
|
blocks["time_pos_embed.0"] = std::shared_ptr<GGMLBlock>(new Linear(in_channels, time_embed_dim));
|
||||||
|
// time_pos_embed.1 is nn.SiLU()
|
||||||
|
blocks["time_pos_embed.2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, in_channels));
|
||||||
|
|
||||||
|
blocks["time_mixer"] = std::shared_ptr<GGMLBlock>(new AlphaBlender());
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor* forward(struct ggml_context* ctx,
|
||||||
|
struct ggml_tensor* x,
|
||||||
|
struct ggml_tensor* context,
|
||||||
|
int timesteps) {
|
||||||
|
// x: [N, in_channels, h, w] aka [b*t, in_channels, h, w], t == timesteps
|
||||||
|
// context: [N, max_position(aka n_context), hidden_size(aka context_dim)] aka [b*t, n_context, context_dim], t == timesteps
|
||||||
|
// t_emb: [N, in_channels] aka [b*t, in_channels]
|
||||||
|
// timesteps is num_frames
|
||||||
|
// time_context is always None
|
||||||
|
// image_only_indicator is always tensor([0.])
|
||||||
|
// transformer_options is not used
|
||||||
|
// GGML_ASSERT(ggml_n_dims(context) == 3);
|
||||||
|
|
||||||
|
auto norm = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm"]);
|
||||||
|
auto proj_in = std::dynamic_pointer_cast<Conv2d>(blocks["proj_in"]);
|
||||||
|
auto proj_out = std::dynamic_pointer_cast<Conv2d>(blocks["proj_out"]);
|
||||||
|
auto time_pos_embed_0 = std::dynamic_pointer_cast<Linear>(blocks["time_pos_embed.0"]);
|
||||||
|
auto time_pos_embed_2 = std::dynamic_pointer_cast<Linear>(blocks["time_pos_embed.2"]);
|
||||||
|
auto time_mixer = std::dynamic_pointer_cast<AlphaBlender>(blocks["time_mixer"]);
|
||||||
|
|
||||||
|
auto x_in = x;
|
||||||
|
int64_t n = x->ne[3];
|
||||||
|
int64_t h = x->ne[1];
|
||||||
|
int64_t w = x->ne[0];
|
||||||
|
int64_t inner_dim = n_head * d_head;
|
||||||
|
|
||||||
|
GGML_ASSERT(n == timesteps); // We compute cond and uncond separately, so batch_size==1
|
||||||
|
|
||||||
|
auto time_context = context; // [b*t, n_context, context_dim]
|
||||||
|
auto spatial_context = context;
|
||||||
|
// time_context_first_timestep = time_context[::timesteps]
|
||||||
|
auto time_context_first_timestep = ggml_view_3d(ctx,
|
||||||
|
time_context,
|
||||||
|
time_context->ne[0],
|
||||||
|
time_context->ne[1],
|
||||||
|
1,
|
||||||
|
time_context->nb[1],
|
||||||
|
time_context->nb[2],
|
||||||
|
0); // [b, n_context, context_dim]
|
||||||
|
time_context = ggml_new_tensor_3d(ctx, GGML_TYPE_F32,
|
||||||
|
time_context_first_timestep->ne[0],
|
||||||
|
time_context_first_timestep->ne[1],
|
||||||
|
time_context_first_timestep->ne[2] * h * w);
|
||||||
|
time_context = ggml_repeat(ctx, time_context_first_timestep, time_context); // [b*h*w, n_context, context_dim]
|
||||||
|
|
||||||
|
x = norm->forward(ctx, x);
|
||||||
|
x = proj_in->forward(ctx, x); // [N, inner_dim, h, w]
|
||||||
|
|
||||||
|
x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 2, 0, 3)); // [N, h, w, inner_dim]
|
||||||
|
x = ggml_reshape_3d(ctx, x, inner_dim, w * h, n); // [N, h * w, inner_dim]
|
||||||
|
|
||||||
|
auto num_frames = ggml_arange(ctx, 0, timesteps, 1);
|
||||||
|
// since b is 1, no need to do repeat
|
||||||
|
auto t_emb = ggml_nn_timestep_embedding(ctx, num_frames, in_channels, max_time_embed_period); // [N, in_channels]
|
||||||
|
|
||||||
|
auto emb = time_pos_embed_0->forward(ctx, t_emb);
|
||||||
|
emb = ggml_silu_inplace(ctx, emb);
|
||||||
|
emb = time_pos_embed_2->forward(ctx, emb); // [N, in_channels]
|
||||||
|
emb = ggml_reshape_3d(ctx, emb, emb->ne[0], 1, emb->ne[1]); // [N, 1, in_channels]
|
||||||
|
|
||||||
|
for (int i = 0; i < depth; i++) {
|
||||||
|
std::string transformer_name = "transformer_blocks." + std::to_string(i);
|
||||||
|
std::string time_stack_name = "time_stack." + std::to_string(i);
|
||||||
|
|
||||||
|
auto block = std::dynamic_pointer_cast<BasicTransformerBlock>(blocks[transformer_name]);
|
||||||
|
auto mix_block = std::dynamic_pointer_cast<BasicTransformerBlock>(blocks[time_stack_name]);
|
||||||
|
|
||||||
|
x = block->forward(ctx, x, spatial_context); // [N, h * w, inner_dim]
|
||||||
|
|
||||||
|
// in_channels == inner_dim
|
||||||
|
auto x_mix = x;
|
||||||
|
x_mix = ggml_add(ctx, x_mix, emb); // [N, h * w, inner_dim]
|
||||||
|
|
||||||
|
int64_t N = x_mix->ne[2];
|
||||||
|
int64_t T = timesteps;
|
||||||
|
int64_t B = N / T;
|
||||||
|
int64_t S = x_mix->ne[1];
|
||||||
|
int64_t C = x_mix->ne[0];
|
||||||
|
|
||||||
|
x_mix = ggml_reshape_4d(ctx, x_mix, C, S, T, B); // (b t) s c -> b t s c
|
||||||
|
x_mix = ggml_cont(ctx, ggml_permute(ctx, x_mix, 0, 2, 1, 3)); // b t s c -> b s t c
|
||||||
|
x_mix = ggml_reshape_3d(ctx, x_mix, C, T, S * B); // b s t c -> (b s) t c
|
||||||
|
|
||||||
|
x_mix = mix_block->forward(ctx, x_mix, time_context); // [B * h * w, T, inner_dim]
|
||||||
|
|
||||||
|
x_mix = ggml_reshape_4d(ctx, x_mix, C, T, S, B); // (b s) t c -> b s t c
|
||||||
|
x_mix = ggml_cont(ctx, ggml_permute(ctx, x_mix, 0, 2, 1, 3)); // b s t c -> b t s c
|
||||||
|
x_mix = ggml_reshape_3d(ctx, x_mix, C, S, T * B); // b t s c -> (b t) s c
|
||||||
|
|
||||||
|
x = time_mixer->forward(ctx, x, x_mix); // [N, h * w, inner_dim]
|
||||||
|
}
|
||||||
|
|
||||||
|
x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 0, 2, 3)); // [N, inner_dim, h * w]
|
||||||
|
x = ggml_reshape_4d(ctx, x, w, h, inner_dim, n); // [N, inner_dim, h, w]
|
||||||
|
|
||||||
|
// proj_out
|
||||||
|
x = proj_out->forward(ctx, x); // [N, in_channels, h, w]
|
||||||
|
|
||||||
|
x = ggml_add(ctx, x, x_in);
|
||||||
|
return x;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// ldm.modules.diffusionmodules.openaimodel.UNetModel
|
||||||
|
class UnetModelBlock : public GGMLBlock {
|
||||||
|
protected:
|
||||||
|
SDVersion version = VERSION_1_x;
|
||||||
|
// network hparams
|
||||||
|
int in_channels = 4;
|
||||||
|
int out_channels = 4;
|
||||||
|
int num_res_blocks = 2;
|
||||||
|
std::vector<int> attention_resolutions = {4, 2, 1};
|
||||||
|
std::vector<int> channel_mult = {1, 2, 4, 4};
|
||||||
|
std::vector<int> transformer_depth = {1, 1, 1, 1};
|
||||||
|
int time_embed_dim = 1280; // model_channels*4
|
||||||
|
int num_heads = 8;
|
||||||
|
int num_head_channels = -1; // channels // num_heads
|
||||||
|
int context_dim = 768; // 1024 for VERSION_2_x, 2048 for VERSION_XL
|
||||||
|
|
||||||
|
public:
|
||||||
|
int model_channels = 320;
|
||||||
|
int adm_in_channels = 2816; // only for VERSION_XL/SVD
|
||||||
|
|
||||||
|
UnetModelBlock(SDVersion version = VERSION_1_x)
|
||||||
|
: version(version) {
|
||||||
|
if (version == VERSION_2_x) {
|
||||||
|
context_dim = 1024;
|
||||||
|
num_head_channels = 64;
|
||||||
|
num_heads = -1;
|
||||||
|
} else if (version == VERSION_XL) {
|
||||||
|
context_dim = 2048;
|
||||||
|
attention_resolutions = {4, 2};
|
||||||
|
channel_mult = {1, 2, 4};
|
||||||
|
transformer_depth = {1, 2, 10};
|
||||||
|
num_head_channels = 64;
|
||||||
|
num_heads = -1;
|
||||||
|
} else if (version == VERSION_SVD) {
|
||||||
|
in_channels = 8;
|
||||||
|
out_channels = 4;
|
||||||
|
context_dim = 1024;
|
||||||
|
adm_in_channels = 768;
|
||||||
|
num_head_channels = 64;
|
||||||
|
num_heads = -1;
|
||||||
|
}
|
||||||
|
// dims is always 2
|
||||||
|
// use_temporal_attention is always True for SVD
|
||||||
|
|
||||||
|
blocks["time_embed.0"] = std::shared_ptr<GGMLBlock>(new Linear(model_channels, time_embed_dim));
|
||||||
|
// time_embed_1 is nn.SiLU()
|
||||||
|
blocks["time_embed.2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, time_embed_dim));
|
||||||
|
|
||||||
|
if (version == VERSION_XL || version == VERSION_SVD) {
|
||||||
|
blocks["label_emb.0.0"] = std::shared_ptr<GGMLBlock>(new Linear(adm_in_channels, time_embed_dim));
|
||||||
|
// label_emb_1 is nn.SiLU()
|
||||||
|
blocks["label_emb.0.2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, time_embed_dim));
|
||||||
|
}
|
||||||
|
|
||||||
|
// input_blocks
|
||||||
|
blocks["input_blocks.0.0"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, model_channels, {3, 3}, {1, 1}, {1, 1}));
|
||||||
|
|
||||||
|
std::vector<int> input_block_chans;
|
||||||
|
input_block_chans.push_back(model_channels);
|
||||||
|
int ch = model_channels;
|
||||||
|
int input_block_idx = 0;
|
||||||
|
int ds = 1;
|
||||||
|
|
||||||
|
auto get_resblock = [&](int64_t channels, int64_t emb_channels, int64_t out_channels) -> ResBlock* {
|
||||||
|
if (version == VERSION_SVD) {
|
||||||
|
return new VideoResBlock(channels, emb_channels, out_channels);
|
||||||
|
} else {
|
||||||
|
return new ResBlock(channels, emb_channels, out_channels);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
auto get_attention_layer = [&](int64_t in_channels,
|
||||||
|
int64_t n_head,
|
||||||
|
int64_t d_head,
|
||||||
|
int64_t depth,
|
||||||
|
int64_t context_dim) -> SpatialTransformer* {
|
||||||
|
if (version == VERSION_SVD) {
|
||||||
|
return new SpatialVideoTransformer(in_channels, n_head, d_head, depth, context_dim);
|
||||||
|
} else {
|
||||||
|
return new SpatialTransformer(in_channels, n_head, d_head, depth, context_dim);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
size_t len_mults = channel_mult.size();
|
||||||
|
for (int i = 0; i < len_mults; i++) {
|
||||||
|
int mult = channel_mult[i];
|
||||||
|
for (int j = 0; j < num_res_blocks; j++) {
|
||||||
|
input_block_idx += 1;
|
||||||
|
std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0";
|
||||||
|
blocks[name] = std::shared_ptr<GGMLBlock>(get_resblock(ch, time_embed_dim, mult * model_channels));
|
||||||
|
|
||||||
|
ch = mult * model_channels;
|
||||||
|
if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) {
|
||||||
|
int n_head = num_heads;
|
||||||
|
int d_head = ch / num_heads;
|
||||||
|
if (num_head_channels != -1) {
|
||||||
|
d_head = num_head_channels;
|
||||||
|
n_head = ch / d_head;
|
||||||
|
}
|
||||||
|
std::string name = "input_blocks." + std::to_string(input_block_idx) + ".1";
|
||||||
|
blocks[name] = std::shared_ptr<GGMLBlock>(get_attention_layer(ch,
|
||||||
|
n_head,
|
||||||
|
d_head,
|
||||||
|
transformer_depth[i],
|
||||||
|
context_dim));
|
||||||
|
}
|
||||||
|
input_block_chans.push_back(ch);
|
||||||
|
}
|
||||||
|
if (i != len_mults - 1) {
|
||||||
|
input_block_idx += 1;
|
||||||
|
std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0";
|
||||||
|
blocks[name] = std::shared_ptr<GGMLBlock>(new DownSampleBlock(ch, ch));
|
||||||
|
|
||||||
|
input_block_chans.push_back(ch);
|
||||||
|
ds *= 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// middle blocks
|
||||||
|
int n_head = num_heads;
|
||||||
|
int d_head = ch / num_heads;
|
||||||
|
if (num_head_channels != -1) {
|
||||||
|
d_head = num_head_channels;
|
||||||
|
n_head = ch / d_head;
|
||||||
|
}
|
||||||
|
blocks["middle_block.0"] = std::shared_ptr<GGMLBlock>(get_resblock(ch, time_embed_dim, ch));
|
||||||
|
blocks["middle_block.1"] = std::shared_ptr<GGMLBlock>(get_attention_layer(ch,
|
||||||
|
n_head,
|
||||||
|
d_head,
|
||||||
|
transformer_depth[transformer_depth.size() - 1],
|
||||||
|
context_dim));
|
||||||
|
blocks["middle_block.2"] = std::shared_ptr<GGMLBlock>(get_resblock(ch, time_embed_dim, ch));
|
||||||
|
|
||||||
|
// output_blocks
|
||||||
|
int output_block_idx = 0;
|
||||||
|
for (int i = (int)len_mults - 1; i >= 0; i--) {
|
||||||
|
int mult = channel_mult[i];
|
||||||
|
for (int j = 0; j < num_res_blocks + 1; j++) {
|
||||||
|
int ich = input_block_chans.back();
|
||||||
|
input_block_chans.pop_back();
|
||||||
|
|
||||||
|
std::string name = "output_blocks." + std::to_string(output_block_idx) + ".0";
|
||||||
|
blocks[name] = std::shared_ptr<GGMLBlock>(get_resblock(ch + ich, time_embed_dim, mult * model_channels));
|
||||||
|
|
||||||
|
ch = mult * model_channels;
|
||||||
|
int up_sample_idx = 1;
|
||||||
|
if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) {
|
||||||
|
int n_head = num_heads;
|
||||||
|
int d_head = ch / num_heads;
|
||||||
|
if (num_head_channels != -1) {
|
||||||
|
d_head = num_head_channels;
|
||||||
|
n_head = ch / d_head;
|
||||||
|
}
|
||||||
|
std::string name = "output_blocks." + std::to_string(output_block_idx) + ".1";
|
||||||
|
blocks[name] = std::shared_ptr<GGMLBlock>(get_attention_layer(ch, n_head, d_head, transformer_depth[i], context_dim));
|
||||||
|
|
||||||
|
up_sample_idx++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (i > 0 && j == num_res_blocks) {
|
||||||
|
std::string name = "output_blocks." + std::to_string(output_block_idx) + "." + std::to_string(up_sample_idx);
|
||||||
|
blocks[name] = std::shared_ptr<GGMLBlock>(new UpSampleBlock(ch, ch));
|
||||||
|
|
||||||
|
ds /= 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
output_block_idx += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// out
|
||||||
|
blocks["out.0"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(ch)); // ch == model_channels
|
||||||
|
// out_1 is nn.SiLU()
|
||||||
|
blocks["out.2"] = std::shared_ptr<GGMLBlock>(new Conv2d(model_channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor* resblock_forward(std::string name,
|
||||||
|
struct ggml_context* ctx,
|
||||||
|
struct ggml_tensor* x,
|
||||||
|
struct ggml_tensor* emb,
|
||||||
|
int num_video_frames) {
|
||||||
|
if (version == VERSION_SVD) {
|
||||||
|
auto block = std::dynamic_pointer_cast<VideoResBlock>(blocks[name]);
|
||||||
|
|
||||||
|
return block->forward(ctx, x, emb, num_video_frames);
|
||||||
|
} else {
|
||||||
|
auto block = std::dynamic_pointer_cast<ResBlock>(blocks[name]);
|
||||||
|
|
||||||
|
return block->forward(ctx, x, emb);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor* attention_layer_forward(std::string name,
|
||||||
|
struct ggml_context* ctx,
|
||||||
|
struct ggml_tensor* x,
|
||||||
|
struct ggml_tensor* context,
|
||||||
|
int timesteps) {
|
||||||
|
if (version == VERSION_SVD) {
|
||||||
|
auto block = std::dynamic_pointer_cast<SpatialVideoTransformer>(blocks[name]);
|
||||||
|
|
||||||
|
return block->forward(ctx, x, context, timesteps);
|
||||||
|
} else {
|
||||||
|
auto block = std::dynamic_pointer_cast<SpatialTransformer>(blocks[name]);
|
||||||
|
|
||||||
|
return block->forward(ctx, x, context);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor* forward(struct ggml_context* ctx,
|
||||||
|
struct ggml_tensor* x,
|
||||||
|
struct ggml_tensor* timesteps,
|
||||||
|
struct ggml_tensor* context,
|
||||||
|
struct ggml_tensor* c_concat = NULL,
|
||||||
|
struct ggml_tensor* y = NULL,
|
||||||
|
int num_video_frames = -1,
|
||||||
|
std::vector<struct ggml_tensor*> controls = {},
|
||||||
|
float control_strength = 0.f) {
|
||||||
|
// x: [N, in_channels, h, w] or [N, in_channels/2, h, w]
|
||||||
|
// timesteps: [N,]
|
||||||
|
// context: [N, max_position, hidden_size] or [1, max_position, hidden_size]. for example, [N, 77, 768]
|
||||||
|
// c_concat: [N, in_channels, h, w] or [1, in_channels, h, w]
|
||||||
|
// y: [N, adm_in_channels] or [1, adm_in_channels]
|
||||||
|
// return: [N, out_channels, h, w]
|
||||||
|
if (context != NULL) {
|
||||||
|
if (context->ne[2] != x->ne[3]) {
|
||||||
|
context = ggml_repeat(ctx, context, ggml_new_tensor_3d(ctx, GGML_TYPE_F32, context->ne[0], context->ne[1], x->ne[3]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (c_concat != NULL) {
|
||||||
|
if (c_concat->ne[3] != x->ne[3]) {
|
||||||
|
c_concat = ggml_repeat(ctx, c_concat, x);
|
||||||
|
}
|
||||||
|
x = ggml_concat(ctx, x, c_concat);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (y != NULL) {
|
||||||
|
if (y->ne[1] != x->ne[3]) {
|
||||||
|
y = ggml_repeat(ctx, y, ggml_new_tensor_2d(ctx, GGML_TYPE_F32, y->ne[0], x->ne[3]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
auto time_embed_0 = std::dynamic_pointer_cast<Linear>(blocks["time_embed.0"]);
|
||||||
|
auto time_embed_2 = std::dynamic_pointer_cast<Linear>(blocks["time_embed.2"]);
|
||||||
|
auto input_blocks_0_0 = std::dynamic_pointer_cast<Conv2d>(blocks["input_blocks.0.0"]);
|
||||||
|
|
||||||
|
auto out_0 = std::dynamic_pointer_cast<GroupNorm32>(blocks["out.0"]);
|
||||||
|
auto out_2 = std::dynamic_pointer_cast<Conv2d>(blocks["out.2"]);
|
||||||
|
|
||||||
|
auto t_emb = ggml_nn_timestep_embedding(ctx, timesteps, model_channels); // [N, model_channels]
|
||||||
|
|
||||||
|
auto emb = time_embed_0->forward(ctx, t_emb);
|
||||||
|
emb = ggml_silu_inplace(ctx, emb);
|
||||||
|
emb = time_embed_2->forward(ctx, emb); // [N, time_embed_dim]
|
||||||
|
|
||||||
|
// SDXL/SVD
|
||||||
|
if (y != NULL) {
|
||||||
|
auto label_embed_0 = std::dynamic_pointer_cast<Linear>(blocks["label_emb.0.0"]);
|
||||||
|
auto label_embed_2 = std::dynamic_pointer_cast<Linear>(blocks["label_emb.0.2"]);
|
||||||
|
|
||||||
|
auto label_emb = label_embed_0->forward(ctx, y);
|
||||||
|
label_emb = ggml_silu_inplace(ctx, label_emb);
|
||||||
|
label_emb = label_embed_2->forward(ctx, label_emb); // [N, time_embed_dim]
|
||||||
|
|
||||||
|
emb = ggml_add(ctx, emb, label_emb); // [N, time_embed_dim]
|
||||||
|
}
|
||||||
|
|
||||||
|
// input_blocks
|
||||||
|
std::vector<struct ggml_tensor*> hs;
|
||||||
|
|
||||||
|
// input block 0
|
||||||
|
auto h = input_blocks_0_0->forward(ctx, x);
|
||||||
|
|
||||||
|
ggml_set_name(h, "bench-start");
|
||||||
|
hs.push_back(h);
|
||||||
|
// input block 1-11
|
||||||
|
size_t len_mults = channel_mult.size();
|
||||||
|
int input_block_idx = 0;
|
||||||
|
int ds = 1;
|
||||||
|
for (int i = 0; i < len_mults; i++) {
|
||||||
|
int mult = channel_mult[i];
|
||||||
|
for (int j = 0; j < num_res_blocks; j++) {
|
||||||
|
input_block_idx += 1;
|
||||||
|
std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0";
|
||||||
|
h = resblock_forward(name, ctx, h, emb, num_video_frames); // [N, mult*model_channels, h, w]
|
||||||
|
if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) {
|
||||||
|
std::string name = "input_blocks." + std::to_string(input_block_idx) + ".1";
|
||||||
|
h = attention_layer_forward(name, ctx, h, context, num_video_frames); // [N, mult*model_channels, h, w]
|
||||||
|
}
|
||||||
|
hs.push_back(h);
|
||||||
|
}
|
||||||
|
if (i != len_mults - 1) {
|
||||||
|
ds *= 2;
|
||||||
|
input_block_idx += 1;
|
||||||
|
|
||||||
|
std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0";
|
||||||
|
auto block = std::dynamic_pointer_cast<DownSampleBlock>(blocks[name]);
|
||||||
|
|
||||||
|
h = block->forward(ctx, h); // [N, mult*model_channels, h/(2^(i+1)), w/(2^(i+1))]
|
||||||
|
hs.push_back(h);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// [N, 4*model_channels, h/8, w/8]
|
||||||
|
|
||||||
|
// middle_block
|
||||||
|
h = resblock_forward("middle_block.0", ctx, h, emb, num_video_frames); // [N, 4*model_channels, h/8, w/8]
|
||||||
|
h = attention_layer_forward("middle_block.1", ctx, h, context, num_video_frames); // [N, 4*model_channels, h/8, w/8]
|
||||||
|
h = resblock_forward("middle_block.2", ctx, h, emb, num_video_frames); // [N, 4*model_channels, h/8, w/8]
|
||||||
|
|
||||||
|
if (controls.size() > 0) {
|
||||||
|
auto cs = ggml_scale_inplace(ctx, controls[controls.size() - 1], control_strength);
|
||||||
|
h = ggml_add(ctx, h, cs); // middle control
|
||||||
|
}
|
||||||
|
int control_offset = controls.size() - 2;
|
||||||
|
|
||||||
|
// output_blocks
|
||||||
|
int output_block_idx = 0;
|
||||||
|
for (int i = (int)len_mults - 1; i >= 0; i--) {
|
||||||
|
for (int j = 0; j < num_res_blocks + 1; j++) {
|
||||||
|
auto h_skip = hs.back();
|
||||||
|
hs.pop_back();
|
||||||
|
|
||||||
|
if (controls.size() > 0) {
|
||||||
|
auto cs = ggml_scale_inplace(ctx, controls[control_offset], control_strength);
|
||||||
|
h_skip = ggml_add(ctx, h_skip, cs); // control net condition
|
||||||
|
control_offset--;
|
||||||
|
}
|
||||||
|
|
||||||
|
h = ggml_concat(ctx, h, h_skip);
|
||||||
|
|
||||||
|
std::string name = "output_blocks." + std::to_string(output_block_idx) + ".0";
|
||||||
|
|
||||||
|
h = resblock_forward(name, ctx, h, emb, num_video_frames);
|
||||||
|
|
||||||
|
int up_sample_idx = 1;
|
||||||
|
if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) {
|
||||||
|
std::string name = "output_blocks." + std::to_string(output_block_idx) + ".1";
|
||||||
|
|
||||||
|
h = attention_layer_forward(name, ctx, h, context, num_video_frames);
|
||||||
|
|
||||||
|
up_sample_idx++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (i > 0 && j == num_res_blocks) {
|
||||||
|
std::string name = "output_blocks." + std::to_string(output_block_idx) + "." + std::to_string(up_sample_idx);
|
||||||
|
auto block = std::dynamic_pointer_cast<UpSampleBlock>(blocks[name]);
|
||||||
|
|
||||||
|
h = block->forward(ctx, h);
|
||||||
|
|
||||||
|
ds /= 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
output_block_idx += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// out
|
||||||
|
h = out_0->forward(ctx, h);
|
||||||
|
h = ggml_silu_inplace(ctx, h);
|
||||||
|
h = out_2->forward(ctx, h);
|
||||||
|
ggml_set_name(h, "bench-end");
|
||||||
|
return h; // [N, out_channels, h, w]
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct UNetModel : public GGMLModule {
|
||||||
|
SDVersion version = VERSION_1_x;
|
||||||
|
UnetModelBlock unet;
|
||||||
|
|
||||||
|
UNetModel(ggml_backend_t backend,
|
||||||
|
ggml_type wtype,
|
||||||
|
SDVersion version = VERSION_1_x)
|
||||||
|
: GGMLModule(backend, wtype), unet(version) {
|
||||||
|
unet.init(params_ctx, wtype);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string get_desc() {
|
||||||
|
return "unet";
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t get_params_mem_size() {
|
||||||
|
return unet.get_params_mem_size();
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t get_params_num() {
|
||||||
|
return unet.get_params_num();
|
||||||
|
}
|
||||||
|
|
||||||
|
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
|
||||||
|
unet.get_param_tensors(tensors, prefix);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_cgraph* build_graph(struct ggml_tensor* x,
|
||||||
|
struct ggml_tensor* timesteps,
|
||||||
|
struct ggml_tensor* context,
|
||||||
|
struct ggml_tensor* c_concat = NULL,
|
||||||
|
struct ggml_tensor* y = NULL,
|
||||||
|
int num_video_frames = -1,
|
||||||
|
std::vector<struct ggml_tensor*> controls = {},
|
||||||
|
float control_strength = 0.f) {
|
||||||
|
struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, UNET_GRAPH_SIZE, false);
|
||||||
|
|
||||||
|
if (num_video_frames == -1) {
|
||||||
|
num_video_frames = x->ne[3];
|
||||||
|
}
|
||||||
|
|
||||||
|
x = to_backend(x);
|
||||||
|
context = to_backend(context);
|
||||||
|
y = to_backend(y);
|
||||||
|
timesteps = to_backend(timesteps);
|
||||||
|
|
||||||
|
for (int i = 0; i < controls.size(); i++) {
|
||||||
|
controls[i] = to_backend(controls[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor* out = unet.forward(compute_ctx,
|
||||||
|
x,
|
||||||
|
timesteps,
|
||||||
|
context,
|
||||||
|
c_concat,
|
||||||
|
y,
|
||||||
|
num_video_frames,
|
||||||
|
controls,
|
||||||
|
control_strength);
|
||||||
|
|
||||||
|
ggml_build_forward_expand(gf, out);
|
||||||
|
|
||||||
|
return gf;
|
||||||
|
}
|
||||||
|
|
||||||
|
void compute(int n_threads,
|
||||||
|
struct ggml_tensor* x,
|
||||||
|
struct ggml_tensor* timesteps,
|
||||||
|
struct ggml_tensor* context,
|
||||||
|
struct ggml_tensor* c_concat,
|
||||||
|
struct ggml_tensor* y,
|
||||||
|
int num_video_frames = -1,
|
||||||
|
std::vector<struct ggml_tensor*> controls = {},
|
||||||
|
float control_strength = 0.f,
|
||||||
|
struct ggml_tensor** output = NULL,
|
||||||
|
struct ggml_context* output_ctx = NULL) {
|
||||||
|
// x: [N, in_channels, h, w]
|
||||||
|
// timesteps: [N, ]
|
||||||
|
// context: [N, max_position, hidden_size]([N, 77, 768]) or [1, max_position, hidden_size]
|
||||||
|
// c_concat: [N, in_channels, h, w] or [1, in_channels, h, w]
|
||||||
|
// y: [N, adm_in_channels] or [1, adm_in_channels]
|
||||||
|
auto get_graph = [&]() -> struct ggml_cgraph* {
|
||||||
|
return build_graph(x, timesteps, context, c_concat, y, num_video_frames, controls, control_strength);
|
||||||
|
};
|
||||||
|
|
||||||
|
GGMLModule::compute(get_graph, n_threads, false, output, output_ctx);
|
||||||
|
}
|
||||||
|
|
||||||
|
void test() {
|
||||||
|
struct ggml_init_params params;
|
||||||
|
params.mem_size = static_cast<size_t>(10 * 1024 * 1024); // 10 MB
|
||||||
|
params.mem_buffer = NULL;
|
||||||
|
params.no_alloc = false;
|
||||||
|
|
||||||
|
struct ggml_context* work_ctx = ggml_init(params);
|
||||||
|
GGML_ASSERT(work_ctx != NULL);
|
||||||
|
|
||||||
|
{
|
||||||
|
// CPU, num_video_frames = 1, x{num_video_frames, 8, 8, 8}: Pass
|
||||||
|
// CUDA, num_video_frames = 1, x{num_video_frames, 8, 8, 8}: Pass
|
||||||
|
// CPU, num_video_frames = 3, x{num_video_frames, 8, 8, 8}: Wrong result
|
||||||
|
// CUDA, num_video_frames = 3, x{num_video_frames, 8, 8, 8}: nan
|
||||||
|
int num_video_frames = 3;
|
||||||
|
|
||||||
|
auto x = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 8, num_video_frames);
|
||||||
|
std::vector<float> timesteps_vec(num_video_frames, 999.f);
|
||||||
|
auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec);
|
||||||
|
ggml_set_f32(x, 0.5f);
|
||||||
|
// print_ggml_tensor(x);
|
||||||
|
|
||||||
|
auto context = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32, 1024, 1, num_video_frames);
|
||||||
|
ggml_set_f32(context, 0.5f);
|
||||||
|
// print_ggml_tensor(context);
|
||||||
|
|
||||||
|
auto y = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 768, num_video_frames);
|
||||||
|
ggml_set_f32(y, 0.5f);
|
||||||
|
// print_ggml_tensor(y);
|
||||||
|
|
||||||
|
struct ggml_tensor* out = NULL;
|
||||||
|
|
||||||
|
int t0 = ggml_time_ms();
|
||||||
|
compute(8, x, timesteps, context, NULL, y, num_video_frames, {}, 0.f, &out, work_ctx);
|
||||||
|
int t1 = ggml_time_ms();
|
||||||
|
|
||||||
|
print_ggml_tensor(out);
|
||||||
|
LOG_DEBUG("unet test done in %dms", t1 - t0);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif // __UNET_HPP__
|
123
otherarch/sdcpp/upscaler.cpp
Normal file
123
otherarch/sdcpp/upscaler.cpp
Normal file
|
@ -0,0 +1,123 @@
|
||||||
|
#include "esrgan.hpp"
|
||||||
|
#include "ggml_extend.hpp"
|
||||||
|
#include "model.h"
|
||||||
|
#include "stable-diffusion.h"
|
||||||
|
|
||||||
|
struct UpscalerGGML {
|
||||||
|
ggml_backend_t backend = NULL; // general backend
|
||||||
|
ggml_type model_data_type = GGML_TYPE_F16;
|
||||||
|
std::shared_ptr<ESRGAN> esrgan_upscaler;
|
||||||
|
std::string esrgan_path;
|
||||||
|
int n_threads;
|
||||||
|
|
||||||
|
UpscalerGGML(int n_threads)
|
||||||
|
: n_threads(n_threads) {
|
||||||
|
}
|
||||||
|
|
||||||
|
bool load_from_file(const std::string& esrgan_path) {
|
||||||
|
#ifdef SD_USE_CUBLAS
|
||||||
|
LOG_DEBUG("Using CUDA backend");
|
||||||
|
backend = ggml_backend_cuda_init(0);
|
||||||
|
#endif
|
||||||
|
#ifdef SD_USE_METAL
|
||||||
|
LOG_DEBUG("Using Metal backend");
|
||||||
|
ggml_metal_log_set_callback(ggml_log_callback_default, nullptr);
|
||||||
|
backend = ggml_backend_metal_init();
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if (!backend) {
|
||||||
|
LOG_DEBUG("Using CPU backend");
|
||||||
|
backend = ggml_backend_cpu_init();
|
||||||
|
}
|
||||||
|
LOG_INFO("Upscaler weight type: %s", ggml_type_name(model_data_type));
|
||||||
|
esrgan_upscaler = std::make_shared<ESRGAN>(backend, model_data_type);
|
||||||
|
if (!esrgan_upscaler->load_from_file(esrgan_path)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
sd_image_t upscale(sd_image_t input_image, uint32_t upscale_factor) {
|
||||||
|
// upscale_factor, unused for RealESRGAN_x4plus_anime_6B.pth
|
||||||
|
sd_image_t upscaled_image = {0, 0, 0, NULL};
|
||||||
|
int output_width = (int)input_image.width * esrgan_upscaler->scale;
|
||||||
|
int output_height = (int)input_image.height * esrgan_upscaler->scale;
|
||||||
|
LOG_INFO("upscaling from (%i x %i) to (%i x %i)",
|
||||||
|
input_image.width, input_image.height, output_width, output_height);
|
||||||
|
|
||||||
|
struct ggml_init_params params;
|
||||||
|
params.mem_size = output_width * output_height * 3 * sizeof(float) * 2;
|
||||||
|
params.mem_size += 2 * ggml_tensor_overhead();
|
||||||
|
params.mem_buffer = NULL;
|
||||||
|
params.no_alloc = false;
|
||||||
|
|
||||||
|
// draft context
|
||||||
|
struct ggml_context* upscale_ctx = ggml_init(params);
|
||||||
|
if (!upscale_ctx) {
|
||||||
|
LOG_ERROR("ggml_init() failed");
|
||||||
|
return upscaled_image;
|
||||||
|
}
|
||||||
|
LOG_DEBUG("upscale work buffer size: %.2f MB", params.mem_size / 1024.f / 1024.f);
|
||||||
|
ggml_tensor* input_image_tensor = ggml_new_tensor_4d(upscale_ctx, GGML_TYPE_F32, input_image.width, input_image.height, 3, 1);
|
||||||
|
sd_image_to_tensor(input_image.data, input_image_tensor);
|
||||||
|
|
||||||
|
ggml_tensor* upscaled = ggml_new_tensor_4d(upscale_ctx, GGML_TYPE_F32, output_width, output_height, 3, 1);
|
||||||
|
auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
|
||||||
|
esrgan_upscaler->compute(n_threads, in, &out);
|
||||||
|
};
|
||||||
|
int64_t t0 = ggml_time_ms();
|
||||||
|
sd_tiling(input_image_tensor, upscaled, esrgan_upscaler->scale, esrgan_upscaler->tile_size, 0.25f, on_tiling);
|
||||||
|
esrgan_upscaler->free_compute_buffer();
|
||||||
|
ggml_tensor_clamp(upscaled, 0.f, 1.f);
|
||||||
|
uint8_t* upscaled_data = sd_tensor_to_image(upscaled);
|
||||||
|
ggml_free(upscale_ctx);
|
||||||
|
int64_t t3 = ggml_time_ms();
|
||||||
|
LOG_INFO("input_image_tensor upscaled, taking %.2fs", (t3 - t0) / 1000.0f);
|
||||||
|
upscaled_image = {
|
||||||
|
(uint32_t)output_width,
|
||||||
|
(uint32_t)output_height,
|
||||||
|
3,
|
||||||
|
upscaled_data,
|
||||||
|
};
|
||||||
|
return upscaled_image;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct upscaler_ctx_t {
|
||||||
|
UpscalerGGML* upscaler = NULL;
|
||||||
|
};
|
||||||
|
|
||||||
|
upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str,
|
||||||
|
int n_threads,
|
||||||
|
enum sd_type_t wtype) {
|
||||||
|
upscaler_ctx_t* upscaler_ctx = (upscaler_ctx_t*)malloc(sizeof(upscaler_ctx_t));
|
||||||
|
if (upscaler_ctx == NULL) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
std::string esrgan_path(esrgan_path_c_str);
|
||||||
|
|
||||||
|
upscaler_ctx->upscaler = new UpscalerGGML(n_threads);
|
||||||
|
if (upscaler_ctx->upscaler == NULL) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!upscaler_ctx->upscaler->load_from_file(esrgan_path)) {
|
||||||
|
delete upscaler_ctx->upscaler;
|
||||||
|
upscaler_ctx->upscaler = NULL;
|
||||||
|
free(upscaler_ctx);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
return upscaler_ctx;
|
||||||
|
}
|
||||||
|
|
||||||
|
sd_image_t upscale(upscaler_ctx_t* upscaler_ctx, sd_image_t input_image, uint32_t upscale_factor) {
|
||||||
|
return upscaler_ctx->upscaler->upscale(input_image, upscale_factor);
|
||||||
|
}
|
||||||
|
|
||||||
|
void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx) {
|
||||||
|
if (upscaler_ctx->upscaler != NULL) {
|
||||||
|
delete upscaler_ctx->upscaler;
|
||||||
|
upscaler_ctx->upscaler = NULL;
|
||||||
|
}
|
||||||
|
free(upscaler_ctx);
|
||||||
|
}
|
463
otherarch/sdcpp/util.cpp
Normal file
463
otherarch/sdcpp/util.cpp
Normal file
|
@ -0,0 +1,463 @@
|
||||||
|
#include "util.h"
|
||||||
|
#include <stdarg.h>
|
||||||
|
#include <algorithm>
|
||||||
|
#include <cmath>
|
||||||
|
#include <codecvt>
|
||||||
|
#include <fstream>
|
||||||
|
#include <locale>
|
||||||
|
#include <sstream>
|
||||||
|
#include <string>
|
||||||
|
#include <thread>
|
||||||
|
#include <unordered_set>
|
||||||
|
#include <vector>
|
||||||
|
#include "preprocessing.hpp"
|
||||||
|
|
||||||
|
#if defined(__APPLE__) && defined(__MACH__)
|
||||||
|
#include <sys/sysctl.h>
|
||||||
|
#include <sys/types.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if !defined(_WIN32)
|
||||||
|
#include <sys/ioctl.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include "ggml/ggml.h"
|
||||||
|
#include "stable-diffusion.h"
|
||||||
|
|
||||||
|
bool ends_with(const std::string& str, const std::string& ending) {
|
||||||
|
if (str.length() >= ending.length()) {
|
||||||
|
return (str.compare(str.length() - ending.length(), ending.length(), ending) == 0);
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bool starts_with(const std::string& str, const std::string& start) {
|
||||||
|
if (str.find(start) == 0) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
void replace_all_chars(std::string& str, char target, char replacement) {
|
||||||
|
for (size_t i = 0; i < str.length(); ++i) {
|
||||||
|
if (str[i] == target) {
|
||||||
|
str[i] = replacement;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string format(const char* fmt, ...) {
|
||||||
|
va_list ap;
|
||||||
|
va_list ap2;
|
||||||
|
va_start(ap, fmt);
|
||||||
|
va_copy(ap2, ap);
|
||||||
|
int size = vsnprintf(NULL, 0, fmt, ap);
|
||||||
|
std::vector<char> buf(size + 1);
|
||||||
|
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
|
||||||
|
va_end(ap2);
|
||||||
|
va_end(ap);
|
||||||
|
return std::string(buf.data(), size);
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef _WIN32 // code for windows
|
||||||
|
#include <windows.h>
|
||||||
|
|
||||||
|
bool file_exists(const std::string& filename) {
|
||||||
|
DWORD attributes = GetFileAttributesA(filename.c_str());
|
||||||
|
return (attributes != INVALID_FILE_ATTRIBUTES && !(attributes & FILE_ATTRIBUTE_DIRECTORY));
|
||||||
|
}
|
||||||
|
|
||||||
|
bool is_directory(const std::string& path) {
|
||||||
|
DWORD attributes = GetFileAttributesA(path.c_str());
|
||||||
|
return (attributes != INVALID_FILE_ATTRIBUTES && (attributes & FILE_ATTRIBUTE_DIRECTORY));
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string get_full_path(const std::string& dir, const std::string& filename) {
|
||||||
|
std::string full_path = dir + "\\" + filename;
|
||||||
|
|
||||||
|
WIN32_FIND_DATA find_file_data;
|
||||||
|
HANDLE hFind = FindFirstFile(full_path.c_str(), &find_file_data);
|
||||||
|
|
||||||
|
if (hFind != INVALID_HANDLE_VALUE) {
|
||||||
|
FindClose(hFind);
|
||||||
|
return full_path;
|
||||||
|
} else {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#else // Unix
|
||||||
|
#include <dirent.h>
|
||||||
|
#include <sys/stat.h>
|
||||||
|
|
||||||
|
bool file_exists(const std::string& filename) {
|
||||||
|
struct stat buffer;
|
||||||
|
return (stat(filename.c_str(), &buffer) == 0 && S_ISREG(buffer.st_mode));
|
||||||
|
}
|
||||||
|
|
||||||
|
bool is_directory(const std::string& path) {
|
||||||
|
struct stat buffer;
|
||||||
|
return (stat(path.c_str(), &buffer) == 0 && S_ISDIR(buffer.st_mode));
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string get_full_path(const std::string& dir, const std::string& filename) {
|
||||||
|
DIR* dp = opendir(dir.c_str());
|
||||||
|
|
||||||
|
if (dp != nullptr) {
|
||||||
|
struct dirent* entry;
|
||||||
|
|
||||||
|
while ((entry = readdir(dp)) != nullptr) {
|
||||||
|
if (strcasecmp(entry->d_name, filename.c_str()) == 0) {
|
||||||
|
closedir(dp);
|
||||||
|
return dir + "/" + entry->d_name;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
closedir(dp);
|
||||||
|
}
|
||||||
|
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// get_num_physical_cores is copy from
|
||||||
|
// https://github.com/ggerganov/llama.cpp/blob/master/examples/common.cpp
|
||||||
|
// LICENSE: https://github.com/ggerganov/llama.cpp/blob/master/LICENSE
|
||||||
|
int32_t get_num_physical_cores() {
|
||||||
|
#ifdef __linux__
|
||||||
|
// enumerate the set of thread siblings, num entries is num cores
|
||||||
|
std::unordered_set<std::string> siblings;
|
||||||
|
for (uint32_t cpu = 0; cpu < UINT32_MAX; ++cpu) {
|
||||||
|
std::ifstream thread_siblings("/sys/devices/system/cpu" + std::to_string(cpu) + "/topology/thread_siblings");
|
||||||
|
if (!thread_siblings.is_open()) {
|
||||||
|
break; // no more cpus
|
||||||
|
}
|
||||||
|
std::string line;
|
||||||
|
if (std::getline(thread_siblings, line)) {
|
||||||
|
siblings.insert(line);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (siblings.size() > 0) {
|
||||||
|
return static_cast<int32_t>(siblings.size());
|
||||||
|
}
|
||||||
|
#elif defined(__APPLE__) && defined(__MACH__)
|
||||||
|
int32_t num_physical_cores;
|
||||||
|
size_t len = sizeof(num_physical_cores);
|
||||||
|
int result = sysctlbyname("hw.perflevel0.physicalcpu", &num_physical_cores, &len, NULL, 0);
|
||||||
|
if (result == 0) {
|
||||||
|
return num_physical_cores;
|
||||||
|
}
|
||||||
|
result = sysctlbyname("hw.physicalcpu", &num_physical_cores, &len, NULL, 0);
|
||||||
|
if (result == 0) {
|
||||||
|
return num_physical_cores;
|
||||||
|
}
|
||||||
|
#elif defined(_WIN32)
|
||||||
|
// TODO: Implement
|
||||||
|
#endif
|
||||||
|
unsigned int n_threads = std::thread::hardware_concurrency();
|
||||||
|
return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::u32string utf8_to_utf32(const std::string& utf8_str) {
|
||||||
|
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
|
||||||
|
return converter.from_bytes(utf8_str);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string utf32_to_utf8(const std::u32string& utf32_str) {
|
||||||
|
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
|
||||||
|
return converter.to_bytes(utf32_str);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::u32string unicode_value_to_utf32(int unicode_value) {
|
||||||
|
std::u32string utf32_string = {static_cast<char32_t>(unicode_value)};
|
||||||
|
return utf32_string;
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::string sd_basename(const std::string& path) {
|
||||||
|
size_t pos = path.find_last_of('/');
|
||||||
|
if (pos != std::string::npos) {
|
||||||
|
return path.substr(pos + 1);
|
||||||
|
}
|
||||||
|
pos = path.find_last_of('\\');
|
||||||
|
if (pos != std::string::npos) {
|
||||||
|
return path.substr(pos + 1);
|
||||||
|
}
|
||||||
|
return path;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string path_join(const std::string& p1, const std::string& p2) {
|
||||||
|
if (p1.empty()) {
|
||||||
|
return p2;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (p2.empty()) {
|
||||||
|
return p1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (p1[p1.length() - 1] == '/' || p1[p1.length() - 1] == '\\') {
|
||||||
|
return p1 + p2;
|
||||||
|
}
|
||||||
|
|
||||||
|
return p1 + "/" + p2;
|
||||||
|
}
|
||||||
|
|
||||||
|
void pretty_progress(int step, int steps, float time) {
|
||||||
|
if (step == 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
std::string progress = " |";
|
||||||
|
int max_progress = 50;
|
||||||
|
int32_t current = (int32_t)(step * 1.f * max_progress / steps);
|
||||||
|
for (int i = 0; i < 50; i++) {
|
||||||
|
if (i > current) {
|
||||||
|
progress += " ";
|
||||||
|
} else if (i == current && i != max_progress - 1) {
|
||||||
|
progress += ">";
|
||||||
|
} else {
|
||||||
|
progress += "=";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
progress += "|";
|
||||||
|
printf(time > 1.0f ? "\r%s %i/%i - %.2fs/it" : "\r%s %i/%i - %.2fit/s",
|
||||||
|
progress.c_str(), step, steps,
|
||||||
|
time > 1.0f || time == 0 ? time : (1.0f / time));
|
||||||
|
fflush(stdout); // for linux
|
||||||
|
if (step == steps) {
|
||||||
|
printf("\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string ltrim(const std::string& s) {
|
||||||
|
auto it = std::find_if(s.begin(), s.end(), [](int ch) {
|
||||||
|
return !std::isspace(ch);
|
||||||
|
});
|
||||||
|
return std::string(it, s.end());
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string rtrim(const std::string& s) {
|
||||||
|
auto it = std::find_if(s.rbegin(), s.rend(), [](int ch) {
|
||||||
|
return !std::isspace(ch);
|
||||||
|
});
|
||||||
|
return std::string(s.begin(), it.base());
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string trim(const std::string& s) {
|
||||||
|
return rtrim(ltrim(s));
|
||||||
|
}
|
||||||
|
|
||||||
|
static sd_log_cb_t sd_log_cb = NULL;
|
||||||
|
void* sd_log_cb_data = NULL;
|
||||||
|
|
||||||
|
#define LOG_BUFFER_SIZE 1024
|
||||||
|
|
||||||
|
void log_printf(sd_log_level_t level, const char* file, int line, const char* format, ...) {
|
||||||
|
va_list args;
|
||||||
|
va_start(args, format);
|
||||||
|
|
||||||
|
const char* level_str = "DEBUG";
|
||||||
|
if (level == SD_LOG_INFO) {
|
||||||
|
level_str = "INFO ";
|
||||||
|
} else if (level == SD_LOG_WARN) {
|
||||||
|
level_str = "WARN ";
|
||||||
|
} else if (level == SD_LOG_ERROR) {
|
||||||
|
level_str = "ERROR";
|
||||||
|
}
|
||||||
|
|
||||||
|
static char log_buffer[LOG_BUFFER_SIZE];
|
||||||
|
|
||||||
|
int written = snprintf(log_buffer, LOG_BUFFER_SIZE, "[%s] %s:%-4d - ", level_str, sd_basename(file).c_str(), line);
|
||||||
|
|
||||||
|
if (written >= 0 && written < LOG_BUFFER_SIZE) {
|
||||||
|
vsnprintf(log_buffer + written, LOG_BUFFER_SIZE - written, format, args);
|
||||||
|
strncat(log_buffer, "\n", LOG_BUFFER_SIZE - strlen(log_buffer) - 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (sd_log_cb) {
|
||||||
|
sd_log_cb(level, log_buffer, sd_log_cb_data);
|
||||||
|
}
|
||||||
|
|
||||||
|
va_end(args);
|
||||||
|
}
|
||||||
|
|
||||||
|
void sd_set_log_callback(sd_log_cb_t cb, void* data) {
|
||||||
|
sd_log_cb = cb;
|
||||||
|
sd_log_cb_data = data;
|
||||||
|
}
|
||||||
|
|
||||||
|
const char* sd_get_system_info() {
|
||||||
|
static char buffer[1024];
|
||||||
|
std::stringstream ss;
|
||||||
|
ss << "System Info: \n";
|
||||||
|
ss << " BLAS = " << ggml_cpu_has_blas() << std::endl;
|
||||||
|
ss << " SSE3 = " << ggml_cpu_has_sse3() << std::endl;
|
||||||
|
ss << " AVX = " << ggml_cpu_has_avx() << std::endl;
|
||||||
|
ss << " AVX2 = " << ggml_cpu_has_avx2() << std::endl;
|
||||||
|
ss << " AVX512 = " << ggml_cpu_has_avx512() << std::endl;
|
||||||
|
ss << " AVX512_VBMI = " << ggml_cpu_has_avx512_vbmi() << std::endl;
|
||||||
|
ss << " AVX512_VNNI = " << ggml_cpu_has_avx512_vnni() << std::endl;
|
||||||
|
ss << " FMA = " << ggml_cpu_has_fma() << std::endl;
|
||||||
|
ss << " NEON = " << ggml_cpu_has_neon() << std::endl;
|
||||||
|
ss << " ARM_FMA = " << ggml_cpu_has_arm_fma() << std::endl;
|
||||||
|
ss << " F16C = " << ggml_cpu_has_f16c() << std::endl;
|
||||||
|
ss << " FP16_VA = " << ggml_cpu_has_fp16_va() << std::endl;
|
||||||
|
ss << " WASM_SIMD = " << ggml_cpu_has_wasm_simd() << std::endl;
|
||||||
|
ss << " VSX = " << ggml_cpu_has_vsx() << std::endl;
|
||||||
|
snprintf(buffer, sizeof(buffer), "%s", ss.str().c_str());
|
||||||
|
return buffer;
|
||||||
|
}
|
||||||
|
|
||||||
|
const char* sd_type_name(enum sd_type_t type) {
|
||||||
|
return ggml_type_name((ggml_type)type);
|
||||||
|
}
|
||||||
|
|
||||||
|
sd_image_f32_t sd_image_t_to_sd_image_f32_t(sd_image_t image) {
|
||||||
|
sd_image_f32_t converted_image;
|
||||||
|
converted_image.width = image.width;
|
||||||
|
converted_image.height = image.height;
|
||||||
|
converted_image.channel = image.channel;
|
||||||
|
|
||||||
|
// Allocate memory for float data
|
||||||
|
converted_image.data = (float*)malloc(image.width * image.height * image.channel * sizeof(float));
|
||||||
|
|
||||||
|
for (int i = 0; i < image.width * image.height * image.channel; i++) {
|
||||||
|
// Convert uint8_t to float
|
||||||
|
converted_image.data[i] = (float)image.data[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
return converted_image;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Function to perform double linear interpolation
|
||||||
|
float interpolate(float v1, float v2, float v3, float v4, float x_ratio, float y_ratio) {
|
||||||
|
return v1 * (1 - x_ratio) * (1 - y_ratio) + v2 * x_ratio * (1 - y_ratio) + v3 * (1 - x_ratio) * y_ratio + v4 * x_ratio * y_ratio;
|
||||||
|
}
|
||||||
|
|
||||||
|
sd_image_f32_t resize_sd_image_f32_t(sd_image_f32_t image, int target_width, int target_height) {
|
||||||
|
sd_image_f32_t resized_image;
|
||||||
|
resized_image.width = target_width;
|
||||||
|
resized_image.height = target_height;
|
||||||
|
resized_image.channel = image.channel;
|
||||||
|
|
||||||
|
// Allocate memory for resized float data
|
||||||
|
resized_image.data = (float*)malloc(target_width * target_height * image.channel * sizeof(float));
|
||||||
|
|
||||||
|
for (int y = 0; y < target_height; y++) {
|
||||||
|
for (int x = 0; x < target_width; x++) {
|
||||||
|
float original_x = (float)x * image.width / target_width;
|
||||||
|
float original_y = (float)y * image.height / target_height;
|
||||||
|
|
||||||
|
int x1 = (int)original_x;
|
||||||
|
int y1 = (int)original_y;
|
||||||
|
int x2 = x1 + 1;
|
||||||
|
int y2 = y1 + 1;
|
||||||
|
|
||||||
|
for (int k = 0; k < image.channel; k++) {
|
||||||
|
float v1 = *(image.data + y1 * image.width * image.channel + x1 * image.channel + k);
|
||||||
|
float v2 = *(image.data + y1 * image.width * image.channel + x2 * image.channel + k);
|
||||||
|
float v3 = *(image.data + y2 * image.width * image.channel + x1 * image.channel + k);
|
||||||
|
float v4 = *(image.data + y2 * image.width * image.channel + x2 * image.channel + k);
|
||||||
|
|
||||||
|
float x_ratio = original_x - x1;
|
||||||
|
float y_ratio = original_y - y1;
|
||||||
|
|
||||||
|
float value = interpolate(v1, v2, v3, v4, x_ratio, y_ratio);
|
||||||
|
|
||||||
|
*(resized_image.data + y * target_width * image.channel + x * image.channel + k) = value;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return resized_image;
|
||||||
|
}
|
||||||
|
|
||||||
|
void normalize_sd_image_f32_t(sd_image_f32_t image, float means[3], float stds[3]) {
|
||||||
|
for (int y = 0; y < image.height; y++) {
|
||||||
|
for (int x = 0; x < image.width; x++) {
|
||||||
|
for (int k = 0; k < image.channel; k++) {
|
||||||
|
int index = (y * image.width + x) * image.channel + k;
|
||||||
|
image.data[index] = (image.data[index] - means[k]) / stds[k];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Constants for means and std
|
||||||
|
float means[3] = {0.48145466, 0.4578275, 0.40821073};
|
||||||
|
float stds[3] = {0.26862954, 0.26130258, 0.27577711};
|
||||||
|
|
||||||
|
// Function to clip and preprocess sd_image_f32_t
|
||||||
|
sd_image_f32_t clip_preprocess(sd_image_f32_t image, int size) {
|
||||||
|
float scale = (float)size / fmin(image.width, image.height);
|
||||||
|
|
||||||
|
// Interpolation
|
||||||
|
int new_width = (int)(scale * image.width);
|
||||||
|
int new_height = (int)(scale * image.height);
|
||||||
|
float* resized_data = (float*)malloc(new_width * new_height * image.channel * sizeof(float));
|
||||||
|
|
||||||
|
for (int y = 0; y < new_height; y++) {
|
||||||
|
for (int x = 0; x < new_width; x++) {
|
||||||
|
float original_x = (float)x * image.width / new_width;
|
||||||
|
float original_y = (float)y * image.height / new_height;
|
||||||
|
|
||||||
|
int x1 = (int)original_x;
|
||||||
|
int y1 = (int)original_y;
|
||||||
|
int x2 = x1 + 1;
|
||||||
|
int y2 = y1 + 1;
|
||||||
|
|
||||||
|
for (int k = 0; k < image.channel; k++) {
|
||||||
|
float v1 = *(image.data + y1 * image.width * image.channel + x1 * image.channel + k);
|
||||||
|
float v2 = *(image.data + y1 * image.width * image.channel + x2 * image.channel + k);
|
||||||
|
float v3 = *(image.data + y2 * image.width * image.channel + x1 * image.channel + k);
|
||||||
|
float v4 = *(image.data + y2 * image.width * image.channel + x2 * image.channel + k);
|
||||||
|
|
||||||
|
float x_ratio = original_x - x1;
|
||||||
|
float y_ratio = original_y - y1;
|
||||||
|
|
||||||
|
float value = interpolate(v1, v2, v3, v4, x_ratio, y_ratio);
|
||||||
|
|
||||||
|
*(resized_data + y * new_width * image.channel + x * image.channel + k) = value;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Clip and preprocess
|
||||||
|
int h = (new_height - size) / 2;
|
||||||
|
int w = (new_width - size) / 2;
|
||||||
|
|
||||||
|
sd_image_f32_t result;
|
||||||
|
result.width = size;
|
||||||
|
result.height = size;
|
||||||
|
result.channel = image.channel;
|
||||||
|
result.data = (float*)malloc(size * size * image.channel * sizeof(float));
|
||||||
|
|
||||||
|
for (int k = 0; k < image.channel; k++) {
|
||||||
|
for (int i = 0; i < size; i++) {
|
||||||
|
for (int j = 0; j < size; j++) {
|
||||||
|
*(result.data + i * size * image.channel + j * image.channel + k) =
|
||||||
|
fmin(fmax(*(resized_data + (i + h) * new_width * image.channel + (j + w) * image.channel + k), 0.0f), 255.0f) / 255.0f;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Free allocated memory
|
||||||
|
free(resized_data);
|
||||||
|
|
||||||
|
// Normalize
|
||||||
|
for (int k = 0; k < image.channel; k++) {
|
||||||
|
for (int i = 0; i < size; i++) {
|
||||||
|
for (int j = 0; j < size; j++) {
|
||||||
|
// *(result.data + i * size * image.channel + j * image.channel + k) = 0.5f;
|
||||||
|
int offset = i * size * image.channel + j * image.channel + k;
|
||||||
|
float value = *(result.data + offset);
|
||||||
|
value = (value - means[k]) / stds[k];
|
||||||
|
// value = 0.5f;
|
||||||
|
*(result.data + offset) = value;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
51
otherarch/sdcpp/util.h
Normal file
51
otherarch/sdcpp/util.h
Normal file
|
@ -0,0 +1,51 @@
|
||||||
|
#ifndef __UTIL_H__
|
||||||
|
#define __UTIL_H__
|
||||||
|
|
||||||
|
#include <cstdint>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
#include "stable-diffusion.h"
|
||||||
|
|
||||||
|
bool ends_with(const std::string& str, const std::string& ending);
|
||||||
|
bool starts_with(const std::string& str, const std::string& start);
|
||||||
|
|
||||||
|
std::string format(const char* fmt, ...);
|
||||||
|
|
||||||
|
void replace_all_chars(std::string& str, char target, char replacement);
|
||||||
|
|
||||||
|
bool file_exists(const std::string& filename);
|
||||||
|
bool is_directory(const std::string& path);
|
||||||
|
std::string get_full_path(const std::string& dir, const std::string& filename);
|
||||||
|
|
||||||
|
std::u32string utf8_to_utf32(const std::string& utf8_str);
|
||||||
|
std::string utf32_to_utf8(const std::u32string& utf32_str);
|
||||||
|
std::u32string unicode_value_to_utf32(int unicode_value);
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
uint32_t width;
|
||||||
|
uint32_t height;
|
||||||
|
uint32_t channel;
|
||||||
|
float* data;
|
||||||
|
} sd_image_f32_t;
|
||||||
|
|
||||||
|
void normalize_sd_image_f32_t(sd_image_f32_t image, float means[3], float stds[3]);
|
||||||
|
|
||||||
|
sd_image_f32_t sd_image_t_to_sd_image_f32_t(sd_image_t image);
|
||||||
|
|
||||||
|
sd_image_f32_t resize_sd_image_f32_t(sd_image_f32_t image, int target_width, int target_height);
|
||||||
|
|
||||||
|
sd_image_f32_t clip_preprocess(sd_image_f32_t image, int size);
|
||||||
|
|
||||||
|
std::string path_join(const std::string& p1, const std::string& p2);
|
||||||
|
|
||||||
|
void pretty_progress(int step, int steps, float time);
|
||||||
|
|
||||||
|
void log_printf(sd_log_level_t level, const char* file, int line, const char* format, ...);
|
||||||
|
|
||||||
|
std::string trim(const std::string& s);
|
||||||
|
|
||||||
|
#define LOG_DEBUG(format, ...) log_printf(SD_LOG_DEBUG, __FILE__, __LINE__, format, ##__VA_ARGS__)
|
||||||
|
#define LOG_INFO(format, ...) log_printf(SD_LOG_INFO, __FILE__, __LINE__, format, ##__VA_ARGS__)
|
||||||
|
#define LOG_WARN(format, ...) log_printf(SD_LOG_WARN, __FILE__, __LINE__, format, ##__VA_ARGS__)
|
||||||
|
#define LOG_ERROR(format, ...) log_printf(SD_LOG_ERROR, __FILE__, __LINE__, format, ##__VA_ARGS__)
|
||||||
|
#endif // __UTIL_H__
|
613
otherarch/sdcpp/vae.hpp
Normal file
613
otherarch/sdcpp/vae.hpp
Normal file
|
@ -0,0 +1,613 @@
|
||||||
|
#ifndef __VAE_HPP__
|
||||||
|
#define __VAE_HPP__
|
||||||
|
|
||||||
|
#include "common.hpp"
|
||||||
|
#include "ggml_extend.hpp"
|
||||||
|
|
||||||
|
/*================================================== AutoEncoderKL ===================================================*/
|
||||||
|
|
||||||
|
#define VAE_GRAPH_SIZE 10240
|
||||||
|
|
||||||
|
class ResnetBlock : public UnaryBlock {
|
||||||
|
protected:
|
||||||
|
int64_t in_channels;
|
||||||
|
int64_t out_channels;
|
||||||
|
|
||||||
|
public:
|
||||||
|
ResnetBlock(int64_t in_channels,
|
||||||
|
int64_t out_channels)
|
||||||
|
: in_channels(in_channels),
|
||||||
|
out_channels(out_channels) {
|
||||||
|
// temb_channels is always 0
|
||||||
|
blocks["norm1"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(in_channels));
|
||||||
|
blocks["conv1"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
|
||||||
|
|
||||||
|
blocks["norm2"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(out_channels));
|
||||||
|
blocks["conv2"] = std::shared_ptr<GGMLBlock>(new Conv2d(out_channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
|
||||||
|
|
||||||
|
if (out_channels != in_channels) {
|
||||||
|
blocks["nin_shortcut"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, out_channels, {1, 1}));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
|
||||||
|
// x: [N, in_channels, h, w]
|
||||||
|
// t_emb is always None
|
||||||
|
auto norm1 = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm1"]);
|
||||||
|
auto conv1 = std::dynamic_pointer_cast<Conv2d>(blocks["conv1"]);
|
||||||
|
auto norm2 = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm2"]);
|
||||||
|
auto conv2 = std::dynamic_pointer_cast<Conv2d>(blocks["conv2"]);
|
||||||
|
|
||||||
|
auto h = x;
|
||||||
|
h = norm1->forward(ctx, h);
|
||||||
|
h = ggml_silu_inplace(ctx, h); // swish
|
||||||
|
h = conv1->forward(ctx, h);
|
||||||
|
// return h;
|
||||||
|
|
||||||
|
h = norm2->forward(ctx, h);
|
||||||
|
h = ggml_silu_inplace(ctx, h); // swish
|
||||||
|
// dropout, skip for inference
|
||||||
|
h = conv2->forward(ctx, h);
|
||||||
|
|
||||||
|
// skip connection
|
||||||
|
if (out_channels != in_channels) {
|
||||||
|
auto nin_shortcut = std::dynamic_pointer_cast<Conv2d>(blocks["nin_shortcut"]);
|
||||||
|
|
||||||
|
x = nin_shortcut->forward(ctx, x); // [N, out_channels, h, w]
|
||||||
|
}
|
||||||
|
|
||||||
|
h = ggml_add(ctx, h, x);
|
||||||
|
return h; // [N, out_channels, h, w]
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class AttnBlock : public UnaryBlock {
|
||||||
|
protected:
|
||||||
|
int64_t in_channels;
|
||||||
|
|
||||||
|
public:
|
||||||
|
AttnBlock(int64_t in_channels)
|
||||||
|
: in_channels(in_channels) {
|
||||||
|
blocks["norm"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(in_channels));
|
||||||
|
blocks["q"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1}));
|
||||||
|
blocks["k"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1}));
|
||||||
|
blocks["v"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1}));
|
||||||
|
|
||||||
|
blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1}));
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
|
||||||
|
// x: [N, in_channels, h, w]
|
||||||
|
auto norm = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm"]);
|
||||||
|
auto q_proj = std::dynamic_pointer_cast<Conv2d>(blocks["q"]);
|
||||||
|
auto k_proj = std::dynamic_pointer_cast<Conv2d>(blocks["k"]);
|
||||||
|
auto v_proj = std::dynamic_pointer_cast<Conv2d>(blocks["v"]);
|
||||||
|
auto proj_out = std::dynamic_pointer_cast<Conv2d>(blocks["proj_out"]);
|
||||||
|
|
||||||
|
auto h_ = norm->forward(ctx, x);
|
||||||
|
|
||||||
|
const int64_t n = h_->ne[3];
|
||||||
|
const int64_t c = h_->ne[2];
|
||||||
|
const int64_t h = h_->ne[1];
|
||||||
|
const int64_t w = h_->ne[0];
|
||||||
|
|
||||||
|
auto q = q_proj->forward(ctx, h_); // [N, in_channels, h, w]
|
||||||
|
q = ggml_cont(ctx, ggml_permute(ctx, q, 1, 2, 0, 3)); // [N, h, w, in_channels]
|
||||||
|
q = ggml_reshape_3d(ctx, q, c, h * w, n); // [N, h * w, in_channels]
|
||||||
|
|
||||||
|
auto k = k_proj->forward(ctx, h_); // [N, in_channels, h, w]
|
||||||
|
k = ggml_cont(ctx, ggml_permute(ctx, k, 1, 2, 0, 3)); // [N, h, w, in_channels]
|
||||||
|
k = ggml_reshape_3d(ctx, k, c, h * w, n); // [N, h * w, in_channels]
|
||||||
|
|
||||||
|
auto v = v_proj->forward(ctx, h_); // [N, in_channels, h, w]
|
||||||
|
v = ggml_reshape_3d(ctx, v, h * w, c, n); // [N, in_channels, h * w]
|
||||||
|
|
||||||
|
h_ = ggml_nn_attention(ctx, q, k, v, false); // [N, h * w, in_channels]
|
||||||
|
|
||||||
|
h_ = ggml_cont(ctx, ggml_permute(ctx, h_, 1, 0, 2, 3)); // [N, in_channels, h * w]
|
||||||
|
h_ = ggml_reshape_4d(ctx, h_, w, h, c, n); // [N, in_channels, h, w]
|
||||||
|
|
||||||
|
h_ = proj_out->forward(ctx, h_); // [N, in_channels, h, w]
|
||||||
|
|
||||||
|
h_ = ggml_add(ctx, h_, x);
|
||||||
|
return h_;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class AE3DConv : public Conv2d {
|
||||||
|
public:
|
||||||
|
AE3DConv(int64_t in_channels,
|
||||||
|
int64_t out_channels,
|
||||||
|
std::pair<int, int> kernel_size,
|
||||||
|
int64_t video_kernel_size = 3,
|
||||||
|
std::pair<int, int> stride = {1, 1},
|
||||||
|
std::pair<int, int> padding = {0, 0},
|
||||||
|
std::pair<int, int> dilation = {1, 1},
|
||||||
|
bool bias = true)
|
||||||
|
: Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias) {
|
||||||
|
int64_t kernel_padding = video_kernel_size / 2;
|
||||||
|
blocks["time_mix_conv"] = std::shared_ptr<GGMLBlock>(new Conv3dnx1x1(out_channels,
|
||||||
|
out_channels,
|
||||||
|
video_kernel_size,
|
||||||
|
1,
|
||||||
|
kernel_padding));
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor* forward(struct ggml_context* ctx,
|
||||||
|
struct ggml_tensor* x) {
|
||||||
|
// timesteps always None
|
||||||
|
// skip_video always False
|
||||||
|
// x: [N, IC, IH, IW]
|
||||||
|
// result: [N, OC, OH, OW]
|
||||||
|
auto time_mix_conv = std::dynamic_pointer_cast<Conv3dnx1x1>(blocks["time_mix_conv"]);
|
||||||
|
|
||||||
|
x = Conv2d::forward(ctx, x);
|
||||||
|
// timesteps = x.shape[0]
|
||||||
|
// x = rearrange(x, "(b t) c h w -> b c t h w", t=timesteps)
|
||||||
|
// x = conv3d(x)
|
||||||
|
// return rearrange(x, "b c t h w -> (b t) c h w")
|
||||||
|
int64_t T = x->ne[3];
|
||||||
|
int64_t B = x->ne[3] / T;
|
||||||
|
int64_t C = x->ne[2];
|
||||||
|
int64_t H = x->ne[1];
|
||||||
|
int64_t W = x->ne[0];
|
||||||
|
|
||||||
|
x = ggml_reshape_4d(ctx, x, W * H, C, T, B); // (b t) c h w -> b t c (h w)
|
||||||
|
x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // b t c (h w) -> b c t (h w)
|
||||||
|
x = time_mix_conv->forward(ctx, x); // [B, OC, T, OH * OW]
|
||||||
|
x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // b c t (h w) -> b t c (h w)
|
||||||
|
x = ggml_reshape_4d(ctx, x, W, H, C, T * B); // b t c (h w) -> (b t) c h w
|
||||||
|
return x; // [B*T, OC, OH, OW]
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class VideoResnetBlock : public ResnetBlock {
|
||||||
|
protected:
|
||||||
|
void init_params(struct ggml_context* ctx, ggml_type wtype) {
|
||||||
|
params["mix_factor"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
float get_alpha() {
|
||||||
|
float alpha = ggml_backend_tensor_get_f32(params["mix_factor"]);
|
||||||
|
return sigmoid(alpha);
|
||||||
|
}
|
||||||
|
|
||||||
|
public:
|
||||||
|
VideoResnetBlock(int64_t in_channels,
|
||||||
|
int64_t out_channels,
|
||||||
|
int video_kernel_size = 3)
|
||||||
|
: ResnetBlock(in_channels, out_channels) {
|
||||||
|
// merge_strategy is always learned
|
||||||
|
blocks["time_stack"] = std::shared_ptr<GGMLBlock>(new ResBlock(out_channels, 0, out_channels, {video_kernel_size, 1}, 3, false, true));
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
|
||||||
|
// x: [N, in_channels, h, w] aka [b*t, in_channels, h, w]
|
||||||
|
// return: [N, out_channels, h, w] aka [b*t, out_channels, h, w]
|
||||||
|
// t_emb is always None
|
||||||
|
// skip_video is always False
|
||||||
|
// timesteps is always None
|
||||||
|
auto time_stack = std::dynamic_pointer_cast<ResBlock>(blocks["time_stack"]);
|
||||||
|
|
||||||
|
x = ResnetBlock::forward(ctx, x); // [N, out_channels, h, w]
|
||||||
|
// return x;
|
||||||
|
|
||||||
|
int64_t T = x->ne[3];
|
||||||
|
int64_t B = x->ne[3] / T;
|
||||||
|
int64_t C = x->ne[2];
|
||||||
|
int64_t H = x->ne[1];
|
||||||
|
int64_t W = x->ne[0];
|
||||||
|
|
||||||
|
x = ggml_reshape_4d(ctx, x, W * H, C, T, B); // (b t) c h w -> b t c (h w)
|
||||||
|
x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // b t c (h w) -> b c t (h w)
|
||||||
|
auto x_mix = x;
|
||||||
|
|
||||||
|
x = time_stack->forward(ctx, x); // b t c (h w)
|
||||||
|
|
||||||
|
float alpha = get_alpha();
|
||||||
|
x = ggml_add(ctx,
|
||||||
|
ggml_scale(ctx, x, alpha),
|
||||||
|
ggml_scale(ctx, x_mix, 1.0f - alpha));
|
||||||
|
|
||||||
|
x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // b c t (h w) -> b t c (h w)
|
||||||
|
x = ggml_reshape_4d(ctx, x, W, H, C, T * B); // b t c (h w) -> (b t) c h w
|
||||||
|
|
||||||
|
return x;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// ldm.modules.diffusionmodules.model.Encoder
|
||||||
|
class Encoder : public GGMLBlock {
|
||||||
|
protected:
|
||||||
|
int ch = 128;
|
||||||
|
std::vector<int> ch_mult = {1, 2, 4, 4};
|
||||||
|
int num_res_blocks = 2;
|
||||||
|
int in_channels = 3;
|
||||||
|
int z_channels = 4;
|
||||||
|
bool double_z = true;
|
||||||
|
|
||||||
|
public:
|
||||||
|
Encoder(int ch,
|
||||||
|
std::vector<int> ch_mult,
|
||||||
|
int num_res_blocks,
|
||||||
|
int in_channels,
|
||||||
|
int z_channels,
|
||||||
|
bool double_z = true)
|
||||||
|
: ch(ch),
|
||||||
|
ch_mult(ch_mult),
|
||||||
|
num_res_blocks(num_res_blocks),
|
||||||
|
in_channels(in_channels),
|
||||||
|
z_channels(z_channels),
|
||||||
|
double_z(double_z) {
|
||||||
|
blocks["conv_in"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, ch, {3, 3}, {1, 1}, {1, 1}));
|
||||||
|
|
||||||
|
size_t num_resolutions = ch_mult.size();
|
||||||
|
|
||||||
|
int block_in = 1;
|
||||||
|
for (int i = 0; i < num_resolutions; i++) {
|
||||||
|
if (i == 0) {
|
||||||
|
block_in = ch;
|
||||||
|
} else {
|
||||||
|
block_in = ch * ch_mult[i - 1];
|
||||||
|
}
|
||||||
|
int block_out = ch * ch_mult[i];
|
||||||
|
for (int j = 0; j < num_res_blocks; j++) {
|
||||||
|
std::string name = "down." + std::to_string(i) + ".block." + std::to_string(j);
|
||||||
|
blocks[name] = std::shared_ptr<GGMLBlock>(new ResnetBlock(block_in, block_out));
|
||||||
|
block_in = block_out;
|
||||||
|
}
|
||||||
|
if (i != num_resolutions - 1) {
|
||||||
|
std::string name = "down." + std::to_string(i) + ".downsample";
|
||||||
|
blocks[name] = std::shared_ptr<GGMLBlock>(new DownSampleBlock(block_in, block_in, true));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
blocks["mid.block_1"] = std::shared_ptr<GGMLBlock>(new ResnetBlock(block_in, block_in));
|
||||||
|
blocks["mid.attn_1"] = std::shared_ptr<GGMLBlock>(new AttnBlock(block_in));
|
||||||
|
blocks["mid.block_2"] = std::shared_ptr<GGMLBlock>(new ResnetBlock(block_in, block_in));
|
||||||
|
|
||||||
|
blocks["norm_out"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(block_in));
|
||||||
|
blocks["conv_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(block_in, double_z ? z_channels * 2 : z_channels, {3, 3}, {1, 1}, {1, 1}));
|
||||||
|
}
|
||||||
|
|
||||||
|
virtual struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
|
||||||
|
// x: [N, in_channels, h, w]
|
||||||
|
|
||||||
|
auto conv_in = std::dynamic_pointer_cast<Conv2d>(blocks["conv_in"]);
|
||||||
|
auto mid_block_1 = std::dynamic_pointer_cast<ResnetBlock>(blocks["mid.block_1"]);
|
||||||
|
auto mid_attn_1 = std::dynamic_pointer_cast<AttnBlock>(blocks["mid.attn_1"]);
|
||||||
|
auto mid_block_2 = std::dynamic_pointer_cast<ResnetBlock>(blocks["mid.block_2"]);
|
||||||
|
auto norm_out = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm_out"]);
|
||||||
|
auto conv_out = std::dynamic_pointer_cast<Conv2d>(blocks["conv_out"]);
|
||||||
|
|
||||||
|
auto h = conv_in->forward(ctx, x); // [N, ch, h, w]
|
||||||
|
|
||||||
|
// downsampling
|
||||||
|
size_t num_resolutions = ch_mult.size();
|
||||||
|
for (int i = 0; i < num_resolutions; i++) {
|
||||||
|
for (int j = 0; j < num_res_blocks; j++) {
|
||||||
|
std::string name = "down." + std::to_string(i) + ".block." + std::to_string(j);
|
||||||
|
auto down_block = std::dynamic_pointer_cast<ResnetBlock>(blocks[name]);
|
||||||
|
|
||||||
|
h = down_block->forward(ctx, h);
|
||||||
|
}
|
||||||
|
if (i != num_resolutions - 1) {
|
||||||
|
std::string name = "down." + std::to_string(i) + ".downsample";
|
||||||
|
auto down_sample = std::dynamic_pointer_cast<DownSampleBlock>(blocks[name]);
|
||||||
|
|
||||||
|
h = down_sample->forward(ctx, h);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// middle
|
||||||
|
h = mid_block_1->forward(ctx, h);
|
||||||
|
h = mid_attn_1->forward(ctx, h);
|
||||||
|
h = mid_block_2->forward(ctx, h); // [N, block_in, h, w]
|
||||||
|
|
||||||
|
// end
|
||||||
|
h = norm_out->forward(ctx, h);
|
||||||
|
h = ggml_silu_inplace(ctx, h); // nonlinearity/swish
|
||||||
|
h = conv_out->forward(ctx, h); // [N, z_channels*2, h, w]
|
||||||
|
return h;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// ldm.modules.diffusionmodules.model.Decoder
|
||||||
|
class Decoder : public GGMLBlock {
|
||||||
|
protected:
|
||||||
|
int ch = 128;
|
||||||
|
int out_ch = 3;
|
||||||
|
std::vector<int> ch_mult = {1, 2, 4, 4};
|
||||||
|
int num_res_blocks = 2;
|
||||||
|
int z_channels = 4;
|
||||||
|
bool video_decoder = false;
|
||||||
|
int video_kernel_size = 3;
|
||||||
|
|
||||||
|
virtual std::shared_ptr<GGMLBlock> get_conv_out(int64_t in_channels,
|
||||||
|
int64_t out_channels,
|
||||||
|
std::pair<int, int> kernel_size,
|
||||||
|
std::pair<int, int> stride = {1, 1},
|
||||||
|
std::pair<int, int> padding = {0, 0}) {
|
||||||
|
if (video_decoder) {
|
||||||
|
return std::shared_ptr<GGMLBlock>(new AE3DConv(in_channels, out_channels, kernel_size, video_kernel_size, stride, padding));
|
||||||
|
} else {
|
||||||
|
return std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, out_channels, kernel_size, stride, padding));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
virtual std::shared_ptr<GGMLBlock> get_resnet_block(int64_t in_channels,
|
||||||
|
int64_t out_channels) {
|
||||||
|
if (video_decoder) {
|
||||||
|
return std::shared_ptr<GGMLBlock>(new VideoResnetBlock(in_channels, out_channels, video_kernel_size));
|
||||||
|
} else {
|
||||||
|
return std::shared_ptr<GGMLBlock>(new ResnetBlock(in_channels, out_channels));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public:
|
||||||
|
Decoder(int ch,
|
||||||
|
int out_ch,
|
||||||
|
std::vector<int> ch_mult,
|
||||||
|
int num_res_blocks,
|
||||||
|
int z_channels,
|
||||||
|
bool video_decoder = false,
|
||||||
|
int video_kernel_size = 3)
|
||||||
|
: ch(ch),
|
||||||
|
out_ch(out_ch),
|
||||||
|
ch_mult(ch_mult),
|
||||||
|
num_res_blocks(num_res_blocks),
|
||||||
|
z_channels(z_channels),
|
||||||
|
video_decoder(video_decoder),
|
||||||
|
video_kernel_size(video_kernel_size) {
|
||||||
|
size_t num_resolutions = ch_mult.size();
|
||||||
|
int block_in = ch * ch_mult[num_resolutions - 1];
|
||||||
|
|
||||||
|
blocks["conv_in"] = std::shared_ptr<GGMLBlock>(new Conv2d(z_channels, block_in, {3, 3}, {1, 1}, {1, 1}));
|
||||||
|
|
||||||
|
blocks["mid.block_1"] = get_resnet_block(block_in, block_in);
|
||||||
|
blocks["mid.attn_1"] = std::shared_ptr<GGMLBlock>(new AttnBlock(block_in));
|
||||||
|
blocks["mid.block_2"] = get_resnet_block(block_in, block_in);
|
||||||
|
|
||||||
|
for (int i = num_resolutions - 1; i >= 0; i--) {
|
||||||
|
int mult = ch_mult[i];
|
||||||
|
int block_out = ch * mult;
|
||||||
|
for (int j = 0; j < num_res_blocks + 1; j++) {
|
||||||
|
std::string name = "up." + std::to_string(i) + ".block." + std::to_string(j);
|
||||||
|
blocks[name] = get_resnet_block(block_in, block_out);
|
||||||
|
|
||||||
|
block_in = block_out;
|
||||||
|
}
|
||||||
|
if (i != 0) {
|
||||||
|
std::string name = "up." + std::to_string(i) + ".upsample";
|
||||||
|
blocks[name] = std::shared_ptr<GGMLBlock>(new UpSampleBlock(block_in, block_in));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
blocks["norm_out"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(block_in));
|
||||||
|
blocks["conv_out"] = get_conv_out(block_in, out_ch, {3, 3}, {1, 1}, {1, 1});
|
||||||
|
}
|
||||||
|
|
||||||
|
virtual struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* z) {
|
||||||
|
// z: [N, z_channels, h, w]
|
||||||
|
// alpha is always 0
|
||||||
|
// merge_strategy is always learned
|
||||||
|
// time_mode is always conv-only, so we need to replace conv_out_op/resnet_op to AE3DConv/VideoResBlock
|
||||||
|
// AttnVideoBlock will not be used
|
||||||
|
auto conv_in = std::dynamic_pointer_cast<Conv2d>(blocks["conv_in"]);
|
||||||
|
auto mid_block_1 = std::dynamic_pointer_cast<ResnetBlock>(blocks["mid.block_1"]);
|
||||||
|
auto mid_attn_1 = std::dynamic_pointer_cast<AttnBlock>(blocks["mid.attn_1"]);
|
||||||
|
auto mid_block_2 = std::dynamic_pointer_cast<ResnetBlock>(blocks["mid.block_2"]);
|
||||||
|
auto norm_out = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm_out"]);
|
||||||
|
auto conv_out = std::dynamic_pointer_cast<Conv2d>(blocks["conv_out"]);
|
||||||
|
|
||||||
|
// conv_in
|
||||||
|
auto h = conv_in->forward(ctx, z); // [N, block_in, h, w]
|
||||||
|
|
||||||
|
// middle
|
||||||
|
h = mid_block_1->forward(ctx, h);
|
||||||
|
// return h;
|
||||||
|
|
||||||
|
h = mid_attn_1->forward(ctx, h);
|
||||||
|
h = mid_block_2->forward(ctx, h); // [N, block_in, h, w]
|
||||||
|
|
||||||
|
// upsampling
|
||||||
|
size_t num_resolutions = ch_mult.size();
|
||||||
|
for (int i = num_resolutions - 1; i >= 0; i--) {
|
||||||
|
for (int j = 0; j < num_res_blocks + 1; j++) {
|
||||||
|
std::string name = "up." + std::to_string(i) + ".block." + std::to_string(j);
|
||||||
|
auto up_block = std::dynamic_pointer_cast<ResnetBlock>(blocks[name]);
|
||||||
|
|
||||||
|
h = up_block->forward(ctx, h);
|
||||||
|
}
|
||||||
|
if (i != 0) {
|
||||||
|
std::string name = "up." + std::to_string(i) + ".upsample";
|
||||||
|
auto up_sample = std::dynamic_pointer_cast<UpSampleBlock>(blocks[name]);
|
||||||
|
|
||||||
|
h = up_sample->forward(ctx, h);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
h = norm_out->forward(ctx, h);
|
||||||
|
h = ggml_silu_inplace(ctx, h); // nonlinearity/swish
|
||||||
|
h = conv_out->forward(ctx, h); // [N, out_ch, h*8, w*8]
|
||||||
|
return h;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// ldm.models.autoencoder.AutoencoderKL
|
||||||
|
class AutoencodingEngine : public GGMLBlock {
|
||||||
|
protected:
|
||||||
|
bool decode_only = true;
|
||||||
|
bool use_video_decoder = false;
|
||||||
|
int embed_dim = 4;
|
||||||
|
struct {
|
||||||
|
int z_channels = 4;
|
||||||
|
int resolution = 256;
|
||||||
|
int in_channels = 3;
|
||||||
|
int out_ch = 3;
|
||||||
|
int ch = 128;
|
||||||
|
std::vector<int> ch_mult = {1, 2, 4, 4};
|
||||||
|
int num_res_blocks = 2;
|
||||||
|
bool double_z = true;
|
||||||
|
} dd_config;
|
||||||
|
|
||||||
|
public:
|
||||||
|
AutoencodingEngine(bool decode_only = true,
|
||||||
|
bool use_video_decoder = false)
|
||||||
|
: decode_only(decode_only), use_video_decoder(use_video_decoder) {
|
||||||
|
blocks["decoder"] = std::shared_ptr<GGMLBlock>(new Decoder(dd_config.ch,
|
||||||
|
dd_config.out_ch,
|
||||||
|
dd_config.ch_mult,
|
||||||
|
dd_config.num_res_blocks,
|
||||||
|
dd_config.z_channels,
|
||||||
|
use_video_decoder));
|
||||||
|
if (!use_video_decoder) {
|
||||||
|
blocks["post_quant_conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(dd_config.z_channels,
|
||||||
|
embed_dim,
|
||||||
|
{1, 1}));
|
||||||
|
}
|
||||||
|
if (!decode_only) {
|
||||||
|
blocks["encoder"] = std::shared_ptr<GGMLBlock>(new Encoder(dd_config.ch,
|
||||||
|
dd_config.ch_mult,
|
||||||
|
dd_config.num_res_blocks,
|
||||||
|
dd_config.in_channels,
|
||||||
|
dd_config.z_channels,
|
||||||
|
dd_config.double_z));
|
||||||
|
if (!use_video_decoder) {
|
||||||
|
int factor = dd_config.double_z ? 2 : 1;
|
||||||
|
|
||||||
|
blocks["quant_conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(embed_dim * factor,
|
||||||
|
dd_config.z_channels * factor,
|
||||||
|
{1, 1}));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor* decode(struct ggml_context* ctx, struct ggml_tensor* z) {
|
||||||
|
// z: [N, z_channels, h, w]
|
||||||
|
if (!use_video_decoder) {
|
||||||
|
auto post_quant_conv = std::dynamic_pointer_cast<Conv2d>(blocks["post_quant_conv"]);
|
||||||
|
z = post_quant_conv->forward(ctx, z); // [N, z_channels, h, w]
|
||||||
|
}
|
||||||
|
auto decoder = std::dynamic_pointer_cast<Decoder>(blocks["decoder"]);
|
||||||
|
|
||||||
|
ggml_set_name(z, "bench-start");
|
||||||
|
auto h = decoder->forward(ctx, z);
|
||||||
|
ggml_set_name(h, "bench-end");
|
||||||
|
return h;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor* encode(struct ggml_context* ctx, struct ggml_tensor* x) {
|
||||||
|
// x: [N, in_channels, h, w]
|
||||||
|
auto encoder = std::dynamic_pointer_cast<Encoder>(blocks["encoder"]);
|
||||||
|
|
||||||
|
auto h = encoder->forward(ctx, x); // [N, 2*z_channels, h/8, w/8]
|
||||||
|
if (!use_video_decoder) {
|
||||||
|
auto quant_conv = std::dynamic_pointer_cast<Conv2d>(blocks["quant_conv"]);
|
||||||
|
h = quant_conv->forward(ctx, h); // [N, 2*embed_dim, h/8, w/8]
|
||||||
|
}
|
||||||
|
return h;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct AutoEncoderKL : public GGMLModule {
|
||||||
|
bool decode_only = true;
|
||||||
|
AutoencodingEngine ae;
|
||||||
|
|
||||||
|
AutoEncoderKL(ggml_backend_t backend,
|
||||||
|
ggml_type wtype,
|
||||||
|
bool decode_only = false,
|
||||||
|
bool use_video_decoder = false)
|
||||||
|
: decode_only(decode_only), ae(decode_only, use_video_decoder), GGMLModule(backend, wtype) {
|
||||||
|
ae.init(params_ctx, wtype);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string get_desc() {
|
||||||
|
return "vae";
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t get_params_mem_size() {
|
||||||
|
return ae.get_params_mem_size();
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t get_params_num() {
|
||||||
|
return ae.get_params_num();
|
||||||
|
}
|
||||||
|
|
||||||
|
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
|
||||||
|
ae.get_param_tensors(tensors, prefix);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_cgraph* build_graph(struct ggml_tensor* z, bool decode_graph) {
|
||||||
|
struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
|
||||||
|
|
||||||
|
z = to_backend(z);
|
||||||
|
|
||||||
|
struct ggml_tensor* out = decode_graph ? ae.decode(compute_ctx, z) : ae.encode(compute_ctx, z);
|
||||||
|
|
||||||
|
ggml_build_forward_expand(gf, out);
|
||||||
|
|
||||||
|
return gf;
|
||||||
|
}
|
||||||
|
|
||||||
|
void compute(const int n_threads,
|
||||||
|
struct ggml_tensor* z,
|
||||||
|
bool decode_graph,
|
||||||
|
struct ggml_tensor** output,
|
||||||
|
struct ggml_context* output_ctx = NULL) {
|
||||||
|
auto get_graph = [&]() -> struct ggml_cgraph* {
|
||||||
|
return build_graph(z, decode_graph);
|
||||||
|
};
|
||||||
|
// ggml_set_f32(z, 0.5f);
|
||||||
|
// print_ggml_tensor(z);
|
||||||
|
GGMLModule::compute(get_graph, n_threads, true, output, output_ctx);
|
||||||
|
}
|
||||||
|
|
||||||
|
void test() {
|
||||||
|
struct ggml_init_params params;
|
||||||
|
params.mem_size = static_cast<size_t>(10 * 1024 * 1024); // 10 MB
|
||||||
|
params.mem_buffer = NULL;
|
||||||
|
params.no_alloc = false;
|
||||||
|
|
||||||
|
struct ggml_context* work_ctx = ggml_init(params);
|
||||||
|
GGML_ASSERT(work_ctx != NULL);
|
||||||
|
|
||||||
|
{
|
||||||
|
// CPU, x{1, 3, 64, 64}: Pass
|
||||||
|
// CUDA, x{1, 3, 64, 64}: Pass, but sill get wrong result for some image, may be due to interlnal nan
|
||||||
|
// CPU, x{2, 3, 64, 64}: Wrong result
|
||||||
|
// CUDA, x{2, 3, 64, 64}: Wrong result, and different from CPU result
|
||||||
|
auto x = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 64, 64, 3, 2);
|
||||||
|
ggml_set_f32(x, 0.5f);
|
||||||
|
print_ggml_tensor(x);
|
||||||
|
struct ggml_tensor* out = NULL;
|
||||||
|
|
||||||
|
int t0 = ggml_time_ms();
|
||||||
|
compute(8, x, false, &out, work_ctx);
|
||||||
|
int t1 = ggml_time_ms();
|
||||||
|
|
||||||
|
print_ggml_tensor(out);
|
||||||
|
LOG_DEBUG("encode test done in %dms", t1 - t0);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (false) {
|
||||||
|
// CPU, z{1, 4, 8, 8}: Pass
|
||||||
|
// CUDA, z{1, 4, 8, 8}: Pass
|
||||||
|
// CPU, z{3, 4, 8, 8}: Wrong result
|
||||||
|
// CUDA, z{3, 4, 8, 8}: Wrong result, and different from CPU result
|
||||||
|
auto z = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 4, 1);
|
||||||
|
ggml_set_f32(z, 0.5f);
|
||||||
|
print_ggml_tensor(z);
|
||||||
|
struct ggml_tensor* out = NULL;
|
||||||
|
|
||||||
|
int t0 = ggml_time_ms();
|
||||||
|
compute(8, z, true, &out, work_ctx);
|
||||||
|
int t1 = ggml_time_ms();
|
||||||
|
|
||||||
|
print_ggml_tensor(out);
|
||||||
|
LOG_DEBUG("decode test done in %dms", t1 - t0);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif
|
524621
otherarch/sdcpp/vocab.hpp
Normal file
524621
otherarch/sdcpp/vocab.hpp
Normal file
File diff suppressed because it is too large
Load diff
Loading…
Add table
Add a link
Reference in a new issue