mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 09:34:37 +00:00
Added vulkan support for SD (+1 squashed commits)
Squashed commits: [13f42f83] Added vulkan support for SD
This commit is contained in:
parent
101efb66af
commit
3a72410804
35 changed files with 102702 additions and 98774 deletions
13
Makefile
13
Makefile
|
@ -42,8 +42,8 @@ endif
|
||||||
#
|
#
|
||||||
|
|
||||||
# keep standard at C11 and C++11
|
# keep standard at C11 and C++11
|
||||||
CFLAGS = -I. -Iggml/include -Iggml/src -Iinclude -Isrc -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./otherarch/sdcpp -I./otherarch/sdcpp/thirdparty -I./include/vulkan -O3 -fno-finite-math-only -DNDEBUG -std=c11 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE -DGGML_USE_LLAMAFILE
|
CFLAGS = -I. -Iggml/include -Iggml/src -Iinclude -Isrc -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./otherarch/sdcpp -I./otherarch/sdcpp/thirdparty -I./include/vulkan -O3 -fno-finite-math-only -fmath-errno -DNDEBUG -std=c11 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE -DGGML_USE_LLAMAFILE
|
||||||
CXXFLAGS = -I. -Iggml/include -Iggml/src -Iinclude -Isrc -I./common -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./otherarch/sdcpp -I./otherarch/sdcpp/thirdparty -I./include/vulkan -O3 -fno-finite-math-only -DNDEBUG -std=c++11 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE -DGGML_USE_LLAMAFILE
|
CXXFLAGS = -I. -Iggml/include -Iggml/src -Iinclude -Isrc -I./common -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./otherarch/sdcpp -I./otherarch/sdcpp/thirdparty -I./include/vulkan -O3 -fno-finite-math-only -fmath-errno -DNDEBUG -std=c++11 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE -DGGML_USE_LLAMAFILE
|
||||||
LDFLAGS =
|
LDFLAGS =
|
||||||
FASTCFLAGS = $(subst -O3,-Ofast,$(CFLAGS))
|
FASTCFLAGS = $(subst -O3,-Ofast,$(CFLAGS))
|
||||||
FASTCXXFLAGS = $(subst -O3,-Ofast,$(CXXFLAGS))
|
FASTCXXFLAGS = $(subst -O3,-Ofast,$(CXXFLAGS))
|
||||||
|
@ -56,7 +56,7 @@ NONECFLAGS =
|
||||||
OPENBLAS_FLAGS = -DGGML_USE_OPENBLAS -DGGML_USE_BLAS -I/usr/local/include/openblas
|
OPENBLAS_FLAGS = -DGGML_USE_OPENBLAS -DGGML_USE_BLAS -I/usr/local/include/openblas
|
||||||
CLBLAST_FLAGS = -DGGML_USE_CLBLAST
|
CLBLAST_FLAGS = -DGGML_USE_CLBLAST
|
||||||
FAILSAFE_FLAGS = -DUSE_FAILSAFE
|
FAILSAFE_FLAGS = -DUSE_FAILSAFE
|
||||||
VULKAN_FLAGS = -DGGML_USE_VULKAN
|
VULKAN_FLAGS = -DGGML_USE_VULKAN -DSD_USE_VULKAN
|
||||||
ifdef LLAMA_CUBLAS
|
ifdef LLAMA_CUBLAS
|
||||||
CUBLAS_FLAGS = -DGGML_USE_CUDA -DSD_USE_CUBLAS
|
CUBLAS_FLAGS = -DGGML_USE_CUDA -DSD_USE_CUBLAS
|
||||||
else
|
else
|
||||||
|
@ -535,6 +535,9 @@ sdcpp_default.o: otherarch/sdcpp/sdtype_adapter.cpp otherarch/sdcpp/stable-diffu
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
sdcpp_cublas.o: otherarch/sdcpp/sdtype_adapter.cpp otherarch/sdcpp/stable-diffusion.h otherarch/sdcpp/stable-diffusion.cpp otherarch/sdcpp/util.cpp otherarch/sdcpp/upscaler.cpp otherarch/sdcpp/model.cpp otherarch/sdcpp/thirdparty/zip.c
|
sdcpp_cublas.o: otherarch/sdcpp/sdtype_adapter.cpp otherarch/sdcpp/stable-diffusion.h otherarch/sdcpp/stable-diffusion.cpp otherarch/sdcpp/util.cpp otherarch/sdcpp/upscaler.cpp otherarch/sdcpp/model.cpp otherarch/sdcpp/thirdparty/zip.c
|
||||||
$(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@
|
||||||
|
sdcpp_vulkan.o: otherarch/sdcpp/sdtype_adapter.cpp otherarch/sdcpp/stable-diffusion.h otherarch/sdcpp/stable-diffusion.cpp otherarch/sdcpp/util.cpp otherarch/sdcpp/upscaler.cpp otherarch/sdcpp/model.cpp otherarch/sdcpp/thirdparty/zip.c
|
||||||
|
$(CXX) $(CXXFLAGS) $(VULKAN_FLAGS) -c $< -o $@
|
||||||
|
|
||||||
|
|
||||||
#whisper objects
|
#whisper objects
|
||||||
whispercpp_default.o: otherarch/whispercpp/whisper_adapter.cpp
|
whispercpp_default.o: otherarch/whispercpp/whisper_adapter.cpp
|
||||||
|
@ -649,10 +652,10 @@ koboldcpp_hipblas:
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef VULKAN_BUILD
|
ifdef VULKAN_BUILD
|
||||||
koboldcpp_vulkan: ggml_v4_vulkan.o ggml_v3.o ggml_v2.o ggml_v1.o expose.o gpttype_adapter_vulkan.o ggml-vulkan.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_vulkan.o $(OBJS_FULL) $(OBJS)
|
koboldcpp_vulkan: ggml_v4_vulkan.o ggml_v3.o ggml_v2.o ggml_v1.o expose.o gpttype_adapter_vulkan.o ggml-vulkan.o sdcpp_vulkan.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_vulkan.o $(OBJS_FULL) $(OBJS)
|
||||||
$(VULKAN_BUILD)
|
$(VULKAN_BUILD)
|
||||||
ifdef NOAVX2_BUILD
|
ifdef NOAVX2_BUILD
|
||||||
koboldcpp_vulkan_noavx2: ggml_v4_vulkan_noavx2.o ggml_v3_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_vulkan_noavx2.o ggml-vulkan.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_vulkan.o $(OBJS_SIMPLE) $(OBJS)
|
koboldcpp_vulkan_noavx2: ggml_v4_vulkan_noavx2.o ggml_v3_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o gpttype_adapter_vulkan_noavx2.o ggml-vulkan.o sdcpp_vulkan.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_vulkan.o $(OBJS_SIMPLE) $(OBJS)
|
||||||
$(VULKAN_BUILD)
|
$(VULKAN_BUILD)
|
||||||
else
|
else
|
||||||
koboldcpp_vulkan_noavx2:
|
koboldcpp_vulkan_noavx2:
|
||||||
|
|
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
5
ggml/src/vulkan-shaders/CMakeLists.txt
Normal file
5
ggml/src/vulkan-shaders/CMakeLists.txt
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
|
||||||
|
set(TARGET vulkan-shaders-gen)
|
||||||
|
add_executable(${TARGET} vulkan-shaders-gen.cpp)
|
||||||
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
|
@ -4,9 +4,11 @@
|
||||||
#include "generic_binary_head.comp"
|
#include "generic_binary_head.comp"
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
if (gl_GlobalInvocationID.x >= p.ne) {
|
const uint idx = get_idx();
|
||||||
|
|
||||||
|
if (idx >= p.ne) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]) + FLOAT_TYPE(data_b[src1_idx(gl_GlobalInvocationID.x)]));
|
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) + FLOAT_TYPE(data_b[src1_idx(idx)]));
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,10 +4,12 @@
|
||||||
#include "generic_unary_head.comp"
|
#include "generic_unary_head.comp"
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
if (gl_GlobalInvocationID.x >= p.ne) {
|
const uint idx = get_idx();
|
||||||
|
|
||||||
|
if (idx >= p.ne) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]);
|
const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
|
||||||
data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(val < p.param1 ? p.param1 : (val > p.param2 ? p.param2 : val));
|
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(val < p.param1 ? p.param1 : (val > p.param2 ? p.param2 : val));
|
||||||
}
|
}
|
||||||
|
|
31
ggml/src/vulkan-shaders/concat.comp
Normal file
31
ggml/src/vulkan-shaders/concat.comp
Normal file
|
@ -0,0 +1,31 @@
|
||||||
|
#version 450
|
||||||
|
|
||||||
|
#include "types.comp"
|
||||||
|
#include "generic_binary_head.comp"
|
||||||
|
|
||||||
|
void main() {
|
||||||
|
const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
||||||
|
const int dim = p.param3;
|
||||||
|
|
||||||
|
if (idx >= p.ne) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const uint i3 = idx / (p.ne22*p.ne21*p.ne20);
|
||||||
|
const uint i3_offset = i3 * p.ne22*p.ne21*p.ne20;
|
||||||
|
const uint i2 = (idx - i3_offset) / (p.ne21*p.ne20);
|
||||||
|
const uint i2_offset = i2*p.ne21*p.ne20;
|
||||||
|
const uint i1 = (idx - i3_offset - i2_offset) / p.ne20;
|
||||||
|
const uint i0 = idx - i3_offset - i2_offset - i1*p.ne20;
|
||||||
|
|
||||||
|
uint o[4] = {0, 0, 0, 0};
|
||||||
|
o[dim] = dim == 0 ? p.ne00 : (dim == 1 ? p.ne01 : (dim == 2 ? p.ne02 : p.ne03));
|
||||||
|
|
||||||
|
const uint src0_idx = i3*p.nb03 + i2*p.nb02 + i1*p.nb01 + i0*p.nb00;
|
||||||
|
const uint src1_idx = (i3 - o[3])*p.nb13 + (i2 - o[2])*p.nb12 + (i1 - o[1])*p.nb11 + (i0 - o[0])*p.nb10;
|
||||||
|
const uint dst_idx = i3*p.nb23 + i2*p.nb22 + i1*p.nb21 + i0*p.nb20;
|
||||||
|
|
||||||
|
const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03;
|
||||||
|
|
||||||
|
data_d[p.d_offset + dst_idx] = D_TYPE(is_src0 ? data_a[src0_idx] : data_b[src1_idx]);
|
||||||
|
}
|
|
@ -4,13 +4,15 @@
|
||||||
#include "generic_unary_head.comp"
|
#include "generic_unary_head.comp"
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
if (gl_GlobalInvocationID.x >= p.ne) {
|
const uint idx = get_idx();
|
||||||
|
|
||||||
|
if (idx >= p.ne) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifndef OPTIMIZATION_ERROR_WORKAROUND
|
#ifndef OPTIMIZATION_ERROR_WORKAROUND
|
||||||
data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]);
|
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(data_a[src0_idx(idx)]);
|
||||||
#else
|
#else
|
||||||
data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = data_a[src0_idx(gl_GlobalInvocationID.x)];
|
data_d[p.d_offset + dst_idx(idx)] = data_a[src0_idx(idx)];
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,9 +4,11 @@
|
||||||
#include "generic_binary_head.comp"
|
#include "generic_binary_head.comp"
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
if (gl_GlobalInvocationID.x >= p.ne) {
|
const uint idx = get_idx();
|
||||||
|
|
||||||
|
if (idx >= p.ne) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]) / FLOAT_TYPE(data_b[src1_idx(gl_GlobalInvocationID.x)]));
|
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) / FLOAT_TYPE(data_b[src1_idx(idx)]));
|
||||||
}
|
}
|
||||||
|
|
|
@ -13,7 +13,7 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||||
void main() {
|
void main() {
|
||||||
const float GELU_COEF_A = 0.044715f;
|
const float GELU_COEF_A = 0.044715f;
|
||||||
const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
|
const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
|
||||||
const uint i = gl_GlobalInvocationID.x;
|
const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
||||||
|
|
||||||
if (i >= p.KX) {
|
if (i >= p.KX) {
|
||||||
return;
|
return;
|
||||||
|
|
23
ggml/src/vulkan-shaders/gelu_quick.comp
Normal file
23
ggml/src/vulkan-shaders/gelu_quick.comp
Normal file
|
@ -0,0 +1,23 @@
|
||||||
|
#version 450
|
||||||
|
|
||||||
|
#include "generic_head.comp"
|
||||||
|
#include "types.comp"
|
||||||
|
|
||||||
|
#extension GL_EXT_control_flow_attributes : enable
|
||||||
|
|
||||||
|
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||||
|
|
||||||
|
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
||||||
|
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||||
|
|
||||||
|
void main() {
|
||||||
|
const float GELU_QUICK_COEF = -1.702f;
|
||||||
|
const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
||||||
|
|
||||||
|
if (i >= p.KX) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const float x = float(data_a[i]);
|
||||||
|
data_d[i] = D_TYPE(x * (1.0f / (1.0f + exp(GELU_QUICK_COEF * x))));
|
||||||
|
}
|
|
@ -7,7 +7,7 @@ layout (push_constant) uniform parameter
|
||||||
uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
|
uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
|
||||||
uint ne20; uint ne21; uint ne22; uint ne23; uint nb20; uint nb21; uint nb22; uint nb23;
|
uint ne20; uint ne21; uint ne22; uint ne23; uint nb20; uint nb21; uint nb22; uint nb23;
|
||||||
uint d_offset;
|
uint d_offset;
|
||||||
float param1; float param2;
|
float param1; float param2; int param3;
|
||||||
} p;
|
} p;
|
||||||
|
|
||||||
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||||
|
@ -16,6 +16,10 @@ layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
|
||||||
layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
|
layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
|
||||||
layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
|
layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
|
||||||
|
|
||||||
|
uint get_idx() {
|
||||||
|
return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
||||||
|
}
|
||||||
|
|
||||||
uint src0_idx(uint idx) {
|
uint src0_idx(uint idx) {
|
||||||
const uint i03 = idx / (p.ne02*p.ne01*p.ne00);
|
const uint i03 = idx / (p.ne02*p.ne01*p.ne00);
|
||||||
const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00;
|
const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00;
|
||||||
|
|
|
@ -14,6 +14,10 @@ layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||||
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
|
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
|
||||||
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||||
|
|
||||||
|
uint get_idx() {
|
||||||
|
return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
||||||
|
}
|
||||||
|
|
||||||
uint src0_idx(uint idx) {
|
uint src0_idx(uint idx) {
|
||||||
const uint i03 = idx / (p.ne02*p.ne01*p.ne00);
|
const uint i03 = idx / (p.ne02*p.ne01*p.ne00);
|
||||||
const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00;
|
const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00;
|
||||||
|
|
66
ggml/src/vulkan-shaders/group_norm.comp
Normal file
66
ggml/src/vulkan-shaders/group_norm.comp
Normal file
|
@ -0,0 +1,66 @@
|
||||||
|
#version 450
|
||||||
|
|
||||||
|
#include "generic_head.comp"
|
||||||
|
#include "types.comp"
|
||||||
|
|
||||||
|
#extension GL_EXT_control_flow_attributes : enable
|
||||||
|
#define BLOCK_SIZE 512
|
||||||
|
|
||||||
|
layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
|
||||||
|
|
||||||
|
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
||||||
|
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||||
|
|
||||||
|
shared float tmp[BLOCK_SIZE];
|
||||||
|
|
||||||
|
void main() {
|
||||||
|
const uint group_size = p.KX;
|
||||||
|
const float eps = p.param1;
|
||||||
|
|
||||||
|
const uint tid = gl_LocalInvocationID.x;
|
||||||
|
const uint start = gl_WorkGroupID.x * group_size + tid;
|
||||||
|
const uint end = start + group_size;
|
||||||
|
|
||||||
|
tmp[tid] = 0.0f;
|
||||||
|
|
||||||
|
// Calculate mean
|
||||||
|
[[unroll]] for (uint col = start; col < end; col += BLOCK_SIZE) {
|
||||||
|
tmp[tid] += float(data_a[col]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// tmp up partial tmps and write back result
|
||||||
|
barrier();
|
||||||
|
[[unroll]] for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
|
||||||
|
if (tid < s) {
|
||||||
|
tmp[tid] += tmp[tid + s];
|
||||||
|
}
|
||||||
|
barrier();
|
||||||
|
}
|
||||||
|
|
||||||
|
const float mean = tmp[0] / group_size;
|
||||||
|
barrier();
|
||||||
|
tmp[tid] = 0.0f;
|
||||||
|
|
||||||
|
// Calculate variance
|
||||||
|
[[unroll]] for (uint col = start; col < end; col += BLOCK_SIZE) {
|
||||||
|
const float xi = float(data_a[col]) - mean;
|
||||||
|
data_d[col] = D_TYPE(xi);
|
||||||
|
tmp[tid] += xi * xi;
|
||||||
|
}
|
||||||
|
|
||||||
|
// sum up partial sums and write back result
|
||||||
|
barrier();
|
||||||
|
[[unroll]] for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
|
||||||
|
if (tid < s) {
|
||||||
|
tmp[tid] += tmp[tid + s];
|
||||||
|
}
|
||||||
|
barrier();
|
||||||
|
}
|
||||||
|
|
||||||
|
const float variance = tmp[0] / group_size;
|
||||||
|
const float scale = inversesqrt(variance + eps);
|
||||||
|
|
||||||
|
[[unroll]] for (uint col = start; col < end; col += BLOCK_SIZE) {
|
||||||
|
data_d[col] *= D_TYPE(scale);
|
||||||
|
}
|
||||||
|
}
|
57
ggml/src/vulkan-shaders/im2col.comp
Normal file
57
ggml/src/vulkan-shaders/im2col.comp
Normal file
|
@ -0,0 +1,57 @@
|
||||||
|
#version 450
|
||||||
|
|
||||||
|
#extension GL_EXT_shader_16bit_storage : require
|
||||||
|
|
||||||
|
layout (push_constant) uniform parameter
|
||||||
|
{
|
||||||
|
uint batch_offset; uint offset_delta;
|
||||||
|
uint IC;
|
||||||
|
uint IW; uint IH;
|
||||||
|
uint OW; uint OH;
|
||||||
|
uint KW; uint KH;
|
||||||
|
uint pelements;
|
||||||
|
uint CHW;
|
||||||
|
int s0; int s1;
|
||||||
|
int p0; int p1;
|
||||||
|
int d0; int d1;
|
||||||
|
} p;
|
||||||
|
|
||||||
|
#include "types.comp"
|
||||||
|
|
||||||
|
#define BLOCK_SIZE 256
|
||||||
|
|
||||||
|
layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
|
||||||
|
|
||||||
|
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
||||||
|
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||||
|
|
||||||
|
void main() {
|
||||||
|
const uint i = gl_GlobalInvocationID.x;
|
||||||
|
if (i >= p.pelements) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const uint ksize = p.OW * (p.KH > 1 ? p.KW : 1);
|
||||||
|
const uint kx = i / ksize;
|
||||||
|
const uint kd = kx * ksize;
|
||||||
|
const uint ky = (i - kd) / p.OW;
|
||||||
|
const uint ix = i % p.OW;
|
||||||
|
|
||||||
|
const uint oh = gl_GlobalInvocationID.y;
|
||||||
|
const uint batch = gl_GlobalInvocationID.z / p.IC;
|
||||||
|
const uint ic = gl_GlobalInvocationID.z % p.IC;
|
||||||
|
|
||||||
|
const uint iiw = ix * p.s0 + kx * p.d0 - p.p0;
|
||||||
|
const uint iih = oh * p.s1 + ky * p.d1 - p.p1;
|
||||||
|
|
||||||
|
const uint offset_dst =
|
||||||
|
((batch * p.OH + oh) * p.OW + ix) * p.CHW +
|
||||||
|
(ic * (p.KW * p.KH) + ky * p.KW + kx);
|
||||||
|
|
||||||
|
if (iih < 0 || iih >= p.IH || iiw < 0 || iiw >= p.IW) {
|
||||||
|
data_d[offset_dst] = D_TYPE(0.0f);
|
||||||
|
} else {
|
||||||
|
const uint offset_src = ic * p.offset_delta + batch * p.batch_offset;
|
||||||
|
data_d[offset_dst] = D_TYPE(data_a[offset_src + iih * p.IW + iiw]);
|
||||||
|
}
|
||||||
|
}
|
|
@ -4,9 +4,11 @@
|
||||||
#include "generic_binary_head.comp"
|
#include "generic_binary_head.comp"
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
if (gl_GlobalInvocationID.x >= p.ne) {
|
const uint idx = get_idx();
|
||||||
|
|
||||||
|
if (idx >= p.ne) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]) * FLOAT_TYPE(data_b[src1_idx(gl_GlobalInvocationID.x)]));
|
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) * FLOAT_TYPE(data_b[src1_idx(idx)]));
|
||||||
}
|
}
|
||||||
|
|
|
@ -14,7 +14,7 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||||
shared vec2 sum[BLOCK_SIZE];
|
shared vec2 sum[BLOCK_SIZE];
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
const uint row = gl_WorkGroupID.x;
|
const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
|
||||||
const uint tid = gl_LocalInvocationID.x;
|
const uint tid = gl_LocalInvocationID.x;
|
||||||
|
|
||||||
sum[tid] = vec2(0.0f, 0.0f);
|
sum[tid] = vec2(0.0f, 0.0f);
|
||||||
|
|
26
ggml/src/vulkan-shaders/pad.comp
Normal file
26
ggml/src/vulkan-shaders/pad.comp
Normal file
|
@ -0,0 +1,26 @@
|
||||||
|
#version 450
|
||||||
|
|
||||||
|
#include "types.comp"
|
||||||
|
#include "generic_unary_head.comp"
|
||||||
|
|
||||||
|
void main() {
|
||||||
|
const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
||||||
|
|
||||||
|
if (idx >= p.ne) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const uint i3 = idx / (p.ne12*p.ne11*p.ne10);
|
||||||
|
const uint i3_offset = i3 * p.ne12*p.ne11*p.ne10;
|
||||||
|
const uint i2 = (idx - i3_offset) / (p.ne11*p.ne10);
|
||||||
|
const uint i2_offset = i2*p.ne11*p.ne10;
|
||||||
|
const uint i1 = (idx - i3_offset - i2_offset) / p.ne10;
|
||||||
|
const uint i0 = idx - i3_offset - i2_offset - i1*p.ne10;
|
||||||
|
|
||||||
|
const uint src0_idx = i3*p.nb03 + i2*p.nb02 + i1*p.nb01 + i0*p.nb00;
|
||||||
|
const uint dst_idx = i3*p.nb13 + i2*p.nb12 + i1*p.nb11 + i0*p.nb10;
|
||||||
|
|
||||||
|
const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03;
|
||||||
|
|
||||||
|
data_d[p.d_offset + dst_idx] = D_TYPE(is_src0 ? data_a[src0_idx] : 0.0f);
|
||||||
|
}
|
|
@ -11,7 +11,7 @@ layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
||||||
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
const uint i = gl_GlobalInvocationID.x;
|
const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
||||||
|
|
||||||
if (i >= p.KX) {
|
if (i >= p.KX) {
|
||||||
return;
|
return;
|
||||||
|
|
|
@ -14,7 +14,7 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||||
shared FLOAT_TYPE sum[BLOCK_SIZE];
|
shared FLOAT_TYPE sum[BLOCK_SIZE];
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
const uint row = gl_WorkGroupID.x;
|
const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
|
||||||
const uint tid = gl_LocalInvocationID.x;
|
const uint tid = gl_LocalInvocationID.x;
|
||||||
|
|
||||||
sum[tid] = FLOAT_TYPE(0.0f); // partial sum for thread in warp
|
sum[tid] = FLOAT_TYPE(0.0f); // partial sum for thread in warp
|
||||||
|
|
|
@ -4,9 +4,11 @@
|
||||||
#include "generic_unary_head.comp"
|
#include "generic_unary_head.comp"
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
if (gl_GlobalInvocationID.x >= p.ne) {
|
const uint idx = get_idx();
|
||||||
|
|
||||||
|
if (idx >= p.ne) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]) * FLOAT_TYPE(p.param1));
|
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) * FLOAT_TYPE(p.param1));
|
||||||
}
|
}
|
||||||
|
|
|
@ -11,7 +11,7 @@ layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
||||||
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
const uint i = gl_GlobalInvocationID.x;
|
const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
||||||
|
|
||||||
if (i >= p.KX) {
|
if (i >= p.KX) {
|
||||||
return;
|
return;
|
||||||
|
|
|
@ -28,7 +28,7 @@ shared FLOAT_TYPE vals[BLOCK_SIZE];
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
const uint tid = gl_LocalInvocationID.x;
|
const uint tid = gl_LocalInvocationID.x;
|
||||||
const uint rowx = gl_WorkGroupID.x;
|
const uint rowx = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
|
||||||
const uint rowy = rowx % p.KY;
|
const uint rowy = rowx % p.KY;
|
||||||
|
|
||||||
float slope = 1.0f;
|
float slope = 1.0f;
|
||||||
|
|
|
@ -4,10 +4,12 @@
|
||||||
#include "generic_unary_head.comp"
|
#include "generic_unary_head.comp"
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
if (gl_GlobalInvocationID.x >= p.ne) {
|
const uint idx = get_idx();
|
||||||
|
|
||||||
|
if (idx >= p.ne) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]);
|
const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
|
||||||
data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(val * val);
|
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(val * val);
|
||||||
}
|
}
|
||||||
|
|
|
@ -14,7 +14,7 @@ layout (constant_id = 0) const uint BLOCK_SIZE = 32;
|
||||||
shared FLOAT_TYPE tmp[BLOCK_SIZE];
|
shared FLOAT_TYPE tmp[BLOCK_SIZE];
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
const uint row = gl_WorkGroupID.x;
|
const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
|
||||||
const uint col = gl_LocalInvocationID.x;
|
const uint col = gl_LocalInvocationID.x;
|
||||||
|
|
||||||
tmp[col] = FLOAT_TYPE(0.0f);
|
tmp[col] = FLOAT_TYPE(0.0f);
|
||||||
|
|
21
ggml/src/vulkan-shaders/tanh.comp
Normal file
21
ggml/src/vulkan-shaders/tanh.comp
Normal file
|
@ -0,0 +1,21 @@
|
||||||
|
#version 450
|
||||||
|
|
||||||
|
#include "generic_head.comp"
|
||||||
|
#include "types.comp"
|
||||||
|
|
||||||
|
#extension GL_EXT_control_flow_attributes : enable
|
||||||
|
|
||||||
|
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||||
|
|
||||||
|
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
||||||
|
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||||
|
|
||||||
|
void main() {
|
||||||
|
const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
||||||
|
|
||||||
|
if (i >= p.KX) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
data_d[i] = D_TYPE(tanh(data_a[i]));
|
||||||
|
}
|
41
ggml/src/vulkan-shaders/timestep_embedding.comp
Normal file
41
ggml/src/vulkan-shaders/timestep_embedding.comp
Normal file
|
@ -0,0 +1,41 @@
|
||||||
|
#version 450
|
||||||
|
|
||||||
|
#extension GL_EXT_shader_16bit_storage : require
|
||||||
|
|
||||||
|
layout (push_constant) uniform parameter
|
||||||
|
{
|
||||||
|
uint nb1;
|
||||||
|
uint dim;
|
||||||
|
uint max_period;
|
||||||
|
} p;
|
||||||
|
|
||||||
|
#include "types.comp"
|
||||||
|
|
||||||
|
#extension GL_EXT_control_flow_attributes : enable
|
||||||
|
#define BLOCK_SIZE 256
|
||||||
|
|
||||||
|
layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
|
||||||
|
|
||||||
|
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
||||||
|
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||||
|
|
||||||
|
void main() {
|
||||||
|
const uint i = gl_WorkGroupID.y;
|
||||||
|
const uint j = gl_GlobalInvocationID.x;
|
||||||
|
const uint d_offset = i * p.nb1;
|
||||||
|
|
||||||
|
if (p.dim % 2 != 0 && j == ((p.dim + 1) / 2)) {
|
||||||
|
data_d[d_offset + p.dim] = 0.f;
|
||||||
|
}
|
||||||
|
|
||||||
|
const uint half_dim = p.dim / 2;
|
||||||
|
if (j >= half_dim) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const float timestep = float(data_a[i]);
|
||||||
|
const float freq = float(exp(-log(p.max_period) * j / half_dim));
|
||||||
|
const float arg = timestep * freq;
|
||||||
|
data_d[d_offset + j] = D_TYPE(cos(arg));
|
||||||
|
data_d[d_offset + j + half_dim] = D_TYPE(sin(arg));
|
||||||
|
}
|
|
@ -6,7 +6,7 @@
|
||||||
#define QUANT_K 1
|
#define QUANT_K 1
|
||||||
#define QUANT_R 1
|
#define QUANT_R 1
|
||||||
|
|
||||||
#ifndef LOAD_VEC_A
|
#if !defined(LOAD_VEC_A) || LOAD_VEC_A == 1
|
||||||
#define A_TYPE float
|
#define A_TYPE float
|
||||||
#elif LOAD_VEC_A == 4
|
#elif LOAD_VEC_A == 4
|
||||||
#define A_TYPE vec4
|
#define A_TYPE vec4
|
||||||
|
@ -19,7 +19,7 @@
|
||||||
#define QUANT_K 1
|
#define QUANT_K 1
|
||||||
#define QUANT_R 1
|
#define QUANT_R 1
|
||||||
|
|
||||||
#ifndef LOAD_VEC_A
|
#if !defined(LOAD_VEC_A) || LOAD_VEC_A == 1
|
||||||
#define A_TYPE float16_t
|
#define A_TYPE float16_t
|
||||||
#elif LOAD_VEC_A == 4
|
#elif LOAD_VEC_A == 4
|
||||||
#define A_TYPE f16vec4
|
#define A_TYPE f16vec4
|
||||||
|
|
36
ggml/src/vulkan-shaders/upscale.comp
Normal file
36
ggml/src/vulkan-shaders/upscale.comp
Normal file
|
@ -0,0 +1,36 @@
|
||||||
|
#version 450
|
||||||
|
|
||||||
|
layout (push_constant) uniform parameter
|
||||||
|
{
|
||||||
|
uint ne; uint d_offset;
|
||||||
|
uint nb00; uint nb01; uint nb02; uint nb03;
|
||||||
|
uint ne10; uint ne11; uint ne12; uint ne13;
|
||||||
|
float sf0; float sf1; float sf2; float sf3;
|
||||||
|
} p;
|
||||||
|
|
||||||
|
#include "types.comp"
|
||||||
|
|
||||||
|
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||||
|
|
||||||
|
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
|
||||||
|
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||||
|
|
||||||
|
void main() {
|
||||||
|
const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
||||||
|
|
||||||
|
if (idx >= p.ne) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const uint i10 = idx % p.ne10;
|
||||||
|
const uint i11 = (idx / p.ne10) % p.ne11;
|
||||||
|
const uint i12 = (idx / (p.ne10 * p.ne11)) % p.ne12;
|
||||||
|
const uint i13 = (idx / (p.ne10 * p.ne11 * p.ne12)) % p.ne13;
|
||||||
|
|
||||||
|
const uint i00 = uint(i10 / p.sf0);
|
||||||
|
const uint i01 = uint(i11 / p.sf1);
|
||||||
|
const uint i02 = uint(i12 / p.sf2);
|
||||||
|
const uint i03 = uint(i13 / p.sf3);
|
||||||
|
|
||||||
|
data_d[p.d_offset + idx] = D_TYPE(data_a[i03 * p.nb03 + i02 * p.nb02 + i01 * p.nb01 + i00 * p.nb00]);
|
||||||
|
}
|
|
@ -30,20 +30,6 @@
|
||||||
|
|
||||||
#define ASYNCIO_CONCURRENCY 64
|
#define ASYNCIO_CONCURRENCY 64
|
||||||
|
|
||||||
// define prototypes
|
|
||||||
void execute_command(const std::string& command, std::string& stdout_str, std::string& stderr_str);
|
|
||||||
bool directory_exists(const std::string& path);
|
|
||||||
bool create_directory(const std::string& path);
|
|
||||||
std::string to_uppercase(const std::string& input);
|
|
||||||
bool string_ends_with(const std::string& str, const std::string& suffix);
|
|
||||||
std::string join_paths(const std::string& path1, const std::string& path2);
|
|
||||||
std::string basename(const std::string &path);
|
|
||||||
void string_to_spv(const std::string& _name, const std::string& in_fname, const std::map<std::string, std::string>& defines, bool fp16);
|
|
||||||
std::map<std::string, std::string> merge_maps(const std::map<std::string, std::string>& a, const std::map<std::string, std::string>& b);
|
|
||||||
void matmul_shaders(std::vector<std::future<void>>& tasks, bool fp16, bool matmul_id);
|
|
||||||
void process_shaders(std::vector<std::future<void>>& tasks);
|
|
||||||
void write_output_files();
|
|
||||||
|
|
||||||
std::mutex lock;
|
std::mutex lock;
|
||||||
std::vector<std::pair<std::string, std::string>> shader_fnames;
|
std::vector<std::pair<std::string, std::string>> shader_fnames;
|
||||||
|
|
||||||
|
@ -52,7 +38,7 @@ std::string input_dir = "vulkan-shaders";
|
||||||
std::string output_dir = "/tmp";
|
std::string output_dir = "/tmp";
|
||||||
std::string target_hpp = "ggml-vulkan-shaders.hpp";
|
std::string target_hpp = "ggml-vulkan-shaders.hpp";
|
||||||
std::string target_cpp = "ggml-vulkan-shaders.cpp";
|
std::string target_cpp = "ggml-vulkan-shaders.cpp";
|
||||||
bool clean = true;
|
bool no_clean = false;
|
||||||
|
|
||||||
const std::vector<std::string> type_names = {
|
const std::vector<std::string> type_names = {
|
||||||
"f32",
|
"f32",
|
||||||
|
@ -283,9 +269,12 @@ void matmul_shaders(std::vector<std::future<void>>& tasks, bool fp16, bool matmu
|
||||||
|
|
||||||
for (const auto& tname : type_names) {
|
for (const auto& tname : type_names) {
|
||||||
std::string data_a_key = "DATA_A_" + to_uppercase(tname);
|
std::string data_a_key = "DATA_A_" + to_uppercase(tname);
|
||||||
|
// For unaligned, load one at a time for f32/f16, or two at a time for quants
|
||||||
|
std::string load_vec_a_unaligned = (tname == "f32" || tname == "f16") ? "1" : "2";
|
||||||
|
// For aligned matmul loads
|
||||||
std::string load_vec_a = (tname == "f32" || tname == "f16") ? load_vec : "2";
|
std::string load_vec_a = (tname == "f32" || tname == "f16") ? load_vec : "2";
|
||||||
tasks.push_back(std::async(std::launch::async, [=] {
|
tasks.push_back(std::async(std::launch::async, [=] {
|
||||||
string_to_spv(shader_name + "_" + tname + "_f32", "mul_mm.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}), fp16);
|
string_to_spv(shader_name + "_" + tname + "_f32", "mul_mm.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}), fp16);
|
||||||
}));
|
}));
|
||||||
tasks.push_back(std::async(std::launch::async, [=] {
|
tasks.push_back(std::async(std::launch::async, [=] {
|
||||||
string_to_spv(shader_name + "_" + tname + "_f32_aligned", "mul_mm.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"D_TYPE", "float"}}), fp16);
|
string_to_spv(shader_name + "_" + tname + "_f32_aligned", "mul_mm.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"D_TYPE", "float"}}), fp16);
|
||||||
|
@ -354,6 +343,9 @@ void process_shaders(std::vector<std::future<void>>& tasks) {
|
||||||
tasks.push_back(std::async(std::launch::async, [=] {
|
tasks.push_back(std::async(std::launch::async, [=] {
|
||||||
string_to_spv("norm_f32", "norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
string_to_spv("norm_f32", "norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||||
}));
|
}));
|
||||||
|
tasks.push_back(std::async(std::launch::async, [=] {
|
||||||
|
string_to_spv("group_norm_f32", "group_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||||
|
}));
|
||||||
tasks.push_back(std::async(std::launch::async, [=] {
|
tasks.push_back(std::async(std::launch::async, [=] {
|
||||||
string_to_spv("rms_norm_f32", "rms_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
string_to_spv("rms_norm_f32", "rms_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||||
}));
|
}));
|
||||||
|
@ -371,6 +363,9 @@ void process_shaders(std::vector<std::future<void>>& tasks) {
|
||||||
tasks.push_back(std::async(std::launch::async, [] {
|
tasks.push_back(std::async(std::launch::async, [] {
|
||||||
string_to_spv("add_f32", "add.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
string_to_spv("add_f32", "add.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||||
}));
|
}));
|
||||||
|
tasks.push_back(std::async(std::launch::async, [] {
|
||||||
|
string_to_spv("add_f16_f32_f16", "add.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"FLOAT_TYPE", "float"}});
|
||||||
|
}));
|
||||||
|
|
||||||
tasks.push_back(std::async(std::launch::async, [] {
|
tasks.push_back(std::async(std::launch::async, [] {
|
||||||
string_to_spv("split_k_reduce", "mul_mat_split_k_reduce.comp", {});
|
string_to_spv("split_k_reduce", "mul_mat_split_k_reduce.comp", {});
|
||||||
|
@ -396,15 +391,36 @@ void process_shaders(std::vector<std::future<void>>& tasks) {
|
||||||
string_to_spv("clamp_f32", "clamp.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
string_to_spv("clamp_f32", "clamp.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
||||||
}));
|
}));
|
||||||
|
|
||||||
|
tasks.push_back(std::async(std::launch::async, [] {
|
||||||
|
string_to_spv("pad_f32", "pad.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||||
|
}));
|
||||||
|
|
||||||
|
tasks.push_back(std::async(std::launch::async, [] {
|
||||||
|
string_to_spv("concat_f32", "concat.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||||
|
}));
|
||||||
|
tasks.push_back(std::async(std::launch::async, [] {
|
||||||
|
string_to_spv("concat_i32", "concat.comp", {{"A_TYPE", "int"}, {"B_TYPE", "int"}, {"D_TYPE", "int"}});
|
||||||
|
}));
|
||||||
|
|
||||||
|
tasks.push_back(std::async(std::launch::async, [] {
|
||||||
|
string_to_spv("upscale_f32", "upscale.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||||
|
}));
|
||||||
|
|
||||||
tasks.push_back(std::async(std::launch::async, [] {
|
tasks.push_back(std::async(std::launch::async, [] {
|
||||||
string_to_spv("gelu_f32", "gelu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
string_to_spv("gelu_f32", "gelu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||||
}));
|
}));
|
||||||
|
tasks.push_back(std::async(std::launch::async, [] {
|
||||||
|
string_to_spv("gelu_quick_f32", "gelu_quick.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||||
|
}));
|
||||||
tasks.push_back(std::async(std::launch::async, [] {
|
tasks.push_back(std::async(std::launch::async, [] {
|
||||||
string_to_spv("silu_f32", "silu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
string_to_spv("silu_f32", "silu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||||
}));
|
}));
|
||||||
tasks.push_back(std::async(std::launch::async, [] {
|
tasks.push_back(std::async(std::launch::async, [] {
|
||||||
string_to_spv("relu_f32", "relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
string_to_spv("relu_f32", "relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||||
}));
|
}));
|
||||||
|
tasks.push_back(std::async(std::launch::async, [] {
|
||||||
|
string_to_spv("tanh_f32", "tanh.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||||
|
}));
|
||||||
|
|
||||||
tasks.push_back(std::async(std::launch::async, [] {
|
tasks.push_back(std::async(std::launch::async, [] {
|
||||||
string_to_spv("diag_mask_inf_f32", "diag_mask_inf.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
string_to_spv("diag_mask_inf_f32", "diag_mask_inf.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||||
|
@ -438,6 +454,17 @@ void process_shaders(std::vector<std::future<void>>& tasks) {
|
||||||
tasks.push_back(std::async(std::launch::async, [=] {
|
tasks.push_back(std::async(std::launch::async, [=] {
|
||||||
string_to_spv("sum_rows_f32", "sum_rows.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
string_to_spv("sum_rows_f32", "sum_rows.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||||
}));
|
}));
|
||||||
|
|
||||||
|
tasks.push_back(std::async(std::launch::async, [=] {
|
||||||
|
string_to_spv("im2col_f32", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||||
|
}));
|
||||||
|
tasks.push_back(std::async(std::launch::async, [=] {
|
||||||
|
string_to_spv("im2col_f32_f16", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}}));
|
||||||
|
}));
|
||||||
|
|
||||||
|
tasks.push_back(std::async(std::launch::async, [=] {
|
||||||
|
string_to_spv("timestep_embedding_f32", "timestep_embedding.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||||
|
}));
|
||||||
}
|
}
|
||||||
|
|
||||||
void write_output_files() {
|
void write_output_files() {
|
||||||
|
@ -478,9 +505,8 @@ void write_output_files() {
|
||||||
}
|
}
|
||||||
fprintf(src, "\n};\n\n");
|
fprintf(src, "\n};\n\n");
|
||||||
|
|
||||||
if (clean) {
|
if (!no_clean) {
|
||||||
std::remove(path.c_str());
|
std::remove(path.c_str());
|
||||||
// fprintf(stderr, "Removed: %s\n", path.c_str());
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -496,18 +522,6 @@ int main(int argc, char** argv) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (argc <= 1 || args.find("--help") != args.end()) {
|
|
||||||
std::cout << "Usage:\n"
|
|
||||||
"\tvulkan-shaders-gen [options]\n\n"
|
|
||||||
"Options:\n"
|
|
||||||
"\t--glslc <path> Path to glslc executable (default: /usr/bin/glslc)\n"
|
|
||||||
"\t--input-dir Directory containing shader sources (required)\n"
|
|
||||||
"\t--output-dir Output directory for generated SPIR-V files and optional C++ headers\n"
|
|
||||||
"\t--target-hpp <path> Path to generate a header file with shader declarations in C++ format\n"
|
|
||||||
"\t--target-cpp <path> Path to generate a source code file implementing the declared shaders (optional)\n"
|
|
||||||
"\t--no-clean Keep temporary SPIR-V files after build (default: remove them)\n";
|
|
||||||
return EXIT_SUCCESS;
|
|
||||||
}
|
|
||||||
if (args.find("--glslc") != args.end()) {
|
if (args.find("--glslc") != args.end()) {
|
||||||
GLSLC = args["--glslc"]; // Path to glslc
|
GLSLC = args["--glslc"]; // Path to glslc
|
||||||
}
|
}
|
||||||
|
@ -524,7 +538,7 @@ int main(int argc, char** argv) {
|
||||||
target_cpp = args["--target-cpp"]; // Path to generated cpp file
|
target_cpp = args["--target-cpp"]; // Path to generated cpp file
|
||||||
}
|
}
|
||||||
if (args.find("--no-clean") != args.end()) {
|
if (args.find("--no-clean") != args.end()) {
|
||||||
clean = false; // Keep temporary SPIR-V files in output-dir after build
|
no_clean = true; // Keep temporary SPIR-V files in output-dir after build
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!directory_exists(input_dir)) {
|
if (!directory_exists(input_dir)) {
|
||||||
|
|
|
@ -41,7 +41,7 @@ maxhordelen = 350
|
||||||
modelbusy = threading.Lock()
|
modelbusy = threading.Lock()
|
||||||
requestsinqueue = 0
|
requestsinqueue = 0
|
||||||
defaultport = 5001
|
defaultport = 5001
|
||||||
KcppVersion = "1.71.1"
|
KcppVersion = "1.72"
|
||||||
showdebug = True
|
showdebug = True
|
||||||
guimode = False
|
guimode = False
|
||||||
showsamplerwarning = True
|
showsamplerwarning = True
|
||||||
|
@ -3771,7 +3771,7 @@ def main(launch_args,start_server=True):
|
||||||
if not filename.endswith(".json"):
|
if not filename.endswith(".json"):
|
||||||
filename += ".json"
|
filename += ".json"
|
||||||
premade_adapt_path = os.path.join(adapt_dir,filename)
|
premade_adapt_path = os.path.join(adapt_dir,filename)
|
||||||
if os.path.exists(premade_adapt_path):
|
if premade_adapt_path and os.path.exists(premade_adapt_path):
|
||||||
ccadapter_path = os.path.abspath(premade_adapt_path)
|
ccadapter_path = os.path.abspath(premade_adapt_path)
|
||||||
if ccadapter_path:
|
if ccadapter_path:
|
||||||
print(f"Loading Chat Completions Adapter: {ccadapter_path}")
|
print(f"Loading Chat Completions Adapter: {ccadapter_path}")
|
||||||
|
@ -3890,7 +3890,7 @@ def main(launch_args,start_server=True):
|
||||||
if shouldavoidgpu:
|
if shouldavoidgpu:
|
||||||
print("WARNING: GPU layers is set, but a GPU backend was not selected!")
|
print("WARNING: GPU layers is set, but a GPU backend was not selected!")
|
||||||
pass
|
pass
|
||||||
elif args.gpulayers==-1 and not shouldavoidgpu and os.path.exists(args.model_param):
|
elif args.gpulayers==-1 and not shouldavoidgpu and args.model_param and os.path.exists(args.model_param):
|
||||||
if not args.usecublas and not args.usevulkan and not args.useclblast:
|
if not args.usecublas and not args.usevulkan and not args.useclblast:
|
||||||
print("NOTE: Auto GPU layers was set without picking a GPU backend! Trying to assign one for you automatically...")
|
print("NOTE: Auto GPU layers was set without picking a GPU backend! Trying to assign one for you automatically...")
|
||||||
auto_set_backend_cli()
|
auto_set_backend_cli()
|
||||||
|
|
|
@ -32,6 +32,10 @@
|
||||||
#include "ggml-metal.h"
|
#include "ggml-metal.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef SD_USE_VULKAN
|
||||||
|
#include "ggml-vulkan.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
#include "rng.hpp"
|
#include "rng.hpp"
|
||||||
#include "util.h"
|
#include "util.h"
|
||||||
|
|
||||||
|
@ -529,7 +533,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_attention(struct ggml_context* ctx
|
||||||
struct ggml_tensor* k,
|
struct ggml_tensor* k,
|
||||||
struct ggml_tensor* v,
|
struct ggml_tensor* v,
|
||||||
bool mask = false) {
|
bool mask = false) {
|
||||||
#if defined(SD_USE_FLASH_ATTENTION) && !defined(SD_USE_CUBLAS) && !defined(SD_USE_METAL)
|
#if defined(SD_USE_FLASH_ATTENTION) && !defined(SD_USE_CUBLAS) && !defined(SD_USE_METAL) && !defined(SD_USE_VULKAN)
|
||||||
struct ggml_tensor* kqv = ggml_flash_attn(ctx, q, k, v, false); // [N * n_head, n_token, d_head]
|
struct ggml_tensor* kqv = ggml_flash_attn(ctx, q, k, v, false); // [N * n_head, n_token, d_head]
|
||||||
#else
|
#else
|
||||||
float d_head = (float)q->ne[0];
|
float d_head = (float)q->ne[0];
|
||||||
|
|
|
@ -135,13 +135,17 @@ public:
|
||||||
ggml_backend_metal_log_set_callback(ggml_log_callback_default, nullptr);
|
ggml_backend_metal_log_set_callback(ggml_log_callback_default, nullptr);
|
||||||
backend = ggml_backend_metal_init();
|
backend = ggml_backend_metal_init();
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef SD_USE_VULKAN
|
||||||
|
LOG_DEBUG("Using Vulkan backend");
|
||||||
|
backend = ggml_backend_vk_init(0);
|
||||||
|
#endif
|
||||||
|
|
||||||
if (!backend) {
|
if (!backend) {
|
||||||
LOG_DEBUG("Using CPU backend");
|
LOG_DEBUG("Using CPU backend");
|
||||||
backend = ggml_backend_cpu_init();
|
backend = ggml_backend_cpu_init();
|
||||||
}
|
}
|
||||||
#ifdef SD_USE_FLASH_ATTENTION
|
#ifdef SD_USE_FLASH_ATTENTION
|
||||||
#if defined(SD_USE_CUBLAS) || defined(SD_USE_METAL)
|
#if defined(SD_USE_CUBLAS) || defined(SD_USE_METAL) || defined(SD_USE_VULKAN)
|
||||||
LOG_WARN("Flash Attention not supported with GPU Backend");
|
LOG_WARN("Flash Attention not supported with GPU Backend");
|
||||||
#else
|
#else
|
||||||
LOG_INFO("Flash Attention enabled");
|
LOG_INFO("Flash Attention enabled");
|
||||||
|
|
|
@ -24,6 +24,10 @@ struct UpscalerGGML {
|
||||||
ggml_backend_metal_log_set_callback(ggml_log_callback_default, nullptr);
|
ggml_backend_metal_log_set_callback(ggml_log_callback_default, nullptr);
|
||||||
backend = ggml_backend_metal_init();
|
backend = ggml_backend_metal_init();
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef SD_USE_VULKAN
|
||||||
|
LOG_DEBUG("Using Vulkan backend");
|
||||||
|
backend = ggml_backend_vk_init(0);
|
||||||
|
#endif
|
||||||
|
|
||||||
if (!backend) {
|
if (!backend) {
|
||||||
LOG_DEBUG("Using CPU backend");
|
LOG_DEBUG("Using CPU backend");
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue