mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-13 18:39:48 +00:00
ggml: adds CONV_2D op and direct GEMM Vulkan implementation (#14316)
* ggml/ggml-vulkan/test-backend-ops: adds CONV_2D for Vulkan * ggml-vulkan: adds f32 scalar shader to compute 2D convolution directly with gemm (no need for im2col), * test-backend-ops: adds test_case_ref to check the validity/performance of ops against reference implementations having different graphs, adds tests * * Performance fixes: minimized branch divergence, uses collectives to eliminate redundant calculation, macros removed. * Kernel shared memory size check * Updates test-backend-ops to support graphs for performance measurement. * * Apple/Win32 compile errors fixed * Subgroup size used to determine tile size -> fixes llvmpipe errors. * Collectives disabled by default. * Intel support is disabled as the performance is poor. * Conv2d enabled for Intel with disabled collectives, disabled for Apple * test-backend-ops modifications are reverted * Trailing spaces and missing override fixed. * Triggering pipeline relaunch. * Code formatted with .clang-format.
This commit is contained in:
parent
90083283ec
commit
a979ca22db
4 changed files with 711 additions and 11 deletions
|
@ -3699,6 +3699,93 @@ struct test_im2col : public test_case {
|
|||
}
|
||||
};
|
||||
|
||||
// CONV_2D
|
||||
struct test_conv_2d : public test_case {
|
||||
const std::array<int64_t, 4> ne_input;
|
||||
const std::array<int64_t, 4> ne_kernel;
|
||||
const int stride0;
|
||||
const int stride1;
|
||||
const int padding0;
|
||||
const int padding1;
|
||||
const int dilation0;
|
||||
const int dilation1;
|
||||
// Whether the inputs are contiguous in the channel dim or the width dim
|
||||
const bool cwhn;
|
||||
|
||||
// If true, the direct CONV_2D will be used in the graph, otherwise it
|
||||
// uses ggml_conv_2d:
|
||||
// * if the program is called with -o CONV_2D_DIRECT_IMPL, the
|
||||
// CONV_2D graph will be built, while
|
||||
// * if the program is called with -o CONV_2D_INDIRECT_IMPL, the
|
||||
// IM2COL -> MUL_MM graph will be built.
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR9(ne_input, ne_kernel, stride0, stride1, padding0, padding1, dilation0, dilation1, cwhn);
|
||||
}
|
||||
|
||||
uint64_t op_flops(ggml_tensor * t) override {
|
||||
GGML_UNUSED(t);
|
||||
// Just counting matmul costs:
|
||||
// KxCRS @ CRSxNPQ = KxNPQ --> KxNPQx(CRS+CRS-1) flops
|
||||
|
||||
// Copied from ggml.c: int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d)
|
||||
auto calc_conv_output_size = [](int64_t ins, int64_t ks, int s, int p, int d) -> int64_t {
|
||||
return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
|
||||
};
|
||||
|
||||
int64_t W = ne_input[0];
|
||||
int64_t H = ne_input[1];
|
||||
int64_t KW = ne_kernel[0];
|
||||
int64_t KH = ne_kernel[1];
|
||||
int64_t Cin = ne_kernel[2];
|
||||
int64_t Cout = ne_kernel[3];
|
||||
int64_t N = ne_input[3];
|
||||
int64_t OH = calc_conv_output_size(H, KH, stride0, padding0, dilation0);
|
||||
int64_t OW = calc_conv_output_size(W, KW, stride0, padding0, dilation0);
|
||||
|
||||
int64_t K = Cout;
|
||||
int64_t CRS = Cin * KH * KW;
|
||||
int64_t NPQ = N * OH * OW;
|
||||
|
||||
return K * NPQ * (2 * CRS - 1);
|
||||
}
|
||||
|
||||
test_conv_2d(std::array<int64_t, 4> ne_input = { 64, 64, 16, 1 },
|
||||
std::array<int64_t, 4> ne_kernel = { 3, 3, 1, 16 }, int stride0 = 1, int stride1 = 1, int padding0 = 0,
|
||||
int padding1 = 0, int dilation0 = 1, int dilation1 = 1, bool cwhn = false) :
|
||||
ne_input(ne_input),
|
||||
ne_kernel(ne_kernel),
|
||||
stride0(stride0),
|
||||
stride1(stride1),
|
||||
padding0(padding0),
|
||||
padding1(padding1),
|
||||
dilation0(dilation0),
|
||||
dilation1(dilation1),
|
||||
cwhn(cwhn) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
ggml_tensor * input = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_input.data());
|
||||
ggml_set_name(input, "input");
|
||||
|
||||
ggml_tensor * kernel = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_kernel.data());
|
||||
ggml_set_name(kernel, "kernel");
|
||||
|
||||
if (cwhn) {
|
||||
// change memory layout to channel-most-contiguous (CWHN),
|
||||
// then permute it back so NE matches the original input
|
||||
input = ggml_cont(ctx, ggml_permute(ctx, input, 1, 2, 0, 3));
|
||||
input = ggml_permute(ctx, input, 2, 0, 1, 3);
|
||||
kernel = ggml_cont(ctx, ggml_permute(ctx, kernel, 2, 3, 1, 0));
|
||||
kernel = ggml_permute(ctx, kernel, 3, 2, 0, 1);
|
||||
}
|
||||
|
||||
ggml_tensor * out =
|
||||
ggml_conv_2d_direct(ctx, kernel, input, stride0, stride1, padding0, padding1, dilation0, dilation1);
|
||||
ggml_set_name(out, "out");
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
// GGML_OP_CONV_2D_DW
|
||||
struct test_conv_2d_dw : public test_case {
|
||||
const std::array<int64_t, 4> ne_input;
|
||||
|
@ -5007,6 +5094,80 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
|||
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 1, 2560}, {3, 3, 1, 2560}, 1, 1, 1, 1, 1, 1, true));
|
||||
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 2, 2560}, {3, 3, 2, 2560}, 1, 1, 1, 1, 1, 1, true));
|
||||
|
||||
// Conv_2D test cases
|
||||
#ifdef DETAILED_TESTS
|
||||
// Probably we do not have enough time to execute these in the pipeline.
|
||||
uint32_t iwh_idx = 0;
|
||||
uint32_t kwh_idx = 1;
|
||||
uint32_t Cout_idx = 2;
|
||||
uint32_t Cin_idx = 3;
|
||||
uint32_t B_idx = 4;
|
||||
|
||||
std::vector<std::array<int, 5>> cases = {
|
||||
//{IWH, KWH, Cout, Cin, B}
|
||||
// K=CRS=NPQ=4096 conv_2d matmul performance
|
||||
{19, 4, 4096, 256, 16},
|
||||
// K=128, CRS=128, NPQ=4096
|
||||
{ 19, 4, 128, 8, 16},
|
||||
// K=130, CRS=128, NPQ=4096
|
||||
{ 19, 4, 130, 8, 16},
|
||||
// Edge case: K x CRS is small
|
||||
{ 19, 2, 4, 4, 16},
|
||||
// A ConvNet's first layer
|
||||
{ 224, 3, 8, 3, 1 },
|
||||
// A ConvNet's first layer with 2x2 convolution, and 1 channel
|
||||
{ 224, 2, 8, 1, 1 },
|
||||
// A ConvNet's first layer with 2x2 convolution, and 1 channel, several images in the batch
|
||||
{ 224, 2, 8, 1, 8 },
|
||||
// A middle layer of a ConvNet
|
||||
{ 58, 3, 64, 32, 1 },
|
||||
// A middle layer of a ConvNet, several images in the batch
|
||||
{ 58, 3, 64, 32, 8 },
|
||||
// A deep layer of a ConvNet, several images in the batch
|
||||
{ 16, 3, 256, 128, 8 }
|
||||
};
|
||||
|
||||
for (auto act_case : cases) {
|
||||
test_cases.emplace_back(new test_conv_2d(
|
||||
{ act_case[iwh_idx], act_case[iwh_idx], act_case[Cin_idx], act_case[B_idx] },
|
||||
{ act_case[kwh_idx], act_case[kwh_idx], act_case[Cin_idx], act_case[Cout_idx] }, 1, 1, 0, 0, 1, 1, false));
|
||||
}
|
||||
#endif
|
||||
|
||||
// CONV_2D:
|
||||
auto calc_conv_output_size = [](int64_t ins, int64_t ks, int s, int p, int d) -> int64_t {
|
||||
return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
|
||||
};
|
||||
|
||||
//uint32_t s0 = 3;
|
||||
uint32_t s1 = 5;
|
||||
uint32_t p0 = 5;
|
||||
//uint32_t p1 = 2;
|
||||
uint32_t d0 = 2;
|
||||
uint32_t d1 = 4;
|
||||
|
||||
for (uint32_t s0 : { 1, 3 }) {
|
||||
for (uint32_t p1 : { 2, 5 }) {
|
||||
for (uint32_t Cin : { 1, 25 }) {
|
||||
for (uint32_t Cout : { 1, 12 }) {
|
||||
for (uint32_t KH : { 1, 2, 3, 11 }) {
|
||||
for (uint32_t KW : { 1, 2, 3, 11 }) {
|
||||
for (uint32_t H : { 1, 133 }) {
|
||||
for (uint32_t W : { 1, 141 }) {
|
||||
if (calc_conv_output_size(W, KW, s0, p0, d0) > 0 &&
|
||||
calc_conv_output_size(H, KH, s1, p1, d1) > 0) {
|
||||
test_cases.emplace_back(new test_conv_2d(
|
||||
{ W, H, Cin, 2 }, { KW, KH, Cin, Cout }, s0, s1, p0, p1, d0, d1, false));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// sycl backend will limit task global_range < MAX_INT
|
||||
// test cases for 2D im2col with large input W and H (occurs in stable-diffusion)
|
||||
// however these cases need to alloc more memory which may fail in some devices (Intel Arc770, etc.)
|
||||
|
@ -5610,6 +5771,43 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
|||
static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
|
||||
std::vector<std::unique_ptr<test_case>> test_cases;
|
||||
|
||||
// Conv2d: K=CRS=NPQ=4096 matmul performance
|
||||
uint32_t iwh_idx = 0;
|
||||
uint32_t kwh_idx = 1;
|
||||
uint32_t Cout_idx = 2;
|
||||
uint32_t Cin_idx = 3;
|
||||
uint32_t B_idx = 4;
|
||||
std::vector<std::array<int, 5>> cases = {
|
||||
//{IWH, KWH, Cout, Cin, B}
|
||||
// K=CRS=NPQ=4096 conv2d matmul performance
|
||||
{19, 4, 4096, 256, 16},
|
||||
// K=128, CRS=128, NPQ=4096
|
||||
{ 19, 4, 128, 8, 16},
|
||||
// K=130, CRS=128, NPQ=4096
|
||||
{ 19, 4, 130, 8, 16},
|
||||
// Edge case: K x CRS is small
|
||||
{ 19, 2, 4, 4, 16},
|
||||
// A ConvNet's first layer
|
||||
{ 224, 3, 8, 3, 1 },
|
||||
// A ConvNet's first layer with 2x2 convolution, and 1 channel
|
||||
{ 224, 2, 8, 1, 1 },
|
||||
// A ConvNet's first layer with 2x2 convolution, and 1 channel, several images in the batch
|
||||
{ 224, 2, 8, 1, 8 },
|
||||
// A middle layer of a ConvNet
|
||||
{ 58, 3, 64, 32, 1 },
|
||||
// A middle layer of a ConvNet, several images in the batch
|
||||
{ 58, 3, 64, 32, 8 },
|
||||
// A deep layer of a ConvNet, several images in the batch
|
||||
{ 16, 3, 512, 128, 8 },
|
||||
};
|
||||
|
||||
for (auto act_case : cases) {
|
||||
// Direct CONV_2D
|
||||
test_cases.emplace_back(new test_conv_2d(
|
||||
{ act_case[iwh_idx], act_case[iwh_idx], act_case[Cin_idx], act_case[B_idx] },
|
||||
{ act_case[kwh_idx], act_case[kwh_idx], act_case[Cin_idx], act_case[Cout_idx] }, 1, 1, 0, 0, 1, 1, false));
|
||||
}
|
||||
|
||||
test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 1, 1, 1}));
|
||||
test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 512, 1, 1}));
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue