ggml: adds CONV_2D op and direct GEMM Vulkan implementation (#14316)

* ggml/ggml-vulkan/test-backend-ops: adds CONV_2D for Vulkan

* ggml-vulkan: adds f32 scalar shader to compute 2D convolution directly
with gemm (no need for im2col),

* test-backend-ops: adds test_case_ref to check the validity/performance of ops
against reference implementations having different graphs, adds tests

* * Performance fixes: minimized branch divergence, uses collectives to
  eliminate redundant calculation, macros removed.

* Kernel shared memory size check

* Updates test-backend-ops to support graphs for performance
  measurement.

* * Apple/Win32 compile errors fixed

* Subgroup size used to determine tile size -> fixes llvmpipe errors.

* Collectives disabled by default.

* Intel support is disabled as the performance is poor.

* Conv2d enabled for Intel with disabled collectives, disabled for Apple

* test-backend-ops modifications are reverted

* Trailing spaces and missing override fixed.

* Triggering pipeline relaunch.

* Code formatted with .clang-format.
This commit is contained in:
Ervin Áron Tasnádi 2025-07-19 21:59:08 +02:00 committed by GitHub
parent 90083283ec
commit a979ca22db
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 711 additions and 11 deletions

View file

@ -3699,6 +3699,93 @@ struct test_im2col : public test_case {
}
};
// CONV_2D
struct test_conv_2d : public test_case {
const std::array<int64_t, 4> ne_input;
const std::array<int64_t, 4> ne_kernel;
const int stride0;
const int stride1;
const int padding0;
const int padding1;
const int dilation0;
const int dilation1;
// Whether the inputs are contiguous in the channel dim or the width dim
const bool cwhn;
// If true, the direct CONV_2D will be used in the graph, otherwise it
// uses ggml_conv_2d:
// * if the program is called with -o CONV_2D_DIRECT_IMPL, the
// CONV_2D graph will be built, while
// * if the program is called with -o CONV_2D_INDIRECT_IMPL, the
// IM2COL -> MUL_MM graph will be built.
std::string vars() override {
return VARS_TO_STR9(ne_input, ne_kernel, stride0, stride1, padding0, padding1, dilation0, dilation1, cwhn);
}
uint64_t op_flops(ggml_tensor * t) override {
GGML_UNUSED(t);
// Just counting matmul costs:
// KxCRS @ CRSxNPQ = KxNPQ --> KxNPQx(CRS+CRS-1) flops
// Copied from ggml.c: int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d)
auto calc_conv_output_size = [](int64_t ins, int64_t ks, int s, int p, int d) -> int64_t {
return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
};
int64_t W = ne_input[0];
int64_t H = ne_input[1];
int64_t KW = ne_kernel[0];
int64_t KH = ne_kernel[1];
int64_t Cin = ne_kernel[2];
int64_t Cout = ne_kernel[3];
int64_t N = ne_input[3];
int64_t OH = calc_conv_output_size(H, KH, stride0, padding0, dilation0);
int64_t OW = calc_conv_output_size(W, KW, stride0, padding0, dilation0);
int64_t K = Cout;
int64_t CRS = Cin * KH * KW;
int64_t NPQ = N * OH * OW;
return K * NPQ * (2 * CRS - 1);
}
test_conv_2d(std::array<int64_t, 4> ne_input = { 64, 64, 16, 1 },
std::array<int64_t, 4> ne_kernel = { 3, 3, 1, 16 }, int stride0 = 1, int stride1 = 1, int padding0 = 0,
int padding1 = 0, int dilation0 = 1, int dilation1 = 1, bool cwhn = false) :
ne_input(ne_input),
ne_kernel(ne_kernel),
stride0(stride0),
stride1(stride1),
padding0(padding0),
padding1(padding1),
dilation0(dilation0),
dilation1(dilation1),
cwhn(cwhn) {}
ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_tensor * input = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_input.data());
ggml_set_name(input, "input");
ggml_tensor * kernel = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_kernel.data());
ggml_set_name(kernel, "kernel");
if (cwhn) {
// change memory layout to channel-most-contiguous (CWHN),
// then permute it back so NE matches the original input
input = ggml_cont(ctx, ggml_permute(ctx, input, 1, 2, 0, 3));
input = ggml_permute(ctx, input, 2, 0, 1, 3);
kernel = ggml_cont(ctx, ggml_permute(ctx, kernel, 2, 3, 1, 0));
kernel = ggml_permute(ctx, kernel, 3, 2, 0, 1);
}
ggml_tensor * out =
ggml_conv_2d_direct(ctx, kernel, input, stride0, stride1, padding0, padding1, dilation0, dilation1);
ggml_set_name(out, "out");
return out;
}
};
// GGML_OP_CONV_2D_DW
struct test_conv_2d_dw : public test_case {
const std::array<int64_t, 4> ne_input;
@ -5007,6 +5094,80 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 1, 2560}, {3, 3, 1, 2560}, 1, 1, 1, 1, 1, 1, true));
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {12, 12, 2, 2560}, {3, 3, 2, 2560}, 1, 1, 1, 1, 1, 1, true));
// Conv_2D test cases
#ifdef DETAILED_TESTS
// Probably we do not have enough time to execute these in the pipeline.
uint32_t iwh_idx = 0;
uint32_t kwh_idx = 1;
uint32_t Cout_idx = 2;
uint32_t Cin_idx = 3;
uint32_t B_idx = 4;
std::vector<std::array<int, 5>> cases = {
//{IWH, KWH, Cout, Cin, B}
// K=CRS=NPQ=4096 conv_2d matmul performance
{19, 4, 4096, 256, 16},
// K=128, CRS=128, NPQ=4096
{ 19, 4, 128, 8, 16},
// K=130, CRS=128, NPQ=4096
{ 19, 4, 130, 8, 16},
// Edge case: K x CRS is small
{ 19, 2, 4, 4, 16},
// A ConvNet's first layer
{ 224, 3, 8, 3, 1 },
// A ConvNet's first layer with 2x2 convolution, and 1 channel
{ 224, 2, 8, 1, 1 },
// A ConvNet's first layer with 2x2 convolution, and 1 channel, several images in the batch
{ 224, 2, 8, 1, 8 },
// A middle layer of a ConvNet
{ 58, 3, 64, 32, 1 },
// A middle layer of a ConvNet, several images in the batch
{ 58, 3, 64, 32, 8 },
// A deep layer of a ConvNet, several images in the batch
{ 16, 3, 256, 128, 8 }
};
for (auto act_case : cases) {
test_cases.emplace_back(new test_conv_2d(
{ act_case[iwh_idx], act_case[iwh_idx], act_case[Cin_idx], act_case[B_idx] },
{ act_case[kwh_idx], act_case[kwh_idx], act_case[Cin_idx], act_case[Cout_idx] }, 1, 1, 0, 0, 1, 1, false));
}
#endif
// CONV_2D:
auto calc_conv_output_size = [](int64_t ins, int64_t ks, int s, int p, int d) -> int64_t {
return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
};
//uint32_t s0 = 3;
uint32_t s1 = 5;
uint32_t p0 = 5;
//uint32_t p1 = 2;
uint32_t d0 = 2;
uint32_t d1 = 4;
for (uint32_t s0 : { 1, 3 }) {
for (uint32_t p1 : { 2, 5 }) {
for (uint32_t Cin : { 1, 25 }) {
for (uint32_t Cout : { 1, 12 }) {
for (uint32_t KH : { 1, 2, 3, 11 }) {
for (uint32_t KW : { 1, 2, 3, 11 }) {
for (uint32_t H : { 1, 133 }) {
for (uint32_t W : { 1, 141 }) {
if (calc_conv_output_size(W, KW, s0, p0, d0) > 0 &&
calc_conv_output_size(H, KH, s1, p1, d1) > 0) {
test_cases.emplace_back(new test_conv_2d(
{ W, H, Cin, 2 }, { KW, KH, Cin, Cout }, s0, s1, p0, p1, d0, d1, false));
}
}
}
}
}
}
}
}
}
// sycl backend will limit task global_range < MAX_INT
// test cases for 2D im2col with large input W and H (occurs in stable-diffusion)
// however these cases need to alloc more memory which may fail in some devices (Intel Arc770, etc.)
@ -5610,6 +5771,43 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
std::vector<std::unique_ptr<test_case>> test_cases;
// Conv2d: K=CRS=NPQ=4096 matmul performance
uint32_t iwh_idx = 0;
uint32_t kwh_idx = 1;
uint32_t Cout_idx = 2;
uint32_t Cin_idx = 3;
uint32_t B_idx = 4;
std::vector<std::array<int, 5>> cases = {
//{IWH, KWH, Cout, Cin, B}
// K=CRS=NPQ=4096 conv2d matmul performance
{19, 4, 4096, 256, 16},
// K=128, CRS=128, NPQ=4096
{ 19, 4, 128, 8, 16},
// K=130, CRS=128, NPQ=4096
{ 19, 4, 130, 8, 16},
// Edge case: K x CRS is small
{ 19, 2, 4, 4, 16},
// A ConvNet's first layer
{ 224, 3, 8, 3, 1 },
// A ConvNet's first layer with 2x2 convolution, and 1 channel
{ 224, 2, 8, 1, 1 },
// A ConvNet's first layer with 2x2 convolution, and 1 channel, several images in the batch
{ 224, 2, 8, 1, 8 },
// A middle layer of a ConvNet
{ 58, 3, 64, 32, 1 },
// A middle layer of a ConvNet, several images in the batch
{ 58, 3, 64, 32, 8 },
// A deep layer of a ConvNet, several images in the batch
{ 16, 3, 512, 128, 8 },
};
for (auto act_case : cases) {
// Direct CONV_2D
test_cases.emplace_back(new test_conv_2d(
{ act_case[iwh_idx], act_case[iwh_idx], act_case[Cin_idx], act_case[B_idx] },
{ act_case[kwh_idx], act_case[kwh_idx], act_case[Cin_idx], act_case[Cout_idx] }, 1, 1, 0, 0, 1, 1, false));
}
test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 1, 1, 1}));
test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 512, 1, 1}));