From 19e1c518f14273cd8885d7de9296f9676108ae80 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Wed, 28 Feb 2024 16:38:00 +0800 Subject: [PATCH] merge missing functions from sdcpp --- ggml-cuda.cu | 121 ++++++++++++++++++++ ggml.c | 192 +++++++++++++++++++++++++++++++- ggml.h | 17 +++ otherarch/sdcpp/ggml_extend.hpp | 2 +- 4 files changed, 325 insertions(+), 7 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index d70283b11..879644349 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -606,6 +606,8 @@ static_assert(sizeof(block_iq4_xs) == sizeof(ggml_fp16_t) + sizeof(uint16_t) + Q #define CUDA_UPSCALE_BLOCK_SIZE 256 #define CUDA_CONCAT_BLOCK_SIZE 256 #define CUDA_PAD_BLOCK_SIZE 256 +#define CUDA_ARANGE_BLOCK_SIZE 256 +#define CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE 256 #define CUDA_ACC_BLOCK_SIZE 256 #define CUDA_IM2COL_BLOCK_SIZE 256 #define CUDA_POOL2D_BLOCK_SIZE 256 @@ -1033,6 +1035,38 @@ static __global__ void pad_f32(const float * x, float * dst, const int ne0, cons } } +static __global__ void arange_f32(float * dst, const int ne0, const float start, const float step) { + // blockIDx.x: idx of ne0 / BLOCK_SIZE + int nidx = threadIdx.x + blockIdx.x * blockDim.x; + if (nidx >= ne0) { + return; + } + dst[nidx] = start + step * nidx; +} + +static __global__ void timestep_embedding_f32(const float * timesteps, float * dst, const int nb1, const int dim, const int max_period) { + // blockIDx.y: idx of timesteps->ne[0] + // blockIDx.x: idx of ((dim + 1) / 2) / BLOCK_SIZE + int i = blockIdx.y; + int j = threadIdx.x + blockIdx.x * blockDim.x; + float * embed_data = (float *)((char *)dst + i*nb1); + + if (dim % 2 != 0 && j == ((dim + 1) / 2)) { + embed_data[dim] = 0.f; + } + + int half = dim / 2; + if (j >= half) { + return; + } + + float timestep = timesteps[i]; + float freq = (float)exp(-logf(max_period) * j / half); + float arg = timestep * freq; + embed_data[j] = cos(arg); + embed_data[j + half] = sin(arg); +} + template static __global__ void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps) { int start = blockIdx.x * group_size; @@ -9159,6 +9193,44 @@ static void ggml_cuda_op_pad( (void) src1_dd; } +static void ggml_cuda_op_arange( + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, + const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + const float start = ((float*)dst->op_params)[0]; + const float stop = ((float*)dst->op_params)[1]; + const float step = ((float*)dst->op_params)[2]; + + int64_t steps = (int64_t)ceil((stop - start) / step); + GGML_ASSERT(ggml_nelements(dst) == steps); + + arange_f32_cuda(dst_dd, dst->ne[0], start, step, main_stream); + + (void) src0; + (void) src1; + (void) src0_dd; + (void) src1_dd; +} + + +static void ggml_cuda_op_timestep_embedding( + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, + const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + const int dim = dst->op_params[0]; + const int max_period = dst->op_params[1]; + + timestep_embedding_f32_cuda(src0_dd, dst_dd, src0->ne[0], dst->nb[1], dim, max_period, main_stream); + + (void) src1; + (void) dst; + (void) src1_dd; +} + static void ggml_cuda_op_rms_norm( const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { @@ -10437,6 +10509,47 @@ static void ggml_cuda_pad(const ggml_tensor * src0, const ggml_tensor * src1, gg ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_pad); } +static void ggml_cuda_arange(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_ASSERT( dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT); + + ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; + + const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU; + + // dd = data device + float * src0_ddf = nullptr; + float * src1_ddf = nullptr; + float * dst_ddf = nullptr; + + cuda_pool_alloc dst_f; + + ggml_cuda_set_device(g_main_device); + cudaStream_t main_stream = g_cudaStreams[g_main_device][0]; + + if (dst_on_device) { + dst_ddf = (float *) dst_extra->data_device[g_main_device]; + } else { + dst_ddf = dst_f.alloc(ggml_nelements(dst)); + } + + // do the computation + ggml_cuda_op_arange(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream); + CUDA_CHECK(cudaGetLastError()); + + // copy dst to host if necessary + if (!dst_on_device) { + CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream)); + } + + if (dst->backend == GGML_BACKEND_TYPE_CPU) { + CUDA_CHECK(cudaDeviceSynchronize()); + } +} + +static void ggml_cuda_timestep_embedding(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_timestep_embedding); +} + static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm); } @@ -11343,6 +11456,12 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st case GGML_OP_PAD: func = ggml_cuda_pad; break; + case GGML_OP_ARANGE: + func = ggml_cuda_arange; + break; + case GGML_OP_TIMESTEP_EMBEDDING: + func = ggml_cuda_timestep_embedding; + break; case GGML_OP_LEAKY_RELU: func = ggml_cuda_leaky_relu; break; @@ -12238,6 +12357,8 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons case GGML_OP_GROUP_NORM: case GGML_OP_UPSCALE: case GGML_OP_PAD: + case GGML_OP_ARANGE: + case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_LEAKY_RELU: return true; default: diff --git a/ggml.c b/ggml.c index da3456d40..e96bff787 100644 --- a/ggml.c +++ b/ggml.c @@ -1804,6 +1804,8 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "POOL_2D", "UPSCALE", "PAD", + "ARANGE", + "TIMESTEP_EMBEDDING", "ARGSORT", "LEAKY_RELU", @@ -1832,7 +1834,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "CROSS_ENTROPY_LOSS_BACK", }; -static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72"); +static_assert(GGML_OP_COUNT == 74, "GGML_OP_COUNT != 74"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -1890,6 +1892,8 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "pool_2d(x)", "upscale(x)", "pad(x)", + "arange(start, stop, step)", + "timestep_embedding(timesteps, dim, max_period)", "argsort(x)", "leaky_relu(x)", @@ -1918,7 +1922,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "cross_entropy_loss_back(x,y)", }; -static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72"); +static_assert(GGML_OP_COUNT == 74, "GGML_OP_COUNT != 74"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); @@ -5879,6 +5883,55 @@ struct ggml_tensor * ggml_upscale( return ggml_upscale_impl(ctx, a, scale_factor); } +struct ggml_tensor * ggml_arange( + struct ggml_context * ctx, + float start, + float stop, + float step) { + + GGML_ASSERT(stop > start); + + int64_t steps = (int64_t)ceil((stop - start) / step); + + struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, steps); + + result->op = GGML_OP_ARANGE; + ((float *)(result->op_params))[0] = start; + ((float *)(result->op_params))[1] = stop; + ((float *)(result->op_params))[2] = step; + + return result; +} + + +struct ggml_tensor * ggml_timestep_embedding( + struct ggml_context * ctx, + struct ggml_tensor * timesteps, + int dim, + int max_period) { + bool is_node = false; + + if (timesteps->grad) { + GGML_ASSERT(false); // TODO: implement backward + is_node = true; + } + + int acutual_dim = dim; + if (dim % 2 != 0) { + acutual_dim = dim + 1; + } + + struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, acutual_dim, timesteps->ne[0]); + + result->op = GGML_OP_TIMESTEP_EMBEDDING; + result->op_params[0] = dim; + result->op_params[1] = max_period; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = timesteps; + + return result; +} + // ggml_argsort struct ggml_tensor * ggml_argsort( @@ -10227,11 +10280,11 @@ static void ggml_compute_forward_group_norm_f32( const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03); for (int64_t i00 = 0; i00 < ne00; i00++) { - sum += (ggml_float)x[i00]; + sum += (ggml_float)x[i00] / (ne00 * ne01 * step); } } } - float mean = sum / (ne00 * ne01 * step); + float mean = sum; ggml_float sum2 = 0.0; for (int64_t i02 = start; i02 < end; i02++) { @@ -10243,11 +10296,11 @@ static void ggml_compute_forward_group_norm_f32( for (int64_t i00 = 0; i00 < ne00; i00++) { float v = x[i00] - mean; y[i00] = v; - sum2 += (ggml_float)(v * v); + sum2 += (ggml_float)(v * v) / (ne00 * ne01 * step); } } } - float variance = sum2 / (ne00 * ne01 * step); + float variance = sum2; const float scale = 1.0f / sqrtf(variance + eps); for (int64_t i02 = start; i02 < end; i02++) { @@ -13528,6 +13581,109 @@ static void ggml_compute_forward_pad( } } + +// ggml_compute_forward_arange + +static void ggml_compute_forward_arange_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + + GGML_ASSERT(dst->nb[0] == sizeof(float)); + + const int ith = params->ith; + const int nth = params->nth; + + GGML_TENSOR_UNARY_OP_LOCALS + + const float start = ((float*)dst->op_params)[0]; + const float stop = ((float*)dst->op_params)[1]; + const float step = ((float*)dst->op_params)[2]; + + int64_t steps = (int64_t)ceil((stop - start) / step); + GGML_ASSERT(ggml_nelements(dst) == steps); + + for (int64_t i = ith; i < steps; i+= nth) { + float value = start + step * i; + ((float *)dst->data)[i] = value; + } +} + +static void ggml_compute_forward_arange( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (dst->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_arange_f32(params, src0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +static void ggml_compute_forward_timestep_embedding_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + + GGML_ASSERT(src0->nb[0] == sizeof(float)); + + const int ith = params->ith; + const int nth = params->nth; + + GGML_TENSOR_UNARY_OP_LOCALS + + const int dim = dst->op_params[0]; + const int max_period = dst->op_params[1]; + int acutual_dim = dim; + if (dim % 2 != 0) { + acutual_dim = dim + 1; + } + int half = dim / 2; + + for (int64_t i = 0; i < ne00; i++) { + float * embed_data = (float *)((char *) dst->data + i*nb1); + for (int64_t j = ith; j < half; j += nth) { + float timestep = ((float *)src0->data)[i]; + float freq = (float)exp(-log(max_period) * j / half); + float arg = timestep * freq; + embed_data[j] = cos(arg); + embed_data[j + half] = sin(arg); + } + if (dim % 2 != 0 && ith == 0) { + embed_data[dim] = 0.f; + } + } +} + +static void ggml_compute_forward_timestep_embedding( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_timestep_embedding_f32(params, src0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + // ggml_compute_forward_argsort static void ggml_compute_forward_argsort_f32( @@ -15593,6 +15749,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_pad(params, tensor); } break; + case GGML_OP_ARANGE: + { + ggml_compute_forward_arange(params, tensor->src[0], tensor); + } break; + case GGML_OP_TIMESTEP_EMBEDDING: + { + ggml_compute_forward_timestep_embedding(params, tensor->src[0], tensor); + } break; case GGML_OP_ARGSORT: { ggml_compute_forward_argsort(params, tensor); @@ -16595,6 +16759,14 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor { GGML_ASSERT(false); // TODO: not implemented } break; + case GGML_OP_ARANGE: + { + GGML_ASSERT(false); // TODO: not implemented + } break; + case GGML_OP_TIMESTEP_EMBEDDING: + { + GGML_ASSERT(false); // TODO: not implemented + } break; case GGML_OP_ARGSORT: { GGML_ASSERT(false); // TODO: not implemented @@ -17346,6 +17518,14 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { { n_tasks = n_threads; } break; + case GGML_OP_ARANGE: + { + n_tasks = n_threads; + } break; + case GGML_OP_TIMESTEP_EMBEDDING: + { + n_tasks = n_threads; + } break; case GGML_OP_ARGSORT: { n_tasks = n_threads; diff --git a/ggml.h b/ggml.h index b7f5a89ee..6c4c034d1 100644 --- a/ggml.h +++ b/ggml.h @@ -461,6 +461,8 @@ extern "C" { GGML_OP_POOL_2D, GGML_OP_UPSCALE, // nearest interpolate GGML_OP_PAD, + GGML_OP_ARANGE, + GGML_OP_TIMESTEP_EMBEDDING, GGML_OP_ARGSORT, GGML_OP_LEAKY_RELU, @@ -1658,6 +1660,15 @@ extern "C" { int p2, int p3); + // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151 + // timesteps: [N,] + // return: [N, dim] + GGML_API struct ggml_tensor * ggml_timestep_embedding( + struct ggml_context * ctx, + struct ggml_tensor * timesteps, + int dim, + int max_period); + // sort rows enum ggml_sort_order { GGML_SORT_ORDER_ASC, @@ -1669,6 +1680,12 @@ extern "C" { struct ggml_tensor * a, enum ggml_sort_order order); + GGML_API struct ggml_tensor * ggml_arange( + struct ggml_context * ctx, + float start, + float stop, + float step); + // top k elements per row GGML_API struct ggml_tensor * ggml_top_k( struct ggml_context * ctx, diff --git a/otherarch/sdcpp/ggml_extend.hpp b/otherarch/sdcpp/ggml_extend.hpp index 580ec3e92..71f91fe58 100644 --- a/otherarch/sdcpp/ggml_extend.hpp +++ b/otherarch/sdcpp/ggml_extend.hpp @@ -846,7 +846,7 @@ public: return NULL; } // it's performing a compute, check if backend isn't cpu - if (!ggml_backend_is_cpu(backend) && tensor->backend == GGML_BACKEND_CPU) { + if (!ggml_backend_is_cpu(backend) && tensor->backend == GGML_BACKEND_TYPE_CPU) { // pass input tensors to gpu memory auto backend_tensor = ggml_dup_tensor(compute_ctx, tensor);