From c952b4f192abcdfc2e82812b6e5c1c5fc225ab2c Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Tue, 5 Mar 2024 15:38:51 +0800 Subject: [PATCH] Revert "merge missing functions from sdcpp" This reverts commit 19e1c518f14273cd8885d7de9296f9676108ae80. --- ggml-cuda.cu | 121 -------------------- ggml.c | 192 +------------------------------- ggml.h | 17 --- otherarch/sdcpp/ggml_extend.hpp | 2 +- 4 files changed, 7 insertions(+), 325 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index f9ff661a3..66a597056 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -616,8 +616,6 @@ static_assert(sizeof(block_iq4_xs) == sizeof(ggml_fp16_t) + sizeof(uint16_t) + Q #define CUDA_UPSCALE_BLOCK_SIZE 256 #define CUDA_CONCAT_BLOCK_SIZE 256 #define CUDA_PAD_BLOCK_SIZE 256 -#define CUDA_ARANGE_BLOCK_SIZE 256 -#define CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE 256 #define CUDA_ACC_BLOCK_SIZE 256 #define CUDA_IM2COL_BLOCK_SIZE 256 #define CUDA_POOL2D_BLOCK_SIZE 256 @@ -1045,38 +1043,6 @@ static __global__ void pad_f32(const float * x, float * dst, const int ne0, cons } } -static __global__ void arange_f32(float * dst, const int ne0, const float start, const float step) { - // blockIDx.x: idx of ne0 / BLOCK_SIZE - int nidx = threadIdx.x + blockIdx.x * blockDim.x; - if (nidx >= ne0) { - return; - } - dst[nidx] = start + step * nidx; -} - -static __global__ void timestep_embedding_f32(const float * timesteps, float * dst, const int nb1, const int dim, const int max_period) { - // blockIDx.y: idx of timesteps->ne[0] - // blockIDx.x: idx of ((dim + 1) / 2) / BLOCK_SIZE - int i = blockIdx.y; - int j = threadIdx.x + blockIdx.x * blockDim.x; - float * embed_data = (float *)((char *)dst + i*nb1); - - if (dim % 2 != 0 && j == ((dim + 1) / 2)) { - embed_data[dim] = 0.f; - } - - int half = dim / 2; - if (j >= half) { - return; - } - - float timestep = timesteps[i]; - float freq = (float)exp(-logf(max_period) * j / half); - float arg = timestep * freq; - embed_data[j] = cos(arg); - embed_data[j + half] = sin(arg); -} - template static __global__ void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps) { int start = blockIdx.x * group_size; @@ -9219,44 +9185,6 @@ static void ggml_cuda_op_pad( (void) src1_dd; } -static void ggml_cuda_op_arange( - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, - const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - GGML_ASSERT(dst->type == GGML_TYPE_F32); - - const float start = ((float*)dst->op_params)[0]; - const float stop = ((float*)dst->op_params)[1]; - const float step = ((float*)dst->op_params)[2]; - - int64_t steps = (int64_t)ceil((stop - start) / step); - GGML_ASSERT(ggml_nelements(dst) == steps); - - arange_f32_cuda(dst_dd, dst->ne[0], start, step, main_stream); - - (void) src0; - (void) src1; - (void) src0_dd; - (void) src1_dd; -} - - -static void ggml_cuda_op_timestep_embedding( - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, - const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); - - const int dim = dst->op_params[0]; - const int max_period = dst->op_params[1]; - - timestep_embedding_f32_cuda(src0_dd, dst_dd, src0->ne[0], dst->nb[1], dim, max_period, main_stream); - - (void) src1; - (void) dst; - (void) src1_dd; -} - static void ggml_cuda_op_rms_norm( const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { @@ -10535,47 +10463,6 @@ static void ggml_cuda_pad(const ggml_tensor * src0, const ggml_tensor * src1, gg ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_pad); } -static void ggml_cuda_arange(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_ASSERT( dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT); - - ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; - - const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU; - - // dd = data device - float * src0_ddf = nullptr; - float * src1_ddf = nullptr; - float * dst_ddf = nullptr; - - cuda_pool_alloc dst_f; - - ggml_cuda_set_device(g_main_device); - cudaStream_t main_stream = g_cudaStreams[g_main_device][0]; - - if (dst_on_device) { - dst_ddf = (float *) dst_extra->data_device[g_main_device]; - } else { - dst_ddf = dst_f.alloc(ggml_nelements(dst)); - } - - // do the computation - ggml_cuda_op_arange(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream); - CUDA_CHECK(cudaGetLastError()); - - // copy dst to host if necessary - if (!dst_on_device) { - CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream)); - } - - if (dst->backend == GGML_BACKEND_TYPE_CPU) { - CUDA_CHECK(cudaDeviceSynchronize()); - } -} - -static void ggml_cuda_timestep_embedding(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_timestep_embedding); -} - static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm); } @@ -11482,12 +11369,6 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st case GGML_OP_PAD: func = ggml_cuda_pad; break; - case GGML_OP_ARANGE: - func = ggml_cuda_arange; - break; - case GGML_OP_TIMESTEP_EMBEDDING: - func = ggml_cuda_timestep_embedding; - break; case GGML_OP_LEAKY_RELU: func = ggml_cuda_leaky_relu; break; @@ -12383,8 +12264,6 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons case GGML_OP_GROUP_NORM: case GGML_OP_UPSCALE: case GGML_OP_PAD: - case GGML_OP_ARANGE: - case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_LEAKY_RELU: return true; default: diff --git a/ggml.c b/ggml.c index 35c54ee03..b7ecb1de9 100644 --- a/ggml.c +++ b/ggml.c @@ -1822,8 +1822,6 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "POOL_2D", "UPSCALE", "PAD", - "ARANGE", - "TIMESTEP_EMBEDDING", "ARGSORT", "LEAKY_RELU", @@ -1852,7 +1850,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "CROSS_ENTROPY_LOSS_BACK", }; -static_assert(GGML_OP_COUNT == 74, "GGML_OP_COUNT != 74"); +static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -1910,8 +1908,6 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "pool_2d(x)", "upscale(x)", "pad(x)", - "arange(start, stop, step)", - "timestep_embedding(timesteps, dim, max_period)", "argsort(x)", "leaky_relu(x)", @@ -1940,7 +1936,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "cross_entropy_loss_back(x,y)", }; -static_assert(GGML_OP_COUNT == 74, "GGML_OP_COUNT != 74"); +static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); @@ -5903,55 +5899,6 @@ struct ggml_tensor * ggml_upscale( return ggml_upscale_impl(ctx, a, scale_factor); } -struct ggml_tensor * ggml_arange( - struct ggml_context * ctx, - float start, - float stop, - float step) { - - GGML_ASSERT(stop > start); - - int64_t steps = (int64_t)ceil((stop - start) / step); - - struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, steps); - - result->op = GGML_OP_ARANGE; - ((float *)(result->op_params))[0] = start; - ((float *)(result->op_params))[1] = stop; - ((float *)(result->op_params))[2] = step; - - return result; -} - - -struct ggml_tensor * ggml_timestep_embedding( - struct ggml_context * ctx, - struct ggml_tensor * timesteps, - int dim, - int max_period) { - bool is_node = false; - - if (timesteps->grad) { - GGML_ASSERT(false); // TODO: implement backward - is_node = true; - } - - int acutual_dim = dim; - if (dim % 2 != 0) { - acutual_dim = dim + 1; - } - - struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, acutual_dim, timesteps->ne[0]); - - result->op = GGML_OP_TIMESTEP_EMBEDDING; - result->op_params[0] = dim; - result->op_params[1] = max_period; - result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src[0] = timesteps; - - return result; -} - // ggml_argsort struct ggml_tensor * ggml_argsort( @@ -10300,11 +10247,11 @@ static void ggml_compute_forward_group_norm_f32( const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03); for (int64_t i00 = 0; i00 < ne00; i00++) { - sum += (ggml_float)x[i00] / (ne00 * ne01 * step); + sum += (ggml_float)x[i00]; } } } - float mean = sum; + float mean = sum / (ne00 * ne01 * step); ggml_float sum2 = 0.0; for (int64_t i02 = start; i02 < end; i02++) { @@ -10316,11 +10263,11 @@ static void ggml_compute_forward_group_norm_f32( for (int64_t i00 = 0; i00 < ne00; i00++) { float v = x[i00] - mean; y[i00] = v; - sum2 += (ggml_float)(v * v) / (ne00 * ne01 * step); + sum2 += (ggml_float)(v * v); } } } - float variance = sum2; + float variance = sum2 / (ne00 * ne01 * step); const float scale = 1.0f / sqrtf(variance + eps); for (int64_t i02 = start; i02 < end; i02++) { @@ -13601,109 +13548,6 @@ static void ggml_compute_forward_pad( } } - -// ggml_compute_forward_arange - -static void ggml_compute_forward_arange_f32( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - struct ggml_tensor * dst) { - - if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { - return; - } - - GGML_ASSERT(dst->nb[0] == sizeof(float)); - - const int ith = params->ith; - const int nth = params->nth; - - GGML_TENSOR_UNARY_OP_LOCALS - - const float start = ((float*)dst->op_params)[0]; - const float stop = ((float*)dst->op_params)[1]; - const float step = ((float*)dst->op_params)[2]; - - int64_t steps = (int64_t)ceil((stop - start) / step); - GGML_ASSERT(ggml_nelements(dst) == steps); - - for (int64_t i = ith; i < steps; i+= nth) { - float value = start + step * i; - ((float *)dst->data)[i] = value; - } -} - -static void ggml_compute_forward_arange( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - struct ggml_tensor * dst) { - switch (dst->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_arange_f32(params, src0, dst); - } break; - default: - { - GGML_ASSERT(false); - } break; - } -} - -static void ggml_compute_forward_timestep_embedding_f32( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - struct ggml_tensor * dst) { - - if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { - return; - } - - GGML_ASSERT(src0->nb[0] == sizeof(float)); - - const int ith = params->ith; - const int nth = params->nth; - - GGML_TENSOR_UNARY_OP_LOCALS - - const int dim = dst->op_params[0]; - const int max_period = dst->op_params[1]; - int acutual_dim = dim; - if (dim % 2 != 0) { - acutual_dim = dim + 1; - } - int half = dim / 2; - - for (int64_t i = 0; i < ne00; i++) { - float * embed_data = (float *)((char *) dst->data + i*nb1); - for (int64_t j = ith; j < half; j += nth) { - float timestep = ((float *)src0->data)[i]; - float freq = (float)exp(-log(max_period) * j / half); - float arg = timestep * freq; - embed_data[j] = cos(arg); - embed_data[j + half] = sin(arg); - } - if (dim % 2 != 0 && ith == 0) { - embed_data[dim] = 0.f; - } - } -} - -static void ggml_compute_forward_timestep_embedding( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - struct ggml_tensor * dst) { - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_timestep_embedding_f32(params, src0, dst); - } break; - default: - { - GGML_ASSERT(false); - } break; - } -} - // ggml_compute_forward_argsort static void ggml_compute_forward_argsort_f32( @@ -15772,14 +15616,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_pad(params, tensor); } break; - case GGML_OP_ARANGE: - { - ggml_compute_forward_arange(params, tensor->src[0], tensor); - } break; - case GGML_OP_TIMESTEP_EMBEDDING: - { - ggml_compute_forward_timestep_embedding(params, tensor->src[0], tensor); - } break; case GGML_OP_ARGSORT: { ggml_compute_forward_argsort(params, tensor); @@ -16782,14 +16618,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor { GGML_ASSERT(false); // TODO: not implemented } break; - case GGML_OP_ARANGE: - { - GGML_ASSERT(false); // TODO: not implemented - } break; - case GGML_OP_TIMESTEP_EMBEDDING: - { - GGML_ASSERT(false); // TODO: not implemented - } break; case GGML_OP_ARGSORT: { GGML_ASSERT(false); // TODO: not implemented @@ -17541,14 +17369,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { { n_tasks = n_threads; } break; - case GGML_OP_ARANGE: - { - n_tasks = n_threads; - } break; - case GGML_OP_TIMESTEP_EMBEDDING: - { - n_tasks = n_threads; - } break; case GGML_OP_ARGSORT: { n_tasks = n_threads; diff --git a/ggml.h b/ggml.h index 46f696ca5..8009986b7 100644 --- a/ggml.h +++ b/ggml.h @@ -461,8 +461,6 @@ extern "C" { GGML_OP_POOL_2D, GGML_OP_UPSCALE, // nearest interpolate GGML_OP_PAD, - GGML_OP_ARANGE, - GGML_OP_TIMESTEP_EMBEDDING, GGML_OP_ARGSORT, GGML_OP_LEAKY_RELU, @@ -1670,15 +1668,6 @@ extern "C" { int p2, int p3); - // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151 - // timesteps: [N,] - // return: [N, dim] - GGML_API struct ggml_tensor * ggml_timestep_embedding( - struct ggml_context * ctx, - struct ggml_tensor * timesteps, - int dim, - int max_period); - // sort rows enum ggml_sort_order { GGML_SORT_ORDER_ASC, @@ -1690,12 +1679,6 @@ extern "C" { struct ggml_tensor * a, enum ggml_sort_order order); - GGML_API struct ggml_tensor * ggml_arange( - struct ggml_context * ctx, - float start, - float stop, - float step); - // top k elements per row GGML_API struct ggml_tensor * ggml_top_k( struct ggml_context * ctx, diff --git a/otherarch/sdcpp/ggml_extend.hpp b/otherarch/sdcpp/ggml_extend.hpp index 71f91fe58..580ec3e92 100644 --- a/otherarch/sdcpp/ggml_extend.hpp +++ b/otherarch/sdcpp/ggml_extend.hpp @@ -846,7 +846,7 @@ public: return NULL; } // it's performing a compute, check if backend isn't cpu - if (!ggml_backend_is_cpu(backend) && tensor->backend == GGML_BACKEND_TYPE_CPU) { + if (!ggml_backend_is_cpu(backend) && tensor->backend == GGML_BACKEND_CPU) { // pass input tensors to gpu memory auto backend_tensor = ggml_dup_tensor(compute_ctx, tensor);