From c952b4f192abcdfc2e82812b6e5c1c5fc225ab2c Mon Sep 17 00:00:00 2001
From: Concedo <39025047+LostRuins@users.noreply.github.com>
Date: Tue, 5 Mar 2024 15:38:51 +0800
Subject: [PATCH] Revert "merge missing functions from sdcpp"

This reverts commit 19e1c518f14273cd8885d7de9296f9676108ae80.
---
 ggml-cuda.cu                    | 121 --------------------
 ggml.c                          | 192 +-------------------------------
 ggml.h                          |  17 ---
 otherarch/sdcpp/ggml_extend.hpp |   2 +-
 4 files changed, 7 insertions(+), 325 deletions(-)
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index f9ff661a3..66a597056 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -616,8 +616,6 @@ static_assert(sizeof(block_iq4_xs) == sizeof(ggml_fp16_t) + sizeof(uint16_t) + Q
 #define CUDA_UPSCALE_BLOCK_SIZE 256
 #define CUDA_CONCAT_BLOCK_SIZE 256
 #define CUDA_PAD_BLOCK_SIZE 256
-#define CUDA_ARANGE_BLOCK_SIZE 256
-#define CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE 256
 #define CUDA_ACC_BLOCK_SIZE 256
 #define CUDA_IM2COL_BLOCK_SIZE 256
 #define CUDA_POOL2D_BLOCK_SIZE 256
@@ -1045,38 +1043,6 @@ static __global__ void pad_f32(const float * x, float * dst, const int ne0, cons
     }
 }
 
-static __global__ void arange_f32(float * dst, const int ne0, const float start, const float step) {
-    // blockIDx.x: idx of ne0 / BLOCK_SIZE
-    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
-    if (nidx >= ne0) {
-        return;
-    }
-    dst[nidx] = start + step * nidx;
-}
-
-static __global__ void timestep_embedding_f32(const float * timesteps, float * dst, const int nb1, const int dim, const int max_period) {
-    // blockIDx.y: idx of timesteps->ne[0]
-    // blockIDx.x: idx of ((dim + 1) / 2) / BLOCK_SIZE
-    int i = blockIdx.y;
-    int j = threadIdx.x + blockIdx.x * blockDim.x;
-    float * embed_data = (float *)((char *)dst +  i*nb1);
-
-    if (dim % 2 != 0 && j == ((dim + 1) / 2)) {
-        embed_data[dim] = 0.f;
-    }
-
-    int half = dim / 2;
-    if (j >= half) {
-        return;
-    }
-
-    float timestep = timesteps[i];
-    float freq = (float)exp(-logf(max_period) * j / half);
-    float arg = timestep * freq;
-    embed_data[j] = cos(arg);
-    embed_data[j + half] = sin(arg);
-}
-
 template <int block_size>
 static __global__ void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps) {
     int start = blockIdx.x * group_size;
@@ -9219,44 +9185,6 @@ static void ggml_cuda_op_pad(
     (void) src1_dd;
 }
 
-static void ggml_cuda_op_arange(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-
-    const float start = ((float*)dst->op_params)[0];
-    const float stop = ((float*)dst->op_params)[1];
-    const float step = ((float*)dst->op_params)[2];
-
-    int64_t steps = (int64_t)ceil((stop - start) / step);
-    GGML_ASSERT(ggml_nelements(dst) == steps);
-
-    arange_f32_cuda(dst_dd, dst->ne[0], start, step, main_stream);
-
-    (void) src0;
-    (void) src1;
-    (void) src0_dd;
-    (void) src1_dd;
-}
-
-
-static void ggml_cuda_op_timestep_embedding(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-
-    const int dim = dst->op_params[0];
-    const int max_period = dst->op_params[1];
-
-    timestep_embedding_f32_cuda(src0_dd, dst_dd, src0->ne[0], dst->nb[1], dim, max_period, main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
 static void ggml_cuda_op_rms_norm(
     const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
     const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
@@ -10535,47 +10463,6 @@ static void ggml_cuda_pad(const ggml_tensor * src0, const ggml_tensor * src1, gg
     ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_pad);
 }
 
-static void ggml_cuda_arange(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_ASSERT(              dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
-
-    ggml_tensor_extra_gpu * dst_extra  =            (ggml_tensor_extra_gpu *)  dst->extra;
-
-    const bool  dst_on_device =              dst->backend == GGML_BACKEND_TYPE_GPU;
-
-    // dd = data device
-    float * src0_ddf = nullptr;
-    float * src1_ddf = nullptr;
-    float *  dst_ddf = nullptr;
-
-    cuda_pool_alloc<float>  dst_f;
-
-    ggml_cuda_set_device(g_main_device);
-    cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
-
-    if (dst_on_device) {
-        dst_ddf = (float *) dst_extra->data_device[g_main_device];
-    } else {
-        dst_ddf = dst_f.alloc(ggml_nelements(dst));
-    }
-
-    // do the computation
-    ggml_cuda_op_arange(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
-    CUDA_CHECK(cudaGetLastError());
-
-    // copy dst to host if necessary
-    if (!dst_on_device) {
-        CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream));
-    }
-
-    if (dst->backend == GGML_BACKEND_TYPE_CPU) {
-        CUDA_CHECK(cudaDeviceSynchronize());
-    }
-}
-
-static void ggml_cuda_timestep_embedding(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_timestep_embedding);
-}
-
 static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
 }
@@ -11482,12 +11369,6 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st
         case GGML_OP_PAD:
             func = ggml_cuda_pad;
             break;
-        case GGML_OP_ARANGE:
-            func = ggml_cuda_arange;
-            break;
-        case GGML_OP_TIMESTEP_EMBEDDING:
-            func = ggml_cuda_timestep_embedding;
-            break;
         case GGML_OP_LEAKY_RELU:
             func = ggml_cuda_leaky_relu;
             break;
@@ -12383,8 +12264,6 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
         case GGML_OP_GROUP_NORM:
         case GGML_OP_UPSCALE:
         case GGML_OP_PAD:
-        case GGML_OP_ARANGE:
-        case GGML_OP_TIMESTEP_EMBEDDING:
         case GGML_OP_LEAKY_RELU:
             return true;
         default:
diff --git a/ggml.c b/ggml.c
index 35c54ee03..b7ecb1de9 100644
--- a/ggml.c
+++ b/ggml.c
@@ -1822,8 +1822,6 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "POOL_2D",
     "UPSCALE",
     "PAD",
-    "ARANGE",
-    "TIMESTEP_EMBEDDING",
     "ARGSORT",
     "LEAKY_RELU",
 
@@ -1852,7 +1850,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "CROSS_ENTROPY_LOSS_BACK",
 };
 
-static_assert(GGML_OP_COUNT == 74, "GGML_OP_COUNT != 74");
+static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72");
 
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "none",
@@ -1910,8 +1908,6 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "pool_2d(x)",
     "upscale(x)",
     "pad(x)",
-    "arange(start, stop, step)",
-    "timestep_embedding(timesteps, dim, max_period)",
     "argsort(x)",
     "leaky_relu(x)",
 
@@ -1940,7 +1936,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "cross_entropy_loss_back(x,y)",
 };
 
-static_assert(GGML_OP_COUNT == 74, "GGML_OP_COUNT != 74");
+static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72");
 
 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
 
@@ -5903,55 +5899,6 @@ struct ggml_tensor * ggml_upscale(
     return ggml_upscale_impl(ctx, a, scale_factor);
 }
 
-struct ggml_tensor * ggml_arange(
-    struct ggml_context * ctx,
-    float start,
-    float stop,
-    float step) {
-
-    GGML_ASSERT(stop > start);
-
-    int64_t steps = (int64_t)ceil((stop - start) / step);
-
-    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, steps);
-
-    result->op = GGML_OP_ARANGE;
-    ((float *)(result->op_params))[0] = start;
-    ((float *)(result->op_params))[1] = stop;
-    ((float *)(result->op_params))[2] = step;
-
-    return result;
-}
-
-
-struct ggml_tensor * ggml_timestep_embedding(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * timesteps,
-            int                   dim,
-            int                   max_period) {
-    bool is_node = false;
-
-    if (timesteps->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    int acutual_dim = dim;
-    if (dim % 2 != 0) {
-        acutual_dim = dim + 1;
-    }
-
-    struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, acutual_dim, timesteps->ne[0]);
-
-    result->op = GGML_OP_TIMESTEP_EMBEDDING;
-    result->op_params[0] = dim;
-    result->op_params[1] = max_period;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = timesteps;
-
-    return result;
-}
-
 // ggml_argsort
 
 struct ggml_tensor * ggml_argsort(
@@ -10300,11 +10247,11 @@ static void ggml_compute_forward_group_norm_f32(
                     const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
 
                     for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        sum += (ggml_float)x[i00] / (ne00 * ne01 * step);
+                        sum += (ggml_float)x[i00];
                     }
                 }
             }
-            float mean = sum;
+            float mean = sum / (ne00 * ne01 * step);
             ggml_float sum2 = 0.0;
 
             for (int64_t i02 = start; i02 < end; i02++) {
@@ -10316,11 +10263,11 @@ static void ggml_compute_forward_group_norm_f32(
                     for (int64_t i00 = 0; i00 < ne00; i00++) {
                         float v = x[i00] - mean;
                         y[i00] = v;
-                        sum2 += (ggml_float)(v * v) / (ne00 * ne01 * step);
+                        sum2 += (ggml_float)(v * v);
                     }
                 }
             }
-            float variance = sum2;
+            float variance = sum2 / (ne00 * ne01 * step);
             const float scale = 1.0f / sqrtf(variance + eps);
 
             for (int64_t i02 = start; i02 < end; i02++) {
@@ -13601,109 +13548,6 @@ static void ggml_compute_forward_pad(
     }
 }
 
-
-// ggml_compute_forward_arange
-
-static void ggml_compute_forward_arange_f32(
-    const struct ggml_compute_params * params,
-    const struct ggml_tensor * src0,
-    struct ggml_tensor * dst) {
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    GGML_ASSERT(dst->nb[0] == sizeof(float));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    const float start = ((float*)dst->op_params)[0];
-    const float stop = ((float*)dst->op_params)[1];
-    const float step = ((float*)dst->op_params)[2];
-
-    int64_t steps = (int64_t)ceil((stop - start) / step);
-    GGML_ASSERT(ggml_nelements(dst) == steps);
-
-    for (int64_t i = ith; i < steps; i+= nth) {
-        float value = start + step * i;
-        ((float *)dst->data)[i] = value;
-    }
-}
-
-static void ggml_compute_forward_arange(
-    const struct ggml_compute_params * params,
-    const struct ggml_tensor * src0,
-    struct ggml_tensor * dst) {
-    switch (dst->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_arange_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-static void ggml_compute_forward_timestep_embedding_f32(
-    const struct ggml_compute_params * params,
-    const struct ggml_tensor * src0,
-    struct ggml_tensor * dst) {
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    GGML_ASSERT(src0->nb[0] == sizeof(float));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    const int dim = dst->op_params[0];
-    const int max_period = dst->op_params[1];
-    int acutual_dim = dim;
-    if (dim % 2 != 0) {
-        acutual_dim = dim + 1;
-    }
-    int half = dim / 2;
-
-    for (int64_t i = 0; i < ne00; i++) {
-        float * embed_data = (float *)((char *)  dst->data +  i*nb1);
-        for (int64_t j = ith; j < half; j += nth) {
-            float timestep = ((float *)src0->data)[i];
-            float freq = (float)exp(-log(max_period) * j / half);
-            float arg = timestep * freq;
-            embed_data[j] = cos(arg);
-            embed_data[j + half] = sin(arg);
-        }
-        if (dim % 2 != 0 && ith == 0) {
-            embed_data[dim] = 0.f;
-        }
-    }
-}
-
-static void ggml_compute_forward_timestep_embedding(
-    const struct ggml_compute_params * params,
-    const struct ggml_tensor * src0,
-    struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_timestep_embedding_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
 // ggml_compute_forward_argsort
 
 static void ggml_compute_forward_argsort_f32(
@@ -15772,14 +15616,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             {
                 ggml_compute_forward_pad(params, tensor);
             } break;
-        case GGML_OP_ARANGE:
-            {
-                ggml_compute_forward_arange(params, tensor->src[0], tensor);
-            } break;
-        case GGML_OP_TIMESTEP_EMBEDDING:
-            {
-                ggml_compute_forward_timestep_embedding(params, tensor->src[0], tensor);
-            } break;
         case GGML_OP_ARGSORT:
             {
                 ggml_compute_forward_argsort(params, tensor);
@@ -16782,14 +16618,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
             {
                 GGML_ASSERT(false); // TODO: not implemented
             } break;
-        case GGML_OP_ARANGE:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_TIMESTEP_EMBEDDING:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
         case GGML_OP_ARGSORT:
             {
                 GGML_ASSERT(false); // TODO: not implemented
@@ -17541,14 +17369,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
             {
                 n_tasks = n_threads;
             } break;
-        case GGML_OP_ARANGE:
-            {
-                n_tasks = n_threads;
-            } break;
-        case GGML_OP_TIMESTEP_EMBEDDING:
-            {
-                n_tasks = n_threads;
-            } break;
         case GGML_OP_ARGSORT:
             {
                 n_tasks = n_threads;
diff --git a/ggml.h b/ggml.h
index 46f696ca5..8009986b7 100644
--- a/ggml.h
+++ b/ggml.h
@@ -461,8 +461,6 @@ extern "C" {
         GGML_OP_POOL_2D,
         GGML_OP_UPSCALE, // nearest interpolate
         GGML_OP_PAD,
-        GGML_OP_ARANGE,
-        GGML_OP_TIMESTEP_EMBEDDING,
         GGML_OP_ARGSORT,
         GGML_OP_LEAKY_RELU,
 
@@ -1670,15 +1668,6 @@ extern "C" {
             int                  p2,
             int                  p3);
 
-    // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
-    // timesteps: [N,]
-    // return: [N, dim]
-    GGML_API struct ggml_tensor * ggml_timestep_embedding(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * timesteps,
-            int                   dim,
-            int                   max_period);
-
     // sort rows
     enum ggml_sort_order {
         GGML_SORT_ORDER_ASC,
@@ -1690,12 +1679,6 @@ extern "C" {
             struct ggml_tensor  * a,
             enum ggml_sort_order  order);
 
-    GGML_API struct ggml_tensor * ggml_arange(
-            struct ggml_context * ctx,
-            float                  start,
-            float                  stop,
-            float                  step);
-
     // top k elements per row
     GGML_API struct ggml_tensor * ggml_top_k(
             struct ggml_context * ctx,
diff --git a/otherarch/sdcpp/ggml_extend.hpp b/otherarch/sdcpp/ggml_extend.hpp
index 71f91fe58..580ec3e92 100644
--- a/otherarch/sdcpp/ggml_extend.hpp
+++ b/otherarch/sdcpp/ggml_extend.hpp
@@ -846,7 +846,7 @@ public:
             return NULL;
         }
         // it's performing a compute, check if backend isn't cpu
-        if (!ggml_backend_is_cpu(backend) && tensor->backend == GGML_BACKEND_TYPE_CPU) {
+        if (!ggml_backend_is_cpu(backend) && tensor->backend == GGML_BACKEND_CPU) {
             // pass input tensors to gpu memory
             auto backend_tensor = ggml_dup_tensor(compute_ctx, tensor);