Revert "merge missing functions from sdcpp"

This reverts commit 19e1c518f1.
2025-09-10 17:14:36 +00:00 · 2024-03-05 15:38:51 +08:00 · 2024-03-05 15:38:51 +08:00 · c952b4f192
commit c952b4f192
parent b67a906244
4 changed files with 7 additions and 325 deletions
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -616,8 +616,6 @@ static_assert(sizeof(block_iq4_xs) == sizeof(ggml_fp16_t) + sizeof(uint16_t) + Q
 #define CUDA_UPSCALE_BLOCK_SIZE 256
 #define CUDA_CONCAT_BLOCK_SIZE 256
 #define CUDA_PAD_BLOCK_SIZE 256
-#define CUDA_ARANGE_BLOCK_SIZE 256
-#define CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE 256
 #define CUDA_ACC_BLOCK_SIZE 256
 #define CUDA_IM2COL_BLOCK_SIZE 256
 #define CUDA_POOL2D_BLOCK_SIZE 256
@ -1045,38 +1043,6 @@ static __global__ void pad_f32(const float * x, float * dst, const int ne0, cons
    }
 }

-static __global__ void arange_f32(float * dst, const int ne0, const float start, const float step) {
-    // blockIDx.x: idx of ne0 / BLOCK_SIZE
-    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
-    if (nidx >= ne0) {
-        return;
-    }
-    dst[nidx] = start + step * nidx;
-}
-
-static __global__ void timestep_embedding_f32(const float * timesteps, float * dst, const int nb1, const int dim, const int max_period) {
-    // blockIDx.y: idx of timesteps->ne[0]
-    // blockIDx.x: idx of ((dim + 1) / 2) / BLOCK_SIZE
-    int i = blockIdx.y;
-    int j = threadIdx.x + blockIdx.x * blockDim.x;
-    float * embed_data = (float *)((char *)dst +  i*nb1);
-
-    if (dim % 2 != 0 && j == ((dim + 1) / 2)) {
-        embed_data[dim] = 0.f;
-    }
-
-    int half = dim / 2;
-    if (j >= half) {
-        return;
-    }
-
-    float timestep = timesteps[i];
-    float freq = (float)exp(-logf(max_period) * j / half);
-    float arg = timestep * freq;
-    embed_data[j] = cos(arg);
-    embed_data[j + half] = sin(arg);
-}
-
 template <int block_size>
 static __global__ void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps) {
    int start = blockIdx.x * group_size;
@ -9219,44 +9185,6 @@ static void ggml_cuda_op_pad(
    (void) src1_dd;
 }

-static void ggml_cuda_op_arange(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-
-    const float start = ((float*)dst->op_params)[0];
-    const float stop = ((float*)dst->op_params)[1];
-    const float step = ((float*)dst->op_params)[2];
-
-    int64_t steps = (int64_t)ceil((stop - start) / step);
-    GGML_ASSERT(ggml_nelements(dst) == steps);
-
-    arange_f32_cuda(dst_dd, dst->ne[0], start, step, main_stream);
-
-    (void) src0;
-    (void) src1;
-    (void) src0_dd;
-    (void) src1_dd;
-}
-
-
-static void ggml_cuda_op_timestep_embedding(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-
-    const int dim = dst->op_params[0];
-    const int max_period = dst->op_params[1];
-
-    timestep_embedding_f32_cuda(src0_dd, dst_dd, src0->ne[0], dst->nb[1], dim, max_period, main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
 static void ggml_cuda_op_rms_norm(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
@ -10535,47 +10463,6 @@ static void ggml_cuda_pad(const ggml_tensor * src0, const ggml_tensor * src1, gg
    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_pad);
 }

-static void ggml_cuda_arange(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_ASSERT(              dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
-
-    ggml_tensor_extra_gpu * dst_extra  =            (ggml_tensor_extra_gpu *)  dst->extra;
-
-    const bool  dst_on_device =              dst->backend == GGML_BACKEND_TYPE_GPU;
-
-    // dd = data device
-    float * src0_ddf = nullptr;
-    float * src1_ddf = nullptr;
-    float *  dst_ddf = nullptr;
-
-    cuda_pool_alloc<float>  dst_f;
-
-    ggml_cuda_set_device(g_main_device);
-    cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
-
-    if (dst_on_device) {
-        dst_ddf = (float *) dst_extra->data_device[g_main_device];
-    } else {
-        dst_ddf = dst_f.alloc(ggml_nelements(dst));
-    }
-
-    // do the computation
-    ggml_cuda_op_arange(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
-    CUDA_CHECK(cudaGetLastError());
-
-    // copy dst to host if necessary
-    if (!dst_on_device) {
-        CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream));
-    }
-
-    if (dst->backend == GGML_BACKEND_TYPE_CPU) {
-        CUDA_CHECK(cudaDeviceSynchronize());
-    }
-}
-
-static void ggml_cuda_timestep_embedding(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_timestep_embedding);
-}
-
 static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
 }
@ -11482,12 +11369,6 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st
        case GGML_OP_PAD:
            func = ggml_cuda_pad;
            break;
-        case GGML_OP_ARANGE:
-            func = ggml_cuda_arange;
-            break;
-        case GGML_OP_TIMESTEP_EMBEDDING:
-            func = ggml_cuda_timestep_embedding;
-            break;
        case GGML_OP_LEAKY_RELU:
            func = ggml_cuda_leaky_relu;
            break;
@ -12383,8 +12264,6 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
        case GGML_OP_GROUP_NORM:
        case GGML_OP_UPSCALE:
        case GGML_OP_PAD:
-        case GGML_OP_ARANGE:
-        case GGML_OP_TIMESTEP_EMBEDDING:
        case GGML_OP_LEAKY_RELU:
            return true;
        default:
--- a/ggml.c
+++ b/ggml.c
@ -1822,8 +1822,6 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
    "POOL_2D",
    "UPSCALE",
    "PAD",
-    "ARANGE",
-    "TIMESTEP_EMBEDDING",
    "ARGSORT",
    "LEAKY_RELU",

@ -1852,7 +1850,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
    "CROSS_ENTROPY_LOSS_BACK",
 };

-static_assert(GGML_OP_COUNT == 74, "GGML_OP_COUNT != 74");
+static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72");

 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
    "none",
@ -1910,8 +1908,6 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
    "pool_2d(x)",
    "upscale(x)",
    "pad(x)",
-    "arange(start, stop, step)",
-    "timestep_embedding(timesteps, dim, max_period)",
    "argsort(x)",
    "leaky_relu(x)",

@ -1940,7 +1936,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
    "cross_entropy_loss_back(x,y)",
 };

-static_assert(GGML_OP_COUNT == 74, "GGML_OP_COUNT != 74");
+static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72");

 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");

@ -5903,55 +5899,6 @@ struct ggml_tensor * ggml_upscale(
    return ggml_upscale_impl(ctx, a, scale_factor);
 }

-struct ggml_tensor * ggml_arange(
-    struct ggml_context * ctx,
-    float start,
-    float stop,
-    float step) {
-
-    GGML_ASSERT(stop > start);
-
-    int64_t steps = (int64_t)ceil((stop - start) / step);
-
-    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, steps);
-
-    result->op = GGML_OP_ARANGE;
-    ((float *)(result->op_params))[0] = start;
-    ((float *)(result->op_params))[1] = stop;
-    ((float *)(result->op_params))[2] = step;
-
-    return result;
-}
-
-
-struct ggml_tensor * ggml_timestep_embedding(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * timesteps,
-            int                   dim,
-            int                   max_period) {
-    bool is_node = false;
-
-    if (timesteps->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    int acutual_dim = dim;
-    if (dim % 2 != 0) {
-        acutual_dim = dim + 1;
-    }
-
-    struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, acutual_dim, timesteps->ne[0]);
-
-    result->op = GGML_OP_TIMESTEP_EMBEDDING;
-    result->op_params[0] = dim;
-    result->op_params[1] = max_period;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = timesteps;
-
-    return result;
-}
-
 // ggml_argsort

 struct ggml_tensor * ggml_argsort(
@ -10300,11 +10247,11 @@ static void ggml_compute_forward_group_norm_f32(
                    const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);

                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        sum += (ggml_float)x[i00] / (ne00 * ne01 * step);
+                        sum += (ggml_float)x[i00];
                    }
                }
            }
-            float mean = sum;
+            float mean = sum / (ne00 * ne01 * step);
            ggml_float sum2 = 0.0;

            for (int64_t i02 = start; i02 < end; i02++) {
@ -10316,11 +10263,11 @@ static void ggml_compute_forward_group_norm_f32(
                    for (int64_t i00 = 0; i00 < ne00; i00++) {
                        float v = x[i00] - mean;
                        y[i00] = v;
-                        sum2 += (ggml_float)(v * v) / (ne00 * ne01 * step);
+                        sum2 += (ggml_float)(v * v);
                    }
                }
            }
-            float variance = sum2;
+            float variance = sum2 / (ne00 * ne01 * step);
            const float scale = 1.0f / sqrtf(variance + eps);

            for (int64_t i02 = start; i02 < end; i02++) {
@ -13601,109 +13548,6 @@ static void ggml_compute_forward_pad(
    }
 }

-
-// ggml_compute_forward_arange
-
-static void ggml_compute_forward_arange_f32(
-    const struct ggml_compute_params * params,
-    const struct ggml_tensor * src0,
-    struct ggml_tensor * dst) {
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    GGML_ASSERT(dst->nb[0] == sizeof(float));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    const float start = ((float*)dst->op_params)[0];
-    const float stop = ((float*)dst->op_params)[1];
-    const float step = ((float*)dst->op_params)[2];
-
-    int64_t steps = (int64_t)ceil((stop - start) / step);
-    GGML_ASSERT(ggml_nelements(dst) == steps);
-
-    for (int64_t i = ith; i < steps; i+= nth) {
-        float value = start + step * i;
-        ((float *)dst->data)[i] = value;
-    }
-}
-
-static void ggml_compute_forward_arange(
-    const struct ggml_compute_params * params,
-    const struct ggml_tensor * src0,
-    struct ggml_tensor * dst) {
-    switch (dst->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_arange_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-static void ggml_compute_forward_timestep_embedding_f32(
-    const struct ggml_compute_params * params,
-    const struct ggml_tensor * src0,
-    struct ggml_tensor * dst) {
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    GGML_ASSERT(src0->nb[0] == sizeof(float));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    const int dim = dst->op_params[0];
-    const int max_period = dst->op_params[1];
-    int acutual_dim = dim;
-    if (dim % 2 != 0) {
-        acutual_dim = dim + 1;
-    }
-    int half = dim / 2;
-
-    for (int64_t i = 0; i < ne00; i++) {
-        float * embed_data = (float *)((char *)  dst->data +  i*nb1);
-        for (int64_t j = ith; j < half; j += nth) {
-            float timestep = ((float *)src0->data)[i];
-            float freq = (float)exp(-log(max_period) * j / half);
-            float arg = timestep * freq;
-            embed_data[j] = cos(arg);
-            embed_data[j + half] = sin(arg);
-        }
-        if (dim % 2 != 0 && ith == 0) {
-            embed_data[dim] = 0.f;
-        }
-    }
-}
-
-static void ggml_compute_forward_timestep_embedding(
-    const struct ggml_compute_params * params,
-    const struct ggml_tensor * src0,
-    struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_timestep_embedding_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
 // ggml_compute_forward_argsort

 static void ggml_compute_forward_argsort_f32(
@ -15772,14 +15616,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
            {
                ggml_compute_forward_pad(params, tensor);
            } break;
-        case GGML_OP_ARANGE:
-            {
-                ggml_compute_forward_arange(params, tensor->src[0], tensor);
-            } break;
-        case GGML_OP_TIMESTEP_EMBEDDING:
-            {
-                ggml_compute_forward_timestep_embedding(params, tensor->src[0], tensor);
-            } break;
        case GGML_OP_ARGSORT:
            {
                ggml_compute_forward_argsort(params, tensor);
@ -16782,14 +16618,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
            {
                GGML_ASSERT(false); // TODO: not implemented
            } break;
-        case GGML_OP_ARANGE:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_TIMESTEP_EMBEDDING:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
        case GGML_OP_ARGSORT:
            {
                GGML_ASSERT(false); // TODO: not implemented
@ -17541,14 +17369,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
            {
                n_tasks = n_threads;
            } break;
-        case GGML_OP_ARANGE:
-            {
-                n_tasks = n_threads;
-            } break;
-        case GGML_OP_TIMESTEP_EMBEDDING:
-            {
-                n_tasks = n_threads;
-            } break;
        case GGML_OP_ARGSORT:
            {
                n_tasks = n_threads;
--- a/ggml.h
+++ b/ggml.h
@ -461,8 +461,6 @@ extern "C" {
        GGML_OP_POOL_2D,
        GGML_OP_UPSCALE, // nearest interpolate
        GGML_OP_PAD,
-        GGML_OP_ARANGE,
-        GGML_OP_TIMESTEP_EMBEDDING,
        GGML_OP_ARGSORT,
        GGML_OP_LEAKY_RELU,

@ -1670,15 +1668,6 @@ extern "C" {
            int                  p2,
            int                  p3);

-    // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
-    // timesteps: [N,]
-    // return: [N, dim]
-    GGML_API struct ggml_tensor * ggml_timestep_embedding(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * timesteps,
-            int                   dim,
-            int                   max_period);
-
    // sort rows
    enum ggml_sort_order {
        GGML_SORT_ORDER_ASC,
@ -1690,12 +1679,6 @@ extern "C" {
            struct ggml_tensor  * a,
            enum ggml_sort_order  order);

-    GGML_API struct ggml_tensor * ggml_arange(
-            struct ggml_context * ctx,
-            float                  start,
-            float                  stop,
-            float                  step);
-
    // top k elements per row
    GGML_API struct ggml_tensor * ggml_top_k(
            struct ggml_context * ctx,
--- a/otherarch/sdcpp/ggml_extend.hpp
+++ b/otherarch/sdcpp/ggml_extend.hpp
@ -846,7 +846,7 @@ public:
            return NULL;
        }
        // it's performing a compute, check if backend isn't cpu
-        if (!ggml_backend_is_cpu(backend) && tensor->backend == GGML_BACKEND_TYPE_CPU) {
+        if (!ggml_backend_is_cpu(backend) && tensor->backend == GGML_BACKEND_CPU) {
            // pass input tensors to gpu memory
            auto backend_tensor = ggml_dup_tensor(compute_ctx, tensor);