Revert "merge missing functions from sdcpp"

This reverts commit 19e1c518f1.
2025-09-12 09:59:41 +00:00 · 2024-03-05 15:38:51 +08:00 · 2024-03-05 15:38:51 +08:00 · c952b4f192
commit c952b4f192
parent b67a906244
4 changed files with 7 additions and 325 deletions
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -616,8 +616,6 @@ static_assert(sizeof(block_iq4_xs) == sizeof(ggml_fp16_t) + sizeof(uint16_t) + Q
 #define CUDA_UPSCALE_BLOCK_SIZE 256
 #define CUDA_CONCAT_BLOCK_SIZE 256
 #define CUDA_PAD_BLOCK_SIZE 256
-#define CUDA_ARANGE_BLOCK_SIZE 256
-#define CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE 256
 #define CUDA_ACC_BLOCK_SIZE 256
 #define CUDA_IM2COL_BLOCK_SIZE 256
 #define CUDA_POOL2D_BLOCK_SIZE 256
@ -1045,38 +1043,6 @@ static __global__ void pad_f32(const float * x, float * dst, const int ne0, cons
    }
 }

-static __global__ void arange_f32(float * dst, const int ne0, const float start, const float step) {
-    // blockIDx.x: idx of ne0 / BLOCK_SIZE
-    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
-    if (nidx >= ne0) {
-        return;
-    }
-    dst[nidx] = start + step * nidx;
-}
-
-static __global__ void timestep_embedding_f32(const float * timesteps, float * dst, const int nb1, const int dim, const int max_period) {
-    // blockIDx.y: idx of timesteps->ne[0]
-    // blockIDx.x: idx of ((dim + 1) / 2) / BLOCK_SIZE
-    int i = blockIdx.y;
-    int j = threadIdx.x + blockIdx.x * blockDim.x;
-    float * embed_data = (float *)((char *)dst +  i*nb1);
-
-    if (dim % 2 != 0 && j == ((dim + 1) / 2)) {
-        embed_data[dim] = 0.f;
-    }
-
-    int half = dim / 2;
-    if (j >= half) {
-        return;
-    }
-
-    float timestep = timesteps[i];
-    float freq = (float)exp(-logf(max_period) * j / half);
-    float arg = timestep * freq;
-    embed_data[j] = cos(arg);
-    embed_data[j + half] = sin(arg);
-}
-
 template <int block_size>
 static __global__ void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps) {
    int start = blockIdx.x * group_size;
@ -9219,44 +9185,6 @@ static void ggml_cuda_op_pad(
    (void) src1_dd;
 }

-static void ggml_cuda_op_arange(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-
-    const float start = ((float*)dst->op_params)[0];
-    const float stop = ((float*)dst->op_params)[1];
-    const float step = ((float*)dst->op_params)[2];
-
-    int64_t steps = (int64_t)ceil((stop - start) / step);
-    GGML_ASSERT(ggml_nelements(dst) == steps);
-
-    arange_f32_cuda(dst_dd, dst->ne[0], start, step, main_stream);
-
-    (void) src0;
-    (void) src1;
-    (void) src0_dd;
-    (void) src1_dd;
-}
-
-
-static void ggml_cuda_op_timestep_embedding(
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
-    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-
-    const int dim = dst->op_params[0];
-    const int max_period = dst->op_params[1];
-
-    timestep_embedding_f32_cuda(src0_dd, dst_dd, src0->ne[0], dst->nb[1], dim, max_period, main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
 static void ggml_cuda_op_rms_norm(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
@ -10535,47 +10463,6 @@ static void ggml_cuda_pad(const ggml_tensor * src0, const ggml_tensor * src1, gg
    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_pad);
 }

-static void ggml_cuda_arange(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_ASSERT(              dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
-
-    ggml_tensor_extra_gpu * dst_extra  =            (ggml_tensor_extra_gpu *)  dst->extra;
-
-    const bool  dst_on_device =              dst->backend == GGML_BACKEND_TYPE_GPU;
-
-    // dd = data device
-    float * src0_ddf = nullptr;
-    float * src1_ddf = nullptr;
-    float *  dst_ddf = nullptr;
-
-    cuda_pool_alloc<float>  dst_f;
-
-    ggml_cuda_set_device(g_main_device);
-    cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
-
-    if (dst_on_device) {
-        dst_ddf = (float *) dst_extra->data_device[g_main_device];
-    } else {
-        dst_ddf = dst_f.alloc(ggml_nelements(dst));
-    }
-
-    // do the computation
-    ggml_cuda_op_arange(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
-    CUDA_CHECK(cudaGetLastError());
-
-    // copy dst to host if necessary
-    if (!dst_on_device) {
-        CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream));
-    }
-
-    if (dst->backend == GGML_BACKEND_TYPE_CPU) {
-        CUDA_CHECK(cudaDeviceSynchronize());
-    }
-}
-
-static void ggml_cuda_timestep_embedding(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_timestep_embedding);
-}
-
 static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
 }
@ -11482,12 +11369,6 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st
        case GGML_OP_PAD:
            func = ggml_cuda_pad;
            break;
-        case GGML_OP_ARANGE:
-            func = ggml_cuda_arange;
-            break;
-        case GGML_OP_TIMESTEP_EMBEDDING:
-            func = ggml_cuda_timestep_embedding;
-            break;
        case GGML_OP_LEAKY_RELU:
            func = ggml_cuda_leaky_relu;
            break;
@ -12383,8 +12264,6 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
        case GGML_OP_GROUP_NORM:
        case GGML_OP_UPSCALE:
        case GGML_OP_PAD:
-        case GGML_OP_ARANGE:
-        case GGML_OP_TIMESTEP_EMBEDDING:
        case GGML_OP_LEAKY_RELU:
            return true;
        default: