merge missing functions from sdcpp

2025-09-11 09:34:37 +00:00 · 2024-02-28 16:38:00 +08:00 · 2024-02-28 16:38:00 +08:00 · 19e1c518f1
commit 19e1c518f1
parent 26696970ce
4 changed files with 325 additions and 7 deletions
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -606,6 +606,8 @@ static_assert(sizeof(block_iq4_xs) == sizeof(ggml_fp16_t) + sizeof(uint16_t) + Q
 #define CUDA_UPSCALE_BLOCK_SIZE 256
 #define CUDA_CONCAT_BLOCK_SIZE 256
 #define CUDA_PAD_BLOCK_SIZE 256
 #define CUDA_ARANGE_BLOCK_SIZE 256
 #define CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE 256
 #define CUDA_ACC_BLOCK_SIZE 256
 #define CUDA_IM2COL_BLOCK_SIZE 256
 #define CUDA_POOL2D_BLOCK_SIZE 256
@ -1033,6 +1035,38 @@ static __global__ void pad_f32(const float * x, float * dst, const int ne0, cons
    }
 }
 static __global__ void arange_f32(float * dst, const int ne0, const float start, const float step) {
    // blockIDx.x: idx of ne0 / BLOCK_SIZE
    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
    if (nidx >= ne0) {
        return;
    }
    dst[nidx] = start + step * nidx;
 }
 static __global__ void timestep_embedding_f32(const float * timesteps, float * dst, const int nb1, const int dim, const int max_period) {
    // blockIDx.y: idx of timesteps->ne[0]
    // blockIDx.x: idx of ((dim + 1) / 2) / BLOCK_SIZE
    int i = blockIdx.y;
    int j = threadIdx.x + blockIdx.x * blockDim.x;
    float * embed_data = (float *)((char *)dst +  i*nb1);
    if (dim % 2 != 0 && j == ((dim + 1) / 2)) {
        embed_data[dim] = 0.f;
    }
    int half = dim / 2;
    if (j >= half) {
        return;
    }
    float timestep = timesteps[i];
    float freq = (float)exp(-logf(max_period) * j / half);
    float arg = timestep * freq;
    embed_data[j] = cos(arg);
    embed_data[j + half] = sin(arg);
 }
 template <int block_size>
 static __global__ void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps) {
    int start = blockIdx.x * group_size;
@ -9159,6 +9193,44 @@ static void ggml_cuda_op_pad(
    (void) src1_dd;
 }
 static void ggml_cuda_op_arange(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
    GGML_ASSERT(dst->type == GGML_TYPE_F32);
    const float start = ((float*)dst->op_params)[0];
    const float stop = ((float*)dst->op_params)[1];
    const float step = ((float*)dst->op_params)[2];
    int64_t steps = (int64_t)ceil((stop - start) / step);
    GGML_ASSERT(ggml_nelements(dst) == steps);
    arange_f32_cuda(dst_dd, dst->ne[0], start, step, main_stream);
    (void) src0;
    (void) src1;
    (void) src0_dd;
    (void) src1_dd;
 }
 static void ggml_cuda_op_timestep_embedding(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT(dst->type == GGML_TYPE_F32);
    const int dim = dst->op_params[0];
    const int max_period = dst->op_params[1];
    timestep_embedding_f32_cuda(src0_dd, dst_dd, src0->ne[0], dst->nb[1], dim, max_period, main_stream);
    (void) src1;
    (void) dst;
    (void) src1_dd;
 }
 static void ggml_cuda_op_rms_norm(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
    const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
@ -10437,6 +10509,47 @@ static void ggml_cuda_pad(const ggml_tensor * src0, const ggml_tensor * src1, gg
    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_pad);
 }
 static void ggml_cuda_arange(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    GGML_ASSERT(              dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
    ggml_tensor_extra_gpu * dst_extra  =            (ggml_tensor_extra_gpu *)  dst->extra;
    const bool  dst_on_device =              dst->backend == GGML_BACKEND_TYPE_GPU;
    // dd = data device
    float * src0_ddf = nullptr;
    float * src1_ddf = nullptr;
    float *  dst_ddf = nullptr;
    cuda_pool_alloc<float>  dst_f;
    ggml_cuda_set_device(g_main_device);
    cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
    if (dst_on_device) {
        dst_ddf = (float *) dst_extra->data_device[g_main_device];
    } else {
        dst_ddf = dst_f.alloc(ggml_nelements(dst));
    }
    // do the computation
    ggml_cuda_op_arange(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
    CUDA_CHECK(cudaGetLastError());
    // copy dst to host if necessary
    if (!dst_on_device) {
        CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream));
    }
    if (dst->backend == GGML_BACKEND_TYPE_CPU) {
        CUDA_CHECK(cudaDeviceSynchronize());
    }
 }
 static void ggml_cuda_timestep_embedding(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_timestep_embedding);
 }
 static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
 }
@ -11343,6 +11456,12 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st
        case GGML_OP_PAD:
            func = ggml_cuda_pad;
            break;
        case GGML_OP_ARANGE:
            func = ggml_cuda_arange;
            break;
        case GGML_OP_TIMESTEP_EMBEDDING:
            func = ggml_cuda_timestep_embedding;
            break;
        case GGML_OP_LEAKY_RELU:
            func = ggml_cuda_leaky_relu;
            break;
@ -12238,6 +12357,8 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
        case GGML_OP_GROUP_NORM:
        case GGML_OP_UPSCALE:
        case GGML_OP_PAD:
        case GGML_OP_ARANGE:
        case GGML_OP_TIMESTEP_EMBEDDING:
        case GGML_OP_LEAKY_RELU:
            return true;
        default:
--- a/ggml.c
+++ b/ggml.c
@ -1804,6 +1804,8 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
    "POOL_2D",
    "UPSCALE",
    "PAD",
    "ARANGE",
    "TIMESTEP_EMBEDDING",
    "ARGSORT",
    "LEAKY_RELU",
@ -1832,7 +1834,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
    "CROSS_ENTROPY_LOSS_BACK",
 };
-static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72");
+static_assert(GGML_OP_COUNT == 74, "GGML_OP_COUNT != 74");
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
    "none",
@ -1890,6 +1892,8 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
    "pool_2d(x)",
    "upscale(x)",
    "pad(x)",
    "arange(start, stop, step)",
    "timestep_embedding(timesteps, dim, max_period)",
    "argsort(x)",
    "leaky_relu(x)",
@ -1918,7 +1922,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
    "cross_entropy_loss_back(x,y)",
 };
-static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72");
+static_assert(GGML_OP_COUNT == 74, "GGML_OP_COUNT != 74");
 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
@ -5879,6 +5883,55 @@ struct ggml_tensor * ggml_upscale(
    return ggml_upscale_impl(ctx, a, scale_factor);
 }
 struct ggml_tensor * ggml_arange(
    struct ggml_context * ctx,
    float start,
    float stop,
    float step) {
    GGML_ASSERT(stop > start);
    int64_t steps = (int64_t)ceil((stop - start) / step);
    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, steps);
    result->op = GGML_OP_ARANGE;
    ((float *)(result->op_params))[0] = start;
    ((float *)(result->op_params))[1] = stop;
    ((float *)(result->op_params))[2] = step;
    return result;
 }
 struct ggml_tensor * ggml_timestep_embedding(
            struct ggml_context * ctx,
            struct ggml_tensor  * timesteps,
            int                   dim,
            int                   max_period) {
    bool is_node = false;
    if (timesteps->grad) {
        GGML_ASSERT(false); // TODO: implement backward
        is_node = true;
    }
    int acutual_dim = dim;
    if (dim % 2 != 0) {
        acutual_dim = dim + 1;
    }
    struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, acutual_dim, timesteps->ne[0]);
    result->op = GGML_OP_TIMESTEP_EMBEDDING;
    result->op_params[0] = dim;
    result->op_params[1] = max_period;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = timesteps;
    return result;
 }
 // ggml_argsort
 struct ggml_tensor * ggml_argsort(
@ -10227,11 +10280,11 @@ static void ggml_compute_forward_group_norm_f32(
                    const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        sum += (ggml_float)x[i00];
+                        sum += (ggml_float)x[i00] / (ne00 * ne01 * step);
                    }
                }
            }
-            float mean = sum / (ne00 * ne01 * step);
+            float mean = sum;
            ggml_float sum2 = 0.0;
            for (int64_t i02 = start; i02 < end; i02++) {
@ -10243,11 +10296,11 @@ static void ggml_compute_forward_group_norm_f32(
                    for (int64_t i00 = 0; i00 < ne00; i00++) {
                        float v = x[i00] - mean;
                        y[i00] = v;
-                        sum2 += (ggml_float)(v * v);
+                        sum2 += (ggml_float)(v * v) / (ne00 * ne01 * step);
                    }
                }
            }
-            float variance = sum2 / (ne00 * ne01 * step);
+            float variance = sum2;
            const float scale = 1.0f / sqrtf(variance + eps);
            for (int64_t i02 = start; i02 < end; i02++) {
@ -13528,6 +13581,109 @@ static void ggml_compute_forward_pad(
    }
 }
 // ggml_compute_forward_arange
 static void ggml_compute_forward_arange_f32(
    const struct ggml_compute_params * params,
    const struct ggml_tensor * src0,
    struct ggml_tensor * dst) {
    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }
    GGML_ASSERT(dst->nb[0] == sizeof(float));
    const int ith = params->ith;
    const int nth = params->nth;
    GGML_TENSOR_UNARY_OP_LOCALS
    const float start = ((float*)dst->op_params)[0];
    const float stop = ((float*)dst->op_params)[1];
    const float step = ((float*)dst->op_params)[2];
    int64_t steps = (int64_t)ceil((stop - start) / step);
    GGML_ASSERT(ggml_nelements(dst) == steps);
    for (int64_t i = ith; i < steps; i+= nth) {
        float value = start + step * i;
        ((float *)dst->data)[i] = value;
    }
 }
 static void ggml_compute_forward_arange(
    const struct ggml_compute_params * params,
    const struct ggml_tensor * src0,
    struct ggml_tensor * dst) {
    switch (dst->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_arange_f32(params, src0, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
 }
 static void ggml_compute_forward_timestep_embedding_f32(
    const struct ggml_compute_params * params,
    const struct ggml_tensor * src0,
    struct ggml_tensor * dst) {
    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
        return;
    }
    GGML_ASSERT(src0->nb[0] == sizeof(float));
    const int ith = params->ith;
    const int nth = params->nth;
    GGML_TENSOR_UNARY_OP_LOCALS
    const int dim = dst->op_params[0];
    const int max_period = dst->op_params[1];
    int acutual_dim = dim;
    if (dim % 2 != 0) {
        acutual_dim = dim + 1;
    }
    int half = dim / 2;
    for (int64_t i = 0; i < ne00; i++) {
        float * embed_data = (float *)((char *)  dst->data +  i*nb1);
        for (int64_t j = ith; j < half; j += nth) {
            float timestep = ((float *)src0->data)[i];
            float freq = (float)exp(-log(max_period) * j / half);
            float arg = timestep * freq;
            embed_data[j] = cos(arg);
            embed_data[j + half] = sin(arg);
        }
        if (dim % 2 != 0 && ith == 0) {
            embed_data[dim] = 0.f;
        }
    }
 }
 static void ggml_compute_forward_timestep_embedding(
    const struct ggml_compute_params * params,
    const struct ggml_tensor * src0,
    struct ggml_tensor * dst) {
    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_timestep_embedding_f32(params, src0, dst);
            } break;
        default:
            {
                GGML_ASSERT(false);
            } break;
    }
 }
 // ggml_compute_forward_argsort
 static void ggml_compute_forward_argsort_f32(
@ -15593,6 +15749,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
            {
                ggml_compute_forward_pad(params, tensor);
            } break;
        case GGML_OP_ARANGE:
            {
                ggml_compute_forward_arange(params, tensor->src[0], tensor);
            } break;
        case GGML_OP_TIMESTEP_EMBEDDING:
            {
                ggml_compute_forward_timestep_embedding(params, tensor->src[0], tensor);
            } break;
        case GGML_OP_ARGSORT:
            {
                ggml_compute_forward_argsort(params, tensor);
@ -16595,6 +16759,14 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
            {
                GGML_ASSERT(false); // TODO: not implemented
            } break;
        case GGML_OP_ARANGE:
            {
                GGML_ASSERT(false); // TODO: not implemented
            } break;
        case GGML_OP_TIMESTEP_EMBEDDING:
            {
                GGML_ASSERT(false); // TODO: not implemented
            } break;
        case GGML_OP_ARGSORT:
            {
                GGML_ASSERT(false); // TODO: not implemented
@ -17346,6 +17518,14 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
            {
                n_tasks = n_threads;
            } break;
        case GGML_OP_ARANGE:
            {
                n_tasks = n_threads;
            } break;
        case GGML_OP_TIMESTEP_EMBEDDING:
            {
                n_tasks = n_threads;
            } break;
        case GGML_OP_ARGSORT:
            {
                n_tasks = n_threads;
--- a/ggml.h
+++ b/ggml.h
@ -461,6 +461,8 @@ extern "C" {
        GGML_OP_POOL_2D,
        GGML_OP_UPSCALE, // nearest interpolate
        GGML_OP_PAD,
        GGML_OP_ARANGE,
        GGML_OP_TIMESTEP_EMBEDDING,
        GGML_OP_ARGSORT,
        GGML_OP_LEAKY_RELU,
@ -1658,6 +1660,15 @@ extern "C" {
            int                  p2,
            int                  p3);
    // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
    // timesteps: [N,]
    // return: [N, dim]
    GGML_API struct ggml_tensor * ggml_timestep_embedding(
            struct ggml_context * ctx,
            struct ggml_tensor  * timesteps,
            int                   dim,
            int                   max_period);
    // sort rows
    enum ggml_sort_order {
        GGML_SORT_ORDER_ASC,
@ -1669,6 +1680,12 @@ extern "C" {
            struct ggml_tensor  * a,
            enum ggml_sort_order  order);
    GGML_API struct ggml_tensor * ggml_arange(
            struct ggml_context * ctx,
            float                  start,
            float                  stop,
            float                  step);
    // top k elements per row
    GGML_API struct ggml_tensor * ggml_top_k(
            struct ggml_context * ctx,
--- a/otherarch/sdcpp/ggml_extend.hpp
+++ b/otherarch/sdcpp/ggml_extend.hpp
@ -846,7 +846,7 @@ public:
            return NULL;
        }
        // it's performing a compute, check if backend isn't cpu
-        if (!ggml_backend_is_cpu(backend) && tensor->backend == GGML_BACKEND_CPU) {
+        if (!ggml_backend_is_cpu(backend) && tensor->backend == GGML_BACKEND_TYPE_CPU) {
            // pass input tensors to gpu memory
            auto backend_tensor = ggml_dup_tensor(compute_ctx, tensor);