merge missing functions from sdcpp

This commit is contained in:
Concedo 2024-02-28 16:38:00 +08:00
parent 26696970ce
commit 19e1c518f1
4 changed files with 325 additions and 7 deletions

View file

@ -606,6 +606,8 @@ static_assert(sizeof(block_iq4_xs) == sizeof(ggml_fp16_t) + sizeof(uint16_t) + Q
#define CUDA_UPSCALE_BLOCK_SIZE 256 #define CUDA_UPSCALE_BLOCK_SIZE 256
#define CUDA_CONCAT_BLOCK_SIZE 256 #define CUDA_CONCAT_BLOCK_SIZE 256
#define CUDA_PAD_BLOCK_SIZE 256 #define CUDA_PAD_BLOCK_SIZE 256
#define CUDA_ARANGE_BLOCK_SIZE 256
#define CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE 256
#define CUDA_ACC_BLOCK_SIZE 256 #define CUDA_ACC_BLOCK_SIZE 256
#define CUDA_IM2COL_BLOCK_SIZE 256 #define CUDA_IM2COL_BLOCK_SIZE 256
#define CUDA_POOL2D_BLOCK_SIZE 256 #define CUDA_POOL2D_BLOCK_SIZE 256
@ -1033,6 +1035,38 @@ static __global__ void pad_f32(const float * x, float * dst, const int ne0, cons
} }
} }
static __global__ void arange_f32(float * dst, const int ne0, const float start, const float step) {
// blockIDx.x: idx of ne0 / BLOCK_SIZE
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
if (nidx >= ne0) {
return;
}
dst[nidx] = start + step * nidx;
}
static __global__ void timestep_embedding_f32(const float * timesteps, float * dst, const int nb1, const int dim, const int max_period) {
// blockIDx.y: idx of timesteps->ne[0]
// blockIDx.x: idx of ((dim + 1) / 2) / BLOCK_SIZE
int i = blockIdx.y;
int j = threadIdx.x + blockIdx.x * blockDim.x;
float * embed_data = (float *)((char *)dst + i*nb1);
if (dim % 2 != 0 && j == ((dim + 1) / 2)) {
embed_data[dim] = 0.f;
}
int half = dim / 2;
if (j >= half) {
return;
}
float timestep = timesteps[i];
float freq = (float)exp(-logf(max_period) * j / half);
float arg = timestep * freq;
embed_data[j] = cos(arg);
embed_data[j + half] = sin(arg);
}
template <int block_size> template <int block_size>
static __global__ void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps) { static __global__ void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps) {
int start = blockIdx.x * group_size; int start = blockIdx.x * group_size;
@ -9159,6 +9193,44 @@ static void ggml_cuda_op_pad(
(void) src1_dd; (void) src1_dd;
} }
static void ggml_cuda_op_arange(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
GGML_ASSERT(dst->type == GGML_TYPE_F32);
const float start = ((float*)dst->op_params)[0];
const float stop = ((float*)dst->op_params)[1];
const float step = ((float*)dst->op_params)[2];
int64_t steps = (int64_t)ceil((stop - start) / step);
GGML_ASSERT(ggml_nelements(dst) == steps);
arange_f32_cuda(dst_dd, dst->ne[0], start, step, main_stream);
(void) src0;
(void) src1;
(void) src0_dd;
(void) src1_dd;
}
static void ggml_cuda_op_timestep_embedding(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
const int dim = dst->op_params[0];
const int max_period = dst->op_params[1];
timestep_embedding_f32_cuda(src0_dd, dst_dd, src0->ne[0], dst->nb[1], dim, max_period, main_stream);
(void) src1;
(void) dst;
(void) src1_dd;
}
static void ggml_cuda_op_rms_norm( static void ggml_cuda_op_rms_norm(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
@ -10437,6 +10509,47 @@ static void ggml_cuda_pad(const ggml_tensor * src0, const ggml_tensor * src1, gg
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_pad); ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_pad);
} }
static void ggml_cuda_arange(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT( dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU;
// dd = data device
float * src0_ddf = nullptr;
float * src1_ddf = nullptr;
float * dst_ddf = nullptr;
cuda_pool_alloc<float> dst_f;
ggml_cuda_set_device(g_main_device);
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
if (dst_on_device) {
dst_ddf = (float *) dst_extra->data_device[g_main_device];
} else {
dst_ddf = dst_f.alloc(ggml_nelements(dst));
}
// do the computation
ggml_cuda_op_arange(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
CUDA_CHECK(cudaGetLastError());
// copy dst to host if necessary
if (!dst_on_device) {
CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream));
}
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
CUDA_CHECK(cudaDeviceSynchronize());
}
}
static void ggml_cuda_timestep_embedding(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_timestep_embedding);
}
static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm); ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
} }
@ -11343,6 +11456,12 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st
case GGML_OP_PAD: case GGML_OP_PAD:
func = ggml_cuda_pad; func = ggml_cuda_pad;
break; break;
case GGML_OP_ARANGE:
func = ggml_cuda_arange;
break;
case GGML_OP_TIMESTEP_EMBEDDING:
func = ggml_cuda_timestep_embedding;
break;
case GGML_OP_LEAKY_RELU: case GGML_OP_LEAKY_RELU:
func = ggml_cuda_leaky_relu; func = ggml_cuda_leaky_relu;
break; break;
@ -12238,6 +12357,8 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
case GGML_OP_GROUP_NORM: case GGML_OP_GROUP_NORM:
case GGML_OP_UPSCALE: case GGML_OP_UPSCALE:
case GGML_OP_PAD: case GGML_OP_PAD:
case GGML_OP_ARANGE:
case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_LEAKY_RELU: case GGML_OP_LEAKY_RELU:
return true; return true;
default: default:

192
ggml.c
View file

@ -1804,6 +1804,8 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"POOL_2D", "POOL_2D",
"UPSCALE", "UPSCALE",
"PAD", "PAD",
"ARANGE",
"TIMESTEP_EMBEDDING",
"ARGSORT", "ARGSORT",
"LEAKY_RELU", "LEAKY_RELU",
@ -1832,7 +1834,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"CROSS_ENTROPY_LOSS_BACK", "CROSS_ENTROPY_LOSS_BACK",
}; };
static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72"); static_assert(GGML_OP_COUNT == 74, "GGML_OP_COUNT != 74");
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"none", "none",
@ -1890,6 +1892,8 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"pool_2d(x)", "pool_2d(x)",
"upscale(x)", "upscale(x)",
"pad(x)", "pad(x)",
"arange(start, stop, step)",
"timestep_embedding(timesteps, dim, max_period)",
"argsort(x)", "argsort(x)",
"leaky_relu(x)", "leaky_relu(x)",
@ -1918,7 +1922,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"cross_entropy_loss_back(x,y)", "cross_entropy_loss_back(x,y)",
}; };
static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72"); static_assert(GGML_OP_COUNT == 74, "GGML_OP_COUNT != 74");
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
@ -5879,6 +5883,55 @@ struct ggml_tensor * ggml_upscale(
return ggml_upscale_impl(ctx, a, scale_factor); return ggml_upscale_impl(ctx, a, scale_factor);
} }
struct ggml_tensor * ggml_arange(
struct ggml_context * ctx,
float start,
float stop,
float step) {
GGML_ASSERT(stop > start);
int64_t steps = (int64_t)ceil((stop - start) / step);
struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, steps);
result->op = GGML_OP_ARANGE;
((float *)(result->op_params))[0] = start;
((float *)(result->op_params))[1] = stop;
((float *)(result->op_params))[2] = step;
return result;
}
struct ggml_tensor * ggml_timestep_embedding(
struct ggml_context * ctx,
struct ggml_tensor * timesteps,
int dim,
int max_period) {
bool is_node = false;
if (timesteps->grad) {
GGML_ASSERT(false); // TODO: implement backward
is_node = true;
}
int acutual_dim = dim;
if (dim % 2 != 0) {
acutual_dim = dim + 1;
}
struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, acutual_dim, timesteps->ne[0]);
result->op = GGML_OP_TIMESTEP_EMBEDDING;
result->op_params[0] = dim;
result->op_params[1] = max_period;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = timesteps;
return result;
}
// ggml_argsort // ggml_argsort
struct ggml_tensor * ggml_argsort( struct ggml_tensor * ggml_argsort(
@ -10227,11 +10280,11 @@ static void ggml_compute_forward_group_norm_f32(
const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03); const float * x = (float *)((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03);
for (int64_t i00 = 0; i00 < ne00; i00++) { for (int64_t i00 = 0; i00 < ne00; i00++) {
sum += (ggml_float)x[i00]; sum += (ggml_float)x[i00] / (ne00 * ne01 * step);
} }
} }
} }
float mean = sum / (ne00 * ne01 * step); float mean = sum;
ggml_float sum2 = 0.0; ggml_float sum2 = 0.0;
for (int64_t i02 = start; i02 < end; i02++) { for (int64_t i02 = start; i02 < end; i02++) {
@ -10243,11 +10296,11 @@ static void ggml_compute_forward_group_norm_f32(
for (int64_t i00 = 0; i00 < ne00; i00++) { for (int64_t i00 = 0; i00 < ne00; i00++) {
float v = x[i00] - mean; float v = x[i00] - mean;
y[i00] = v; y[i00] = v;
sum2 += (ggml_float)(v * v); sum2 += (ggml_float)(v * v) / (ne00 * ne01 * step);
} }
} }
} }
float variance = sum2 / (ne00 * ne01 * step); float variance = sum2;
const float scale = 1.0f / sqrtf(variance + eps); const float scale = 1.0f / sqrtf(variance + eps);
for (int64_t i02 = start; i02 < end; i02++) { for (int64_t i02 = start; i02 < end; i02++) {
@ -13528,6 +13581,109 @@ static void ggml_compute_forward_pad(
} }
} }
// ggml_compute_forward_arange
static void ggml_compute_forward_arange_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
return;
}
GGML_ASSERT(dst->nb[0] == sizeof(float));
const int ith = params->ith;
const int nth = params->nth;
GGML_TENSOR_UNARY_OP_LOCALS
const float start = ((float*)dst->op_params)[0];
const float stop = ((float*)dst->op_params)[1];
const float step = ((float*)dst->op_params)[2];
int64_t steps = (int64_t)ceil((stop - start) / step);
GGML_ASSERT(ggml_nelements(dst) == steps);
for (int64_t i = ith; i < steps; i+= nth) {
float value = start + step * i;
((float *)dst->data)[i] = value;
}
}
static void ggml_compute_forward_arange(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
switch (dst->type) {
case GGML_TYPE_F32:
{
ggml_compute_forward_arange_f32(params, src0, dst);
} break;
default:
{
GGML_ASSERT(false);
} break;
}
}
static void ggml_compute_forward_timestep_embedding_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
return;
}
GGML_ASSERT(src0->nb[0] == sizeof(float));
const int ith = params->ith;
const int nth = params->nth;
GGML_TENSOR_UNARY_OP_LOCALS
const int dim = dst->op_params[0];
const int max_period = dst->op_params[1];
int acutual_dim = dim;
if (dim % 2 != 0) {
acutual_dim = dim + 1;
}
int half = dim / 2;
for (int64_t i = 0; i < ne00; i++) {
float * embed_data = (float *)((char *) dst->data + i*nb1);
for (int64_t j = ith; j < half; j += nth) {
float timestep = ((float *)src0->data)[i];
float freq = (float)exp(-log(max_period) * j / half);
float arg = timestep * freq;
embed_data[j] = cos(arg);
embed_data[j + half] = sin(arg);
}
if (dim % 2 != 0 && ith == 0) {
embed_data[dim] = 0.f;
}
}
}
static void ggml_compute_forward_timestep_embedding(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_F32:
{
ggml_compute_forward_timestep_embedding_f32(params, src0, dst);
} break;
default:
{
GGML_ASSERT(false);
} break;
}
}
// ggml_compute_forward_argsort // ggml_compute_forward_argsort
static void ggml_compute_forward_argsort_f32( static void ggml_compute_forward_argsort_f32(
@ -15593,6 +15749,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{ {
ggml_compute_forward_pad(params, tensor); ggml_compute_forward_pad(params, tensor);
} break; } break;
case GGML_OP_ARANGE:
{
ggml_compute_forward_arange(params, tensor->src[0], tensor);
} break;
case GGML_OP_TIMESTEP_EMBEDDING:
{
ggml_compute_forward_timestep_embedding(params, tensor->src[0], tensor);
} break;
case GGML_OP_ARGSORT: case GGML_OP_ARGSORT:
{ {
ggml_compute_forward_argsort(params, tensor); ggml_compute_forward_argsort(params, tensor);
@ -16595,6 +16759,14 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
{ {
GGML_ASSERT(false); // TODO: not implemented GGML_ASSERT(false); // TODO: not implemented
} break; } break;
case GGML_OP_ARANGE:
{
GGML_ASSERT(false); // TODO: not implemented
} break;
case GGML_OP_TIMESTEP_EMBEDDING:
{
GGML_ASSERT(false); // TODO: not implemented
} break;
case GGML_OP_ARGSORT: case GGML_OP_ARGSORT:
{ {
GGML_ASSERT(false); // TODO: not implemented GGML_ASSERT(false); // TODO: not implemented
@ -17346,6 +17518,14 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
{ {
n_tasks = n_threads; n_tasks = n_threads;
} break; } break;
case GGML_OP_ARANGE:
{
n_tasks = n_threads;
} break;
case GGML_OP_TIMESTEP_EMBEDDING:
{
n_tasks = n_threads;
} break;
case GGML_OP_ARGSORT: case GGML_OP_ARGSORT:
{ {
n_tasks = n_threads; n_tasks = n_threads;

17
ggml.h
View file

@ -461,6 +461,8 @@ extern "C" {
GGML_OP_POOL_2D, GGML_OP_POOL_2D,
GGML_OP_UPSCALE, // nearest interpolate GGML_OP_UPSCALE, // nearest interpolate
GGML_OP_PAD, GGML_OP_PAD,
GGML_OP_ARANGE,
GGML_OP_TIMESTEP_EMBEDDING,
GGML_OP_ARGSORT, GGML_OP_ARGSORT,
GGML_OP_LEAKY_RELU, GGML_OP_LEAKY_RELU,
@ -1658,6 +1660,15 @@ extern "C" {
int p2, int p2,
int p3); int p3);
// Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
// timesteps: [N,]
// return: [N, dim]
GGML_API struct ggml_tensor * ggml_timestep_embedding(
struct ggml_context * ctx,
struct ggml_tensor * timesteps,
int dim,
int max_period);
// sort rows // sort rows
enum ggml_sort_order { enum ggml_sort_order {
GGML_SORT_ORDER_ASC, GGML_SORT_ORDER_ASC,
@ -1669,6 +1680,12 @@ extern "C" {
struct ggml_tensor * a, struct ggml_tensor * a,
enum ggml_sort_order order); enum ggml_sort_order order);
GGML_API struct ggml_tensor * ggml_arange(
struct ggml_context * ctx,
float start,
float stop,
float step);
// top k elements per row // top k elements per row
GGML_API struct ggml_tensor * ggml_top_k( GGML_API struct ggml_tensor * ggml_top_k(
struct ggml_context * ctx, struct ggml_context * ctx,

View file

@ -846,7 +846,7 @@ public:
return NULL; return NULL;
} }
// it's performing a compute, check if backend isn't cpu // it's performing a compute, check if backend isn't cpu
if (!ggml_backend_is_cpu(backend) && tensor->backend == GGML_BACKEND_CPU) { if (!ggml_backend_is_cpu(backend) && tensor->backend == GGML_BACKEND_TYPE_CPU) {
// pass input tensors to gpu memory // pass input tensors to gpu memory
auto backend_tensor = ggml_dup_tensor(compute_ctx, tensor); auto backend_tensor = ggml_dup_tensor(compute_ctx, tensor);