mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 01:24:36 +00:00
resync and updated sdcpp for flux and sd3 support
This commit is contained in:
parent
33721615b5
commit
f32a874966
30 changed files with 2434248 additions and 1729 deletions
|
@ -36,6 +36,10 @@
|
|||
#include "ggml-vulkan.h"
|
||||
#endif
|
||||
|
||||
#ifdef SD_USE_SYCL
|
||||
#include "ggml-sycl.h"
|
||||
#endif
|
||||
|
||||
#include "rng.hpp"
|
||||
#include "util.h"
|
||||
|
||||
|
@ -79,13 +83,42 @@ __STATIC_INLINE__ float ggml_tensor_get_f32(const ggml_tensor* tensor, int l, in
|
|||
return *(float*)((char*)(tensor->data) + i * tensor->nb[3] + j * tensor->nb[2] + k * tensor->nb[1] + l * tensor->nb[0]);
|
||||
}
|
||||
|
||||
__STATIC_INLINE__ int ggml_tensor_get_i32(const ggml_tensor* tensor, int l, int k = 0, int j = 0, int i = 0) {
|
||||
if (tensor->buffer != NULL) {
|
||||
float value;
|
||||
ggml_backend_tensor_get(tensor, &value, i * tensor->nb[3] + j * tensor->nb[2] + k * tensor->nb[1] + l * tensor->nb[0], sizeof(int));
|
||||
return value;
|
||||
}
|
||||
GGML_ASSERT(tensor->nb[0] == sizeof(int));
|
||||
return *(int*)((char*)(tensor->data) + i * tensor->nb[3] + j * tensor->nb[2] + k * tensor->nb[1] + l * tensor->nb[0]);
|
||||
}
|
||||
|
||||
__STATIC_INLINE__ ggml_fp16_t ggml_tensor_get_f16(const ggml_tensor* tensor, int l, int k = 0, int j = 0, int i = 0) {
|
||||
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
|
||||
return *(ggml_fp16_t*)((char*)(tensor->data) + i * tensor->nb[3] + j * tensor->nb[2] + k * tensor->nb[1] + l * tensor->nb[0]);
|
||||
}
|
||||
|
||||
__STATIC_INLINE__ void print_ggml_tensor(struct ggml_tensor* tensor, bool shape_only = false) {
|
||||
printf("shape(%zu, %zu, %zu, %zu)\n", tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
|
||||
// static struct ggml_tensor* get_tensor_from_graph(struct ggml_cgraph* gf, const char* name) {
|
||||
// struct ggml_tensor* res = NULL;
|
||||
// for (int i = 0; i < gf->n_nodes; i++) {
|
||||
// // printf("%d, %s \n", i, gf->nodes[i]->name);
|
||||
// if (strcmp(ggml_get_name(gf->nodes[i]), name) == 0) {
|
||||
// res = gf->nodes[i];
|
||||
// break;
|
||||
// }
|
||||
// }
|
||||
// for (int i = 0; i < gf->n_leafs; i++) {
|
||||
// // printf("%d, %s \n", i, gf->leafs[i]->name);
|
||||
// if (strcmp(ggml_get_name(gf->leafs[i]), name) == 0) {
|
||||
// res = gf->leafs[i];
|
||||
// break;
|
||||
// }
|
||||
// }
|
||||
// return res;
|
||||
// }
|
||||
|
||||
__STATIC_INLINE__ void print_ggml_tensor(struct ggml_tensor* tensor, bool shape_only = false, const char* mark = "") {
|
||||
printf("%s (%s): shape(%zu, %zu, %zu, %zu)\n", mark, ggml_type_name(tensor->type), tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
|
||||
fflush(stdout);
|
||||
if (shape_only) {
|
||||
return;
|
||||
|
@ -111,6 +144,8 @@ __STATIC_INLINE__ void print_ggml_tensor(struct ggml_tensor* tensor, bool shape_
|
|||
printf(" [%d, %d, %d, %d] = %f\n", i, j, k, l, ggml_tensor_get_f32(tensor, l, k, j, i));
|
||||
} else if (tensor->type == GGML_TYPE_F16) {
|
||||
printf(" [%d, %d, %d, %d] = %i\n", i, j, k, l, ggml_tensor_get_f16(tensor, l, k, j, i));
|
||||
} else if (tensor->type == GGML_TYPE_I32) {
|
||||
printf(" [%d, %d, %d, %d] = %i\n", i, j, k, l, ggml_tensor_get_i32(tensor, l, k, j, i));
|
||||
}
|
||||
fflush(stdout);
|
||||
}
|
||||
|
@ -221,6 +256,23 @@ __STATIC_INLINE__ uint8_t* sd_tensor_to_image(struct ggml_tensor* input) {
|
|||
return image_data;
|
||||
}
|
||||
|
||||
__STATIC_INLINE__ uint8_t* sd_tensor_to_mul_image(struct ggml_tensor* input, int idx) {
|
||||
int64_t width = input->ne[0];
|
||||
int64_t height = input->ne[1];
|
||||
int64_t channels = input->ne[2];
|
||||
GGML_ASSERT(channels == 3 && input->type == GGML_TYPE_F32);
|
||||
uint8_t* image_data = (uint8_t*)malloc(width * height * channels);
|
||||
for (int iy = 0; iy < height; iy++) {
|
||||
for (int ix = 0; ix < width; ix++) {
|
||||
for (int k = 0; k < channels; k++) {
|
||||
float value = ggml_tensor_get_f32(input, ix, iy, k, idx);
|
||||
*(image_data + iy * width * channels + ix * channels + k) = (uint8_t)(value * 255.0f);
|
||||
}
|
||||
}
|
||||
}
|
||||
return image_data;
|
||||
}
|
||||
|
||||
__STATIC_INLINE__ void sd_image_to_tensor(const uint8_t* image_data,
|
||||
struct ggml_tensor* output,
|
||||
bool scale = true) {
|
||||
|
@ -241,6 +293,28 @@ __STATIC_INLINE__ void sd_image_to_tensor(const uint8_t* image_data,
|
|||
}
|
||||
}
|
||||
|
||||
__STATIC_INLINE__ void sd_mul_images_to_tensor(const uint8_t* image_data,
|
||||
struct ggml_tensor* output,
|
||||
int idx,
|
||||
float* mean = NULL,
|
||||
float* std = NULL) {
|
||||
int64_t width = output->ne[0];
|
||||
int64_t height = output->ne[1];
|
||||
int64_t channels = output->ne[2];
|
||||
GGML_ASSERT(channels == 3 && output->type == GGML_TYPE_F32);
|
||||
for (int iy = 0; iy < height; iy++) {
|
||||
for (int ix = 0; ix < width; ix++) {
|
||||
for (int k = 0; k < channels; k++) {
|
||||
int value = *(image_data + iy * width * channels + ix * channels + k);
|
||||
float pixel_val = value / 255.0f;
|
||||
if (mean != NULL && std != NULL)
|
||||
pixel_val = (pixel_val - mean[k]) / std[k];
|
||||
ggml_tensor_set_f32(output, pixel_val, ix, iy, k, idx);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__STATIC_INLINE__ void sd_image_f32_to_tensor(const float* image_data,
|
||||
struct ggml_tensor* output,
|
||||
bool scale = true) {
|
||||
|
@ -251,7 +325,7 @@ __STATIC_INLINE__ void sd_image_f32_to_tensor(const float* image_data,
|
|||
for (int iy = 0; iy < height; iy++) {
|
||||
for (int ix = 0; ix < width; ix++) {
|
||||
for (int k = 0; k < channels; k++) {
|
||||
float value = *(image_data + iy * width * channels + ix * channels + k);
|
||||
int value = *(image_data + iy * width * channels + ix * channels + k);
|
||||
if (scale) {
|
||||
value /= 255.f;
|
||||
}
|
||||
|
@ -279,6 +353,12 @@ __STATIC_INLINE__ void ggml_split_tensor_2d(struct ggml_tensor* input,
|
|||
}
|
||||
}
|
||||
|
||||
// unclamped -> expects x in the range [0-1]
|
||||
__STATIC_INLINE__ float ggml_smootherstep_f32(const float x) {
|
||||
GGML_ASSERT(x >= 0.f && x <= 1.f);
|
||||
return x * x * x * (x * (6.0f * x - 15.0f) + 10.0f);
|
||||
}
|
||||
|
||||
__STATIC_INLINE__ void ggml_merge_tensor_2d(struct ggml_tensor* input,
|
||||
struct ggml_tensor* output,
|
||||
int x,
|
||||
|
@ -287,6 +367,10 @@ __STATIC_INLINE__ void ggml_merge_tensor_2d(struct ggml_tensor* input,
|
|||
int64_t width = input->ne[0];
|
||||
int64_t height = input->ne[1];
|
||||
int64_t channels = input->ne[2];
|
||||
|
||||
int64_t img_width = output->ne[0];
|
||||
int64_t img_height = output->ne[1];
|
||||
|
||||
GGML_ASSERT(input->type == GGML_TYPE_F32 && output->type == GGML_TYPE_F32);
|
||||
for (int iy = 0; iy < height; iy++) {
|
||||
for (int ix = 0; ix < width; ix++) {
|
||||
|
@ -294,16 +378,22 @@ __STATIC_INLINE__ void ggml_merge_tensor_2d(struct ggml_tensor* input,
|
|||
float new_value = ggml_tensor_get_f32(input, ix, iy, k);
|
||||
if (overlap > 0) { // blend colors in overlapped area
|
||||
float old_value = ggml_tensor_get_f32(output, x + ix, y + iy, k);
|
||||
if (x > 0 && ix < overlap) { // in overlapped horizontal
|
||||
ggml_tensor_set_f32(output, old_value + (new_value - old_value) * (ix / (1.0f * overlap)), x + ix, y + iy, k);
|
||||
continue;
|
||||
}
|
||||
if (y > 0 && iy < overlap) { // in overlapped vertical
|
||||
ggml_tensor_set_f32(output, old_value + (new_value - old_value) * (iy / (1.0f * overlap)), x + ix, y + iy, k);
|
||||
continue;
|
||||
}
|
||||
|
||||
const float x_f_0 = (x > 0) ? ix / float(overlap) : 1;
|
||||
const float x_f_1 = (x < (img_width - width)) ? (width - ix) / float(overlap) : 1;
|
||||
const float y_f_0 = (y > 0) ? iy / float(overlap) : 1;
|
||||
const float y_f_1 = (y < (img_height - height)) ? (height - iy) / float(overlap) : 1;
|
||||
|
||||
const float x_f = std::min(std::min(x_f_0, x_f_1), 1.f);
|
||||
const float y_f = std::min(std::min(y_f_0, y_f_1), 1.f);
|
||||
|
||||
ggml_tensor_set_f32(
|
||||
output,
|
||||
old_value + new_value * ggml_smootherstep_f32(y_f) * ggml_smootherstep_f32(x_f),
|
||||
x + ix, y + iy, k);
|
||||
} else {
|
||||
ggml_tensor_set_f32(output, new_value, x + ix, y + iy, k);
|
||||
}
|
||||
ggml_tensor_set_f32(output, new_value, x + ix, y + iy, k);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -347,6 +437,42 @@ __STATIC_INLINE__ void ggml_tensor_clamp(struct ggml_tensor* src, float min, flo
|
|||
}
|
||||
}
|
||||
|
||||
__STATIC_INLINE__ struct ggml_tensor* ggml_tensor_concat(struct ggml_context* ctx,
|
||||
struct ggml_tensor* a,
|
||||
struct ggml_tensor* b,
|
||||
int dim) {
|
||||
int64_t ne[GGML_MAX_DIMS];
|
||||
for (int d = 0; d < GGML_MAX_DIMS; ++d) {
|
||||
if (d == dim) {
|
||||
ne[d] = a->ne[d] + b->ne[d];
|
||||
continue;
|
||||
}
|
||||
GGML_ASSERT(a->ne[d] == b->ne[d]);
|
||||
ne[d] = a->ne[d];
|
||||
}
|
||||
struct ggml_tensor* result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, ne);
|
||||
int64_t o[4] = {0, 0, 0, 0};
|
||||
o[dim] = a->ne[dim];
|
||||
|
||||
float v;
|
||||
for (int i3 = 0; i3 < result->ne[3]; i3++) {
|
||||
for (int i2 = 0; i2 < result->ne[2]; i2++) {
|
||||
for (int i1 = 0; i1 < result->ne[1]; i1++) {
|
||||
for (int i0 = 0; i0 < result->ne[0]; i0++) {
|
||||
if (i0 < a->ne[0] && i1 < a->ne[1] && i2 < a->ne[2] && i3 < a->ne[3]) {
|
||||
v = ggml_tensor_get_f32(a, i0, i1, i2, i3);
|
||||
} else {
|
||||
v = ggml_tensor_get_f32(b, i0 - o[0], i1 - o[1], i2 - o[2], i3 - o[3]);
|
||||
}
|
||||
|
||||
ggml_tensor_set_f32(result, v, i0, i1, i2, i3);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// convert values from [0, 1] to [-1, 1]
|
||||
__STATIC_INLINE__ void ggml_tensor_scale_input(struct ggml_tensor* src) {
|
||||
int64_t nelements = ggml_nelements(src);
|
||||
|
@ -400,7 +526,7 @@ __STATIC_INLINE__ void sd_tiling(ggml_tensor* input, ggml_tensor* output, const
|
|||
ggml_tensor* input_tile = ggml_new_tensor_4d(tiles_ctx, GGML_TYPE_F32, tile_size, tile_size, input->ne[2], 1);
|
||||
ggml_tensor* output_tile = ggml_new_tensor_4d(tiles_ctx, GGML_TYPE_F32, tile_size * scale, tile_size * scale, output->ne[2], 1);
|
||||
on_processing(input_tile, NULL, true);
|
||||
int num_tiles = (input_width * input_height) / (non_tile_overlap * non_tile_overlap);
|
||||
int num_tiles = ceil((float)input_width / non_tile_overlap) * ceil((float)input_height / non_tile_overlap);
|
||||
LOG_INFO("processing %i tiles", num_tiles);
|
||||
pretty_progress(1, num_tiles, 0.0f);
|
||||
int tile_count = 1;
|
||||
|
@ -430,11 +556,13 @@ __STATIC_INLINE__ void sd_tiling(ggml_tensor* input, ggml_tensor* output, const
|
|||
if (tile_count < num_tiles) {
|
||||
pretty_progress(num_tiles, num_tiles, last_time);
|
||||
}
|
||||
ggml_free(tiles_ctx);
|
||||
}
|
||||
|
||||
__STATIC_INLINE__ struct ggml_tensor* ggml_group_norm_32(struct ggml_context* ctx,
|
||||
struct ggml_tensor* a) {
|
||||
return ggml_group_norm(ctx, a, 32, 1e-6f);
|
||||
const float eps = 1e-6f; // default eps parameter
|
||||
return ggml_group_norm(ctx, a, 32, eps);
|
||||
}
|
||||
|
||||
__STATIC_INLINE__ struct ggml_tensor* ggml_nn_linear(struct ggml_context* ctx,
|
||||
|
@ -524,6 +652,20 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_conv_3d_nx1x1(struct ggml_context*
|
|||
return x; // [N, OC, T, OH * OW]
|
||||
}
|
||||
|
||||
// qkv: [N, L, 3*C]
|
||||
// return: ([N, L, C], [N, L, C], [N, L, C])
|
||||
__STATIC_INLINE__ std::vector<struct ggml_tensor*> split_qkv(struct ggml_context* ctx,
|
||||
struct ggml_tensor* qkv) {
|
||||
qkv = ggml_reshape_4d(ctx, qkv, qkv->ne[0] / 3, 3, qkv->ne[1], qkv->ne[2]); // [N, L, 3, C]
|
||||
qkv = ggml_cont(ctx, ggml_permute(ctx, qkv, 0, 3, 1, 2)); // [3, N, L, C]
|
||||
|
||||
int64_t offset = qkv->nb[2] * qkv->ne[2];
|
||||
auto q = ggml_view_3d(ctx, qkv, qkv->ne[0], qkv->ne[1], qkv->ne[2], qkv->nb[1], qkv->nb[2], offset * 0); // [N, L, C]
|
||||
auto k = ggml_view_3d(ctx, qkv, qkv->ne[0], qkv->ne[1], qkv->ne[2], qkv->nb[1], qkv->nb[2], offset * 1); // [N, L, C]
|
||||
auto v = ggml_view_3d(ctx, qkv, qkv->ne[0], qkv->ne[1], qkv->ne[2], qkv->nb[1], qkv->nb[2], offset * 2); // [N, L, C]
|
||||
return {q, k, v};
|
||||
}
|
||||
|
||||
// q: [N * n_head, n_token, d_head]
|
||||
// k: [N * n_head, n_k, d_head]
|
||||
// v: [N * n_head, d_head, n_k]
|
||||
|
@ -533,7 +675,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_attention(struct ggml_context* ctx
|
|||
struct ggml_tensor* k,
|
||||
struct ggml_tensor* v,
|
||||
bool mask = false) {
|
||||
#if defined(SD_USE_FLASH_ATTENTION) && !defined(SD_USE_CUBLAS) && !defined(SD_USE_METAL) && !defined(SD_USE_VULKAN)
|
||||
#if defined(SD_USE_FLASH_ATTENTION) && !defined(SD_USE_CUBLAS) && !defined(SD_USE_METAL) && !defined(SD_USE_VULKAN) && !defined(SD_USE_SYCL)
|
||||
struct ggml_tensor* kqv = ggml_flash_attn(ctx, q, k, v, false); // [N * n_head, n_token, d_head]
|
||||
#else
|
||||
float d_head = (float)q->ne[0];
|
||||
|
@ -550,6 +692,79 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_attention(struct ggml_context* ctx
|
|||
return kqv;
|
||||
}
|
||||
|
||||
// q: [N, L_q, C] or [N*n_head, L_q, d_head]
|
||||
// k: [N, L_k, C] or [N*n_head, L_k, d_head]
|
||||
// v: [N, L_k, C] or [N, L_k, n_head, d_head]
|
||||
// return: [N, L_q, C]
|
||||
__STATIC_INLINE__ struct ggml_tensor* ggml_nn_attention_ext(struct ggml_context* ctx,
|
||||
struct ggml_tensor* q,
|
||||
struct ggml_tensor* k,
|
||||
struct ggml_tensor* v,
|
||||
int64_t n_head,
|
||||
struct ggml_tensor* mask = NULL,
|
||||
bool diag_mask_inf = false,
|
||||
bool skip_reshape = false) {
|
||||
int64_t L_q;
|
||||
int64_t L_k;
|
||||
int64_t C;
|
||||
int64_t N;
|
||||
int64_t d_head;
|
||||
if (!skip_reshape) {
|
||||
L_q = q->ne[1];
|
||||
L_k = k->ne[1];
|
||||
C = q->ne[0];
|
||||
N = q->ne[2];
|
||||
d_head = C / n_head;
|
||||
q = ggml_reshape_4d(ctx, q, d_head, n_head, L_q, N); // [N, L_q, n_head, d_head]
|
||||
q = ggml_cont(ctx, ggml_permute(ctx, q, 0, 2, 1, 3)); // [N, n_head, L_q, d_head]
|
||||
q = ggml_reshape_3d(ctx, q, d_head, L_q, n_head * N); // [N * n_head, L_q, d_head]
|
||||
|
||||
k = ggml_reshape_4d(ctx, k, d_head, n_head, L_k, N); // [N, L_k, n_head, d_head]
|
||||
k = ggml_cont(ctx, ggml_permute(ctx, k, 0, 2, 1, 3)); // [N, n_head, L_k, d_head]
|
||||
k = ggml_reshape_3d(ctx, k, d_head, L_k, n_head * N); // [N * n_head, L_k, d_head]
|
||||
|
||||
v = ggml_reshape_4d(ctx, v, d_head, n_head, L_k, N); // [N, L_k, n_head, d_head]
|
||||
} else {
|
||||
L_q = q->ne[1];
|
||||
L_k = k->ne[1];
|
||||
d_head = v->ne[0];
|
||||
N = v->ne[3];
|
||||
C = d_head * n_head;
|
||||
}
|
||||
|
||||
float scale = (1.0f / sqrt((float)d_head));
|
||||
|
||||
bool use_flash_attn = false;
|
||||
ggml_tensor* kqv = NULL;
|
||||
if (use_flash_attn) {
|
||||
v = ggml_cont(ctx, ggml_permute(ctx, v, 0, 2, 1, 3)); // [N, n_head, L_k, d_head]
|
||||
v = ggml_reshape_3d(ctx, v, d_head, L_k, n_head * N); // [N * n_head, L_k, d_head]
|
||||
LOG_DEBUG("k->ne[1] == %d", k->ne[1]);
|
||||
kqv = ggml_flash_attn_ext(ctx, q, k, v, mask, scale, 0, 0);
|
||||
} else {
|
||||
v = ggml_cont(ctx, ggml_permute(ctx, v, 1, 2, 0, 3)); // [N, n_head, d_head, L_k]
|
||||
v = ggml_reshape_3d(ctx, v, L_k, d_head, n_head * N); // [N * n_head, d_head, L_k]
|
||||
|
||||
auto kq = ggml_mul_mat(ctx, k, q); // [N * n_head, L_q, L_k]
|
||||
kq = ggml_scale_inplace(ctx, kq, scale);
|
||||
if (mask) {
|
||||
kq = ggml_add(ctx, kq, mask);
|
||||
}
|
||||
if (diag_mask_inf) {
|
||||
kq = ggml_diag_mask_inf_inplace(ctx, kq, 0);
|
||||
}
|
||||
kq = ggml_soft_max_inplace(ctx, kq);
|
||||
|
||||
kqv = ggml_mul_mat(ctx, v, kq); // [N * n_head, L_q, d_head]
|
||||
}
|
||||
|
||||
kqv = ggml_reshape_4d(ctx, kqv, d_head, L_q, n_head, N); // [N, n_head, L_q, d_head]
|
||||
kqv = ggml_cont(ctx, ggml_permute(ctx, kqv, 0, 2, 1, 3)); // [N, L_q, n_head, d_head]
|
||||
kqv = ggml_reshape_3d(ctx, kqv, d_head * n_head, L_q, N); // [N, L_q, C]
|
||||
|
||||
return kqv;
|
||||
}
|
||||
|
||||
__STATIC_INLINE__ struct ggml_tensor* ggml_nn_layer_norm(struct ggml_context* ctx,
|
||||
struct ggml_tensor* x,
|
||||
struct ggml_tensor* w,
|
||||
|
@ -575,7 +790,8 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_group_norm(struct ggml_context* ct
|
|||
b = ggml_reshape_4d(ctx, b, 1, 1, b->ne[0], 1);
|
||||
}
|
||||
|
||||
x = ggml_group_norm(ctx, x, num_groups, 1e-6f);
|
||||
const float eps = 1e-6f; // default eps parameter
|
||||
x = ggml_group_norm(ctx, x, num_groups, eps);
|
||||
if (w != NULL && b != NULL) {
|
||||
x = ggml_mul(ctx, x, w);
|
||||
// b = ggml_repeat(ctx, b, x);
|
||||
|
@ -585,7 +801,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_group_norm(struct ggml_context* ct
|
|||
}
|
||||
|
||||
__STATIC_INLINE__ void ggml_backend_tensor_get_and_sync(ggml_backend_t backend, const struct ggml_tensor* tensor, void* data, size_t offset, size_t size) {
|
||||
#ifdef SD_USE_CUBLAS
|
||||
#if defined(SD_USE_CUBLAS) || defined(SD_USE_SYCL)
|
||||
if (!ggml_backend_is_cpu(backend)) {
|
||||
ggml_backend_tensor_get_async(backend, tensor, data, offset, size);
|
||||
ggml_backend_synchronize(backend);
|
||||
|
@ -693,22 +909,25 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_timestep_embedding(
|
|||
struct ggml_context* ctx,
|
||||
struct ggml_tensor* timesteps,
|
||||
int dim,
|
||||
int max_period = 10000) {
|
||||
int max_period = 10000,
|
||||
float time_factor = 1.0f) {
|
||||
timesteps = ggml_scale(ctx, timesteps, time_factor);
|
||||
return ggml_timestep_embedding(ctx, timesteps, dim, max_period);
|
||||
}
|
||||
|
||||
// struct GGMLComputeGraph {
|
||||
// virtual void init(struct ggml_context* ctx, ggml_type wtype) = 0;
|
||||
// virtual std::string get_desc() = 0;
|
||||
// virtual size_t get_params_mem_size() = 0;
|
||||
// virtual size_t get_params_num() = 0;
|
||||
// virtual struct ggml_cgraph* get_ggml_cgraph() = 0;
|
||||
// };
|
||||
__STATIC_INLINE__ size_t ggml_tensor_num(ggml_context* ctx) {
|
||||
size_t num = 0;
|
||||
for (ggml_tensor* t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
|
||||
num++;
|
||||
}
|
||||
return num;
|
||||
}
|
||||
|
||||
/* SDXL with LoRA requires more space */
|
||||
#define MAX_PARAMS_TENSOR_NUM 15360
|
||||
#define MAX_GRAPH_SIZE 15360
|
||||
|
||||
struct GGMLModule {
|
||||
struct GGMLRunner {
|
||||
protected:
|
||||
typedef std::function<struct ggml_cgraph*()> get_graph_cb_t;
|
||||
|
||||
|
@ -775,7 +994,10 @@ protected:
|
|||
|
||||
// compute the required memory
|
||||
size_t compute_buffer_size = ggml_gallocr_get_buffer_size(compute_allocr, 0);
|
||||
LOG_DEBUG("%s compute buffer size: %.2f MB", get_desc().c_str(), compute_buffer_size / 1024.0 / 1024.0);
|
||||
LOG_DEBUG("%s compute buffer size: %.2f MB(%s)",
|
||||
get_desc().c_str(),
|
||||
compute_buffer_size / 1024.0 / 1024.0,
|
||||
ggml_backend_is_cpu(backend) ? "RAM" : "VRAM");
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -791,16 +1013,14 @@ protected:
|
|||
}
|
||||
|
||||
public:
|
||||
virtual size_t get_params_mem_size() = 0;
|
||||
virtual size_t get_params_num() = 0;
|
||||
virtual std::string get_desc() = 0;
|
||||
virtual std::string get_desc() = 0;
|
||||
|
||||
GGMLModule(ggml_backend_t backend, ggml_type wtype = GGML_TYPE_F32)
|
||||
GGMLRunner(ggml_backend_t backend, ggml_type wtype = GGML_TYPE_F32)
|
||||
: backend(backend), wtype(wtype) {
|
||||
alloc_params_ctx();
|
||||
}
|
||||
|
||||
virtual ~GGMLModule() {
|
||||
virtual ~GGMLRunner() {
|
||||
free_params_buffer();
|
||||
free_compute_buffer();
|
||||
free_params_ctx();
|
||||
|
@ -813,15 +1033,20 @@ public:
|
|||
}
|
||||
|
||||
bool alloc_params_buffer() {
|
||||
size_t num_tensors = get_params_num();
|
||||
size_t num_tensors = ggml_tensor_num(params_ctx);
|
||||
params_buffer = ggml_backend_alloc_ctx_tensors(params_ctx, backend);
|
||||
if (params_buffer == NULL) {
|
||||
LOG_ERROR("%s alloc params backend buffer failed", get_desc().c_str());
|
||||
LOG_ERROR("%s alloc params backend buffer failed, num_tensors = %i",
|
||||
get_desc().c_str(),
|
||||
num_tensors);
|
||||
return false;
|
||||
}
|
||||
size_t params_buffer_size = ggml_backend_buffer_get_size(params_buffer);
|
||||
LOG_DEBUG("%s params backend buffer size = % 6.2f MB (%i tensors)",
|
||||
get_desc().c_str(), params_buffer_size / (1024.0 * 1024.0), num_tensors);
|
||||
LOG_DEBUG("%s params backend buffer size = % 6.2f MB(%s) (%i tensors)",
|
||||
get_desc().c_str(),
|
||||
params_buffer_size / (1024.0 * 1024.0),
|
||||
ggml_backend_is_cpu(backend) ? "RAM" : "VRAM",
|
||||
num_tensors);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -832,6 +1057,13 @@ public:
|
|||
}
|
||||
}
|
||||
|
||||
size_t get_params_buffer_size() {
|
||||
if (params_buffer != NULL) {
|
||||
return ggml_backend_buffer_get_size(params_buffer);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
void free_compute_buffer() {
|
||||
if (compute_allocr != NULL) {
|
||||
ggml_gallocr_free(compute_allocr);
|
||||
|
@ -850,7 +1082,7 @@ public:
|
|||
return NULL;
|
||||
}
|
||||
// it's performing a compute, check if backend isn't cpu
|
||||
if (!ggml_backend_is_cpu(backend) && tensor->backend == GGML_BACKEND_TYPE_CPU) {
|
||||
if (!ggml_backend_is_cpu(backend) && (tensor->buffer == NULL || ggml_backend_buffer_is_host(tensor->buffer))) {
|
||||
// pass input tensors to gpu memory
|
||||
auto backend_tensor = ggml_dup_tensor(compute_ctx, tensor);
|
||||
|
||||
|
@ -869,11 +1101,8 @@ public:
|
|||
alloc_compute_buffer(get_graph);
|
||||
reset_compute_ctx();
|
||||
struct ggml_cgraph* gf = get_graph();
|
||||
|
||||
GGML_ASSERT(ggml_gallocr_alloc_graph(compute_allocr, gf));
|
||||
|
||||
cpy_data_to_backend_tensor();
|
||||
|
||||
if (ggml_backend_is_cpu(backend)) {
|
||||
ggml_backend_cpu_set_n_threads(backend, n_threads);
|
||||
}
|
||||
|
@ -883,13 +1112,11 @@ public:
|
|||
// ggml_backend_metal_set_n_cb(backend, n_threads);
|
||||
// }
|
||||
// #endif
|
||||
|
||||
ggml_backend_graph_compute(backend, gf);
|
||||
|
||||
#ifdef GGML_PERF
|
||||
ggml_graph_print(gf);
|
||||
#endif
|
||||
|
||||
if (output != NULL) {
|
||||
auto result = ggml_graph_node(gf, -1);
|
||||
if (*output == NULL && output_ctx != NULL) {
|
||||
|
@ -907,19 +1134,6 @@ public:
|
|||
};
|
||||
|
||||
class GGMLBlock {
|
||||
private:
|
||||
static char temp_buffer[1024 * 1024 * 10];
|
||||
ggml_context* get_temp_ctx() {
|
||||
struct ggml_init_params params;
|
||||
params.mem_size = sizeof(temp_buffer);
|
||||
params.mem_buffer = temp_buffer;
|
||||
params.no_alloc = true;
|
||||
|
||||
ggml_context* temp_ctx = ggml_init(params);
|
||||
GGML_ASSERT(temp_ctx != NULL);
|
||||
return temp_ctx;
|
||||
}
|
||||
|
||||
protected:
|
||||
typedef std::unordered_map<std::string, struct ggml_tensor*> ParameterMap;
|
||||
typedef std::unordered_map<std::string, std::shared_ptr<GGMLBlock>> GGMLBlockMap;
|
||||
|
@ -942,14 +1156,6 @@ public:
|
|||
init_params(ctx, wtype);
|
||||
}
|
||||
|
||||
std::tuple<size_t, size_t> get_params_info(ggml_type wtype) {
|
||||
ggml_context* temp_ctx = get_temp_ctx();
|
||||
init(temp_ctx, wtype);
|
||||
size_t num_tensors = get_params_num();
|
||||
size_t mem_size = get_params_mem_size();
|
||||
return {num_tensors, mem_size};
|
||||
}
|
||||
|
||||
size_t get_params_num() {
|
||||
size_t num_tensors = params.size();
|
||||
for (auto& pair : blocks) {
|
||||
|
@ -981,13 +1187,11 @@ public:
|
|||
}
|
||||
for (auto& pair : blocks) {
|
||||
auto& block = pair.second;
|
||||
|
||||
block->get_param_tensors(tensors, prefix + pair.first);
|
||||
}
|
||||
|
||||
for (auto& pair : params) {
|
||||
struct ggml_tensor* param = pair.second;
|
||||
|
||||
struct ggml_tensor* param = pair.second;
|
||||
tensors[prefix + pair.first] = pair.second;
|
||||
}
|
||||
}
|
||||
|
@ -1003,8 +1207,12 @@ protected:
|
|||
int64_t in_features;
|
||||
int64_t out_features;
|
||||
bool bias;
|
||||
bool force_f32;
|
||||
|
||||
void init_params(struct ggml_context* ctx, ggml_type wtype) {
|
||||
if (in_features % ggml_blck_size(wtype) != 0 || force_f32) {
|
||||
wtype = GGML_TYPE_F32;
|
||||
}
|
||||
params["weight"] = ggml_new_tensor_2d(ctx, wtype, in_features, out_features);
|
||||
if (bias) {
|
||||
params["bias"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_features);
|
||||
|
@ -1014,10 +1222,12 @@ protected:
|
|||
public:
|
||||
Linear(int64_t in_features,
|
||||
int64_t out_features,
|
||||
bool bias = true)
|
||||
bool bias = true,
|
||||
bool force_f32 = false)
|
||||
: in_features(in_features),
|
||||
out_features(out_features),
|
||||
bias(bias) {}
|
||||
bias(bias),
|
||||
force_f32(force_f32) {}
|
||||
|
||||
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
|
||||
struct ggml_tensor* w = params["weight"];
|
||||
|
@ -1029,6 +1239,40 @@ public:
|
|||
}
|
||||
};
|
||||
|
||||
class Embedding : public UnaryBlock {
|
||||
protected:
|
||||
int64_t embedding_dim;
|
||||
int64_t num_embeddings;
|
||||
|
||||
void init_params(struct ggml_context* ctx, ggml_type wtype) {
|
||||
params["weight"] = ggml_new_tensor_2d(ctx, wtype, embedding_dim, num_embeddings);
|
||||
}
|
||||
|
||||
public:
|
||||
Embedding(int64_t num_embeddings, int64_t embedding_dim)
|
||||
: embedding_dim(embedding_dim),
|
||||
num_embeddings(num_embeddings) {
|
||||
}
|
||||
|
||||
struct ggml_tensor* forward(struct ggml_context* ctx,
|
||||
struct ggml_tensor* input_ids) {
|
||||
// input_ids: [N, n_token]
|
||||
auto weight = params["weight"];
|
||||
|
||||
// There are issues with ggml batch inference, so we are expanding it here first.
|
||||
// TODO: fix ggml batch inference
|
||||
int64_t n = input_ids->ne[1];
|
||||
input_ids = ggml_reshape_1d(ctx, input_ids, input_ids->ne[0] * input_ids->ne[1]);
|
||||
|
||||
input_ids = ggml_reshape_3d(ctx, input_ids, input_ids->ne[0], 1, input_ids->ne[1]);
|
||||
auto embedding = ggml_get_rows(ctx, weight, input_ids);
|
||||
embedding = ggml_reshape_3d(ctx, embedding, embedding->ne[0], embedding->ne[1] / n, n);
|
||||
|
||||
// [N, n_token, embedding_dim]
|
||||
return embedding;
|
||||
}
|
||||
};
|
||||
|
||||
class Conv2d : public UnaryBlock {
|
||||
protected:
|
||||
int64_t in_channels;
|
||||
|
@ -1202,58 +1446,48 @@ class MultiheadAttention : public GGMLBlock {
|
|||
protected:
|
||||
int64_t embed_dim;
|
||||
int64_t n_head;
|
||||
bool bias;
|
||||
bool mask;
|
||||
std::string q_proj_name;
|
||||
std::string k_proj_name;
|
||||
std::string v_proj_name;
|
||||
std::string out_proj_name;
|
||||
|
||||
public:
|
||||
MultiheadAttention(int64_t embed_dim,
|
||||
int64_t n_head,
|
||||
bool bias = true)
|
||||
bool qkv_proj_bias = true,
|
||||
bool out_proj_bias = true,
|
||||
std::string q_proj_name = "q_proj",
|
||||
std::string k_proj_name = "k_proj",
|
||||
std::string v_proj_name = "v_proj",
|
||||
std::string out_proj_name = "out_proj")
|
||||
: embed_dim(embed_dim),
|
||||
n_head(n_head),
|
||||
bias(bias) {
|
||||
blocks["q_proj"] = std::shared_ptr<GGMLBlock>(new Linear(embed_dim, embed_dim, bias));
|
||||
blocks["k_proj"] = std::shared_ptr<GGMLBlock>(new Linear(embed_dim, embed_dim, bias));
|
||||
blocks["v_proj"] = std::shared_ptr<GGMLBlock>(new Linear(embed_dim, embed_dim, bias));
|
||||
blocks["out_proj"] = std::shared_ptr<GGMLBlock>(new Linear(embed_dim, embed_dim, bias));
|
||||
q_proj_name(q_proj_name),
|
||||
k_proj_name(k_proj_name),
|
||||
v_proj_name(v_proj_name),
|
||||
out_proj_name(out_proj_name) {
|
||||
blocks[q_proj_name] = std::shared_ptr<GGMLBlock>(new Linear(embed_dim, embed_dim, qkv_proj_bias));
|
||||
blocks[k_proj_name] = std::shared_ptr<GGMLBlock>(new Linear(embed_dim, embed_dim, qkv_proj_bias));
|
||||
blocks[v_proj_name] = std::shared_ptr<GGMLBlock>(new Linear(embed_dim, embed_dim, qkv_proj_bias));
|
||||
blocks[out_proj_name] = std::shared_ptr<GGMLBlock>(new Linear(embed_dim, embed_dim, out_proj_bias));
|
||||
}
|
||||
|
||||
// x: [N, n_token, embed_dim]
|
||||
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, bool mask = false) {
|
||||
auto q_proj = std::dynamic_pointer_cast<Linear>(blocks["q_proj"]);
|
||||
auto k_proj = std::dynamic_pointer_cast<Linear>(blocks["k_proj"]);
|
||||
auto v_proj = std::dynamic_pointer_cast<Linear>(blocks["v_proj"]);
|
||||
auto out_proj = std::dynamic_pointer_cast<Linear>(blocks["out_proj"]);
|
||||
|
||||
int64_t N = x->ne[2];
|
||||
int64_t n_token = x->ne[1];
|
||||
int64_t d_head = embed_dim / n_head;
|
||||
auto q_proj = std::dynamic_pointer_cast<Linear>(blocks[q_proj_name]);
|
||||
auto k_proj = std::dynamic_pointer_cast<Linear>(blocks[k_proj_name]);
|
||||
auto v_proj = std::dynamic_pointer_cast<Linear>(blocks[v_proj_name]);
|
||||
auto out_proj = std::dynamic_pointer_cast<Linear>(blocks[out_proj_name]);
|
||||
|
||||
struct ggml_tensor* q = q_proj->forward(ctx, x);
|
||||
q = ggml_reshape_4d(ctx, q, d_head, n_head, n_token, N); // [N, n_token, n_head, d_head]
|
||||
q = ggml_cont(ctx, ggml_permute(ctx, q, 0, 2, 1, 3)); // [N, n_head, n_token, d_head]
|
||||
q = ggml_reshape_3d(ctx, q, d_head, n_token, n_head * N); // [N * n_head, n_token, d_head]
|
||||
|
||||
struct ggml_tensor* k = k_proj->forward(ctx, x);
|
||||
k = ggml_reshape_4d(ctx, k, d_head, n_head, n_token, N); // [N, n_token, n_head, d_head]
|
||||
k = ggml_cont(ctx, ggml_permute(ctx, k, 0, 2, 1, 3)); // [N, n_head, n_token, d_head]
|
||||
k = ggml_reshape_3d(ctx, k, d_head, n_token, n_head); // [N * n_head, n_token, d_head]
|
||||
|
||||
struct ggml_tensor* v = v_proj->forward(ctx, x);
|
||||
v = ggml_reshape_4d(ctx, v, d_head, n_head, n_token, N); // [N, n_token, n_head, d_head]
|
||||
v = ggml_cont(ctx, ggml_permute(ctx, v, 1, 2, 0, 3)); // [N, n_head, d_head, n_token]
|
||||
v = ggml_reshape_3d(ctx, v, n_token, d_head, n_head * N); // [N * n_head, d_head, n_token]
|
||||
|
||||
struct ggml_tensor* kqv = ggml_nn_attention(ctx, q, k, v, mask); // [N * n_head, n_token, d_head]
|
||||
x = ggml_nn_attention_ext(ctx, q, k, v, n_head, NULL, mask); // [N, n_token, embed_dim]
|
||||
|
||||
kqv = ggml_reshape_4d(ctx, kqv, d_head, n_token, n_head, N);
|
||||
kqv = ggml_cont(ctx, ggml_permute(ctx, kqv, 0, 2, 1, 3)); // [N, n_token, n_head, d_head]
|
||||
|
||||
x = ggml_reshape_2d(ctx, kqv, d_head * n_head, n_token * N); // [N * n_token, d_head * n_head]
|
||||
|
||||
x = out_proj->forward(ctx, x);
|
||||
x = out_proj->forward(ctx, x); // [N, n_token, embed_dim]
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
#endif // __GGML_EXTEND__HPP__
|
||||
#endif // __GGML_EXTEND__HPP__
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue