mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 09:34:37 +00:00
Merge commit 'd7f5f4e578
' into concedo_experimental
# Conflicts: # .github/ISSUE_TEMPLATE/010-bug-compilation.yml # .github/ISSUE_TEMPLATE/011-bug-results.yml # .github/labeler.yml # .github/workflows/build.yml # docs/docker.md # examples/simple-chat/simple-chat.cpp # ggml/src/ggml-cann/aclnn_ops.cpp # ggml/src/ggml-cpu/CMakeLists.txt # ggml/src/ggml-metal/CMakeLists.txt # ggml/src/ggml-opencl/CMakeLists.txt # ggml/src/ggml-opencl/ggml-opencl.cpp # ggml/src/ggml-sycl/ggml-sycl.cpp # scripts/sync-ggml.last # tests/test-backend-ops.cpp
This commit is contained in:
commit
ac0366ad9b
18 changed files with 660 additions and 47 deletions
|
@ -320,6 +320,13 @@
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// Function type used in fatal error callbacks
|
||||||
|
typedef void (*ggml_abort_callback_t)(const char * error_message);
|
||||||
|
|
||||||
|
// Set the abort callback (passing null will restore original abort functionality: printing a message to stdout)
|
||||||
|
// Returns the old callback for chaining
|
||||||
|
GGML_API ggml_abort_callback_t ggml_set_abort_callback(ggml_abort_callback_t callback);
|
||||||
|
|
||||||
GGML_NORETURN GGML_ATTRIBUTE_FORMAT(3, 4)
|
GGML_NORETURN GGML_ATTRIBUTE_FORMAT(3, 4)
|
||||||
GGML_API void ggml_abort(const char * file, int line, const char * fmt, ...);
|
GGML_API void ggml_abort(const char * file, int line, const char * fmt, ...);
|
||||||
|
|
||||||
|
@ -488,6 +495,7 @@ extern "C" {
|
||||||
GGML_OP_CONV_TRANSPOSE_1D,
|
GGML_OP_CONV_TRANSPOSE_1D,
|
||||||
GGML_OP_IM2COL,
|
GGML_OP_IM2COL,
|
||||||
GGML_OP_IM2COL_BACK,
|
GGML_OP_IM2COL_BACK,
|
||||||
|
GGML_OP_CONV_2D,
|
||||||
GGML_OP_CONV_2D_DW,
|
GGML_OP_CONV_2D_DW,
|
||||||
GGML_OP_CONV_TRANSPOSE_2D,
|
GGML_OP_CONV_TRANSPOSE_2D,
|
||||||
GGML_OP_POOL_1D,
|
GGML_OP_POOL_1D,
|
||||||
|
@ -1826,6 +1834,17 @@ extern "C" {
|
||||||
struct ggml_tensor * b,
|
struct ggml_tensor * b,
|
||||||
int stride);
|
int stride);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_conv_2d_direct(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a, // convolution kernel [KW, KH, IC, OC]
|
||||||
|
struct ggml_tensor * b, // input data [W, H, C, N]
|
||||||
|
int s0, // stride dimension 0
|
||||||
|
int s1, // stride dimension 1
|
||||||
|
int p0, // padding dimension 0
|
||||||
|
int p1, // padding dimension 1
|
||||||
|
int d0, // dilation dimension 0
|
||||||
|
int d1); // dilation dimension 1
|
||||||
|
|
||||||
enum ggml_op_pool {
|
enum ggml_op_pool {
|
||||||
GGML_OP_POOL_MAX,
|
GGML_OP_POOL_MAX,
|
||||||
GGML_OP_POOL_AVG,
|
GGML_OP_POOL_AVG,
|
||||||
|
@ -1868,6 +1887,12 @@ extern "C" {
|
||||||
enum ggml_scale_mode {
|
enum ggml_scale_mode {
|
||||||
GGML_SCALE_MODE_NEAREST = 0,
|
GGML_SCALE_MODE_NEAREST = 0,
|
||||||
GGML_SCALE_MODE_BILINEAR = 1,
|
GGML_SCALE_MODE_BILINEAR = 1,
|
||||||
|
|
||||||
|
GGML_SCALE_MODE_COUNT
|
||||||
|
};
|
||||||
|
|
||||||
|
enum ggml_scale_flag {
|
||||||
|
GGML_SCALE_FLAG_ALIGN_CORNERS = (1 << 8)
|
||||||
};
|
};
|
||||||
|
|
||||||
// interpolate
|
// interpolate
|
||||||
|
@ -1880,14 +1905,26 @@ extern "C" {
|
||||||
|
|
||||||
// interpolate
|
// interpolate
|
||||||
// interpolate scale to specified dimensions
|
// interpolate scale to specified dimensions
|
||||||
GGML_API struct ggml_tensor * ggml_upscale_ext(
|
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_upscale_ext(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
int ne0,
|
int ne0,
|
||||||
int ne1,
|
int ne1,
|
||||||
int ne2,
|
int ne2,
|
||||||
int ne3,
|
int ne3,
|
||||||
enum ggml_scale_mode mode);
|
enum ggml_scale_mode mode),
|
||||||
|
"use ggml_interpolate instead");
|
||||||
|
|
||||||
|
// Up- or downsamples the input to the specified size.
|
||||||
|
// 2D scale modes (eg. bilinear) are applied to the first two dimensions.
|
||||||
|
GGML_API struct ggml_tensor * ggml_interpolate(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
int64_t ne0,
|
||||||
|
int64_t ne1,
|
||||||
|
int64_t ne2,
|
||||||
|
int64_t ne3,
|
||||||
|
uint32_t mode); // ggml_scale_mode [ | ggml_scale_flag...]
|
||||||
|
|
||||||
// pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
|
// pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
|
||||||
GGML_API struct ggml_tensor * ggml_pad(
|
GGML_API struct ggml_tensor * ggml_pad(
|
||||||
|
|
|
@ -1198,7 +1198,7 @@ static void ggml_compute_forward_mul_mat_one_chunk(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_compute_forward_mul_mat(
|
void ggml_compute_forward_mul_mat(
|
||||||
const struct ggml_compute_params * params,
|
const struct ggml_compute_params * params,
|
||||||
struct ggml_tensor * dst) {
|
struct ggml_tensor * dst) {
|
||||||
|
|
||||||
|
@ -1880,6 +1880,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||||
{
|
{
|
||||||
ggml_compute_forward_im2col_back_f32(params, tensor);
|
ggml_compute_forward_im2col_back_f32(params, tensor);
|
||||||
} break;
|
} break;
|
||||||
|
case GGML_OP_CONV_2D:
|
||||||
|
{
|
||||||
|
ggml_compute_forward_conv_2d(params, tensor);
|
||||||
|
} break;
|
||||||
case GGML_OP_CONV_2D_DW:
|
case GGML_OP_CONV_2D_DW:
|
||||||
{
|
{
|
||||||
ggml_compute_forward_conv_2d_dw(params, tensor);
|
ggml_compute_forward_conv_2d_dw(params, tensor);
|
||||||
|
@ -2242,6 +2246,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_IM2COL:
|
case GGML_OP_IM2COL:
|
||||||
case GGML_OP_IM2COL_BACK:
|
case GGML_OP_IM2COL_BACK:
|
||||||
|
case GGML_OP_CONV_2D:
|
||||||
case GGML_OP_CONV_2D_DW:
|
case GGML_OP_CONV_2D_DW:
|
||||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||||
case GGML_OP_CONV_TRANSPOSE_2D:
|
case GGML_OP_CONV_TRANSPOSE_2D:
|
||||||
|
@ -2775,6 +2780,10 @@ struct ggml_cplan ggml_graph_plan(
|
||||||
GGML_ABORT("fatal error");
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case GGML_OP_CONV_2D:
|
||||||
|
{
|
||||||
|
cur = GGML_IM2COL_WORK_SIZE;
|
||||||
|
} break;
|
||||||
case GGML_OP_CONV_TRANSPOSE_2D:
|
case GGML_OP_CONV_TRANSPOSE_2D:
|
||||||
{
|
{
|
||||||
const int64_t ne00 = node->src[0]->ne[0]; // W
|
const int64_t ne00 = node->src[0]->ne[0]; // W
|
||||||
|
|
|
@ -3,6 +3,7 @@
|
||||||
#include "ggml-cpu.h"
|
#include "ggml-cpu.h"
|
||||||
#include "ggml-impl.h"
|
#include "ggml-impl.h"
|
||||||
#include "binary-ops.h"
|
#include "binary-ops.h"
|
||||||
|
#include "ggml.h"
|
||||||
#include "unary-ops.h"
|
#include "unary-ops.h"
|
||||||
#include "vec.h"
|
#include "vec.h"
|
||||||
|
|
||||||
|
@ -6545,6 +6546,186 @@ void ggml_compute_forward_im2col_back_f32(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void ggml_call_mul_mat(ggml_type type, const ggml_compute_params * params, int64_t m, int64_t n, int64_t k,
|
||||||
|
void * a, void * b, float * c) {
|
||||||
|
const ggml_type_traits * traits = ggml_get_type_traits(type);
|
||||||
|
struct ggml_tensor src1 = {};
|
||||||
|
src1.type = type;
|
||||||
|
src1.ne[0] = k;
|
||||||
|
src1.ne[1] = m;
|
||||||
|
src1.ne[2] = 1;
|
||||||
|
src1.ne[3] = 1;
|
||||||
|
src1.nb[0] = traits->type_size;
|
||||||
|
src1.nb[1] = k * traits->type_size;
|
||||||
|
src1.nb[2] = src1.nb[1];
|
||||||
|
src1.nb[3] = src1.nb[2];
|
||||||
|
src1.data = a;
|
||||||
|
|
||||||
|
struct ggml_tensor src0 = {};
|
||||||
|
src0.type = type;
|
||||||
|
src0.ne[0] = k;
|
||||||
|
src0.ne[1] = n;
|
||||||
|
src0.ne[2] = 1;
|
||||||
|
src0.ne[3] = 1;
|
||||||
|
src0.nb[0] = traits->type_size;
|
||||||
|
src0.nb[1] = k * traits->type_size;
|
||||||
|
src0.nb[2] = src0.nb[1];
|
||||||
|
src0.nb[3] = src0.nb[2];
|
||||||
|
src0.data = b;
|
||||||
|
|
||||||
|
struct ggml_tensor dst = {};
|
||||||
|
dst.ne[0] = n;
|
||||||
|
dst.ne[1] = m;
|
||||||
|
dst.ne[2] = 1;
|
||||||
|
dst.ne[3] = 1;
|
||||||
|
dst.nb[0] = sizeof(float);
|
||||||
|
dst.nb[1] = n * sizeof(float);
|
||||||
|
dst.nb[2] = dst.nb[1];
|
||||||
|
dst.nb[3] = dst.nb[2];
|
||||||
|
dst.data = c;
|
||||||
|
dst.src[0] = &src0;
|
||||||
|
dst.src[1] = &src1;
|
||||||
|
|
||||||
|
ggml_compute_forward_mul_mat(params, &dst);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ggml_compute_forward_conv_2d
|
||||||
|
|
||||||
|
static void ggml_compute_forward_conv_2d_impl(const ggml_compute_params * params,
|
||||||
|
const ggml_tensor * kernel, // [KW, KH, IC, OC]
|
||||||
|
const ggml_tensor * src, // [W, H, C, N]
|
||||||
|
ggml_tensor * dst, // [OW, OH, OC, N]
|
||||||
|
ggml_type kernel_type) {
|
||||||
|
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(kernel));
|
||||||
|
GGML_ASSERT(kernel_type == GGML_TYPE_F16 || kernel_type == GGML_TYPE_F32);
|
||||||
|
GGML_ASSERT(kernel->type == kernel_type);
|
||||||
|
|
||||||
|
const ggml_type_traits * traits = ggml_get_type_traits(kernel_type);
|
||||||
|
|
||||||
|
const int32_t stride_x = dst->op_params[0];
|
||||||
|
const int32_t stride_y = dst->op_params[1];
|
||||||
|
const int32_t pad_x = dst->op_params[2];
|
||||||
|
const int32_t pad_y = dst->op_params[3];
|
||||||
|
const int32_t dilation_x = dst->op_params[4];
|
||||||
|
const int32_t dilation_y = dst->op_params[5];
|
||||||
|
|
||||||
|
const int64_t c_in = src->ne[2];
|
||||||
|
const int64_t c_out = kernel->ne[3];
|
||||||
|
GGML_ASSERT(c_in == kernel->ne[2]);
|
||||||
|
|
||||||
|
const int64_t src_w = src->ne[0];
|
||||||
|
const int64_t src_h = src->ne[1];
|
||||||
|
const int64_t knl_w = kernel->ne[0];
|
||||||
|
const int64_t knl_h = kernel->ne[1];
|
||||||
|
const int64_t dst_w = dst->ne[0];
|
||||||
|
const int64_t dst_h = dst->ne[1];
|
||||||
|
|
||||||
|
const float * src_data = (float *) src->data;
|
||||||
|
void * knl_data = kernel->data;
|
||||||
|
float * dst_data = (float *) dst->data;
|
||||||
|
|
||||||
|
const int64_t knl_n = knl_w * knl_h * c_in;
|
||||||
|
const int64_t patch_total = dst->ne[3] * dst_w * dst_h;
|
||||||
|
|
||||||
|
const int64_t space_per_patch = knl_n * traits->type_size + c_out * sizeof(float);
|
||||||
|
const int64_t batch_size = params->wsize / space_per_patch;
|
||||||
|
const int64_t patches_per_batch = batch_size > 8 ? (batch_size / 8) * 8 : batch_size;
|
||||||
|
const int64_t batch_n = (patch_total + patches_per_batch - 1) / patches_per_batch;
|
||||||
|
|
||||||
|
GGML_ASSERT(patches_per_batch > 0 && batch_size >= 1);
|
||||||
|
|
||||||
|
void * tmp = params->wdata;
|
||||||
|
|
||||||
|
for (int64_t batch_i = 0; batch_i < batch_n; ++batch_i) {
|
||||||
|
|
||||||
|
const int64_t patch_start_batch = batch_i * patches_per_batch;
|
||||||
|
const int64_t patch_end_batch = std::min(patch_start_batch + patches_per_batch,
|
||||||
|
patch_total);
|
||||||
|
const int64_t patch_n = patch_end_batch - patch_start_batch;
|
||||||
|
|
||||||
|
const int64_t patch_per_thread = (patch_n + params->nth - 1) / params->nth;
|
||||||
|
const int64_t patch_start = patch_start_batch + params->ith * patch_per_thread;
|
||||||
|
const int64_t patch_end = std::min(patch_start + patch_per_thread, patch_end_batch);
|
||||||
|
|
||||||
|
//im2col for a patch
|
||||||
|
for (int64_t p = patch_start; p < patch_end; ++p) {
|
||||||
|
const int64_t batch_n = p / (dst_w * dst_h);
|
||||||
|
const int64_t src_x = (p / dst_w) % dst_h;
|
||||||
|
const int64_t src_y = p % dst_w;
|
||||||
|
|
||||||
|
const float * src_base = (const float *)((const char *)src_data + batch_n * src->nb[3]);
|
||||||
|
char * dst_row = (char *) tmp + (p % patches_per_batch) * knl_n * traits->type_size;
|
||||||
|
|
||||||
|
for (int64_t ic = 0; ic < c_in; ++ic) {
|
||||||
|
for (int64_t ky = 0; ky < knl_h; ++ky) {
|
||||||
|
for (int64_t kx = 0; kx < knl_w; ++kx) {
|
||||||
|
const int64_t sy = src_x * stride_y + ky * dilation_y - pad_y;
|
||||||
|
const int64_t sx = src_y * stride_x + kx * dilation_x - pad_x;
|
||||||
|
|
||||||
|
int64_t dst_idx = ic * (knl_h * knl_w) + ky * knl_w + kx;
|
||||||
|
|
||||||
|
float src_val;
|
||||||
|
if (sy < 0 || sy >= src_h || sx < 0 || sx >= src_w) {
|
||||||
|
src_val = 0.0f;
|
||||||
|
} else {
|
||||||
|
const float * src_ptr = (const float *)((const char *)src_base + sx * src->nb[0] + sy * src->nb[1] + ic * src->nb[2]);
|
||||||
|
src_val = *src_ptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
char * element_ptr = dst_row + dst_idx * traits->type_size;
|
||||||
|
if (kernel_type == GGML_TYPE_F32) {
|
||||||
|
*(float *) element_ptr = src_val;
|
||||||
|
} else if (kernel_type == GGML_TYPE_F16) {
|
||||||
|
*(ggml_fp16_t *) element_ptr = GGML_CPU_FP32_TO_FP16(src_val);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} // patches handled by this thread
|
||||||
|
|
||||||
|
ggml_barrier(params->threadpool);
|
||||||
|
|
||||||
|
float * gemm_output = (float *) ((char *) tmp + patches_per_batch * knl_n * traits->type_size);
|
||||||
|
|
||||||
|
GGML_ASSERT(gemm_output + patch_n * c_out <= (float*)tmp + params->wsize);
|
||||||
|
|
||||||
|
// GEMM: patches[patch_n, knl_n] × kernel[knl_n, c_out] = output[patch_n, c_out]
|
||||||
|
ggml_call_mul_mat(kernel_type, params, patch_n, c_out, knl_n, tmp, knl_data, gemm_output);
|
||||||
|
|
||||||
|
ggml_barrier(params->threadpool);
|
||||||
|
|
||||||
|
|
||||||
|
//permute back [OC, N, OH, OW] to [N, OC, OH, OW]
|
||||||
|
const int64_t permute_per_thread = (patch_n + params->nth - 1) / params->nth;
|
||||||
|
const int64_t permute_start = params->ith * permute_per_thread;
|
||||||
|
const int64_t permute_end = std::min(permute_start + permute_per_thread, patch_n);
|
||||||
|
|
||||||
|
for (int64_t i = permute_start; i < permute_end; ++i) {
|
||||||
|
const int64_t p = patch_start_batch + i;
|
||||||
|
const int64_t batch_n = p / (dst_w * dst_h);
|
||||||
|
const int64_t dst_y = (p / dst_w) % dst_h;
|
||||||
|
const int64_t dst_x = p % dst_w;
|
||||||
|
|
||||||
|
for (int64_t oc = 0; oc < c_out; ++oc) {
|
||||||
|
const float value = gemm_output[i * c_out + oc];
|
||||||
|
float * dst_ptr = (float *)((char *)dst_data + dst_x * dst->nb[0] + dst_y * dst->nb[1] + oc * dst->nb[2] + batch_n * dst->nb[3]);
|
||||||
|
*dst_ptr = value;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_compute_forward_conv_2d(
|
||||||
|
const ggml_compute_params * params,
|
||||||
|
ggml_tensor * dst) {
|
||||||
|
|
||||||
|
const ggml_tensor * src0 = dst->src[0];
|
||||||
|
const ggml_tensor * src1 = dst->src[1];
|
||||||
|
|
||||||
|
ggml_compute_forward_conv_2d_impl(params, src0, src1, dst, src0->type);
|
||||||
|
}
|
||||||
|
|
||||||
// ggml_compute_forward_conv_transpose_2d
|
// ggml_compute_forward_conv_transpose_2d
|
||||||
|
|
||||||
void ggml_compute_forward_conv_transpose_2d(
|
void ggml_compute_forward_conv_transpose_2d(
|
||||||
|
@ -7095,12 +7276,13 @@ static void ggml_compute_forward_upscale_f32(
|
||||||
|
|
||||||
GGML_TENSOR_UNARY_OP_LOCALS
|
GGML_TENSOR_UNARY_OP_LOCALS
|
||||||
|
|
||||||
const float sf0 = (float)ne0/src0->ne[0];
|
float sf0 = (float)ne0/src0->ne[0];
|
||||||
const float sf1 = (float)ne1/src0->ne[1];
|
float sf1 = (float)ne1/src0->ne[1];
|
||||||
const float sf2 = (float)ne2/src0->ne[2];
|
float sf2 = (float)ne2/src0->ne[2];
|
||||||
const float sf3 = (float)ne3/src0->ne[3];
|
float sf3 = (float)ne3/src0->ne[3];
|
||||||
|
|
||||||
const ggml_scale_mode mode = (ggml_scale_mode) ggml_get_op_params_i32(dst, 0);
|
const int32_t mode_flags = ggml_get_op_params_i32(dst, 0);
|
||||||
|
const ggml_scale_mode mode = (ggml_scale_mode) (mode_flags & 0xFF);
|
||||||
|
|
||||||
if (mode == GGML_SCALE_MODE_NEAREST) {
|
if (mode == GGML_SCALE_MODE_NEAREST) {
|
||||||
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
||||||
|
@ -7121,8 +7303,12 @@ static void ggml_compute_forward_upscale_f32(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if (mode == GGML_SCALE_MODE_BILINEAR) {
|
} else if (mode == GGML_SCALE_MODE_BILINEAR) {
|
||||||
// setting a pixel offset of 0 would replicate the behavior of pytorch interpolate with align_corners=True
|
float pixel_offset = 0.5f;
|
||||||
const float pixel_offset = 0.5f;
|
if (mode_flags & GGML_SCALE_FLAG_ALIGN_CORNERS) {
|
||||||
|
pixel_offset = 0.0f;
|
||||||
|
sf0 = (float)(ne0 - 1) / (src0->ne[0] - 1);
|
||||||
|
sf1 = (float)(ne1 - 1) / (src0->ne[1] - 1);
|
||||||
|
}
|
||||||
|
|
||||||
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
||||||
const int64_t i03 = i3 / sf3;
|
const int64_t i03 = i3 / sf3;
|
||||||
|
|
|
@ -20,6 +20,9 @@
|
||||||
|
|
||||||
static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
|
static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
|
||||||
|
|
||||||
|
// Work buffer size for im2col operations in CONV2D
|
||||||
|
#define GGML_IM2COL_WORK_SIZE (16 * 1024 * 1024)
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
@ -65,6 +68,7 @@ void ggml_compute_forward_clamp(const struct ggml_compute_params * params, struc
|
||||||
void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||||
void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||||
void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||||
|
void ggml_compute_forward_conv_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||||
void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||||
void ggml_compute_forward_conv_2d_dw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
void ggml_compute_forward_conv_2d_dw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||||
void ggml_compute_forward_pool_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
void ggml_compute_forward_pool_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||||
|
@ -107,6 +111,7 @@ void ggml_compute_forward_custom(const struct ggml_compute_params * params, stru
|
||||||
void ggml_compute_forward_cross_entropy_loss(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
void ggml_compute_forward_cross_entropy_loss(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||||
void ggml_compute_forward_cross_entropy_loss_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
void ggml_compute_forward_cross_entropy_loss_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||||
void ggml_compute_forward_opt_step_adamw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
void ggml_compute_forward_opt_step_adamw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||||
|
void ggml_compute_forward_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
|
|
@ -138,6 +138,7 @@ void quantize_q4_0(device const float * src, device block_q4_0 & dst) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void quantize_q4_1(device const float * src, device block_q4_1 & dst) {
|
void quantize_q4_1(device const float * src, device block_q4_1 & dst) {
|
||||||
|
#pragma METAL fp math_mode(safe)
|
||||||
float min = FLT_MAX;
|
float min = FLT_MAX;
|
||||||
float max = -FLT_MAX;
|
float max = -FLT_MAX;
|
||||||
|
|
||||||
|
@ -203,6 +204,7 @@ void quantize_q5_0(device const float * src, device block_q5_0 & dst) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void quantize_q5_1(device const float * src, device block_q5_1 & dst) {
|
void quantize_q5_1(device const float * src, device block_q5_1 & dst) {
|
||||||
|
#pragma METAL fp math_mode(safe)
|
||||||
float max = src[0];
|
float max = src[0];
|
||||||
float min = src[0];
|
float min = src[0];
|
||||||
|
|
||||||
|
@ -239,6 +241,7 @@ void quantize_q5_1(device const float * src, device block_q5_1 & dst) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void quantize_iq4_nl(device const float * src, device block_iq4_nl & dst) {
|
void quantize_iq4_nl(device const float * src, device block_iq4_nl & dst) {
|
||||||
|
#pragma METAL fp math_mode(safe)
|
||||||
float amax = 0.0f; // absolute max
|
float amax = 0.0f; // absolute max
|
||||||
float max = 0.0f;
|
float max = 0.0f;
|
||||||
|
|
||||||
|
|
201
ggml/src/ggml-opencl/kernels/glu.cl
Normal file
201
ggml/src/ggml-opencl/kernels/glu.cl
Normal file
|
@ -0,0 +1,201 @@
|
||||||
|
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||||
|
|
||||||
|
#define GELU_COEF_A 0.044715f
|
||||||
|
#define SQRT_2_OVER_PI 0.79788456080286535587989211986876f
|
||||||
|
|
||||||
|
//------------------------------------------------------------------------------
|
||||||
|
// geglu
|
||||||
|
//------------------------------------------------------------------------------
|
||||||
|
kernel void kernel_geglu(
|
||||||
|
global char * src0,
|
||||||
|
ulong offset0,
|
||||||
|
global char * src1,
|
||||||
|
ulong offset1,
|
||||||
|
global char * dst,
|
||||||
|
ulong offsetd,
|
||||||
|
ulong nb01,
|
||||||
|
ulong nb11,
|
||||||
|
int ne0,
|
||||||
|
ulong nb1,
|
||||||
|
int ne00_off,
|
||||||
|
int ne10_off
|
||||||
|
) {
|
||||||
|
src0 = (global char*)((global char*)src0 + offset0);
|
||||||
|
src1 = (global char*)((global char*)src1 + offset1);
|
||||||
|
dst = (global char*)((global char*)dst + offsetd);
|
||||||
|
|
||||||
|
global float * src0_row = (global float *) ((global char *) src0 + get_group_id(0)*nb01) + ne00_off;
|
||||||
|
global float * src1_row = (global float *) ((global char *) src1 + get_group_id(0)*nb11) + ne10_off;
|
||||||
|
global float * dst_row = (global float *) ((global char *) dst + get_group_id(0)*nb1);
|
||||||
|
|
||||||
|
for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
|
||||||
|
const float x0 = src0_row[i0];
|
||||||
|
const float x1 = src1_row[i0];
|
||||||
|
|
||||||
|
const float gelu = 0.5f*x0*(1.0f + tanh(SQRT_2_OVER_PI*x0*(1.0f + GELU_COEF_A*x0*x0)));
|
||||||
|
|
||||||
|
dst_row[i0] = gelu*x1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
kernel void kernel_geglu_f16(
|
||||||
|
global char * src0,
|
||||||
|
ulong offset0,
|
||||||
|
global char * src1,
|
||||||
|
ulong offset1,
|
||||||
|
global char * dst,
|
||||||
|
ulong offsetd,
|
||||||
|
ulong nb01,
|
||||||
|
ulong nb11,
|
||||||
|
int ne0,
|
||||||
|
ulong nb1,
|
||||||
|
int ne00_off,
|
||||||
|
int ne10_off
|
||||||
|
) {
|
||||||
|
src0 = (global char*)((global char*)src0 + offset0);
|
||||||
|
src1 = (global char*)((global char*)src1 + offset1);
|
||||||
|
dst = (global char*)((global char*)dst + offsetd);
|
||||||
|
|
||||||
|
global half * src0_row = (global half *) ((global char *) src0 + get_group_id(0)*nb01) + ne00_off;
|
||||||
|
global half * src1_row = (global half *) ((global char *) src1 + get_group_id(0)*nb11) + ne10_off;
|
||||||
|
global half * dst_row = (global half *) ((global char *) dst + get_group_id(0)*nb1);
|
||||||
|
|
||||||
|
for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
|
||||||
|
const half x0 = src0_row[i0];
|
||||||
|
const half x1 = src1_row[i0];
|
||||||
|
|
||||||
|
const half gelu = 0.5f*x0*(1.0f + tanh(SQRT_2_OVER_PI*x0*(1.0f + GELU_COEF_A*x0*x0)));
|
||||||
|
|
||||||
|
dst_row[i0] = gelu*x1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//------------------------------------------------------------------------------
|
||||||
|
// reglu
|
||||||
|
//------------------------------------------------------------------------------
|
||||||
|
kernel void kernel_reglu(
|
||||||
|
global char * src0,
|
||||||
|
ulong offset0,
|
||||||
|
global char * src1,
|
||||||
|
ulong offset1,
|
||||||
|
global char * dst,
|
||||||
|
ulong offsetd,
|
||||||
|
ulong nb01,
|
||||||
|
ulong nb11,
|
||||||
|
int ne0,
|
||||||
|
ulong nb1,
|
||||||
|
int ne00_off,
|
||||||
|
int ne10_off
|
||||||
|
) {
|
||||||
|
src0 = (global char*)((global char*)src0 + offset0);
|
||||||
|
src1 = (global char*)((global char*)src1 + offset1);
|
||||||
|
dst = (global char*)((global char*)dst + offsetd);
|
||||||
|
|
||||||
|
global float * src0_row = (global float *) ((global char *) src0 + get_group_id(0)*nb01) + ne00_off;
|
||||||
|
global float * src1_row = (global float *) ((global char *) src1 + get_group_id(0)*nb11) + ne10_off;
|
||||||
|
global float * dst_row = (global float *) ((global char *) dst + get_group_id(0)*nb1);
|
||||||
|
|
||||||
|
for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
|
||||||
|
const float x0 = src0_row[i0];
|
||||||
|
const float x1 = src1_row[i0];
|
||||||
|
|
||||||
|
dst_row[i0] = x0*x1*(x0 > 0.0f);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
kernel void kernel_reglu_f16(
|
||||||
|
global char * src0,
|
||||||
|
ulong offset0,
|
||||||
|
global char * src1,
|
||||||
|
ulong offset1,
|
||||||
|
global char * dst,
|
||||||
|
ulong offsetd,
|
||||||
|
ulong nb01,
|
||||||
|
ulong nb11,
|
||||||
|
int ne0,
|
||||||
|
ulong nb1,
|
||||||
|
int ne00_off,
|
||||||
|
int ne10_off
|
||||||
|
) {
|
||||||
|
src0 = (global char*)((global char*)src0 + offset0);
|
||||||
|
src1 = (global char*)((global char*)src1 + offset1);
|
||||||
|
dst = (global char*)((global char*)dst + offsetd);
|
||||||
|
|
||||||
|
global half * src0_row = (global half *) ((global char *) src0 + get_group_id(0)*nb01) + ne00_off;
|
||||||
|
global half * src1_row = (global half *) ((global char *) src1 + get_group_id(0)*nb11) + ne10_off;
|
||||||
|
global half * dst_row = (global half *) ((global char *) dst + get_group_id(0)*nb1);
|
||||||
|
|
||||||
|
for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
|
||||||
|
const half x0 = src0_row[i0];
|
||||||
|
const half x1 = src1_row[i0];
|
||||||
|
|
||||||
|
dst_row[i0] = x0*x1*(x0 > 0.0f);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//------------------------------------------------------------------------------
|
||||||
|
// swiglu
|
||||||
|
//------------------------------------------------------------------------------
|
||||||
|
kernel void kernel_swiglu(
|
||||||
|
global char * src0,
|
||||||
|
ulong offset0,
|
||||||
|
global char * src1,
|
||||||
|
ulong offset1,
|
||||||
|
global char * dst,
|
||||||
|
ulong offsetd,
|
||||||
|
ulong nb01,
|
||||||
|
ulong nb11,
|
||||||
|
int ne0,
|
||||||
|
ulong nb1,
|
||||||
|
int ne00_off,
|
||||||
|
int ne10_off
|
||||||
|
) {
|
||||||
|
src0 = (global char*)((global char*)src0 + offset0);
|
||||||
|
src1 = (global char*)((global char*)src1 + offset1);
|
||||||
|
dst = (global char*)((global char*)dst + offsetd);
|
||||||
|
|
||||||
|
global float * src0_row = (global float *) ((global char *) src0 + get_group_id(0)*nb01) + ne00_off;
|
||||||
|
global float * src1_row = (global float *) ((global char *) src1 + get_group_id(0)*nb11) + ne10_off;
|
||||||
|
global float * dst_row = (global float *) ((global char *) dst + get_group_id(0)*nb1);
|
||||||
|
|
||||||
|
for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
|
||||||
|
const float x0 = src0_row[i0];
|
||||||
|
const float x1 = src1_row[i0];
|
||||||
|
|
||||||
|
const float silu = x0 / (1.0f + exp(-x0));
|
||||||
|
|
||||||
|
dst_row[i0] = silu*x1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
kernel void kernel_swiglu_f16(
|
||||||
|
global char * src0,
|
||||||
|
ulong offset0,
|
||||||
|
global char * src1,
|
||||||
|
ulong offset1,
|
||||||
|
global char * dst,
|
||||||
|
ulong offsetd,
|
||||||
|
ulong nb01,
|
||||||
|
ulong nb11,
|
||||||
|
int ne0,
|
||||||
|
ulong nb1,
|
||||||
|
int ne00_off,
|
||||||
|
int ne10_off
|
||||||
|
) {
|
||||||
|
src0 = (global char*)((global char*)src0 + offset0);
|
||||||
|
src1 = (global char*)((global char*)src1 + offset1);
|
||||||
|
dst = (global char*)((global char*)dst + offsetd);
|
||||||
|
|
||||||
|
global half * src0_row = (global half *) ((global char *) src0 + get_group_id(0)*nb01) + ne00_off;
|
||||||
|
global half * src1_row = (global half *) ((global char *) src1 + get_group_id(0)*nb11) + ne10_off;
|
||||||
|
global half * dst_row = (global half *) ((global char *) dst + get_group_id(0)*nb1);
|
||||||
|
|
||||||
|
for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
|
||||||
|
const half x0 = src0_row[i0];
|
||||||
|
const half x1 = src1_row[i0];
|
||||||
|
|
||||||
|
const half silu = x0 / (1.0f + exp(-x0));
|
||||||
|
|
||||||
|
dst_row[i0] = silu*x1;
|
||||||
|
}
|
||||||
|
}
|
|
@ -60,7 +60,8 @@ kernel void kernel_upscale_bilinear(
|
||||||
float sf0,
|
float sf0,
|
||||||
float sf1,
|
float sf1,
|
||||||
float sf2,
|
float sf2,
|
||||||
float sf3
|
float sf3,
|
||||||
|
float pixel_offset
|
||||||
) {
|
) {
|
||||||
global const char * src_base = (global const char *)p_src0 + off_src0;
|
global const char * src_base = (global const char *)p_src0 + off_src0;
|
||||||
global float * dst_base = (global float *)((global char *)p_dst + off_dst);
|
global float * dst_base = (global float *)((global char *)p_dst + off_dst);
|
||||||
|
@ -80,8 +81,6 @@ kernel void kernel_upscale_bilinear(
|
||||||
int i02_src = (int)(i12_dst / sf2);
|
int i02_src = (int)(i12_dst / sf2);
|
||||||
int i03_src = (int)(i13_dst / sf3);
|
int i03_src = (int)(i13_dst / sf3);
|
||||||
|
|
||||||
const float pixel_offset = 0.5f;
|
|
||||||
|
|
||||||
float y_src_f = ((float)i11_dst + pixel_offset) / sf1 - pixel_offset;
|
float y_src_f = ((float)i11_dst + pixel_offset) / sf1 - pixel_offset;
|
||||||
long y0_src = (long)floor(y_src_f);
|
long y0_src = (long)floor(y_src_f);
|
||||||
long y1_src = y0_src + 1;
|
long y1_src = y0_src + 1;
|
||||||
|
|
|
@ -568,14 +568,14 @@ static float make_qkx2_quants(int n, int nmax, const float * GGML_RESTRICT x, co
|
||||||
}
|
}
|
||||||
float iscale = nmax/(max - min);
|
float iscale = nmax/(max - min);
|
||||||
float scale = 1/iscale;
|
float scale = 1/iscale;
|
||||||
float best_mad = 0;
|
float best_error = 0;
|
||||||
for (int i = 0; i < n; ++i) {
|
for (int i = 0; i < n; ++i) {
|
||||||
int l = nearest_int(iscale*(x[i] - min));
|
int l = nearest_int(iscale*(x[i] - min));
|
||||||
L[i] = MAX(0, MIN(nmax, l));
|
L[i] = MAX(0, MIN(nmax, l));
|
||||||
float diff = scale * L[i] + min - x[i];
|
float diff = scale * L[i] + min - x[i];
|
||||||
diff = use_mad ? fabsf(diff) : diff * diff;
|
diff = use_mad ? fabsf(diff) : diff * diff;
|
||||||
float w = weights[i];
|
float w = weights[i];
|
||||||
best_mad += w * diff;
|
best_error += w * diff;
|
||||||
}
|
}
|
||||||
if (nstep < 1) {
|
if (nstep < 1) {
|
||||||
*the_min = -min;
|
*the_min = -min;
|
||||||
|
@ -601,18 +601,18 @@ static float make_qkx2_quants(int n, int nmax, const float * GGML_RESTRICT x, co
|
||||||
this_min = 0;
|
this_min = 0;
|
||||||
this_scale = sum_xl / sum_l2;
|
this_scale = sum_xl / sum_l2;
|
||||||
}
|
}
|
||||||
float mad = 0;
|
float cur_error = 0;
|
||||||
for (int i = 0; i < n; ++i) {
|
for (int i = 0; i < n; ++i) {
|
||||||
float diff = this_scale * Laux[i] + this_min - x[i];
|
float diff = this_scale * Laux[i] + this_min - x[i];
|
||||||
diff = use_mad ? fabsf(diff) : diff * diff;
|
diff = use_mad ? fabsf(diff) : diff * diff;
|
||||||
float w = weights[i];
|
float w = weights[i];
|
||||||
mad += w * diff;
|
cur_error += w * diff;
|
||||||
}
|
}
|
||||||
if (mad < best_mad) {
|
if (cur_error < best_error) {
|
||||||
for (int i = 0; i < n; ++i) {
|
for (int i = 0; i < n; ++i) {
|
||||||
L[i] = Laux[i];
|
L[i] = Laux[i];
|
||||||
}
|
}
|
||||||
best_mad = mad;
|
best_error = cur_error;
|
||||||
scale = this_scale;
|
scale = this_scale;
|
||||||
min = this_min;
|
min = this_min;
|
||||||
}
|
}
|
||||||
|
|
|
@ -447,6 +447,7 @@ struct vk_device_struct {
|
||||||
|
|
||||||
// [src/dst 0=fp32,1=fp16]
|
// [src/dst 0=fp32,1=fp16]
|
||||||
vk_pipeline pipeline_gelu[2];
|
vk_pipeline pipeline_gelu[2];
|
||||||
|
vk_pipeline pipeline_gelu_erf[2];
|
||||||
vk_pipeline pipeline_gelu_quick[2];
|
vk_pipeline pipeline_gelu_quick[2];
|
||||||
vk_pipeline pipeline_silu[2];
|
vk_pipeline pipeline_silu[2];
|
||||||
vk_pipeline pipeline_relu[2];
|
vk_pipeline pipeline_relu[2];
|
||||||
|
@ -2777,6 +2778,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||||
ggml_vk_create_pipeline(device, device->pipeline_ ## name [1], #name "_f16", name ## _f16_len, name ## _f16_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
ggml_vk_create_pipeline(device, device->pipeline_ ## name [1], #name "_f16", name ## _f16_len, name ## _f16_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
|
||||||
|
|
||||||
CREATE_UNARY(gelu)
|
CREATE_UNARY(gelu)
|
||||||
|
CREATE_UNARY(gelu_erf)
|
||||||
CREATE_UNARY(gelu_quick)
|
CREATE_UNARY(gelu_quick)
|
||||||
CREATE_UNARY(silu)
|
CREATE_UNARY(silu)
|
||||||
CREATE_UNARY(relu)
|
CREATE_UNARY(relu)
|
||||||
|
@ -5988,7 +5990,30 @@ static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context& subctx
|
||||||
if (src2->ne[1] == 1 && (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type))) {
|
if (src2->ne[1] == 1 && (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type))) {
|
||||||
ggml_vk_mul_mat_vec_id_q_f16(ctx, subctx, src0, src1, src2, dst, dryrun);
|
ggml_vk_mul_mat_vec_id_q_f16(ctx, subctx, src0, src1, src2, dst, dryrun);
|
||||||
} else {
|
} else {
|
||||||
ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, src1, src2, dst, dryrun);
|
// Split based on number of ids, to fit in shared memory
|
||||||
|
const uint32_t nei0 = (uint32_t)src2->ne[0];
|
||||||
|
const uint32_t nei1 = (uint32_t)src2->ne[1];
|
||||||
|
|
||||||
|
GGML_ASSERT(nei0 <= 4096);
|
||||||
|
const uint32_t split_size = std::min(nei1, 4096u / nei0);
|
||||||
|
|
||||||
|
ggml_tensor src1_copy = *src1;
|
||||||
|
ggml_tensor src2_copy = *src2;
|
||||||
|
ggml_tensor dst_copy = *dst;
|
||||||
|
|
||||||
|
for (uint32_t token_start = 0; token_start < nei1; token_start += split_size) {
|
||||||
|
const uint32_t n_tokens = std::min(split_size, nei1 - token_start);
|
||||||
|
|
||||||
|
src1_copy.view_offs = src1->view_offs + token_start * src1_copy.nb[2];
|
||||||
|
src2_copy.view_offs = src2->view_offs + token_start * src2_copy.nb[1];
|
||||||
|
dst_copy.view_offs = dst->view_offs + token_start * dst_copy.nb[2];
|
||||||
|
|
||||||
|
src1_copy.ne[2] = n_tokens;
|
||||||
|
src2_copy.ne[1] = n_tokens;
|
||||||
|
dst_copy.ne[2] = n_tokens;
|
||||||
|
|
||||||
|
ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, &src1_copy, &src2_copy, &dst_copy, dryrun);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -6505,6 +6530,8 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
||||||
return ctx->device->pipeline_silu[dst->type == GGML_TYPE_F16];
|
return ctx->device->pipeline_silu[dst->type == GGML_TYPE_F16];
|
||||||
case GGML_UNARY_OP_GELU:
|
case GGML_UNARY_OP_GELU:
|
||||||
return ctx->device->pipeline_gelu[dst->type == GGML_TYPE_F16];
|
return ctx->device->pipeline_gelu[dst->type == GGML_TYPE_F16];
|
||||||
|
case GGML_UNARY_OP_GELU_ERF:
|
||||||
|
return ctx->device->pipeline_gelu_erf[dst->type == GGML_TYPE_F16];
|
||||||
case GGML_UNARY_OP_GELU_QUICK:
|
case GGML_UNARY_OP_GELU_QUICK:
|
||||||
return ctx->device->pipeline_gelu_quick[dst->type == GGML_TYPE_F16];
|
return ctx->device->pipeline_gelu_quick[dst->type == GGML_TYPE_F16];
|
||||||
case GGML_UNARY_OP_RELU:
|
case GGML_UNARY_OP_RELU:
|
||||||
|
@ -8851,6 +8878,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
|
||||||
switch (ggml_get_unary_op(node)) {
|
switch (ggml_get_unary_op(node)) {
|
||||||
case GGML_UNARY_OP_SILU:
|
case GGML_UNARY_OP_SILU:
|
||||||
case GGML_UNARY_OP_GELU:
|
case GGML_UNARY_OP_GELU:
|
||||||
|
case GGML_UNARY_OP_GELU_ERF:
|
||||||
case GGML_UNARY_OP_GELU_QUICK:
|
case GGML_UNARY_OP_GELU_QUICK:
|
||||||
case GGML_UNARY_OP_RELU:
|
case GGML_UNARY_OP_RELU:
|
||||||
case GGML_UNARY_OP_TANH:
|
case GGML_UNARY_OP_TANH:
|
||||||
|
@ -9096,6 +9124,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
|
||||||
switch (ggml_get_unary_op(node)) {
|
switch (ggml_get_unary_op(node)) {
|
||||||
case GGML_UNARY_OP_SILU:
|
case GGML_UNARY_OP_SILU:
|
||||||
case GGML_UNARY_OP_GELU:
|
case GGML_UNARY_OP_GELU:
|
||||||
|
case GGML_UNARY_OP_GELU_ERF:
|
||||||
case GGML_UNARY_OP_GELU_QUICK:
|
case GGML_UNARY_OP_GELU_QUICK:
|
||||||
case GGML_UNARY_OP_RELU:
|
case GGML_UNARY_OP_RELU:
|
||||||
case GGML_UNARY_OP_TANH:
|
case GGML_UNARY_OP_TANH:
|
||||||
|
@ -9313,6 +9342,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
|
||||||
switch (ggml_get_unary_op(tensor)) {
|
switch (ggml_get_unary_op(tensor)) {
|
||||||
case GGML_UNARY_OP_SILU:
|
case GGML_UNARY_OP_SILU:
|
||||||
case GGML_UNARY_OP_GELU:
|
case GGML_UNARY_OP_GELU:
|
||||||
|
case GGML_UNARY_OP_GELU_ERF:
|
||||||
case GGML_UNARY_OP_GELU_QUICK:
|
case GGML_UNARY_OP_GELU_QUICK:
|
||||||
case GGML_UNARY_OP_RELU:
|
case GGML_UNARY_OP_RELU:
|
||||||
case GGML_UNARY_OP_TANH:
|
case GGML_UNARY_OP_TANH:
|
||||||
|
@ -10119,6 +10149,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||||
case GGML_OP_UNARY:
|
case GGML_OP_UNARY:
|
||||||
switch (ggml_get_unary_op(op)) {
|
switch (ggml_get_unary_op(op)) {
|
||||||
case GGML_UNARY_OP_GELU:
|
case GGML_UNARY_OP_GELU:
|
||||||
|
case GGML_UNARY_OP_GELU_ERF:
|
||||||
case GGML_UNARY_OP_GELU_QUICK:
|
case GGML_UNARY_OP_GELU_QUICK:
|
||||||
case GGML_UNARY_OP_SILU:
|
case GGML_UNARY_OP_SILU:
|
||||||
case GGML_UNARY_OP_RELU:
|
case GGML_UNARY_OP_RELU:
|
||||||
|
@ -10151,9 +10182,15 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||||
ggml_type src0_type = op->src[0]->type;
|
ggml_type src0_type = op->src[0]->type;
|
||||||
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
|
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
|
||||||
const vk_device& device = ggml_vk_get_device(ctx->device);
|
const vk_device& device = ggml_vk_get_device(ctx->device);
|
||||||
if (op->op == GGML_OP_MUL_MAT_ID && !device->mul_mat_id_s[src0_type] && !device->mul_mat_id_m[src0_type] && !device->mul_mat_id_l[src0_type]) {
|
if (op->op == GGML_OP_MUL_MAT_ID) {
|
||||||
// If there's not enough shared memory for row_ids and the result tile, fallback to CPU
|
if (!device->mul_mat_id_s[src0_type] && !device->mul_mat_id_m[src0_type] && !device->mul_mat_id_l[src0_type]) {
|
||||||
return false;
|
// If there's not enough shared memory for row_ids and the result tile, fallback to CPU
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
// Check against size of shared memory variable
|
||||||
|
if (op->src[2]->ne[0] > 4096) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
switch (src0_type) {
|
switch (src0_type) {
|
||||||
case GGML_TYPE_F32:
|
case GGML_TYPE_F32:
|
||||||
|
@ -10859,6 +10896,9 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) {
|
||||||
case GGML_UNARY_OP_GELU:
|
case GGML_UNARY_OP_GELU:
|
||||||
tensor_clone = ggml_gelu(ggml_ctx, src_clone[0]);
|
tensor_clone = ggml_gelu(ggml_ctx, src_clone[0]);
|
||||||
break;
|
break;
|
||||||
|
case GGML_UNARY_OP_GELU_ERF:
|
||||||
|
tensor_clone = ggml_gelu_erf(ggml_ctx, src_clone[0]);
|
||||||
|
break;
|
||||||
case GGML_UNARY_OP_GELU_QUICK:
|
case GGML_UNARY_OP_GELU_QUICK:
|
||||||
tensor_clone = ggml_gelu_quick(ggml_ctx, src_clone[0]);
|
tensor_clone = ggml_gelu_quick(ggml_ctx, src_clone[0]);
|
||||||
break;
|
break;
|
||||||
|
|
39
ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp
Normal file
39
ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp
Normal file
|
@ -0,0 +1,39 @@
|
||||||
|
#version 450
|
||||||
|
|
||||||
|
#include "generic_head.comp"
|
||||||
|
#include "types.comp"
|
||||||
|
|
||||||
|
#extension GL_EXT_control_flow_attributes : enable
|
||||||
|
|
||||||
|
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
|
||||||
|
|
||||||
|
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
||||||
|
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||||
|
|
||||||
|
void main() {
|
||||||
|
// based on Abramowitz and Stegun formula 7.1.26 or similar Hastings' approximation
|
||||||
|
// ref: https://www.johndcook.com/blog/python_erf/
|
||||||
|
const float p_erf = 0.3275911f;
|
||||||
|
const float a1_erf = 0.254829592f;
|
||||||
|
const float a2_erf = -0.284496736f;
|
||||||
|
const float a3_erf = 1.421413741f;
|
||||||
|
const float a4_erf = -1.453152027f;
|
||||||
|
const float a5_erf = 1.061405429f;
|
||||||
|
|
||||||
|
const float SQRT_2_INV = 0.70710678118654752440084436210484f;
|
||||||
|
const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
|
||||||
|
|
||||||
|
if (i >= p.KX) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const float a = float(data_a[i]);
|
||||||
|
const float a_div_sqr2 = a * SQRT_2_INV;
|
||||||
|
const float sign_x = sign(a_div_sqr2);
|
||||||
|
const float x = abs(a_div_sqr2);
|
||||||
|
const float t = 1.0f / (1.0f + p_erf * x);
|
||||||
|
const float y = 1.0f - (((((a5_erf * t + a4_erf) * t) + a3_erf) * t + a2_erf) * t + a1_erf) * t * exp(-x * x);
|
||||||
|
const float erf_approx = sign_x * y;
|
||||||
|
|
||||||
|
data_d[i] = D_TYPE(0.5f * a * (1.0f + erf_approx));
|
||||||
|
}
|
|
@ -588,6 +588,8 @@ void process_shaders() {
|
||||||
|
|
||||||
string_to_spv("gelu_f16", "gelu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
string_to_spv("gelu_f16", "gelu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
||||||
string_to_spv("gelu_f32", "gelu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
string_to_spv("gelu_f32", "gelu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||||
|
string_to_spv("gelu_erf_f16", "gelu_erf.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
||||||
|
string_to_spv("gelu_erf_f32", "gelu_erf.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||||
string_to_spv("gelu_quick_f16", "gelu_quick.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
string_to_spv("gelu_quick_f16", "gelu_quick.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
||||||
string_to_spv("gelu_quick_f32", "gelu_quick.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
string_to_spv("gelu_quick_f32", "gelu_quick.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
||||||
string_to_spv("silu_f16", "silu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
string_to_spv("silu_f16", "silu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
||||||
|
|
104
ggml/src/ggml.c
104
ggml/src/ggml.c
|
@ -203,19 +203,34 @@ void ggml_print_backtrace(void) {
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
static ggml_abort_callback_t g_abort_callback = NULL;
|
||||||
|
|
||||||
|
// Set the abort callback (passing null will restore original abort functionality: printing a message to stdout)
|
||||||
|
GGML_API ggml_abort_callback_t ggml_set_abort_callback(ggml_abort_callback_t callback) {
|
||||||
|
ggml_abort_callback_t ret_val = g_abort_callback;
|
||||||
|
g_abort_callback = callback;
|
||||||
|
return ret_val;
|
||||||
|
}
|
||||||
|
|
||||||
void ggml_abort(const char * file, int line, const char * fmt, ...) {
|
void ggml_abort(const char * file, int line, const char * fmt, ...) {
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
|
|
||||||
fprintf(stderr, "%s:%d: ", file, line);
|
char message[2048];
|
||||||
|
int offset = snprintf(message, sizeof(message), "%s:%d: ", file, line);
|
||||||
|
|
||||||
va_list args;
|
va_list args;
|
||||||
va_start(args, fmt);
|
va_start(args, fmt);
|
||||||
vfprintf(stderr, fmt, args);
|
vsnprintf(message + offset, sizeof(message) - offset, fmt, args);
|
||||||
va_end(args);
|
va_end(args);
|
||||||
|
|
||||||
fprintf(stderr, "\n");
|
if (g_abort_callback) {
|
||||||
|
g_abort_callback(message);
|
||||||
|
} else {
|
||||||
|
// default: print error and backtrace to stderr
|
||||||
|
fprintf(stderr, "%s\n", message);
|
||||||
|
ggml_print_backtrace();
|
||||||
|
}
|
||||||
|
|
||||||
ggml_print_backtrace();
|
|
||||||
abort();
|
abort();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -958,6 +973,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||||
"CONV_TRANSPOSE_1D",
|
"CONV_TRANSPOSE_1D",
|
||||||
"IM2COL",
|
"IM2COL",
|
||||||
"IM2COL_BACK",
|
"IM2COL_BACK",
|
||||||
|
"CONV_2D",
|
||||||
"CONV_2D_DW",
|
"CONV_2D_DW",
|
||||||
"CONV_TRANSPOSE_2D",
|
"CONV_TRANSPOSE_2D",
|
||||||
"POOL_1D",
|
"POOL_1D",
|
||||||
|
@ -999,7 +1015,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||||
"GLU",
|
"GLU",
|
||||||
};
|
};
|
||||||
|
|
||||||
static_assert(GGML_OP_COUNT == 85, "GGML_OP_COUNT != 85");
|
static_assert(GGML_OP_COUNT == 86, "GGML_OP_COUNT != 86");
|
||||||
|
|
||||||
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||||
"none",
|
"none",
|
||||||
|
@ -1057,6 +1073,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||||
"conv_transpose_1d(x)",
|
"conv_transpose_1d(x)",
|
||||||
"im2col(x)",
|
"im2col(x)",
|
||||||
"im2col_back(x)",
|
"im2col_back(x)",
|
||||||
|
"conv_2d(x)",
|
||||||
"conv_2d_dw(x)",
|
"conv_2d_dw(x)",
|
||||||
"conv_transpose_2d(x)",
|
"conv_transpose_2d(x)",
|
||||||
"pool_1d(x)",
|
"pool_1d(x)",
|
||||||
|
@ -1098,7 +1115,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||||
"glu(x)",
|
"glu(x)",
|
||||||
};
|
};
|
||||||
|
|
||||||
static_assert(GGML_OP_COUNT == 85, "GGML_OP_COUNT != 85");
|
static_assert(GGML_OP_COUNT == 86, "GGML_OP_COUNT != 86");
|
||||||
|
|
||||||
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
||||||
|
|
||||||
|
@ -4304,6 +4321,44 @@ struct ggml_tensor * ggml_conv_2d_dw_direct(
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ggml_conv_2d_direct
|
||||||
|
|
||||||
|
struct ggml_tensor * ggml_conv_2d_direct(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a, // convolution kernel [KW, KH, IC, OC]
|
||||||
|
struct ggml_tensor * b, // input data [W, H, C, N]
|
||||||
|
int s0, // stride dimension 0
|
||||||
|
int s1, // stride dimension 1
|
||||||
|
int p0, // padding dimension 0
|
||||||
|
int p1, // padding dimension 1
|
||||||
|
int d0, // dilation dimension 0
|
||||||
|
int d1) {// dilation dimension 1
|
||||||
|
|
||||||
|
GGML_ASSERT(a->ne[2] == b->ne[2]);
|
||||||
|
//GGML_ASSERT(a->type == b->type);
|
||||||
|
|
||||||
|
int64_t ne[4];
|
||||||
|
ne[0] = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
|
||||||
|
ne[1] = ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1);
|
||||||
|
ne[2] = a->ne[3];
|
||||||
|
ne[3] = b->ne[3];
|
||||||
|
|
||||||
|
struct ggml_tensor * result = ggml_new_tensor(ctx, b->type, 4, ne);
|
||||||
|
|
||||||
|
ggml_set_op_params_i32(result, 0, s0);
|
||||||
|
ggml_set_op_params_i32(result, 1, s1);
|
||||||
|
ggml_set_op_params_i32(result, 2, p0);
|
||||||
|
ggml_set_op_params_i32(result, 3, p1);
|
||||||
|
ggml_set_op_params_i32(result, 4, d0);
|
||||||
|
ggml_set_op_params_i32(result, 5, d1);
|
||||||
|
|
||||||
|
result->op = GGML_OP_CONV_2D;
|
||||||
|
result->src[0] = a;
|
||||||
|
result->src[1] = b;
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
// ggml_conv_transpose_2d_p0
|
// ggml_conv_transpose_2d_p0
|
||||||
|
|
||||||
static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) {
|
static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) {
|
||||||
|
@ -4420,24 +4475,21 @@ struct ggml_tensor * ggml_pool_2d_back(
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
// ggml_upscale
|
// ggml_upscale / ggml_interpolate
|
||||||
|
|
||||||
static struct ggml_tensor * ggml_upscale_impl(
|
static struct ggml_tensor * ggml_interpolate_impl(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
int ne0,
|
int64_t ne0,
|
||||||
int ne1,
|
int64_t ne1,
|
||||||
int ne2,
|
int64_t ne2,
|
||||||
int ne3,
|
int64_t ne3,
|
||||||
enum ggml_scale_mode mode) {
|
uint32_t mode) {
|
||||||
GGML_ASSERT(a->ne[0] <= ne0);
|
GGML_ASSERT((mode & 0xFF) < GGML_SCALE_MODE_COUNT);
|
||||||
GGML_ASSERT(a->ne[1] <= ne1);
|
|
||||||
GGML_ASSERT(a->ne[2] <= ne2);
|
|
||||||
GGML_ASSERT(a->ne[3] <= ne3);
|
|
||||||
|
|
||||||
struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, ne0, ne1, ne2, ne3);
|
struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, ne0, ne1, ne2, ne3);
|
||||||
|
|
||||||
ggml_set_op_params_i32(result, 0, mode);
|
ggml_set_op_params_i32(result, 0, (int32_t)mode);
|
||||||
|
|
||||||
result->op = GGML_OP_UPSCALE;
|
result->op = GGML_OP_UPSCALE;
|
||||||
result->src[0] = a;
|
result->src[0] = a;
|
||||||
|
@ -4450,7 +4502,8 @@ struct ggml_tensor * ggml_upscale(
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
int scale_factor,
|
int scale_factor,
|
||||||
enum ggml_scale_mode mode) {
|
enum ggml_scale_mode mode) {
|
||||||
return ggml_upscale_impl(ctx, a, a->ne[0] * scale_factor, a->ne[1] * scale_factor, a->ne[2], a->ne[3], mode);
|
GGML_ASSERT(scale_factor > 1);
|
||||||
|
return ggml_interpolate_impl(ctx, a, a->ne[0] * scale_factor, a->ne[1] * scale_factor, a->ne[2], a->ne[3], mode);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * ggml_upscale_ext(
|
struct ggml_tensor * ggml_upscale_ext(
|
||||||
|
@ -4461,7 +4514,18 @@ struct ggml_tensor * ggml_upscale_ext(
|
||||||
int ne2,
|
int ne2,
|
||||||
int ne3,
|
int ne3,
|
||||||
enum ggml_scale_mode mode) {
|
enum ggml_scale_mode mode) {
|
||||||
return ggml_upscale_impl(ctx, a, ne0, ne1, ne2, ne3, mode);
|
return ggml_interpolate_impl(ctx, a, ne0, ne1, ne2, ne3, mode);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * ggml_interpolate(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
int64_t ne0,
|
||||||
|
int64_t ne1,
|
||||||
|
int64_t ne2,
|
||||||
|
int64_t ne3,
|
||||||
|
uint32_t mode) {
|
||||||
|
return ggml_interpolate_impl(ctx, a, ne0, ne1, ne2, ne3, mode);
|
||||||
}
|
}
|
||||||
|
|
||||||
// ggml_pad
|
// ggml_pad
|
||||||
|
|
|
@ -250,7 +250,7 @@ bool llama_kv_cache_unified_iswa_context::next() {
|
||||||
}
|
}
|
||||||
|
|
||||||
bool llama_kv_cache_unified_iswa_context::apply() {
|
bool llama_kv_cache_unified_iswa_context::apply() {
|
||||||
assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
|
assert(!llama_memory_status_is_fail(status));
|
||||||
|
|
||||||
bool res = true;
|
bool res = true;
|
||||||
|
|
||||||
|
|
|
@ -1776,7 +1776,7 @@ bool llama_kv_cache_unified_context::next() {
|
||||||
}
|
}
|
||||||
|
|
||||||
bool llama_kv_cache_unified_context::apply() {
|
bool llama_kv_cache_unified_context::apply() {
|
||||||
assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
|
assert(!llama_memory_status_is_fail(status));
|
||||||
|
|
||||||
// no ubatches -> this is a KV cache update
|
// no ubatches -> this is a KV cache update
|
||||||
if (ubatches.empty()) {
|
if (ubatches.empty()) {
|
||||||
|
|
|
@ -218,7 +218,7 @@ bool llama_memory_hybrid_context::next() {
|
||||||
}
|
}
|
||||||
|
|
||||||
bool llama_memory_hybrid_context::apply() {
|
bool llama_memory_hybrid_context::apply() {
|
||||||
assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
|
assert(!llama_memory_status_is_fail(status));
|
||||||
|
|
||||||
bool res = true;
|
bool res = true;
|
||||||
|
|
||||||
|
|
|
@ -1071,7 +1071,15 @@ bool llama_memory_recurrent_context::next() {
|
||||||
}
|
}
|
||||||
|
|
||||||
bool llama_memory_recurrent_context::apply() {
|
bool llama_memory_recurrent_context::apply() {
|
||||||
assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
|
assert(!llama_memory_status_is_fail(status));
|
||||||
|
|
||||||
|
// no ubatches -> this is an update
|
||||||
|
if (ubatches.empty()) {
|
||||||
|
// recurrent cache never performs updates
|
||||||
|
assert(status == LLAMA_MEMORY_STATUS_NO_UPDATE);
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
mem->find_slot(ubatches[i_next]);
|
mem->find_slot(ubatches[i_next]);
|
||||||
|
|
||||||
|
|
|
@ -40,3 +40,20 @@ llama_memory_status llama_memory_status_combine(llama_memory_status s0, llama_me
|
||||||
// if either status has an update, then the combined status has an update
|
// if either status has an update, then the combined status has an update
|
||||||
return has_update ? LLAMA_MEMORY_STATUS_SUCCESS : LLAMA_MEMORY_STATUS_NO_UPDATE;
|
return has_update ? LLAMA_MEMORY_STATUS_SUCCESS : LLAMA_MEMORY_STATUS_NO_UPDATE;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool llama_memory_status_is_fail(llama_memory_status status) {
|
||||||
|
switch (status) {
|
||||||
|
case LLAMA_MEMORY_STATUS_SUCCESS:
|
||||||
|
case LLAMA_MEMORY_STATUS_NO_UPDATE:
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
case LLAMA_MEMORY_STATUS_FAILED_PREPARE:
|
||||||
|
case LLAMA_MEMORY_STATUS_FAILED_COMPUTE:
|
||||||
|
{
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
|
@ -31,6 +31,9 @@ enum llama_memory_status {
|
||||||
// useful for implementing hybrid memory types (e.g. iSWA)
|
// useful for implementing hybrid memory types (e.g. iSWA)
|
||||||
llama_memory_status llama_memory_status_combine(llama_memory_status s0, llama_memory_status s1);
|
llama_memory_status llama_memory_status_combine(llama_memory_status s0, llama_memory_status s1);
|
||||||
|
|
||||||
|
// helper function for checking if a memory status indicates a failure
|
||||||
|
bool llama_memory_status_is_fail(llama_memory_status status);
|
||||||
|
|
||||||
// the interface for managing the memory context during batch processing
|
// the interface for managing the memory context during batch processing
|
||||||
// this interface is implemented per memory type. see:
|
// this interface is implemented per memory type. see:
|
||||||
// - llama_kv_cache_unified_context
|
// - llama_kv_cache_unified_context
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue