mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 09:34:37 +00:00
rewritten checkpoint 1 - before coopmat
This commit is contained in:
commit
4c4ce5e808
59 changed files with 9147 additions and 28724 deletions
|
@ -759,7 +759,7 @@ do { \
|
|||
#define GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(x)))
|
||||
#define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0))
|
||||
#else
|
||||
static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) {
|
||||
static inline __m256 __avx_f32cx8_load(const ggml_fp16_t * x) {
|
||||
float tmp[8];
|
||||
|
||||
for (int i = 0; i < 8; i++) {
|
||||
|
@ -1377,7 +1377,10 @@ struct ggml_compute_state {
|
|||
|
||||
inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
||||
inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
||||
inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
||||
|
||||
inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
||||
inline static void ggml_vec_cpy_i32(const int n, int32_t * y, const int32_t * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; }
|
||||
|
||||
inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
||||
inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
||||
inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; }
|
||||
|
@ -2429,7 +2432,7 @@ bool ggml_is_numa(void) {
|
|||
#endif
|
||||
|
||||
#if !defined(HWCAP2_I8MM)
|
||||
#define HWCAP2_I8MM 0
|
||||
#define HWCAP2_I8MM (1 << 13)
|
||||
#endif
|
||||
|
||||
static void ggml_init_arm_arch_features(void) {
|
||||
|
@ -8284,6 +8287,77 @@ static void ggml_compute_forward_set_f32(
|
|||
}
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_set_i32(
|
||||
const struct ggml_compute_params * params,
|
||||
struct ggml_tensor * dst) {
|
||||
|
||||
const struct ggml_tensor * src0 = dst->src[0];
|
||||
const struct ggml_tensor * src1 = dst->src[1];
|
||||
|
||||
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
||||
GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
|
||||
|
||||
// view src0 and dst with these strides and data offset inbytes during set
|
||||
// nb0 is implicitly element_size because src0 and dst are contiguous
|
||||
size_t nb1 = ((int32_t *) dst->op_params)[0];
|
||||
size_t nb2 = ((int32_t *) dst->op_params)[1];
|
||||
size_t nb3 = ((int32_t *) dst->op_params)[2];
|
||||
size_t offset = ((int32_t *) dst->op_params)[3];
|
||||
bool inplace = (bool) ((int32_t *) dst->op_params)[4];
|
||||
|
||||
if (!inplace) {
|
||||
if (params->ith == 0) {
|
||||
// memcpy needs to be synchronized across threads to avoid race conditions.
|
||||
// => do it in INIT phase
|
||||
memcpy(
|
||||
((char *) dst->data),
|
||||
((char *) src0->data),
|
||||
ggml_nbytes(dst));
|
||||
}
|
||||
ggml_barrier(params->threadpool);
|
||||
}
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
const int nr = ggml_nrows(src1);
|
||||
const int nc = src1->ne[0];
|
||||
|
||||
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne)
|
||||
GGML_TENSOR_LOCALS(size_t, nb1, src1, nb)
|
||||
|
||||
// src0 and dst as viewed during set
|
||||
const size_t nb0 = ggml_element_size(src0);
|
||||
|
||||
const int im0 = (ne10 == 0 ? 0 : ne10-1);
|
||||
const int im1 = (ne11 == 0 ? 0 : ne11-1);
|
||||
const int im2 = (ne12 == 0 ? 0 : ne12-1);
|
||||
const int im3 = (ne13 == 0 ? 0 : ne13-1);
|
||||
|
||||
GGML_ASSERT(offset + im0*nb0 + im1*nb1 + im2*nb2 + im3*nb3 <= ggml_nbytes(dst));
|
||||
|
||||
GGML_ASSERT(nb10 == sizeof(int32_t));
|
||||
|
||||
// rows per thread
|
||||
const int dr = (nr + nth - 1)/nth;
|
||||
|
||||
// row range for this thread
|
||||
const int ir0 = dr*ith;
|
||||
const int ir1 = MIN(ir0 + dr, nr);
|
||||
|
||||
for (int ir = ir0; ir < ir1; ++ir) {
|
||||
// src0 and dst are viewed with shape of src1 and offset
|
||||
// => same indices
|
||||
const int i3 = ir/(ne12*ne11);
|
||||
const int i2 = (ir - i3*ne12*ne11)/ne11;
|
||||
const int i1 = (ir - i3*ne12*ne11 - i2*ne11);
|
||||
|
||||
ggml_vec_cpy_i32(nc,
|
||||
(int32_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + offset),
|
||||
(int32_t *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_set(
|
||||
const struct ggml_compute_params * params,
|
||||
struct ggml_tensor * dst) {
|
||||
|
@ -8295,6 +8369,10 @@ static void ggml_compute_forward_set(
|
|||
{
|
||||
ggml_compute_forward_set_f32(params, dst);
|
||||
} break;
|
||||
case GGML_TYPE_I32:
|
||||
{
|
||||
ggml_compute_forward_set_i32(params, dst);
|
||||
} break;
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_BF16:
|
||||
case GGML_TYPE_Q4_0:
|
||||
|
@ -10475,6 +10553,40 @@ static void ggml_compute_forward_pad(
|
|||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_pad_reflect_1d
|
||||
|
||||
static void ggml_compute_forward_pad_reflect_1d(
|
||||
const struct ggml_compute_params * params,
|
||||
struct ggml_tensor * dst) {
|
||||
|
||||
const struct ggml_tensor * src0 = dst->src[0];
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
const int32_t * opts = (const int32_t *) dst->op_params;
|
||||
const int p0 = opts[0];
|
||||
const int p1 = opts[1];
|
||||
|
||||
GGML_TENSOR_UNARY_OP_LOCALS
|
||||
|
||||
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
||||
for (int64_t i2 = 0; i2 < ne2; i2++) {
|
||||
for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
|
||||
float * left = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + p0*nb0);
|
||||
float * right = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + (ne0-p1-1)*nb0);
|
||||
|
||||
ggml_vec_cpy_f32(ne00, left, (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01));
|
||||
|
||||
for (int i0 = 1; i0 <= p0; i0++) { left[-i0] = left[i0]; }
|
||||
for (int i0 = 1; i0 <= p1; i0++) { right[i0] = right[-i0]; }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_arange
|
||||
|
||||
|
@ -12571,6 +12683,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|||
{
|
||||
ggml_compute_forward_pad(params, tensor);
|
||||
} break;
|
||||
case GGML_OP_PAD_REFLECT_1D:
|
||||
{
|
||||
ggml_compute_forward_pad_reflect_1d(params, tensor);
|
||||
} break;
|
||||
case GGML_OP_ARANGE:
|
||||
{
|
||||
ggml_compute_forward_arange(params, tensor);
|
||||
|
@ -12913,6 +13029,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|||
} break;
|
||||
case GGML_OP_UPSCALE:
|
||||
case GGML_OP_PAD:
|
||||
case GGML_OP_PAD_REFLECT_1D:
|
||||
case GGML_OP_ARANGE:
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
case GGML_OP_ARGSORT:
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue