rewritten checkpoint 1 - before coopmat

This commit is contained in:
Concedo 2024-12-13 16:55:23 +08:00
commit 4c4ce5e808
59 changed files with 9147 additions and 28724 deletions

View file

@ -759,7 +759,7 @@ do { \
#define GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(x)))
#define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0))
#else
static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) {
static inline __m256 __avx_f32cx8_load(const ggml_fp16_t * x) {
float tmp[8];
for (int i = 0; i < 8; i++) {
@ -1377,7 +1377,10 @@ struct ggml_compute_state {
inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
inline static void ggml_vec_cpy_i32(const int n, int32_t * y, const int32_t * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; }
inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; }
@ -2429,7 +2432,7 @@ bool ggml_is_numa(void) {
#endif
#if !defined(HWCAP2_I8MM)
#define HWCAP2_I8MM 0
#define HWCAP2_I8MM (1 << 13)
#endif
static void ggml_init_arm_arch_features(void) {
@ -8284,6 +8287,77 @@ static void ggml_compute_forward_set_f32(
}
}
static void ggml_compute_forward_set_i32(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
const struct ggml_tensor * src0 = dst->src[0];
const struct ggml_tensor * src1 = dst->src[1];
GGML_ASSERT(ggml_are_same_shape(src0, dst));
GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
// view src0 and dst with these strides and data offset inbytes during set
// nb0 is implicitly element_size because src0 and dst are contiguous
size_t nb1 = ((int32_t *) dst->op_params)[0];
size_t nb2 = ((int32_t *) dst->op_params)[1];
size_t nb3 = ((int32_t *) dst->op_params)[2];
size_t offset = ((int32_t *) dst->op_params)[3];
bool inplace = (bool) ((int32_t *) dst->op_params)[4];
if (!inplace) {
if (params->ith == 0) {
// memcpy needs to be synchronized across threads to avoid race conditions.
// => do it in INIT phase
memcpy(
((char *) dst->data),
((char *) src0->data),
ggml_nbytes(dst));
}
ggml_barrier(params->threadpool);
}
const int ith = params->ith;
const int nth = params->nth;
const int nr = ggml_nrows(src1);
const int nc = src1->ne[0];
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne)
GGML_TENSOR_LOCALS(size_t, nb1, src1, nb)
// src0 and dst as viewed during set
const size_t nb0 = ggml_element_size(src0);
const int im0 = (ne10 == 0 ? 0 : ne10-1);
const int im1 = (ne11 == 0 ? 0 : ne11-1);
const int im2 = (ne12 == 0 ? 0 : ne12-1);
const int im3 = (ne13 == 0 ? 0 : ne13-1);
GGML_ASSERT(offset + im0*nb0 + im1*nb1 + im2*nb2 + im3*nb3 <= ggml_nbytes(dst));
GGML_ASSERT(nb10 == sizeof(int32_t));
// rows per thread
const int dr = (nr + nth - 1)/nth;
// row range for this thread
const int ir0 = dr*ith;
const int ir1 = MIN(ir0 + dr, nr);
for (int ir = ir0; ir < ir1; ++ir) {
// src0 and dst are viewed with shape of src1 and offset
// => same indices
const int i3 = ir/(ne12*ne11);
const int i2 = (ir - i3*ne12*ne11)/ne11;
const int i1 = (ir - i3*ne12*ne11 - i2*ne11);
ggml_vec_cpy_i32(nc,
(int32_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + offset),
(int32_t *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
}
}
static void ggml_compute_forward_set(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
@ -8295,6 +8369,10 @@ static void ggml_compute_forward_set(
{
ggml_compute_forward_set_f32(params, dst);
} break;
case GGML_TYPE_I32:
{
ggml_compute_forward_set_i32(params, dst);
} break;
case GGML_TYPE_F16:
case GGML_TYPE_BF16:
case GGML_TYPE_Q4_0:
@ -10475,6 +10553,40 @@ static void ggml_compute_forward_pad(
}
}
// ggml_compute_forward_pad_reflect_1d
static void ggml_compute_forward_pad_reflect_1d(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
const struct ggml_tensor * src0 = dst->src[0];
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
const int ith = params->ith;
const int nth = params->nth;
const int32_t * opts = (const int32_t *) dst->op_params;
const int p0 = opts[0];
const int p1 = opts[1];
GGML_TENSOR_UNARY_OP_LOCALS
for (int64_t i3 = 0; i3 < ne3; i3++) {
for (int64_t i2 = 0; i2 < ne2; i2++) {
for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
float * left = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + p0*nb0);
float * right = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + (ne0-p1-1)*nb0);
ggml_vec_cpy_f32(ne00, left, (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01));
for (int i0 = 1; i0 <= p0; i0++) { left[-i0] = left[i0]; }
for (int i0 = 1; i0 <= p1; i0++) { right[i0] = right[-i0]; }
}
}
}
}
// ggml_compute_forward_arange
@ -12571,6 +12683,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
ggml_compute_forward_pad(params, tensor);
} break;
case GGML_OP_PAD_REFLECT_1D:
{
ggml_compute_forward_pad_reflect_1d(params, tensor);
} break;
case GGML_OP_ARANGE:
{
ggml_compute_forward_arange(params, tensor);
@ -12913,6 +13029,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
} break;
case GGML_OP_UPSCALE:
case GGML_OP_PAD:
case GGML_OP_PAD_REFLECT_1D:
case GGML_OP_ARANGE:
case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_ARGSORT: