mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-13 18:39:48 +00:00
Merge 'origin/master' into hipblas
This commit is contained in:
commit
fcbc262eb9
29 changed files with 1285 additions and 472 deletions
440
ggml.c
440
ggml.c
|
@ -135,14 +135,6 @@ inline static void* ggml_aligned_malloc(size_t size) {
|
|||
#define UNUSED(x) (void)(x)
|
||||
#define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
|
||||
|
||||
#define GGML_ASSERT(x) \
|
||||
do { \
|
||||
if (!(x)) { \
|
||||
fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
|
||||
abort(); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#if defined(GGML_USE_ACCELERATE)
|
||||
#include <Accelerate/Accelerate.h>
|
||||
#elif defined(GGML_USE_OPENBLAS)
|
||||
|
@ -330,7 +322,7 @@ static ggml_fp16_t table_exp_f16[1 << 16];
|
|||
// precomputed f32 table for f16 (256 KB)
|
||||
static float table_f32_f16[1 << 16];
|
||||
|
||||
#if defined(__ARM_NEON)
|
||||
#if defined(__ARM_NEON) || defined(__wasm_simd128__)
|
||||
#define B1(c,s,n) 0x ## n ## c , 0x ## n ## s
|
||||
#define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s)
|
||||
#define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s)
|
||||
|
@ -370,6 +362,32 @@ ggml_fp16_t ggml_fp32_to_fp16(float x) {
|
|||
return GGML_FP32_TO_FP16(x);
|
||||
}
|
||||
|
||||
void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, size_t n) {
|
||||
for (size_t i = 0; i < n; i++) {
|
||||
y[i] = GGML_FP16_TO_FP32(x[i]);
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n) {
|
||||
size_t i = 0;
|
||||
#if defined(__F16C__)
|
||||
for (; i + 7 < n; i += 8) {
|
||||
__m256 x_vec = _mm256_loadu_ps(x + i);
|
||||
__m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
|
||||
_mm_storeu_si128((__m128i *)(y + i), y_vec);
|
||||
}
|
||||
for(; i + 3 < n; i += 4) {
|
||||
__m128 x_vec = _mm_loadu_ps(x + i);
|
||||
__m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
|
||||
_mm_storel_epi64((__m128i *)(y + i), y_vec);
|
||||
}
|
||||
#endif
|
||||
for (; i < n; i++) {
|
||||
y[i] = GGML_FP32_TO_FP16(x[i]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// timing
|
||||
//
|
||||
|
@ -1087,7 +1105,7 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
|
|||
const v128_t v = wasm_f32x4_mul(srcv[l], wasm_f32x4_splat(id));
|
||||
const v128_t vf = wasm_f32x4_add(v, wasm_f32x4_splat(8.5f));
|
||||
const v128_t vi = wasm_i32x4_trunc_sat_f32x4(vf);
|
||||
const v128_t vc = wasm_i32x4_min_u(vi, wasm_i32x4_splat(15));
|
||||
const v128_t vc = wasm_i32x4_min(vi, wasm_i32x4_splat(15));
|
||||
|
||||
y[i].qs[2*l + 0] = wasm_i32x4_extract_lane(vc, 0) | (wasm_i32x4_extract_lane(vc, 1) << 4);
|
||||
y[i].qs[2*l + 1] = wasm_i32x4_extract_lane(vc, 2) | (wasm_i32x4_extract_lane(vc, 3) << 4);
|
||||
|
@ -1911,8 +1929,8 @@ static void dequantize_row_q5_0(const void * restrict vx, float * restrict y, in
|
|||
const uint8_t vi = pp[l/2];
|
||||
|
||||
// extract the 5-th bit from qh
|
||||
const uint8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
|
||||
const uint8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;
|
||||
const uint8_t vh0 = ((qh & (1u << (l + 0))) >> (l + 0)) << 4;
|
||||
const uint8_t vh1 = ((qh & (1u << (l + 1))) >> (l + 1)) << 4;
|
||||
|
||||
const int8_t vi0 = (vi & 0x0F) | vh0;
|
||||
const int8_t vi1 = (vi >> 4) | vh1;
|
||||
|
@ -1948,8 +1966,8 @@ static void dequantize_row_q5_1(const void * restrict vx, float * restrict y, in
|
|||
const uint8_t vi = pp[l/2];
|
||||
|
||||
// extract the 5-th bit from qh
|
||||
const uint8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
|
||||
const uint8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;
|
||||
const uint8_t vh0 = ((qh & (1u << (l + 0))) >> (l + 0)) << 4;
|
||||
const uint8_t vh1 = ((qh & (1u << (l + 1))) >> (l + 1)) << 4;
|
||||
|
||||
const uint8_t vi0 = (vi & 0x0F) | vh0;
|
||||
const uint8_t vi1 = (vi >> 4) | vh1;
|
||||
|
@ -3180,6 +3198,72 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
|
|||
}
|
||||
|
||||
*s = vaddvq_f32(sumv);
|
||||
#elif defined(__wasm_simd128__)
|
||||
v128_t sumv = wasm_f32x4_splat(0.0f);
|
||||
|
||||
uint64_t tmp[4];
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const block_q5_0 * restrict x0 = &x[i];
|
||||
const block_q8_0 * restrict y0 = &y[i];
|
||||
|
||||
const v128_t m4b = wasm_i8x16_splat(0x0F);
|
||||
const v128_t s16b = wasm_i8x16_splat(0x10);
|
||||
|
||||
// extract the 5th bit
|
||||
uint32_t qh;
|
||||
memcpy(&qh, x0->qh, sizeof(qh));
|
||||
|
||||
tmp[0] = table_b2b_u[(qh >> 0) & 0xFF];
|
||||
tmp[1] = table_b2b_u[(qh >> 8) & 0xFF];
|
||||
tmp[2] = table_b2b_u[(qh >> 16) & 0xFF];
|
||||
tmp[3] = table_b2b_u[(qh >> 24) ];
|
||||
|
||||
const v128_t qhl = wasm_v128_load(tmp + 0);
|
||||
const v128_t qhh = wasm_v128_load(tmp + 2);
|
||||
|
||||
const v128_t v0 = wasm_v128_load(x0->qs);
|
||||
|
||||
// 4-bit -> 8-bit
|
||||
const v128_t v0l = wasm_v128_and (v0, m4b);
|
||||
const v128_t v0h = wasm_u8x16_shr(v0, 4);
|
||||
|
||||
// interleave
|
||||
const v128_t v0lz = wasm_v8x16_shuffle(v0l, v0h, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
|
||||
const v128_t v0hz = wasm_v8x16_shuffle(v0l, v0h, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
|
||||
|
||||
// add high bit and sub 16
|
||||
const v128_t v0lf = wasm_i8x16_sub(wasm_v128_or(v0lz, qhl), s16b);
|
||||
const v128_t v0hf = wasm_i8x16_sub(wasm_v128_or(v0hz, qhh), s16b);
|
||||
|
||||
// load y
|
||||
const v128_t v1l = wasm_v128_load(y0->qs);
|
||||
const v128_t v1h = wasm_v128_load(y0->qs + 16);
|
||||
|
||||
// int8x16 -> int16x8
|
||||
const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
|
||||
const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
|
||||
const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
|
||||
const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);
|
||||
|
||||
const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
|
||||
const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
|
||||
const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
|
||||
const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
|
||||
|
||||
const float x0d = GGML_FP16_TO_FP32(x0->d);
|
||||
|
||||
// dot product
|
||||
sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
|
||||
wasm_i32x4_add(
|
||||
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
|
||||
wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
|
||||
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
|
||||
wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), wasm_f32x4_splat(x0d*y0->d)));
|
||||
}
|
||||
|
||||
*s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
|
||||
wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3);
|
||||
#elif defined(__AVX2__)
|
||||
// Initialize accumulator with zeros
|
||||
__m256 acc = _mm256_setzero_ps();
|
||||
|
@ -3220,8 +3304,8 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
|
|||
for (int j = 0; j < QK8_0/2; j++) {
|
||||
const uint8_t v0 = x0[j];
|
||||
|
||||
const int x0_0h = ((qh & (1 << (2*j + 0))) >> (2*j + 0)) << 4;
|
||||
const int x1_0h = ((qh & (1 << (2*j + 1))) >> (2*j + 1)) << 4;
|
||||
const int x0_0h = ((qh & (1u << (2*j + 0))) >> (2*j + 0)) << 4;
|
||||
const int x1_0h = ((qh & (1u << (2*j + 1))) >> (2*j + 1)) << 4;
|
||||
|
||||
const int x0_0 = ((v0 & 0x0F) | x0_0h) - 16;
|
||||
const int x1_0 = ((v0 >> 4) | x1_0h) - 16;
|
||||
|
@ -3311,6 +3395,77 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
|
|||
}
|
||||
|
||||
*s = vaddvq_f32(sumv) + summs;
|
||||
#elif defined(__wasm_simd128__)
|
||||
v128_t sumv = wasm_f32x4_splat(0.0f);
|
||||
|
||||
float summs = 0.0f;
|
||||
|
||||
uint64_t tmp[4];
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
const block_q5_1 * restrict x0 = &x[i];
|
||||
const block_q8_1 * restrict y0 = &y[i];
|
||||
|
||||
summs += GGML_FP16_TO_FP32(x0->m) * (y0->s0 + y0->s1);
|
||||
|
||||
const v128_t m4b = wasm_i8x16_splat(0x0F);
|
||||
|
||||
// extract the 5th bit
|
||||
uint32_t qh;
|
||||
memcpy(&qh, x0->qh, sizeof(qh));
|
||||
|
||||
tmp[0] = table_b2b_u[(qh >> 0) & 0xFF];
|
||||
tmp[1] = table_b2b_u[(qh >> 8) & 0xFF];
|
||||
tmp[2] = table_b2b_u[(qh >> 16) & 0xFF];
|
||||
tmp[3] = table_b2b_u[(qh >> 24) ];
|
||||
|
||||
const v128_t qhl = wasm_v128_load(tmp + 0);
|
||||
const v128_t qhh = wasm_v128_load(tmp + 2);
|
||||
|
||||
const v128_t v0 = wasm_v128_load(x0->qs);
|
||||
|
||||
// 4-bit -> 8-bit
|
||||
const v128_t v0l = wasm_v128_and (v0, m4b);
|
||||
const v128_t v0h = wasm_u8x16_shr(v0, 4);
|
||||
|
||||
static bool x = true;
|
||||
|
||||
// interleave
|
||||
const v128_t v0lz = wasm_v8x16_shuffle(v0l, v0h, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
|
||||
const v128_t v0hz = wasm_v8x16_shuffle(v0l, v0h, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
|
||||
|
||||
// add high bit
|
||||
const v128_t v0lf = wasm_v128_or(v0lz, qhl);
|
||||
const v128_t v0hf = wasm_v128_or(v0hz, qhh);
|
||||
|
||||
// load y
|
||||
const v128_t v1l = wasm_v128_load(y0->qs);
|
||||
const v128_t v1h = wasm_v128_load(y0->qs + 16);
|
||||
|
||||
// int8x16 -> int16x8
|
||||
const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
|
||||
const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
|
||||
const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
|
||||
const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);
|
||||
|
||||
const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
|
||||
const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
|
||||
const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
|
||||
const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
|
||||
|
||||
const float x0d = GGML_FP16_TO_FP32(x0->d);
|
||||
|
||||
// dot product
|
||||
sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
|
||||
wasm_i32x4_add(
|
||||
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
|
||||
wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
|
||||
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
|
||||
wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), wasm_f32x4_splat(x0d*y0->d)));
|
||||
}
|
||||
|
||||
*s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
|
||||
wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3) + summs;
|
||||
#elif defined(__AVX2__)
|
||||
// Initialize accumulator with zeros
|
||||
__m256 acc = _mm256_setzero_ps();
|
||||
|
@ -3354,8 +3509,8 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
|
|||
for (int j = 0; j < QK8_1/2; j++) {
|
||||
const uint8_t v0 = x0[j];
|
||||
|
||||
const int x0_0h = ((qh & (1 << (2*j + 0))) >> (2*j + 0)) << 4;
|
||||
const int x1_0h = ((qh & (1 << (2*j + 1))) >> (2*j + 1)) << 4;
|
||||
const int x0_0h = ((qh & (1u << (2*j + 0))) >> (2*j + 0)) << 4;
|
||||
const int x1_0h = ((qh & (1u << (2*j + 1))) >> (2*j + 1)) << 4;
|
||||
|
||||
const int x0_0 = (v0 & 0x0F) | x0_0h;
|
||||
const int x1_0 = (v0 >> 4) | x1_0h;
|
||||
|
@ -4057,6 +4212,27 @@ bool ggml_is_quantized(enum ggml_type type) {
|
|||
return GGML_IS_QUANTIZED[type];
|
||||
}
|
||||
|
||||
enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
|
||||
enum ggml_type wtype = GGML_TYPE_COUNT;
|
||||
|
||||
switch (ftype) {
|
||||
case GGML_FTYPE_ALL_F32: wtype = GGML_TYPE_F32; break;
|
||||
case GGML_FTYPE_MOSTLY_F16: wtype = GGML_TYPE_F16; break;
|
||||
case GGML_FTYPE_MOSTLY_Q4_0: wtype = GGML_TYPE_Q4_0; break;
|
||||
case GGML_FTYPE_MOSTLY_Q4_1: wtype = GGML_TYPE_Q4_1; break;
|
||||
case GGML_FTYPE_MOSTLY_Q4_2: wtype = GGML_TYPE_Q4_2; break;
|
||||
case GGML_FTYPE_MOSTLY_Q5_0: wtype = GGML_TYPE_Q5_0; break;
|
||||
case GGML_FTYPE_MOSTLY_Q5_1: wtype = GGML_TYPE_Q5_1; break;
|
||||
case GGML_FTYPE_MOSTLY_Q8_0: wtype = GGML_TYPE_Q8_0; break;
|
||||
case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
|
||||
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
|
||||
}
|
||||
|
||||
GGML_ASSERT(wtype != GGML_TYPE_COUNT);
|
||||
|
||||
return wtype;
|
||||
}
|
||||
|
||||
static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) {
|
||||
return tensor->nb[0] > tensor->nb[1];
|
||||
}
|
||||
|
@ -4167,12 +4343,11 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|||
GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
|
||||
}
|
||||
|
||||
// initialize cuBLAS
|
||||
#if defined(GGML_USE_CUBLAS)
|
||||
#if defined(GGML_USE_CUBLAS)
|
||||
ggml_init_cublas();
|
||||
#elif defined(GGML_USE_CLBLAST)
|
||||
#elif defined(GGML_USE_CLBLAST)
|
||||
ggml_cl_init();
|
||||
#endif
|
||||
#endif
|
||||
|
||||
is_first_call = false;
|
||||
}
|
||||
|
@ -4253,7 +4428,7 @@ void ggml_free(struct ggml_context * ctx) {
|
|||
}
|
||||
|
||||
size_t ggml_used_mem(const struct ggml_context * ctx) {
|
||||
return ctx->objects_end->offs + ctx->objects_end->size;
|
||||
return ctx->objects_end == NULL ? 0 : ctx->objects_end->offs + ctx->objects_end->size;
|
||||
}
|
||||
|
||||
size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch) {
|
||||
|
@ -7943,7 +8118,7 @@ static void ggml_compute_forward_rms_norm(
|
|||
|
||||
// ggml_compute_forward_mul_mat
|
||||
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
||||
// helper function to determine if it is better to use BLAS or not
|
||||
// for large matrices, BLAS is faster
|
||||
static bool ggml_compute_forward_mul_mat_use_blas(
|
||||
|
@ -7959,12 +8134,9 @@ static bool ggml_compute_forward_mul_mat_use_blas(
|
|||
const int64_t ne1 = dst->ne[1];
|
||||
|
||||
// TODO: find the optimal values for these
|
||||
if (
|
||||
#if !defined(GGML_USE_CUBLAS)
|
||||
ggml_is_contiguous(src0) &&
|
||||
if (ggml_is_contiguous(src0) &&
|
||||
ggml_is_contiguous(src1) &&
|
||||
#endif
|
||||
((ne0 >= 32 && ne1 >= 32 && ne10 >= 32))) {
|
||||
(ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
|
||||
|
||||
/*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
|
||||
return true;
|
||||
|
@ -7972,7 +8144,6 @@ static bool ggml_compute_forward_mul_mat_use_blas(
|
|||
|
||||
return false;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
static void ggml_compute_forward_mul_mat_f32(
|
||||
|
@ -7988,7 +8159,7 @@ static void ggml_compute_forward_mul_mat_f32(
|
|||
const int64_t ne02 = src0->ne[2];
|
||||
const int64_t ne03 = src0->ne[3];
|
||||
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
||||
const int64_t ne10 = src1->ne[0];
|
||||
#endif
|
||||
const int64_t ne11 = src1->ne[1];
|
||||
|
@ -8045,7 +8216,16 @@ static void ggml_compute_forward_mul_mat_f32(
|
|||
// nb01 >= nb00 - src0 is not transposed
|
||||
// compute by src0 rows
|
||||
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
||||
#if defined(GGML_USE_CUBLAS)
|
||||
if (ggml_cuda_can_mul_mat(src0, src1, dst)) {
|
||||
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
|
||||
ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
||||
}
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
||||
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
|
||||
if (params->ith != 0) {
|
||||
return;
|
||||
|
@ -8059,43 +8239,13 @@ static void ggml_compute_forward_mul_mat_f32(
|
|||
return;
|
||||
}
|
||||
|
||||
#if defined(GGML_USE_CUBLAS)
|
||||
const float alpha = 1.0f;
|
||||
const float beta = 0.0f;
|
||||
const int x_ne = ne01 * ne00;
|
||||
const int y_ne = ne11 * ne10;
|
||||
const int d_ne = ne11 * ne01;
|
||||
|
||||
size_t x_size, y_size, d_size;
|
||||
float *d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size);
|
||||
float *d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size);
|
||||
float *d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size);
|
||||
#endif
|
||||
|
||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||
#if !defined(GGML_USE_CUBLAS)
|
||||
const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03);
|
||||
const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
|
||||
#endif
|
||||
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
||||
|
||||
#if defined(GGML_USE_CUBLAS)
|
||||
// copy data to device
|
||||
CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_X, src0, i03, i02, g_cudaStream));
|
||||
CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_Y, src1, i03, i02, g_cudaStream));
|
||||
|
||||
// compute
|
||||
CUBLAS_CHECK(
|
||||
cublasSgemm(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
|
||||
ne01, ne11, ne10,
|
||||
&alpha, d_X, ne00,
|
||||
d_Y, ne10,
|
||||
&beta, d_D, ne01));
|
||||
|
||||
// copy data to host
|
||||
CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, g_cudaStream));
|
||||
#elif defined(GGML_USE_CLBLAST)
|
||||
#if defined(GGML_USE_CLBLAST)
|
||||
// zT = y * xT
|
||||
ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
|
||||
ne11, ne01, ne10,
|
||||
|
@ -8112,12 +8262,6 @@ static void ggml_compute_forward_mul_mat_f32(
|
|||
#endif
|
||||
}
|
||||
}
|
||||
#if defined(GGML_USE_CUBLAS)
|
||||
CUDA_CHECK(cudaStreamSynchronize(g_cudaStream));
|
||||
ggml_cuda_pool_free(d_X, x_size);
|
||||
ggml_cuda_pool_free(d_Y, y_size);
|
||||
ggml_cuda_pool_free(d_D, d_size);
|
||||
#endif
|
||||
//printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
|
||||
|
||||
return;
|
||||
|
@ -8247,7 +8391,16 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
|||
// nb01 >= nb00 - src0 is not transposed
|
||||
// compute by src0 rows
|
||||
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
||||
#if defined(GGML_USE_CUBLAS)
|
||||
if (ggml_cuda_can_mul_mat(src0, src1, dst)) {
|
||||
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
|
||||
ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
||||
}
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
||||
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
|
||||
GGML_ASSERT(nb10 == sizeof(float));
|
||||
|
||||
|
@ -8263,37 +8416,8 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
|||
return;
|
||||
}
|
||||
|
||||
#if defined(GGML_USE_CUBLAS)
|
||||
const float alpha = 1.0f;
|
||||
const float beta = 0.0f;
|
||||
const int x_ne = ne01 * ne00;
|
||||
const int y_ne = ne11 * ne10;
|
||||
const int d_ne = ne11 * ne01;
|
||||
|
||||
size_t x_size, y_size, d_size;
|
||||
ggml_fp16_t * d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size);
|
||||
ggml_fp16_t * d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size);
|
||||
float * d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size);
|
||||
#endif
|
||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||
#if defined(GGML_USE_CUBLAS)
|
||||
// copy src0 while converting src1
|
||||
CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_X, src0, i03, i02, g_cudaStream));
|
||||
|
||||
// with cuBlAS, instead of converting src0 to fp32, we convert src1 to fp16
|
||||
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + (ne11 * ne10) * (i03 * ne02 + i02);
|
||||
{
|
||||
size_t id = 0;
|
||||
for (int64_t i01 = 0; i01 < ne11; ++i01) {
|
||||
for (int64_t i00 = 0; i00 < ne10; ++i00) {
|
||||
wdata[id++] = GGML_FP32_TO_FP16(*(float *) ((char *) src1->data + i03*nb13 + i02*nb12 + i01*nb11 + i00*nb10));
|
||||
}
|
||||
}
|
||||
|
||||
assert(id*sizeof(ggml_fp16_t) <= params->wsize);
|
||||
}
|
||||
#else
|
||||
float * const wdata = params->wdata;
|
||||
{
|
||||
size_t id = 0;
|
||||
|
@ -8305,28 +8429,8 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
|||
|
||||
assert(id*sizeof(float) <= params->wsize);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(GGML_USE_CUBLAS)
|
||||
const ggml_fp16_t * y = (ggml_fp16_t *) wdata;
|
||||
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
||||
|
||||
// copy data to device
|
||||
CUDA_CHECK(cudaMemcpyAsync(d_Y, y, sizeof(ggml_fp16_t) * y_ne, cudaMemcpyHostToDevice, g_cudaStream));
|
||||
|
||||
// compute
|
||||
CUBLAS_CHECK(
|
||||
cublasGemmEx(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
|
||||
ne01, ne11, ne10,
|
||||
&alpha, d_X, CUDA_R_16F, ne00,
|
||||
d_Y, CUDA_R_16F, ne10,
|
||||
&beta, d_D, CUDA_R_32F, ne01,
|
||||
CUBLAS_COMPUTE_32F,
|
||||
CUBLAS_GEMM_DEFAULT));
|
||||
|
||||
// copy data to host
|
||||
CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, g_cudaStream));
|
||||
#elif defined(GGML_USE_CLBLAST)
|
||||
#if defined(GGML_USE_CLBLAST)
|
||||
const float * x = wdata;
|
||||
const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
|
||||
|
||||
|
@ -8355,12 +8459,6 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
|||
}
|
||||
}
|
||||
|
||||
#if defined(GGML_USE_CUBLAS)
|
||||
CUDA_CHECK(cudaStreamSynchronize(g_cudaStream));
|
||||
ggml_cuda_pool_free(d_X, x_size);
|
||||
ggml_cuda_pool_free(d_Y, y_size);
|
||||
ggml_cuda_pool_free(d_D, d_size);
|
||||
#endif
|
||||
/*printf("CBLAS F16 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);*/
|
||||
|
||||
return;
|
||||
|
@ -8513,7 +8611,16 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|||
// nb01 >= nb00 - src0 is not transposed
|
||||
// compute by src0 rows
|
||||
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
||||
#if defined(GGML_USE_CUBLAS)
|
||||
if (ggml_cuda_can_mul_mat(src0, src1, dst)) {
|
||||
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
|
||||
ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
||||
}
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
||||
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
|
||||
if (params->ith != 0) {
|
||||
return;
|
||||
|
@ -8527,25 +8634,8 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|||
return;
|
||||
}
|
||||
|
||||
#if defined(GGML_USE_CUBLAS)
|
||||
const float alpha = 1.0f;
|
||||
const float beta = 0.0f;
|
||||
const int x_ne = ne01 * ne00;
|
||||
const int y_ne = ne11 * ne10;
|
||||
const int d_ne = ne11 * ne01;
|
||||
|
||||
size_t x_size, y_size, d_size, q_size;
|
||||
float * d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size);
|
||||
float * d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size);
|
||||
float * d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size);
|
||||
void * d_Q = ggml_cuda_pool_malloc(GGML_TYPE_SIZE[type] * x_ne / GGML_BLCK_SIZE[type], &q_size);
|
||||
|
||||
const dequantize_row_q_cuda_t dequantize_row_q_cuda = ggml_get_dequantize_row_q_cuda(type);
|
||||
GGML_ASSERT(dequantize_row_q_cuda != NULL);
|
||||
#else
|
||||
float * const wdata = params->wdata;
|
||||
dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
|
||||
#endif
|
||||
|
||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||
|
@ -8553,14 +8643,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|||
|
||||
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
||||
|
||||
#if defined(GGML_USE_CUBLAS)
|
||||
// copy and dequantize on device
|
||||
CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_Q, src0, i03, i02, g_cudaStream2));
|
||||
|
||||
dequantize_row_q_cuda(d_Q, d_X, x_ne, g_cudaStream2);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
CUDA_CHECK(cudaEventRecord(g_cudaEvent, g_cudaStream2));
|
||||
#elif defined(GGML_USE_CLBLAST)
|
||||
#if defined(GGML_USE_CLBLAST)
|
||||
const void* x = (char *) src0->data + i03*nb03 + i02*nb02;
|
||||
#else
|
||||
{
|
||||
|
@ -8576,24 +8659,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|||
const float * x = wdata;
|
||||
#endif
|
||||
|
||||
#if defined(GGML_USE_CUBLAS)
|
||||
// copy data to device
|
||||
CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_Y, src1, i03, i02, g_cudaStream));
|
||||
|
||||
// wait for dequantization
|
||||
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStream, g_cudaEvent, 0));
|
||||
|
||||
// compute
|
||||
CUBLAS_CHECK(
|
||||
cublasSgemm(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
|
||||
ne01, ne11, ne10,
|
||||
&alpha, d_X, ne00,
|
||||
d_Y, ne10,
|
||||
&beta, d_D, ne01));
|
||||
|
||||
// copy data to host
|
||||
CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, g_cudaStream));
|
||||
#elif defined(GGML_USE_CLBLAST)
|
||||
#if defined(GGML_USE_CLBLAST)
|
||||
// zT = y * xT
|
||||
ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
|
||||
ne11, ne01, ne10,
|
||||
|
@ -8611,13 +8677,6 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|||
}
|
||||
}
|
||||
|
||||
#if defined(GGML_USE_CUBLAS)
|
||||
CUDA_CHECK(cudaStreamSynchronize(g_cudaStream));
|
||||
ggml_cuda_pool_free(d_X, x_size);
|
||||
ggml_cuda_pool_free(d_Y, y_size);
|
||||
ggml_cuda_pool_free(d_D, d_size);
|
||||
ggml_cuda_pool_free(d_Q, q_size);
|
||||
#endif
|
||||
//printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
|
||||
|
||||
return;
|
||||
|
@ -11601,18 +11660,21 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|||
|
||||
size_t cur = 0;
|
||||
|
||||
#if defined(GGML_USE_CUBLAS)
|
||||
if (ggml_cuda_can_mul_mat(node->src0, node->src1, node)) {
|
||||
node->n_tasks = 1; // TODO: this actually is doing nothing
|
||||
// the threads are still spinning
|
||||
cur = ggml_cuda_mul_mat_get_wsize(node->src0, node->src1, node);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) {
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
||||
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
|
||||
node->n_tasks = 1; // TODO: this actually is doing nothing
|
||||
// the threads are still spinning
|
||||
#if defined(GGML_USE_CUBLAS)
|
||||
// with cuBLAS, we need memory for the full 3D / 4D data of src1
|
||||
cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1);
|
||||
#else
|
||||
// here we need memory just for single 2D matrix from src0
|
||||
cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
|
||||
#endif
|
||||
} else {
|
||||
cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1);
|
||||
}
|
||||
|
@ -11621,13 +11683,13 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|||
#endif
|
||||
} else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) {
|
||||
cur = 0;
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
||||
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
|
||||
node->n_tasks = 1;
|
||||
}
|
||||
#endif
|
||||
} else if (ggml_is_quantized(node->src0->type) && node->src1->type == GGML_TYPE_F32) {
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
||||
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
|
||||
node->n_tasks = 1;
|
||||
cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
|
||||
|
@ -12899,8 +12961,8 @@ size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t *
|
|||
memcpy(&qh, &y[i].qh, sizeof(qh));
|
||||
|
||||
for (int l = 0; l < QK5_0; l += 2) {
|
||||
const uint8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
|
||||
const uint8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;
|
||||
const uint8_t vh0 = ((qh & (1u << (l + 0))) >> (l + 0)) << 4;
|
||||
const uint8_t vh1 = ((qh & (1u << (l + 1))) >> (l + 1)) << 4;
|
||||
|
||||
// cast to 16 bins
|
||||
const uint8_t vi0 = ((y[i].qs[l/2] & 0x0F) | vh0) / 2;
|
||||
|
@ -12929,8 +12991,8 @@ size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t *
|
|||
memcpy(&qh, &y[i].qh, sizeof(qh));
|
||||
|
||||
for (int l = 0; l < QK5_1; l += 2) {
|
||||
const uint8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
|
||||
const uint8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;
|
||||
const uint8_t vh0 = ((qh & (1u << (l + 0))) >> (l + 0)) << 4;
|
||||
const uint8_t vh1 = ((qh & (1u << (l + 1))) >> (l + 1)) << 4;
|
||||
|
||||
// cast to 16 bins
|
||||
const uint8_t vi0 = ((y[i].qs[l/2] & 0x0F) | vh0) / 2;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue