From 3e5aa8a1c44051153d6d7b3eeca2f4b4e5fb310c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 30 Apr 2023 10:25:46 +0300 Subject: [PATCH 1/7] ggml : fix labels for GGML_OP_ALIBI --- ggml.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ggml.c b/ggml.c index 4d53b4628..50685f662 100644 --- a/ggml.c +++ b/ggml.c @@ -3827,6 +3827,7 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = { "DIAG_MASK_INF", "SOFT_MAX", "ROPE", + "ALIBI", "CONV_1D_1S", "CONV_1D_2S", @@ -3875,6 +3876,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "diag_mask_inf(x)", "soft_max(x)", "rope(x)", + "alibi(x)", "conv_1d_1s(x)", "conv_1d_2s(x)", From f0d70f147d969e41fa410b8af2965a27aa901eb9 Mon Sep 17 00:00:00 2001 From: Stephan Walter Date: Sun, 30 Apr 2023 12:32:37 +0000 Subject: [PATCH 2/7] Various fixes to mat_mul benchmark (#1253) --- .gitignore | 2 +- Makefile | 8 ++--- examples/CMakeLists.txt | 1 + examples/benchmark/CMakeLists.txt | 4 +++ ...k-q4_0-matmult.c => benchmark-matmult.cpp} | 30 +++++++------------ 5 files changed, 20 insertions(+), 25 deletions(-) create mode 100644 examples/benchmark/CMakeLists.txt rename examples/benchmark/{benchmark-q4_0-matmult.c => benchmark-matmult.cpp} (92%) diff --git a/.gitignore b/.gitignore index 54dcebc4d..565866fd4 100644 --- a/.gitignore +++ b/.gitignore @@ -28,7 +28,7 @@ models/* /result /perplexity /embedding -/benchmark-q4_0-matmult +/benchmark-matmult /vdot /Pipfile diff --git a/Makefile b/Makefile index 4516e8556..6d89401c8 100644 --- a/Makefile +++ b/Makefile @@ -180,7 +180,7 @@ common.o: examples/common.cpp examples/common.h $(CXX) $(CXXFLAGS) -c $< -o $@ clean: - rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-q4_0-matmult + rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-matmult main: examples/main/main.cpp ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) @@ -210,9 +210,9 @@ libllama.so: llama.o ggml.o $(OBJS) # Tests # -benchmark: examples/benchmark/benchmark-q4_0-matmult.c ggml.o $(OBJS) - $(CXX) $(CXXFLAGS) $^ -o benchmark-q4_0-matmult $(LDFLAGS) - ./benchmark-q4_0-matmult +benchmark-matmult: examples/benchmark/benchmark-matmult.cpp ggml.o $(OBJS) + $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) + ./$@ .PHONY: tests tests: diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index be35363f5..0973a3fa1 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -35,4 +35,5 @@ else() add_subdirectory(perplexity) add_subdirectory(embedding) add_subdirectory(save-load-state) + add_subdirectory(benchmark) endif() diff --git a/examples/benchmark/CMakeLists.txt b/examples/benchmark/CMakeLists.txt new file mode 100644 index 000000000..05deebcd1 --- /dev/null +++ b/examples/benchmark/CMakeLists.txt @@ -0,0 +1,4 @@ +set(TARGET benchmark) +add_executable(${TARGET} benchmark-matmult.cpp) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/benchmark/benchmark-q4_0-matmult.c b/examples/benchmark/benchmark-matmult.cpp similarity index 92% rename from examples/benchmark/benchmark-q4_0-matmult.c rename to examples/benchmark/benchmark-matmult.cpp index 84b06766c..19cbab1c3 100644 --- a/examples/benchmark/benchmark-q4_0-matmult.c +++ b/examples/benchmark/benchmark-matmult.cpp @@ -1,11 +1,3 @@ -/* - License: MIT License - - Changelog: - - 2023-03-31 Initial version by Sebastian Apel (https://github.com/SebastianApel) - -*/ - #include #include "ggml.h" #include @@ -45,7 +37,7 @@ float tensor_sum_elements(struct ggml_tensor * tensor) { #define TENSOR_TYPE_AS_STR(TYPE) TYPE == GGML_TYPE_F32 ? "FP32" : TYPE == GGML_TYPE_F16 ? "FP16" : TYPE == GGML_TYPE_Q4_0 ? "Q4_0" : TYPE == GGML_TYPE_Q4_1 ? "Q4_1" : "UNKNOWN" -#define TENSOR_DUMP(TENSOR) printf("%15s: type = %i (%5s) ne = %5d x %5d x %5d, nb = (%5li, %5li, %5li) - ", #TENSOR, \ +#define TENSOR_DUMP(TENSOR) printf("%15s: type = %i (%5s) ne = %5ld x %5ld x %5ld, nb = (%5li, %5li, %5li) - ", #TENSOR, \ TENSOR->type,TENSOR_TYPE_AS_STR(TENSOR->type),\ TENSOR->ne[0], TENSOR->ne[1], TENSOR->ne[2], TENSOR->nb[0], TENSOR->nb[1], TENSOR->nb[2]); \ { float sum = tensor_sum_elements(TENSOR); printf("Sum of tensor %s is %6.2f\n",#TENSOR, sum); } @@ -98,12 +90,9 @@ int main(int argc, char ** argv) { } } - // create the ggml context printf("Starting Test\n"); - - struct ggml_context * ctx; //const int sizex = 4096; //const int sizey = 11008; @@ -125,16 +114,18 @@ int main(int argc, char ** argv) { #endif //printf("Memsize required = %i\n", sizex*sizex); - ggml_type wtype = GGML_TYPE_F32; size_t ctx_size = 0; - ctx_size += sizex*sizey*ggml_type_sizef(wtype); - ctx_size += sizex*sizey*ggml_type_sizef(wtype); ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); - ctx_size += sizex*sizeof(float); - ctx_size += 1024*1024*100; + ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); + ctx_size += sizex*sizez*ggml_type_sizef(GGML_TYPE_F32); + ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_Q4_0); + ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_Q4_0); + ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS + ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS + ctx_size += 1024*1024*16; - printf("Allocating Memory of size %li byes, %li MB\n",ctx_size, (ctx_size/1024/1024)); + printf("Allocating Memory of size %li bytes, %li MB\n",ctx_size, (ctx_size/1024/1024)); struct ggml_init_params params = { /*.mem_size =*/ ctx_size, @@ -217,7 +208,7 @@ int main(int argc, char ** argv) { const int dimz = sizez; long long int flops_per_dot_product = dimy + dimy; long long int flops_per_matrix = flops_per_dot_product * dimx * dimz; ; - printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - aboout %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000); + printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - about %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000); // Let's use the F32 result from above as a reference for the q4_0 multiplication @@ -234,7 +225,6 @@ int main(int argc, char ** argv) { ggml_graph_compute(ctx, &gf31); long long int stop = ggml_time_us(); long long int usec = stop-start; - float sec = usec/1000000; float flops_per_usec = (1.0f*flops_per_matrix)/usec; printf("%9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%19.2f\n", i, From 6bc4400e67e6bc4faad3ad3d5e9d8a6576a9752d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 30 Apr 2023 19:07:00 +0300 Subject: [PATCH 3/7] ggml : add Q5 WASM SIMD + GGML_FTYPE --- ggml.c | 162 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++- ggml.h | 17 ++++++ 2 files changed, 177 insertions(+), 2 deletions(-) diff --git a/ggml.c b/ggml.c index 50685f662..17c03ad40 100644 --- a/ggml.c +++ b/ggml.c @@ -330,7 +330,7 @@ static ggml_fp16_t table_exp_f16[1 << 16]; // precomputed f32 table for f16 (256 KB) static float table_f32_f16[1 << 16]; -#if defined(__ARM_NEON) +#if defined(__ARM_NEON) || defined(__wasm_simd128__) #define B1(c,s,n) 0x ## n ## c , 0x ## n ## s #define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s) #define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s) @@ -1087,7 +1087,7 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int const v128_t v = wasm_f32x4_mul(srcv[l], wasm_f32x4_splat(id)); const v128_t vf = wasm_f32x4_add(v, wasm_f32x4_splat(8.5f)); const v128_t vi = wasm_i32x4_trunc_sat_f32x4(vf); - const v128_t vc = wasm_i32x4_min_u(vi, wasm_i32x4_splat(15)); + const v128_t vc = wasm_i32x4_min(vi, wasm_i32x4_splat(15)); y[i].qs[2*l + 0] = wasm_i32x4_extract_lane(vc, 0) | (wasm_i32x4_extract_lane(vc, 1) << 4); y[i].qs[2*l + 1] = wasm_i32x4_extract_lane(vc, 2) | (wasm_i32x4_extract_lane(vc, 3) << 4); @@ -3180,6 +3180,72 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * } *s = vaddvq_f32(sumv); +#elif defined(__wasm_simd128__) + v128_t sumv = wasm_f32x4_splat(0.0f); + + uint64_t tmp[4]; + + for (int i = 0; i < nb; ++i) { + const block_q5_0 * restrict x0 = &x[i]; + const block_q8_0 * restrict y0 = &y[i]; + + const v128_t m4b = wasm_i8x16_splat(0x0F); + const v128_t s16b = wasm_i8x16_splat(0x10); + + // extract the 5th bit + uint32_t qh; + memcpy(&qh, x0->qh, sizeof(qh)); + + tmp[0] = table_b2b_u[(qh >> 0) & 0xFF]; + tmp[1] = table_b2b_u[(qh >> 8) & 0xFF]; + tmp[2] = table_b2b_u[(qh >> 16) & 0xFF]; + tmp[3] = table_b2b_u[(qh >> 24) ]; + + const v128_t qhl = wasm_v128_load(tmp + 0); + const v128_t qhh = wasm_v128_load(tmp + 2); + + const v128_t v0 = wasm_v128_load(x0->qs); + + // 4-bit -> 8-bit + const v128_t v0l = wasm_v128_and (v0, m4b); + const v128_t v0h = wasm_u8x16_shr(v0, 4); + + // interleave + const v128_t v0lz = wasm_v8x16_shuffle(v0l, v0h, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); + const v128_t v0hz = wasm_v8x16_shuffle(v0l, v0h, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); + + // add high bit and sub 16 + const v128_t v0lf = wasm_i8x16_sub(wasm_v128_or(v0lz, qhl), s16b); + const v128_t v0hf = wasm_i8x16_sub(wasm_v128_or(v0hz, qhh), s16b); + + // load y + const v128_t v1l = wasm_v128_load(y0->qs); + const v128_t v1h = wasm_v128_load(y0->qs + 16); + + // int8x16 -> int16x8 + const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf); + const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf); + const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf); + const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf); + + const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l); + const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l); + const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h); + const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h); + + const float x0d = GGML_FP16_TO_FP32(x0->d); + + // dot product + sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4( + wasm_i32x4_add( + wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll), + wasm_i32x4_dot_i16x8(v0lfh, v1lh)), + wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl), + wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), wasm_f32x4_splat(x0d*y0->d))); + } + + *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) + + wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3); #elif defined(__AVX2__) // Initialize accumulator with zeros __m256 acc = _mm256_setzero_ps(); @@ -3311,6 +3377,77 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * } *s = vaddvq_f32(sumv) + summs; +#elif defined(__wasm_simd128__) + v128_t sumv = wasm_f32x4_splat(0.0f); + + float summs = 0.0f; + + uint64_t tmp[4]; + + for (int i = 0; i < nb; ++i) { + const block_q5_1 * restrict x0 = &x[i]; + const block_q8_1 * restrict y0 = &y[i]; + + summs += GGML_FP16_TO_FP32(x0->m) * (y0->s0 + y0->s1); + + const v128_t m4b = wasm_i8x16_splat(0x0F); + + // extract the 5th bit + uint32_t qh; + memcpy(&qh, x0->qh, sizeof(qh)); + + tmp[0] = table_b2b_u[(qh >> 0) & 0xFF]; + tmp[1] = table_b2b_u[(qh >> 8) & 0xFF]; + tmp[2] = table_b2b_u[(qh >> 16) & 0xFF]; + tmp[3] = table_b2b_u[(qh >> 24) ]; + + const v128_t qhl = wasm_v128_load(tmp + 0); + const v128_t qhh = wasm_v128_load(tmp + 2); + + const v128_t v0 = wasm_v128_load(x0->qs); + + // 4-bit -> 8-bit + const v128_t v0l = wasm_v128_and (v0, m4b); + const v128_t v0h = wasm_u8x16_shr(v0, 4); + + static bool x = true; + + // interleave + const v128_t v0lz = wasm_v8x16_shuffle(v0l, v0h, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); + const v128_t v0hz = wasm_v8x16_shuffle(v0l, v0h, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); + + // add high bit + const v128_t v0lf = wasm_v128_or(v0lz, qhl); + const v128_t v0hf = wasm_v128_or(v0hz, qhh); + + // load y + const v128_t v1l = wasm_v128_load(y0->qs); + const v128_t v1h = wasm_v128_load(y0->qs + 16); + + // int8x16 -> int16x8 + const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf); + const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf); + const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf); + const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf); + + const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l); + const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l); + const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h); + const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h); + + const float x0d = GGML_FP16_TO_FP32(x0->d); + + // dot product + sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4( + wasm_i32x4_add( + wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll), + wasm_i32x4_dot_i16x8(v0lfh, v1lh)), + wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl), + wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), wasm_f32x4_splat(x0d*y0->d))); + } + + *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) + + wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3) + summs; #elif defined(__AVX2__) // Initialize accumulator with zeros __m256 acc = _mm256_setzero_ps(); @@ -4057,6 +4194,27 @@ bool ggml_is_quantized(enum ggml_type type) { return GGML_IS_QUANTIZED[type]; } +enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) { + enum ggml_type wtype = GGML_TYPE_COUNT; + + switch (ftype) { + case GGML_FTYPE_ALL_F32: wtype = GGML_TYPE_F32; break; + case GGML_FTYPE_MOSTLY_F16: wtype = GGML_TYPE_F16; break; + case GGML_FTYPE_MOSTLY_Q4_0: wtype = GGML_TYPE_Q4_0; break; + case GGML_FTYPE_MOSTLY_Q4_1: wtype = GGML_TYPE_Q4_1; break; + case GGML_FTYPE_MOSTLY_Q4_2: wtype = GGML_TYPE_Q4_2; break; + case GGML_FTYPE_MOSTLY_Q5_0: wtype = GGML_TYPE_Q5_0; break; + case GGML_FTYPE_MOSTLY_Q5_1: wtype = GGML_TYPE_Q5_1; break; + case GGML_FTYPE_MOSTLY_Q8_0: wtype = GGML_TYPE_Q8_0; break; + case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break; + case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break; + } + + GGML_ASSERT(wtype != GGML_TYPE_COUNT); + + return wtype; +} + static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) { return tensor->nb[0] > tensor->nb[1]; } diff --git a/ggml.h b/ggml.h index c1c5495c6..d6feacd78 100644 --- a/ggml.h +++ b/ggml.h @@ -232,6 +232,20 @@ extern "C" { GGML_TYPE_COUNT, }; + // model file types + enum ggml_ftype { + GGML_FTYPE_UNKNOWN = -1, + GGML_FTYPE_ALL_F32 = 0, + GGML_FTYPE_MOSTLY_F16 = 1, // except 1d tensors + GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors + GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors + GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 + GGML_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors + GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors + GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors + GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors + }; + // available tensor operations: enum ggml_op { GGML_OP_NONE = 0, @@ -385,6 +399,9 @@ extern "C" { GGML_API bool ggml_is_quantized(enum ggml_type type); + // TODO: temporary until model loading of ggml examples is refactored + GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype); + // main GGML_API struct ggml_context * ggml_init(struct ggml_init_params params); From 76a884920aa1d2fc0dc7a7ac12dfc5ec5816377c Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Sun, 30 Apr 2023 20:34:52 +0200 Subject: [PATCH 4/7] ggml : add CLBlast q5_0, q5_1, q8_0 dequant kernels (#1225) * Implement q5_0, q5_1 and q8_0 * Work around q5_0 OpenCL issue * Fix q8_0 dequant kernel * Move cl kernels into ggml-opencl.c * Use two memcpy calls for q5_0 buffer transfer --- ggml-opencl-dequant.cl | 63 ------------ ggml-opencl.c | 220 ++++++++++++++++++++++++++++++++++++++--- 2 files changed, 205 insertions(+), 78 deletions(-) delete mode 100644 ggml-opencl-dequant.cl diff --git a/ggml-opencl-dequant.cl b/ggml-opencl-dequant.cl deleted file mode 100644 index a65a79f4d..000000000 --- a/ggml-opencl-dequant.cl +++ /dev/null @@ -1,63 +0,0 @@ -#define MULTILINE_QUOTE(...) #__VA_ARGS__ -const char * clblast_dequant = MULTILINE_QUOTE( - -struct block_q4_0 -{ - float d; - uchar qs[16]; -}; - -__kernel void dequantize_row_q4_0(__global struct block_q4_0* blocks, __global float* result) { - const uint i = get_global_id(0) / 32; - const uint l = get_local_id(0); - - const float d = blocks[i].d; - - const uchar vi = blocks[i].qs[l]; - - const uint index = i*32 + l*2; - result[index + 0] = ((vi & 0xf) - 8)*d; - result[index + 1] = ((vi >> 4) - 8)*d; -} - -struct block_q4_1 -{ - float d; - float m; - uchar qs[16]; -}; - -__kernel void dequantize_row_q4_1(__global struct block_q4_1* blocks, __global float* result) { - const uint i = get_global_id(0) / 32; - const uint l = get_local_id(0); - - const float d = blocks[i].d; - const float m = blocks[i].m; - - const uchar vi = blocks[i].qs[l]; - - const uint index = i*32 + l*2; - result[index + 0] = (vi & 0xf) * d + m; - result[index + 1] = (vi >> 4) * d + m; -} - -struct block_q4_2 -{ - ushort d; - uchar qs[8]; -}; - -__kernel void dequantize_row_q4_2(__global struct block_q4_2* blocks, __global float* result) { - const uint i = get_global_id(0) / 16; - const uint l = get_local_id(0); - - const float d = vload_half(0, (__global half*) &blocks[i].d);; - - const uchar vi = blocks[i].qs[l]; - - const uint index = i*16 + l*2; - result[index + 0] = ((vi & 0xf) - 8)*d; - result[index + 1] = ((vi >> 4) - 8)*d; -} - -); diff --git a/ggml-opencl.c b/ggml-opencl.c index b748f86b7..4389eca39 100644 --- a/ggml-opencl.c +++ b/ggml-opencl.c @@ -3,12 +3,141 @@ #define CL_TARGET_OPENCL_VERSION 110 #include +#include #include #include #include "ggml.h" -#include "ggml-opencl-dequant.cl" +#define MULTILINE_QUOTE(...) #__VA_ARGS__ +const char * clblast_dequant = MULTILINE_QUOTE( + +struct block_q4_0 +{ + float d; + uchar qs[16]; +}; + +__kernel void dequantize_row_q4_0(__global struct block_q4_0* blocks, __global float* result) { + const uint i = get_global_id(0) / 32; + const uint l = get_local_id(0); + + const float d = blocks[i].d; + + const uchar vi = blocks[i].qs[l]; + + const uint index = i*32 + l*2; + result[index + 0] = ((vi & 0xf) - 8)*d; + result[index + 1] = ((vi >> 4) - 8)*d; +} + +struct block_q4_1 +{ + float d; + float m; + uchar qs[16]; +}; + +__kernel void dequantize_row_q4_1(__global struct block_q4_1* blocks, __global float* result) { + const uint i = get_global_id(0) / 32; + const uint l = get_local_id(0); + + const float d = blocks[i].d; + const float m = blocks[i].m; + + const uchar vi = blocks[i].qs[l]; + + const uint index = i*32 + l*2; + result[index + 0] = (vi & 0xf) * d + m; + result[index + 1] = (vi >> 4) * d + m; +} + +struct block_q4_2 +{ + ushort d; + uchar qs[8]; +}; + +__kernel void dequantize_row_q4_2(__global struct block_q4_2* blocks, __global float* result) { + const uint i = get_global_id(0) / 16; + const uint l = get_local_id(0); + + const float d = vload_half(0, (__global half*) &blocks[i].d); + + const uchar vi = blocks[i].qs[l]; + + const uint index = i*16 + l*2; + result[index + 0] = ((vi & 0xf) - 8)*d; + result[index + 1] = ((vi >> 4) - 8)*d; +} + + +struct block_q5_0 +{ + float d; + uint qh; + uchar qs[16]; +}; + +__kernel void dequantize_row_q5_0(__global struct block_q5_0* blocks, __global float* result) { + const uint i = get_global_id(0) / 32; + const uint l = get_local_id(0); + + const float d = blocks[i].d; + + const uchar vi = blocks[i].qs[l]; + + const uint l2 = l * 2; + + const uchar vh0 = ((blocks[i].qh & (1 << (l2 + 0))) >> (l2 + 0)) << 4; + const uchar vh1 = ((blocks[i].qh & (1 << (l2 + 1))) >> (l2 + 1)) << 4; + + const uint index = i*32 + l2; + result[index + 0] = (((vi & 0xf) | vh0) - 16)*d; + result[index + 1] = (((vi >> 4) | vh1) - 16)*d; +} + +struct block_q5_1 +{ + ushort d; + ushort m; + uint qh; + uchar qs[16]; +}; + +__kernel void dequantize_row_q5_1(__global struct block_q5_1* blocks, __global float* result) { + const uint i = get_global_id(0) / 32; + const uint l = get_local_id(0); + + const float d = vload_half(0, (__global half*) &blocks[i].d); + const float m = vload_half(0, (__global half*) &blocks[i].m); + + const uchar vi = blocks[i].qs[l]; + + const uint l2 = l * 2; + + const uchar vh0 = ((blocks[i].qh & (1 << (l2 + 0))) >> (l2 + 0)) << 4; + const uchar vh1 = ((blocks[i].qh & (1 << (l2 + 1))) >> (l2 + 1)) << 4; + + const uint index = i*32 + l2; + result[index + 0] = ((vi & 0xf) | vh0)*d + m; + result[index + 1] = ((vi >> 4) | vh1)*d + m; +} + +struct block_q8_0 +{ + float d; + char qs[32]; +}; + +__kernel void dequantize_row_q8_0(__global struct block_q8_0* blocks, __global float* result) { + const uint i = get_global_id(0) / 32; + const uint l = get_local_id(0); + + result[i*32 + l] = blocks[i].qs[l] * blocks[i].d; +} + +); #define CL_CHECK(err, name) \ do { \ @@ -19,12 +148,26 @@ } \ } while (0) +#define QK5_0 32 +typedef struct { + ggml_fp16_t d; // delta + uint8_t qh[4]; // 5-th bit of quants + uint8_t qs[QK5_0 / 2]; // nibbles / quants +} block_q5_0; + + +typedef struct { + float d; // delta + uint32_t qh; // 5-th bit of quants + uint8_t qs[QK5_0 / 2]; // nibbles / quants +} cl_block_q5_0; + static cl_platform_id platform; static cl_device_id device; static cl_context context; static cl_command_queue queue; static cl_program program; -static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q4_2; +static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q4_2, kernel_q5_0, kernel_q5_1, kernel_q8_0; static cl_mem cl_buffer_a, cl_buffer_qb, cl_buffer_b, cl_buffer_c; static size_t cl_size_a = 0, cl_size_qb = 0, cl_size_b = 0, cl_size_c = 0; @@ -97,6 +240,12 @@ void ggml_cl_init(void) { CL_CHECK(err, "clCreateKernel"); kernel_q4_2 = clCreateKernel(program, "dequantize_row_q4_2", &err); CL_CHECK(err, "clCreateKernel"); + kernel_q5_0 = clCreateKernel(program, "dequantize_row_q5_0", &err); + CL_CHECK(err, "clCreateKernel"); + kernel_q5_1 = clCreateKernel(program, "dequantize_row_q5_1", &err); + CL_CHECK(err, "clCreateKernel"); + kernel_q8_0 = clCreateKernel(program, "dequantize_row_q8_0", &err); + CL_CHECK(err, "clCreateKernel"); } static void ggml_cl_malloc(size_t req_size, size_t* cur_size, cl_mem_flags flags, cl_mem* buf) { @@ -125,6 +274,7 @@ void ggml_cl_sgemm_wrapper( cl_kernel kernel; size_t global = n * k, local, size_qb; bool dequant; + cl_block_q5_0* cl_host_b; switch (btype) { case GGML_TYPE_F32: @@ -146,7 +296,36 @@ void ggml_cl_sgemm_wrapper( dequant = true; kernel = kernel_q4_2; local = 8; - size_qb = global * (sizeof(short) + local) / 16; + size_qb = global * (sizeof(ggml_fp16_t) + local) / 16; + break; + case GGML_TYPE_Q5_0: + dequant = true; + kernel = kernel_q5_0; + local = 16; + // For some reason OpenCL seems to be incapable of working with structs of size 22. + // 20 and 24 bytes are fine. Workaround to do the fp16 to fp32 step on CPU... + // TODO Find the reason, fix and remove workaround. + const block_q5_0* b = (const block_q5_0*) host_b; + cl_host_b = (cl_block_q5_0*) malloc(sizeof(cl_block_q5_0) * global / 32); + for (size_t i = 0; i < global / 32; i++) { + cl_host_b[i].d = ggml_fp16_to_fp32(b[i].d); + memcpy(&cl_host_b[i].qh, b[i].qh, sizeof(uint32_t)); + memcpy(&cl_host_b[i].qs, b[i].qs, QK5_0 / 2); + } + host_b = (const float*) cl_host_b; + size_qb = global * (sizeof(float) + sizeof(uint32_t) + local) / 32; + break; + case GGML_TYPE_Q5_1: + dequant = true; + kernel = kernel_q5_1; + local = 16; + size_qb = global * (sizeof(ggml_fp16_t) * 2 + sizeof(uint32_t) + local) / 32; + break; + case GGML_TYPE_Q8_0: + dequant = true; + kernel = kernel_q8_0; + local = 32; + size_qb = global * (sizeof(float) + local) / 32; break; default: fprintf(stderr, "Error: Unsupported OpenCL btype %d\n", btype); @@ -171,12 +350,15 @@ void ggml_cl_sgemm_wrapper( err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_buffer_qb); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_buffer_b); CL_CHECK(err, "clSetKernelArg"); - clEnqueueWriteBuffer(queue, cl_buffer_qb, CL_FALSE, 0, size_qb, host_b, 0, NULL, &ev_qb); + err = clEnqueueWriteBuffer(queue, cl_buffer_qb, CL_FALSE, 0, size_qb, host_b, 0, NULL, &ev_qb); + CL_CHECK(err, "clEnqueueWriteBuffer qb"); } else { - clEnqueueWriteBuffer(queue, cl_buffer_b, CL_FALSE, 0, size_b, host_b, 0, NULL, &ev_b); + err = clEnqueueWriteBuffer(queue, cl_buffer_b, CL_FALSE, 0, size_b, host_b, 0, NULL, &ev_b); + CL_CHECK(err, "clEnqueueWriteBuffer b"); } - clEnqueueWriteBuffer(queue, cl_buffer_a, CL_FALSE, 0, size_a, host_a, 0, NULL, &ev_a); + err = clEnqueueWriteBuffer(queue, cl_buffer_a, CL_FALSE, 0, size_a, host_a, 0, NULL, &ev_a); + CL_CHECK(err, "clEnqueueWriteBuffer a"); if (dequant) { err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 1, &ev_qb, &ev_b); CL_CHECK(err, "clEnqueueNDRangeKernel"); @@ -188,15 +370,20 @@ void ggml_cl_sgemm_wrapper( clReleaseEvent(ev_b); cl_event ev_sgemm; - CLBlastSgemm((CLBlastLayout)order, - (CLBlastTranspose)trans_a, (CLBlastTranspose)trans_b, - m, n, k, - alpha, - cl_buffer_a, 0, lda, - cl_buffer_b, 0, ldb, - beta, - cl_buffer_c, 0, ldc, - &queue, &ev_sgemm); + CLBlastStatusCode status = CLBlastSgemm((CLBlastLayout)order, + (CLBlastTranspose)trans_a, (CLBlastTranspose)trans_b, + m, n, k, + alpha, + cl_buffer_a, 0, lda, + cl_buffer_b, 0, ldb, + beta, + cl_buffer_c, 0, ldc, + &queue, &ev_sgemm); + + if (status != CLBlastSuccess) { + fprintf(stderr, "Error: CLBlast SGEMM %d\n", status); + abort(); + } cl_event ev_c; clEnqueueReadBuffer(queue, cl_buffer_c, CL_TRUE, 0, size_c, host_c, 1, &ev_sgemm, &ev_c); @@ -205,4 +392,7 @@ void ggml_cl_sgemm_wrapper( clWaitForEvents(1, &ev_c); clReleaseEvent(ev_sgemm); clReleaseEvent(ev_c); + if (btype == GGML_TYPE_Q5_0) { + free((void*) cl_host_b); + } } From a5d30b1f53677cb50791fec41c43e93274347303 Mon Sep 17 00:00:00 2001 From: jon-chuang <9093549+jon-chuang@users.noreply.github.com> Date: Sun, 30 Apr 2023 14:41:35 -0400 Subject: [PATCH 5/7] common : better default number of threads (#934) * commit * fix * try-catch * apply code review * improve * improve * add macos headers * done * remove color * fix windows * minor * fix * Apply suggestions from code review Co-authored-by: DannyDaemonic * remove * minor * minor --------- Co-authored-by: jon-chuang Co-authored-by: DannyDaemonic --- examples/common.cpp | 49 ++++++++++++++++++++++++++++++++++++--------- examples/common.h | 5 +++-- 2 files changed, 42 insertions(+), 12 deletions(-) diff --git a/examples/common.cpp b/examples/common.cpp index 6c712c713..ad7b0bba3 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -1,13 +1,18 @@ #include "common.h" #include +#include #include #include #include #include #include #include -#include + +#if defined(__APPLE__) && defined(__MACH__) +#include +#include +#endif #if defined (_WIN32) #include @@ -25,19 +30,43 @@ extern "C" __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int #define CP_UTF8 65001 #endif -bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { - // determine sensible default number of threads. - // std::thread::hardware_concurrency may not be equal to the number of cores, or may return 0. +int32_t get_num_physical_cores() { #ifdef __linux__ std::ifstream cpuinfo("/proc/cpuinfo"); - params.n_threads = std::count(std::istream_iterator(cpuinfo), - std::istream_iterator(), - std::string("processor")); -#endif - if (params.n_threads == 0) { - params.n_threads = std::max(1, (int32_t) std::thread::hardware_concurrency()); + std::string line; + while (std::getline(cpuinfo, line)) { + std::size_t pos = line.find("cpu cores"); + if (pos != std::string::npos) { + pos = line.find(": ", pos); + if (pos != std::string::npos) { + try { + // Extract the number and return it + return static_cast(std::stoul(line.substr(pos + 2))); + } catch (const std::invalid_argument &) { + // Ignore if we could not parse + } + } + } } +#elif defined(__APPLE__) && defined(__MACH__) + int32_t num_physical_cores; + size_t len = sizeof(num_physical_cores); + int result = sysctlbyname("hw.perflevel0.physicalcpu", &num_physical_cores, &len, NULL, 0); + if (result == 0) { + return num_physical_cores; + } + result = sysctlbyname("hw.physicalcpu", &num_physical_cores, &len, NULL, 0); + if (result == 0) { + return num_physical_cores; + } +#elif defined(_WIN32) + //TODO: Implement +#endif + unsigned int n_threads = std::thread::hardware_concurrency(); + return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4; +} +bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { bool invalid_param = false; std::string arg; gpt_params default_params; diff --git a/examples/common.h b/examples/common.h index fce1d42a9..627696e30 100644 --- a/examples/common.h +++ b/examples/common.h @@ -13,11 +13,12 @@ // // CLI argument parsing // +int32_t get_num_physical_cores(); struct gpt_params { int32_t seed = -1; // RNG seed - int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); - int32_t n_predict = -1; // new tokens to predict + int32_t n_threads = get_num_physical_cores(); + int32_t n_predict = -1; // new tokens to predict int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions) int32_t n_ctx = 512; // context size int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS) From 6f796992869f306c48484d62a39f2a181ae2fd6f Mon Sep 17 00:00:00 2001 From: Pavol Rusnak Date: Sun, 30 Apr 2023 20:48:38 +0200 Subject: [PATCH 6/7] build: add armv{6,7,8} support to cmake (#1251) - flags copied from Makefile - updated comments in both CMakeLists.txt and Makefile to match reality --- CMakeLists.txt | 15 ++++++++++++++- Makefile | 8 +++++--- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index bbf599559..098306126 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -258,9 +258,22 @@ if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES # TODO: arm msvc? else() if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") + # Apple M1, M2, etc. + # Raspberry Pi 3, 4, Zero 2 (64-bit) add_compile_options(-mcpu=native) endif() - # TODO: armv6,7,8 version specific flags + if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6") + # Raspberry Pi 1, Zero + add_compile_options(-mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access) + endif() + if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7") + # Raspberry Pi 2 + add_compile_options(-mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations) + endif() + if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8") + # Raspberry Pi 3, 4, Zero 2 (32-bit) + add_compile_options(-mfp16-format=ieee -mno-unaligned-access) + endif() endif() elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$") message(STATUS "x86 detected") diff --git a/Makefile b/Makefile index 6d89401c8..1d62a4438 100644 --- a/Makefile +++ b/Makefile @@ -135,19 +135,21 @@ ifdef LLAMA_PERF CXXFLAGS += -DGGML_PERF endif ifneq ($(filter aarch64%,$(UNAME_M)),) + # Apple M1, M2, etc. + # Raspberry Pi 3, 4, Zero 2 (64-bit) CFLAGS += -mcpu=native CXXFLAGS += -mcpu=native endif ifneq ($(filter armv6%,$(UNAME_M)),) - # Raspberry Pi 1, 2, 3 + # Raspberry Pi 1, Zero CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access endif ifneq ($(filter armv7%,$(UNAME_M)),) - # Raspberry Pi 4 + # Raspberry Pi 2 CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations endif ifneq ($(filter armv8%,$(UNAME_M)),) - # Raspberry Pi 4 + # Raspberry Pi 3, 4, Zero 2 (32-bit) CFLAGS += -mfp16-format=ieee -mno-unaligned-access endif From 7ff0dcd32091c703a12adb0c57c32c565ce17664 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 30 Apr 2023 22:28:51 +0300 Subject: [PATCH 7/7] ggml : fix UB (int << 31) --- ggml.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/ggml.c b/ggml.c index 17c03ad40..8cc48344e 100644 --- a/ggml.c +++ b/ggml.c @@ -1911,8 +1911,8 @@ static void dequantize_row_q5_0(const void * restrict vx, float * restrict y, in const uint8_t vi = pp[l/2]; // extract the 5-th bit from qh - const uint8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4; - const uint8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4; + const uint8_t vh0 = ((qh & (1u << (l + 0))) >> (l + 0)) << 4; + const uint8_t vh1 = ((qh & (1u << (l + 1))) >> (l + 1)) << 4; const int8_t vi0 = (vi & 0x0F) | vh0; const int8_t vi1 = (vi >> 4) | vh1; @@ -1948,8 +1948,8 @@ static void dequantize_row_q5_1(const void * restrict vx, float * restrict y, in const uint8_t vi = pp[l/2]; // extract the 5-th bit from qh - const uint8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4; - const uint8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4; + const uint8_t vh0 = ((qh & (1u << (l + 0))) >> (l + 0)) << 4; + const uint8_t vh1 = ((qh & (1u << (l + 1))) >> (l + 1)) << 4; const uint8_t vi0 = (vi & 0x0F) | vh0; const uint8_t vi1 = (vi >> 4) | vh1; @@ -3286,8 +3286,8 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * for (int j = 0; j < QK8_0/2; j++) { const uint8_t v0 = x0[j]; - const int x0_0h = ((qh & (1 << (2*j + 0))) >> (2*j + 0)) << 4; - const int x1_0h = ((qh & (1 << (2*j + 1))) >> (2*j + 1)) << 4; + const int x0_0h = ((qh & (1u << (2*j + 0))) >> (2*j + 0)) << 4; + const int x1_0h = ((qh & (1u << (2*j + 1))) >> (2*j + 1)) << 4; const int x0_0 = ((v0 & 0x0F) | x0_0h) - 16; const int x1_0 = ((v0 >> 4) | x1_0h) - 16; @@ -3491,8 +3491,8 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * for (int j = 0; j < QK8_1/2; j++) { const uint8_t v0 = x0[j]; - const int x0_0h = ((qh & (1 << (2*j + 0))) >> (2*j + 0)) << 4; - const int x1_0h = ((qh & (1 << (2*j + 1))) >> (2*j + 1)) << 4; + const int x0_0h = ((qh & (1u << (2*j + 0))) >> (2*j + 0)) << 4; + const int x1_0h = ((qh & (1u << (2*j + 1))) >> (2*j + 1)) << 4; const int x0_0 = (v0 & 0x0F) | x0_0h; const int x1_0 = (v0 >> 4) | x1_0h; @@ -13057,8 +13057,8 @@ size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * memcpy(&qh, &y[i].qh, sizeof(qh)); for (int l = 0; l < QK5_0; l += 2) { - const uint8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4; - const uint8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4; + const uint8_t vh0 = ((qh & (1u << (l + 0))) >> (l + 0)) << 4; + const uint8_t vh1 = ((qh & (1u << (l + 1))) >> (l + 1)) << 4; // cast to 16 bins const uint8_t vi0 = ((y[i].qs[l/2] & 0x0F) | vh0) / 2; @@ -13087,8 +13087,8 @@ size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * memcpy(&qh, &y[i].qh, sizeof(qh)); for (int l = 0; l < QK5_1; l += 2) { - const uint8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4; - const uint8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4; + const uint8_t vh0 = ((qh & (1u << (l + 0))) >> (l + 0)) << 4; + const uint8_t vh1 = ((qh & (1u << (l + 1))) >> (l + 1)) << 4; // cast to 16 bins const uint8_t vi0 = ((y[i].qs[l/2] & 0x0F) | vh0) / 2;