From 0d52a69e4bf0d6181beec7853307bdcdeec9905b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 8 Jan 2025 11:29:34 +0200 Subject: [PATCH 01/12] ci : fix cmake option (#11125) --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 602cf5220..02a193b86 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -665,7 +665,7 @@ jobs: - build: 'llvm-arm64' defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON' - build: 'msvc-arm64' - defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=O' + defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON' - build: 'llvm-arm64-opencl-adreno' defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON' From 8cef75c743ba13ebbd6d380c531200c768a8b8aa Mon Sep 17 00:00:00 2001 From: amritahs-ibm Date: Wed, 8 Jan 2025 16:24:19 +0530 Subject: [PATCH 02/12] llamafile : ppc64le MMA INT8 implementation (#10912) This change upstreams llamafile's cpu matrix multiplication kernels for ppc64le using MMA builtins for quantised int8 datatype. This change results in 10% - 70% improvement in total speed(ie all tokens/total time), across various batch sizes. The patch is tested with Meta-Lllama-3-8B, Mistral-7B, Llama-2-7B-chat-hf models on a IBM POWER10 machine. Signed-off-by: Amrita H S --- ggml/src/ggml-cpu/llamafile/sgemm.cpp | 836 ++++++++++++++++++++++++-- 1 file changed, 770 insertions(+), 66 deletions(-) diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.cpp b/ggml/src/ggml-cpu/llamafile/sgemm.cpp index 8fce576c3..c22a66287 100644 --- a/ggml/src/ggml-cpu/llamafile/sgemm.cpp +++ b/ggml/src/ggml-cpu/llamafile/sgemm.cpp @@ -54,6 +54,7 @@ #include "ggml-quants.h" #include +#include #ifdef _MSC_VER #define NOINLINE __declspec(noinline) @@ -1051,6 +1052,704 @@ class tinyBLAS_Q0_AVX { } \ } \ +template +class tinyBLAS_Q0_PPC { + public: + tinyBLAS_Q0_PPC(int64_t k, + const TA *A, int64_t lda, + const TB *B, int64_t ldb, + TC *C, int64_t ldc, + int ith, int nth) + : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) { + } + + void matmul(int64_t m, int64_t n) { + mnpack(0, m, 0, n); + } + + private: + + template + inline void save_res(int ii, int jj, int idx, vector float* fin_res) { + for (int I = 0; I < RM; I++) { + for (int J = 0; J < RN; J++) { + *((float*)(C+ii+((jj+J)*ldc)+I)) = *((float*)&fin_res[idx+I]+J); + } + } + } + + template + inline void compute(acc_t* ACC, int c_idx, int s_idx, std::array& comparray, vector float* vs, vector float* fin_res) { + vector signed int vec_C[4]; + vector float CA[4] = {0}; + vector float res[4] = {0}; + __builtin_mma_disassemble_acc(vec_C, ACC); + for (int i = 0; i < 4; i++) { + CA[i] = vec_splats((float)(((double)comparray[c_idx+i]) * -128.0)); + res[i] = vec_add(vec_ctf(vec_C[i], 0), CA[i]); + fin_res[s_idx+i] = vec_madd(res[i], vs[s_idx+i], fin_res[s_idx+i]); + } + } + + template + void packNormal(const TA* a, int64_t lda, int rows, int cols, VA* vec, bool flip) { + int64_t i, j; + TA *aoffset = NULL; + VA *vecOffset = NULL; + TA *aoffset1 = NULL, *aoffset2 = NULL, *aoffset3 = NULL, *aoffset4 = NULL; + TA *aoffset5 = NULL, *aoffset6 = NULL, *aoffset7 = NULL, *aoffset8 = NULL; + __vector_pair C1, C2, C3, C4, C5, C6, C7, C8; + VB c1[2] = {0}, c2[2] = {0}, c3[2] = {0}, c4[2]={0}; + VB c5[2] = {0}, c6[2] = {0}, c7[2] = {0}, c8[2]={0}; + VB t1, t2, t3, t4, t5, t6, t7, t8; + vector unsigned char xor_vector; + uint8_t flip_vec = 0x80; + xor_vector = vec_splats(flip_vec); + vector unsigned char swiz1 = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23}; + vector unsigned char swiz2 = {8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31}; + vector unsigned char swiz3 = {0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27}; + vector unsigned char swiz4 = {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; + + aoffset = const_cast(a); + vecOffset = vec; + j = (rows >> 3); + if (j > 0) { + do { + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset5 = aoffset4 + lda; + aoffset6 = aoffset5 + lda; + aoffset7 = aoffset6 + lda; + aoffset8 = aoffset7 + lda; + aoffset += 8 * lda; + + i = (cols >> 3); + if (i > 0) { + do { + C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1->qs); + C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2->qs); + C3 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset3->qs); + C4 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset4->qs); + C5 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset5->qs); + C6 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset6->qs); + C7 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset7->qs); + C8 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset8->qs); + + __builtin_vsx_disassemble_pair(c1, &C1); + __builtin_vsx_disassemble_pair(c2, &C2); + __builtin_vsx_disassemble_pair(c3, &C3); + __builtin_vsx_disassemble_pair(c4, &C4); + __builtin_vsx_disassemble_pair(c5, &C5); + __builtin_vsx_disassemble_pair(c6, &C6); + __builtin_vsx_disassemble_pair(c7, &C7); + __builtin_vsx_disassemble_pair(c8, &C8); + + t1 = vec_perm(c1[0], c2[0], swiz1); + t2 = vec_perm(c1[0], c2[0], swiz2); + t3 = vec_perm(c3[0], c4[0], swiz1); + t4 = vec_perm(c3[0], c4[0], swiz2); + t5 = vec_perm(t1, t3, swiz3); + t6 = vec_perm(t1, t3, swiz4); + t7 = vec_perm(t2, t4, swiz3); + t8 = vec_perm(t2, t4, swiz4); + if (flip == true) { + t5 = vec_xor(t5, xor_vector); + t6 = vec_xor(t6, xor_vector); + t7 = vec_xor(t7, xor_vector); + t8 = vec_xor(t8, xor_vector); + } + vec_xst(t5, 0, vecOffset); + vec_xst(t6, 0, vecOffset+16); + vec_xst(t7, 0, vecOffset+32); + vec_xst(t8, 0, vecOffset+48); + + t1 = vec_perm(c1[1], c2[1], swiz1); + t2 = vec_perm(c1[1], c2[1], swiz2); + t3 = vec_perm(c3[1], c4[1], swiz1); + t4 = vec_perm(c3[1], c4[1], swiz2); + t5 = vec_perm(t1, t3, swiz3); + t6 = vec_perm(t1, t3, swiz4); + t7 = vec_perm(t2, t4, swiz3); + t8 = vec_perm(t2, t4, swiz4); + if (flip == true) { + t5 = vec_xor(t5, xor_vector); + t6 = vec_xor(t6, xor_vector); + t7 = vec_xor(t7, xor_vector); + t8 = vec_xor(t8, xor_vector); + } + vec_xst(t5, 0, vecOffset+64); + vec_xst(t6, 0, vecOffset+80); + vec_xst(t7, 0, vecOffset+96); + vec_xst(t8, 0, vecOffset+112); + + t1 = vec_perm(c5[0], c6[0], swiz1); + t2 = vec_perm(c5[0], c6[0], swiz2); + t3 = vec_perm(c7[0], c8[0], swiz1); + t4 = vec_perm(c7[0], c8[0], swiz2); + t5 = vec_perm(t1, t3, swiz3); + t6 = vec_perm(t1, t3, swiz4); + t7 = vec_perm(t2, t4, swiz3); + t8 = vec_perm(t2, t4, swiz4); + if (flip == true) { + t5 = vec_xor(t5, xor_vector); + t6 = vec_xor(t6, xor_vector); + t7 = vec_xor(t7, xor_vector); + t8 = vec_xor(t8, xor_vector); + } + vec_xst(t5, 0, vecOffset+128); + vec_xst(t6, 0, vecOffset+144); + vec_xst(t7, 0, vecOffset+160); + vec_xst(t8, 0, vecOffset+176); + + t1 = vec_perm(c5[1], c6[1], swiz1); + t2 = vec_perm(c5[1], c6[1], swiz2); + t3 = vec_perm(c7[1], c8[1], swiz1); + t4 = vec_perm(c7[1], c8[1], swiz2); + t5 = vec_perm(t1, t3, swiz3); + t6 = vec_perm(t1, t3, swiz4); + t7 = vec_perm(t2, t4, swiz3); + t8 = vec_perm(t2, t4, swiz4); + if (flip == true) { + t5 = vec_xor(t5, xor_vector); + t6 = vec_xor(t6, xor_vector); + t7 = vec_xor(t7, xor_vector); + t8 = vec_xor(t8, xor_vector); + } + vec_xst(t5, 0, vecOffset+192); + vec_xst(t6, 0, vecOffset+208); + vec_xst(t7, 0, vecOffset+224); + vec_xst(t8, 0, vecOffset+240); + + aoffset1 += lda; + aoffset2 += lda; + aoffset3 += lda; + aoffset4 += lda; + aoffset5 += lda; + aoffset6 += lda; + aoffset7 += lda; + aoffset8 += lda; + vecOffset += 256; + i--; + } while(i > 0); + } + j--; + } while(j > 0); + } + + if (rows & 4) { + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset += 4 * lda; + + i = (cols >> 3); + if (i > 0) { + do { + C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1->qs); + C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2->qs); + C3 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset3->qs); + C4 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset4->qs); + + __builtin_vsx_disassemble_pair(c1, &C1); + __builtin_vsx_disassemble_pair(c2, &C2); + __builtin_vsx_disassemble_pair(c3, &C3); + __builtin_vsx_disassemble_pair(c4, &C4); + + t1 = vec_perm(c1[0], c2[0], swiz1); + t2 = vec_perm(c1[0], c2[0], swiz2); + t3 = vec_perm(c3[0], c4[0], swiz1); + t4 = vec_perm(c3[0], c4[0], swiz2); + t5 = vec_perm(t1, t3, swiz3); + t6 = vec_perm(t1, t3, swiz4); + t7 = vec_perm(t2, t4, swiz3); + t8 = vec_perm(t2, t4, swiz4); + if (flip == true) { + t5 = vec_xor(t5, xor_vector); + t6 = vec_xor(t6, xor_vector); + t7 = vec_xor(t7, xor_vector); + t8 = vec_xor(t8, xor_vector); + } + vec_xst(t5, 0, vecOffset); + vec_xst(t6, 0, vecOffset+16); + vec_xst(t7, 0, vecOffset+32); + vec_xst(t8, 0, vecOffset+48); + + t1 = vec_perm(c1[1], c2[1], swiz1); + t2 = vec_perm(c1[1], c2[1], swiz2); + t3 = vec_perm(c3[1], c4[1], swiz1); + t4 = vec_perm(c3[1], c4[1], swiz2); + t5 = vec_perm(t1, t3, swiz3); + t6 = vec_perm(t1, t3, swiz4); + t7 = vec_perm(t2, t4, swiz3); + t8 = vec_perm(t2, t4, swiz4); + if (flip == true) { + t5 = vec_xor(t5, xor_vector); + t6 = vec_xor(t6, xor_vector); + t7 = vec_xor(t7, xor_vector); + t8 = vec_xor(t8, xor_vector); + } + vec_xst(t5, 0, vecOffset+64); + vec_xst(t6, 0, vecOffset+80); + vec_xst(t7, 0, vecOffset+96); + vec_xst(t8, 0, vecOffset+112); + + aoffset1 += lda; + aoffset2 += lda; + aoffset3 += lda; + aoffset4 += lda; + vecOffset += 128; + i--; + } while(i > 0); + } + } + if (rows & 3) { + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + i = (cols >> 3); + if (i > 0) { + do { + switch(rows) { + case 3: C3 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset3->qs); + __builtin_vsx_disassemble_pair(c3, &C3); + case 2: C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2->qs); + __builtin_vsx_disassemble_pair(c2, &C2); + case 1: C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1->qs); + __builtin_vsx_disassemble_pair(c1, &C1); + break; + } + t1 = vec_perm(c1[0], c2[0], swiz1); + t2 = vec_perm(c1[0], c2[0], swiz2); + t3 = vec_perm(c3[0], c4[0], swiz1); + t4 = vec_perm(c3[0], c4[0], swiz2); + t5 = vec_perm(t1, t3, swiz3); + t6 = vec_perm(t1, t3, swiz4); + t7 = vec_perm(t2, t4, swiz3); + t8 = vec_perm(t2, t4, swiz4); + if (flip == true) { + t5 = vec_xor(t5, xor_vector); + t6 = vec_xor(t6, xor_vector); + t7 = vec_xor(t7, xor_vector); + t8 = vec_xor(t8, xor_vector); + } + vec_xst(t5, 0, vecOffset); + vec_xst(t6, 0, vecOffset+16); + vec_xst(t7, 0, vecOffset+32); + vec_xst(t8, 0, vecOffset+48); + + t1 = vec_perm(c1[1], c2[1], swiz1); + t2 = vec_perm(c1[1], c2[1], swiz2); + t3 = vec_perm(c3[1], c4[1], swiz1); + t4 = vec_perm(c3[1], c4[1], swiz2); + t5 = vec_perm(t1, t3, swiz3); + t6 = vec_perm(t1, t3, swiz4); + t7 = vec_perm(t2, t4, swiz3); + t8 = vec_perm(t2, t4, swiz4); + if (flip == true) { + t5 = vec_xor(t5, xor_vector); + t6 = vec_xor(t6, xor_vector); + t7 = vec_xor(t7, xor_vector); + t8 = vec_xor(t8, xor_vector); + } + vec_xst(t5, 0, vecOffset+64); + vec_xst(t6, 0, vecOffset+80); + vec_xst(t7, 0, vecOffset+96); + vec_xst(t8, 0, vecOffset+112); + + aoffset1 += lda; + aoffset2 += lda; + aoffset3 += lda; + vecOffset += 128; + i--; + } while(i > 0); + } + } + } + + void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) { + int64_t mc, nc, mp, np; + int m_rem = MIN(m - m0, 8); + int n_rem = MIN(n - n0, 8); + // TO-DO: KERNEL_16x8 and KERNEL_8x16 are having some performance + // issues. After resolving them, below code will be enabled. + /*if (m_rem >= 16 && n_rem >= 8) { + mc = 16; + nc = 8; + gemm<16,8>(m0, m, n0, n); + } else if(m_rem >= 8 && n_rem >= 16) { + mc = 8; + nc = 16; + gemm<8,16>(m0, m, n0, n); + }*/ + if (m_rem >= 8 && n_rem >= 8) { + mc = 8; + nc = 8; + gemm<8,8>(m0, m, n0, n); + } else if (m_rem >= 4 && n_rem >= 8) { + mc = 4; + nc = 8; + gemm<4,8>(m0, m, n0, n); + } else if (m_rem >= 8 && n_rem >= 4) { + mc = 8; + nc = 4; + gemm<8,4>(m0, m, n0, n); + } else if (m_rem >= 4 && n_rem >= 4) { + mc = 4; + nc = 4; + gemm_small<4, 4>(m0, m, n0, n); + } else if ((m_rem < 4) && (n_rem > 4)) { + nc = 4; + switch(m_rem) { + case 1: + mc = 1; + gemm_small<1, 4>(m0, m, n0, n); + break; + case 2: + mc = 2; + gemm_small<2, 4>(m0, m, n0, n); + break; + case 3: + mc = 3; + gemm_small<3, 4>(m0, m, n0, n); + break; + default: + return; + } + } else if ((m_rem > 4) && (n_rem < 4)) { + mc = 4; + switch(n_rem) { + case 1: + nc = 1; + gemm_small<4, 1>(m0, m, n0, n); + break; + case 2: + nc = 2; + gemm_small<4, 2>(m0, m, n0, n); + break; + case 3: + nc = 3; + gemm_small<4, 3>(m0, m, n0, n); + break; + default: + return; + } + } else { + switch((m_rem << 4) | n_rem) { + case 0x43: + mc = 4; + nc = 3; + gemm_small<4, 3>(m0, m, n0, n); + break; + case 0x42: + mc = 4; + nc = 2; + gemm_small<4, 2>(m0, m, n0, n); + break; + case 0x41: + mc = 4; + nc = 1; + gemm_small<4, 1>(m0, m, n0, n); + break; + case 0x34: + mc = 3; + nc = 4; + gemm_small<3, 4>(m0, m, n0, n); + break; + case 0x33: + mc = 3; + nc = 3; + gemm_small<3, 3>(m0, m, n0, n); + break; + case 0x32: + mc = 3; + nc = 2; + gemm_small<3, 2>(m0, m, n0, n); + break; + case 0x31: + mc = 3; + nc = 1; + gemm_small<3, 1>(m0, m, n0, n); + break; + case 0x24: + mc = 2; + nc = 4; + gemm_small<2, 4>(m0, m, n0, n); + break; + case 0x23: + mc = 2; + nc = 3; + gemm_small<2, 3>(m0, m, n0, n); + break; + case 0x22: + mc = 2; + nc = 2; + gemm_small<2, 2>(m0, m, n0, n); + break; + case 0x21: + mc = 2; + nc = 1; + gemm_small<2, 1>(m0, m, n0, n); + break; + case 0x14: + mc = 1; + nc = 4; + gemm_small<1, 4>(m0, m, n0, n); + break; + case 0x13: + mc = 1; + nc = 3; + gemm_small<1, 3>(m0, m, n0, n); + break; + case 0x12: + mc = 1; + nc = 2; + gemm_small<1, 2>(m0, m, n0, n); + break; + case 0x11: + mc = 1; + nc = 1; + gemm_small<1, 1>(m0, m, n0, n); + break; + default: + return; + } + } + mp = m0 + (m - m0) / mc * mc; + np = n0 + (n - n0) / nc * nc; + mnpack(mp, m, n0, np); + mnpack(m0, m, np, n); + } + + void KERNEL_4x8(int64_t ii, int64_t jj) { + vec_t vec_A[8], vec_B[16] = {0}; + acc_t acc_0, acc_1; + std::array comparray; + vector float fin_res[8] = {0}; + vector float vs[8] = {0}; + for (int l = 0; l < k; l++) { + __builtin_mma_xxsetaccz(&acc_0); + __builtin_mma_xxsetaccz(&acc_1); + packNormal((A+(ii*lda)+l), lda, 4, 8, (int8_t*)vec_A, false); + packNormal((B+(jj*ldb)+l), ldb, 8, 8, (uint8_t*)vec_B, true); + for(int x = 0; x < 8; x++) { + __builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]); + __builtin_mma_xvi8ger4pp(&acc_1, vec_A[x], vec_B[x+8]); + } + for (int I = 0; I<4; I++) { + for (int J = 0; J<4; J++) { + *((float*)&vs[I]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J)*ldb)+l)->d)); + *((float*)&vs[I+4]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J+4)*ldb)+l)->d)); + } + } + auto aoffset = A+(ii*lda)+l; + for (int i = 0; i < 4; i++) { + comparray[i] = 0; + int ca = 0; + const int8_t *at = aoffset->qs; + for (int j = 0; j < 32; j++) + ca += (int)*at++; + comparray[i] = ca; + aoffset += lda; + } + compute<4>(&acc_0, 0, 0, comparray, vs, fin_res); + compute<4>(&acc_1, 0, 4, comparray, vs, fin_res); + } + save_res<4, 4>(ii, jj, 0, fin_res); + save_res<4, 4>(ii, jj+4, 4, fin_res); + } + + void KERNEL_8x4(int64_t ii, int64_t jj) { + vec_t vec_A[16], vec_B[8] = {0}; + acc_t acc_0, acc_1; + std::array comparray; + vector float fin_res[8] = {0}; + vector float vs[8] = {0}; + for (int l = 0; l < k; l++) { + __builtin_mma_xxsetaccz(&acc_0); + __builtin_mma_xxsetaccz(&acc_1); + packNormal((A+(ii*lda)+l), lda, 8, 8, (int8_t*)vec_A, false); + packNormal((B+(jj*ldb)+l), ldb, 4, 8, (uint8_t*)vec_B, true); + for(int x = 0; x < 8; x++) { + __builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]); + __builtin_mma_xvi8ger4pp(&acc_1, vec_A[x+8], vec_B[x]); + } + for (int I = 0; I<8; I++) { + for (int J = 0; J<4; J++) { + *((float*)&vs[I]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J)*ldb)+l)->d)); + } + } + auto aoffset = A+(ii*lda)+l; + for (int i = 0; i < 8; i++) { + comparray[i] = 0; + int ca = 0; + const int8_t *at = aoffset->qs; + for (int j = 0; j < 32; j++) + ca += (int)*at++; + comparray[i] = ca; + aoffset += lda; + } + compute<8>(&acc_0, 0, 0, comparray, vs, fin_res); + compute<8>(&acc_1, 4, 4, comparray, vs, fin_res); + } + save_res<4, 4>(ii, jj, 0, fin_res); + save_res<4, 4>(ii+4, jj, 4, fin_res); + } + + void KERNEL_8x8(int64_t ii, int64_t jj) { + vec_t vec_A[16], vec_B[16] = {0}; + acc_t acc_0, acc_1, acc_2, acc_3; + std::array comparray; + vector float fin_res[16] = {0}; + vector float vs[16] = {0}; + for (int l = 0; l < k; l++) { + __builtin_mma_xxsetaccz(&acc_0); + __builtin_mma_xxsetaccz(&acc_1); + __builtin_mma_xxsetaccz(&acc_2); + __builtin_mma_xxsetaccz(&acc_3); + packNormal((A+(ii*lda)+l), lda, 8, 8, (int8_t*)vec_A, false); + packNormal((B+(jj*ldb)+l), ldb, 8, 8, (uint8_t*)vec_B, true); + for(int x = 0; x < 8; x++) { + __builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]); + __builtin_mma_xvi8ger4pp(&acc_1, vec_A[x+8], vec_B[x]); + __builtin_mma_xvi8ger4pp(&acc_2, vec_A[x], vec_B[x+8]); + __builtin_mma_xvi8ger4pp(&acc_3, vec_A[x+8], vec_B[x+8]); + } + for (int I = 0; I<8; I++) { + for (int J = 0; J<4; J++) { + *((float*)&vs[I]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J)*ldb)+l)->d)); + *((float*)&vs[I+8]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J+4)*ldb)+l)->d)); + } + } + auto aoffset = A+(ii*lda)+l; + for (int i = 0; i < 8; i++) { + comparray[i] = 0; + int ca = 0; + const int8_t *at = aoffset->qs; + for (int j = 0; j < 32; j++) + ca += (int)*at++; + comparray[i] = ca; + aoffset += lda; + } + compute<8>(&acc_0, 0, 0, comparray, vs, fin_res); + compute<8>(&acc_1, 4, 4, comparray, vs, fin_res); + compute<8>(&acc_2, 0, 8, comparray, vs, fin_res); + compute<8>(&acc_3, 4, 12, comparray, vs, fin_res); + } + save_res<4, 4>(ii, jj, 0, fin_res); + save_res<4, 4>(ii+4, jj, 4, fin_res); + save_res<4, 4>(ii, jj+4, 8, fin_res); + save_res<4, 4>(ii+4, jj+4, 12, fin_res); + } + + template + void gemm_small(int64_t m0, int64_t m, int64_t n0, int64_t n) { + int64_t ytiles = (m - m0) / RM; + int64_t xtiles = (n - n0) / RN; + int64_t tiles = xtiles * ytiles; + int64_t duty = (tiles + nth - 1) / nth; + int64_t start = duty * ith; + int64_t end = start + duty; + vec_t vec_A[8], vec_B[8] = {0}; + vector signed int vec_C[4]; + acc_t acc_0; + + if (end > tiles) + end = tiles; + for (int64_t job = start; job < end; ++job) { + int64_t ii = m0 + job / xtiles * RM; + int64_t jj = n0 + job % xtiles * RN; + std::array comparray; + vector float res[4] = {0}; + vector float fin_res[4] = {0}; + vector float vs[4] = {0}; + vector float CA[4] = {0}; + __builtin_prefetch((A+(ii*lda)+0)->qs, 0, 1); // prefetch first value + __builtin_prefetch((B+(jj*ldb)+0)->qs, 0, 1); // prefetch first value + for (int l = 0; l < k; l++) { + __builtin_prefetch((A+(ii*lda)+(l+1))->qs, 0, 1); // prefetch one loop ahead + __builtin_prefetch((B+(jj*ldb)+(l+1))->qs, 0, 1); // prefetch one loop ahead + __builtin_mma_xxsetaccz(&acc_0); + packNormal((A+(ii*lda)+l), lda, RM, 8, (int8_t*)vec_A, false); + packNormal((B+(jj*ldb)+l), ldb, RN, 8, (uint8_t*)vec_B, true); + for(int x = 0; x < 8; x+=4) { + __builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]); + __builtin_mma_xvi8ger4pp(&acc_0, vec_A[x+1], vec_B[x+1]); + __builtin_mma_xvi8ger4pp(&acc_0, vec_A[x+2], vec_B[x+2]); + __builtin_mma_xvi8ger4pp(&acc_0, vec_A[x+3], vec_B[x+3]); + } + for (int I = 0; Id) * unhalf((B+((jj+J)*ldb)+l)->d)); + } + } + __builtin_mma_disassemble_acc(vec_C, &acc_0); + auto aoffset = A+(ii*lda)+l; + for (int i = 0; i < RM; i++) { + comparray[i] = 0; + int ca = 0; + const int8_t *at = aoffset->qs; + for (int j = 0; j < 32; j++) + ca += (int)*at++; + comparray[i] = ca; + aoffset += lda; + } + + for (int i = 0; i < RM; i++) { + CA[i] = vec_splats((float)(((double)comparray[i]) * -128.0)); + res[i] = vec_add(vec_ctf(vec_C[i], 0), CA[i]); + fin_res[i] = vec_madd(res[i], vs[i], fin_res[i]); + } + } + save_res(ii, jj, 0, fin_res); + } + } + + template + inline void kernel(int64_t ii, int64_t jj) { + if constexpr(RM == 4 && RN == 8) { + KERNEL_4x8(ii,jj); + } else if constexpr(RM == 8 && RN == 4) { + KERNEL_8x4(ii,jj); + } else if constexpr(RM == 8 && RN == 8) { + KERNEL_8x8(ii,jj); + } else { + static_assert(false, "RN/RM values not supported"); + } + } + + template + NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) { + int64_t ytiles = (m - m0) / RM; + int64_t xtiles = (n - n0) / RN; + int64_t tiles = xtiles * ytiles; + int64_t duty = (tiles + nth - 1) / nth; + int64_t start = duty * ith; + int64_t end = start + duty; + if (end > tiles) + end = tiles; + for (int64_t job = start; job < end; ++job) { + int64_t ii = m0 + job / xtiles * RM; + int64_t jj = n0 + job % xtiles * RN; + kernel(ii, jj); + } + } + + const TA *const A; + const TB *const B; + TC *C; + TA *At; + TB *Bt; + const int64_t k; + const int64_t lda; + const int64_t ldb; + const int64_t ldc; + const int ith; + const int nth; +}; + template class tinyBLAS_PPC { public: @@ -1070,13 +1769,17 @@ class tinyBLAS_PPC { void (tinyBLAS_PPC::*kernel)(int64_t, int64_t); - void READ_BLOCK(const float* a, int64_t lda, int rows, int cols, float* vec) { + template + void packTranspose(const TA* a, int64_t lda, int rows, int cols, TA* vec) { int64_t i, j; - float *aoffset = NULL, *boffset = NULL; - float *aoffset1 = NULL, *aoffset2 = NULL, *aoffset3 = NULL, *aoffset4 = NULL; - float *aoffset5 = NULL, *aoffset6 = NULL, *aoffset7 = NULL, *aoffset8 = NULL; - - aoffset = const_cast(a); + TA *aoffset = NULL, *boffset = NULL; + TA *aoffset1 = NULL, *aoffset2 = NULL, *aoffset3 = NULL, *aoffset4 = NULL; + TA *aoffset5 = NULL, *aoffset6 = NULL, *aoffset7 = NULL, *aoffset8 = NULL; + __vector_pair C1, C2, C3, C4, C5, C6, C7, C8; + VA c1[2] = {0}, c2[2] = {0}, c3[2] = {0}, c4[2] = {0}; + VA c5[2] = {0}, c6[2] = {0}, c7[2] = {0}, c8[2] = {0}; + VA t1, t2, t3, t4, t5, t6, t7, t8; + aoffset = const_cast(a); boffset = vec; j = (rows >> 3); if (j > 0) { @@ -1092,9 +1795,6 @@ class tinyBLAS_PPC { aoffset += 8 * lda; i = (cols >> 3); if (i > 0) { - __vector_pair C1, C2, C3, C4, C5, C6, C7, C8; - vector float c1[2], c2[2], c3[2], c4[2], c5[2], c6[2], c7[2], c8[2]; - vector float t1, t2, t3, t4, t5, t6, t7, t8; do { C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1); C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2); @@ -1174,21 +1874,19 @@ class tinyBLAS_PPC { } while(i > 0); } if (cols & 4) { - vector float c1, c2, c3, c4, c5, c6, c7, c8; - vector float t1, t2, t3, t4, t5, t6, t7, t8; - c1 = vec_xl(0, aoffset1); - c2 = vec_xl(0, aoffset2); - c3 = vec_xl(0, aoffset3); - c4 = vec_xl(0, aoffset4); - c5 = vec_xl(0, aoffset5); - c6 = vec_xl(0, aoffset6); - c7 = vec_xl(0, aoffset7); - c8 = vec_xl(0, aoffset8); + c1[0] = vec_xl(0, aoffset1); + c2[0] = vec_xl(0, aoffset2); + c3[0] = vec_xl(0, aoffset3); + c4[0] = vec_xl(0, aoffset4); + c5[0] = vec_xl(0, aoffset5); + c6[0] = vec_xl(0, aoffset6); + c7[0] = vec_xl(0, aoffset7); + c8[0] = vec_xl(0, aoffset8); - t1 = vec_mergeh(c1, c2); - t2 = vec_mergeh(c3, c4); - t3 = vec_mergeh(c5, c6); - t4 = vec_mergeh(c7, c8); + t1 = vec_mergeh(c1[0], c2[0]); + t2 = vec_mergeh(c3[0], c4[0]); + t3 = vec_mergeh(c5[0], c6[0]); + t4 = vec_mergeh(c7[0], c8[0]); t5 = vec_xxpermdi(t1, t2, 0); t6 = vec_xxpermdi(t3, t4, 0); t7 = vec_xxpermdi(t1, t2, 3); @@ -1198,10 +1896,10 @@ class tinyBLAS_PPC { vec_xst(t7, 0, boffset+8); vec_xst(t8, 0, boffset+12); - t1 = vec_mergel(c1, c2); - t2 = vec_mergel(c3, c4); - t3 = vec_mergel(c5, c6); - t4 = vec_mergel(c7, c8); + t1 = vec_mergel(c1[0], c2[0]); + t2 = vec_mergel(c3[0], c4[0]); + t3 = vec_mergel(c5[0], c6[0]); + t4 = vec_mergel(c7[0], c8[0]); t5 = vec_xxpermdi(t1, t2, 0); t6 = vec_xxpermdi(t3, t4, 0); t7 = vec_xxpermdi(t1, t2, 3); @@ -1223,9 +1921,6 @@ class tinyBLAS_PPC { aoffset += 4 * lda; i = (cols >> 3); if (i > 0) { - __vector_pair C1, C2, C3, C4; - vector float c1[2], c2[2], c3[2], c4[2]; - vector float t1, t2, t3, t4, t5, t6, t7, t8; do { C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1); C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2); @@ -1272,22 +1967,20 @@ class tinyBLAS_PPC { } if (cols & 4) { - vector float c1, c2, c3, c4; - vector float t1, t2, t3, t4; - c1 = vec_xl(0, aoffset1); - c2 = vec_xl(0, aoffset2); - c3 = vec_xl(0, aoffset3); - c4 = vec_xl(0, aoffset4); + c1[0] = vec_xl(0, aoffset1); + c2[0] = vec_xl(0, aoffset2); + c3[0] = vec_xl(0, aoffset3); + c4[0] = vec_xl(0, aoffset4); - t1 = vec_mergeh(c1, c2); - t2 = vec_mergeh(c3, c4); + t1 = vec_mergeh(c1[0], c2[0]); + t2 = vec_mergeh(c3[0], c4[0]); t3 = vec_xxpermdi(t1, t2, 0); t4 = vec_xxpermdi(t1, t2, 3); vec_xst(t3, 0, boffset); vec_xst(t4, 0, boffset+4); - t1 = vec_mergel(c1, c2); - t2 = vec_mergel(c3, c4); + t1 = vec_mergel(c1[0], c2[0]); + t2 = vec_mergel(c3[0], c4[0]); t3 = vec_xxpermdi(t1, t2, 0); t4 = vec_xxpermdi(t1, t2, 3); vec_xst(t3, 0, boffset+8); @@ -1299,21 +1992,19 @@ class tinyBLAS_PPC { aoffset2 = aoffset1 + lda; aoffset3 = aoffset2 + lda; if (cols & 4) { - vector float c1, c2, c3, c4 = {0}; - vector float t1, t2, t3, t4; - c1 = vec_xl(0, aoffset1); - c2 = vec_xl(0, aoffset2); - c3 = vec_xl(0, aoffset3); + c1[0] = vec_xl(0, aoffset1); + c2[0] = vec_xl(0, aoffset2); + c3[0] = vec_xl(0, aoffset3); - t1 = vec_mergeh(c1, c2); - t2 = vec_mergeh(c3, c4); + t1 = vec_mergeh(c1[0], c2[0]); + t2 = vec_mergeh(c3[0], c4[0]); t3 = vec_xxpermdi(t1, t2, 0); t4 = vec_xxpermdi(t1, t2, 3); vec_xst(t3, 0, boffset); vec_xst(t4, 0, boffset+4); - t1 = vec_mergel(c1, c2); - t2 = vec_mergel(c3, c4); + t1 = vec_mergel(c1[0], c2[0]); + t2 = vec_mergel(c3[0], c4[0]); t3 = vec_xxpermdi(t1, t2, 0); t4 = vec_xxpermdi(t1, t2, 3); vec_xst(t3, 0, boffset+8); @@ -1321,14 +2012,13 @@ class tinyBLAS_PPC { } } } - void KERNEL_4x4(int64_t ii, int64_t jj) { vec_t vec_A[4], vec_B[4], vec_C[4]; acc_t acc_0; __builtin_mma_xxsetaccz(&acc_0); for (int l = 0; l < k; l+=4) { - READ_BLOCK(A+(ii*lda)+l, lda, 4, 4, (float*)vec_A); - READ_BLOCK(B+(jj*ldb)+l, ldb, 4, 4, (float*)vec_B); + packTranspose(A+(ii*lda)+l, lda, 4, 4, (TA*)vec_A); + packTranspose(B+(jj*ldb)+l, ldb, 4, 4, (TA*)vec_B); __builtin_mma_xvf32gerpp(&acc_0, vec_A[0], vec_B[0]); __builtin_mma_xvf32gerpp(&acc_0, vec_A[1], vec_B[1]); __builtin_mma_xvf32gerpp(&acc_0, vec_A[2], vec_B[2]); @@ -1343,8 +2033,8 @@ class tinyBLAS_PPC { __builtin_mma_xxsetaccz(&acc_0); __builtin_mma_xxsetaccz(&acc_1); for (int64_t l = 0; l < k; l+=4) { - READ_BLOCK(A+(ii*lda)+l, lda, 4, 4, (float*)vec_A); - READ_BLOCK(B+(jj*ldb)+l, ldb, 8, 4, (float*)vec_B); + packTranspose(A+(ii*lda)+l, lda, 4, 4, (TA*)vec_A); + packTranspose(B+(jj*ldb)+l, ldb, 8, 4, (TA*)vec_B); __builtin_mma_xvf32gerpp(&acc_0, vec_A[0], (vec_t)vec_B[0]); __builtin_mma_xvf32gerpp(&acc_1, vec_A[0], (vec_t)vec_B[1]); __builtin_mma_xvf32gerpp(&acc_0, vec_A[1], (vec_t)vec_B[2]); @@ -1364,8 +2054,8 @@ class tinyBLAS_PPC { __builtin_mma_xxsetaccz(&acc_0); __builtin_mma_xxsetaccz(&acc_1); for (int64_t l = 0; l < k; l+=4) { - READ_BLOCK(A+(ii*lda)+l, lda, 8, 4, (float*)vec_A); - READ_BLOCK(B+(jj*ldb)+l, ldb, 4, 4, (float*)vec_B); + packTranspose(A+(ii*lda)+l, lda, 8, 4, (TA*)vec_A); + packTranspose(B+(jj*ldb)+l, ldb, 4, 4, (TA*)vec_B); __builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[0], vec_B[0]); __builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[1], vec_B[0]); __builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[2], vec_B[1]); @@ -1387,8 +2077,8 @@ class tinyBLAS_PPC { __builtin_mma_xxsetaccz(&acc_2); __builtin_mma_xxsetaccz(&acc_3); for (int l = 0; l < k; l+=8) { - READ_BLOCK(A+(ii*lda)+l, lda, 8, 8, (float*)vec_A); - READ_BLOCK(B+(jj*ldb)+l, ldb, 8, 8, (float*)vec_B); + packTranspose(A+(ii*lda)+l, lda, 8, 8, (TA*)vec_A); + packTranspose(B+(jj*ldb)+l, ldb, 8, 8, (TA*)vec_B); for(int x = 0; x < 16; x+=2) { __builtin_mma_xvf32gerpp(&acc_0, (vec_t)vec_A[x], vec_B[x]); __builtin_mma_xvf32gerpp(&acc_1, (vec_t)vec_A[x], vec_B[x+1]); @@ -1571,15 +2261,15 @@ class tinyBLAS_PPC { vec_t vec_A[4], vec_B[4]; for (int l=0; l= 4 && RM == 1) { - float* a = const_cast(A+(ii)*lda+l); - READ_BLOCK(B+(jj*ldb)+l, ldb, 4, 4, (float*)vec_B); + TA* a = const_cast(A+(ii)*lda+l); + packTranspose(B+(jj*ldb)+l, ldb, 4, 4, (TA*)vec_B); vec_A[0] = (vec_t)vec_xl(0,a); - vec_A[1] = (vec_t)vec_splats(*((float*)&vec_A+1)); - vec_A[2] = (vec_t)vec_splats(*((float*)&vec_A+2)); - vec_A[3] = (vec_t)vec_splats(*((float*)&vec_A+3)); + vec_A[1] = (vec_t)vec_splats(*((TA*)&vec_A+1)); + vec_A[2] = (vec_t)vec_splats(*((TA*)&vec_A+2)); + vec_A[3] = (vec_t)vec_splats(*((TA*)&vec_A+3)); } else { - READ_BLOCK(A+(ii*lda)+l, lda, RM, 4, (float*)vec_A); - READ_BLOCK(B+(jj*ldb)+l, ldb, RN, 4, (float*)vec_B); + packTranspose(A+(ii*lda)+l, lda, RM, 4, (TA*)vec_A); + packTranspose(B+(jj*ldb)+l, ldb, RN, 4, (TA*)vec_B); } __builtin_mma_xvf32gerpp(&acc_0, vec_A[0], vec_B[0]); __builtin_mma_xvf32gerpp(&acc_0, vec_A[1], vec_B[1]); @@ -1589,7 +2279,7 @@ class tinyBLAS_PPC { __builtin_mma_disassemble_acc(vec_C, &acc_0); for (int I = 0; I < RM; I++) { for (int J = 0; J < RN; J++) { - *((float*)(C+ii+((jj+J)*ldc)+I)) = *((float*)&vec_C[I]+J); + *((TC*)(C+ii+((jj+J)*ldc)+I)) = *((TC*)&vec_C[I]+J); } } } @@ -1812,6 +2502,20 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64 params->ith, params->nth}; tb.matmul(m, n); return true; + +#elif defined(__MMA__) + if (n < 8 && n != 4) + return false; + if (m < 8 && m != 4) + return false; + tinyBLAS_Q0_PPC tb{ + k, (const block_q8_0 *)A, lda, + (const block_q8_0 *)B, ldb, + (float *)C, ldc, + params->ith, params->nth}; + tb.matmul(m, n); + return true; + #else return false; #endif From a3c1232c3f475f0a77b9cc5225516ac31c567a06 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 8 Jan 2025 12:55:36 +0200 Subject: [PATCH 03/12] arg : option to exclude arguments from specific examples (#11136) * arg : option to exclude arguments from specific examples ggml-ci * readme : remove old args [no ci] --- common/arg.cpp | 17 +++++++++++++---- common/arg.h | 3 +++ examples/server/README.md | 3 --- 3 files changed, 16 insertions(+), 7 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index c81b15217..27886b84e 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -22,6 +22,11 @@ common_arg & common_arg::set_examples(std::initializer_list return *this; } +common_arg & common_arg::set_excludes(std::initializer_list excludes) { + this->excludes = std::move(excludes); + return *this; +} + common_arg & common_arg::set_env(const char * env) { help = help + "\n(env: " + env + ")"; this->env = env; @@ -37,6 +42,10 @@ bool common_arg::in_example(enum llama_example ex) { return examples.find(ex) != examples.end(); } +bool common_arg::is_exclude(enum llama_example ex) { + return excludes.find(ex) != excludes.end(); +} + bool common_arg::get_value_from_env(std::string & output) { if (env == nullptr) return false; char * value = std::getenv(env); @@ -420,7 +429,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex * - if both {LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_*,} are set, we will prioritize the LLAMA_EXAMPLE_* matching current example */ auto add_opt = [&](common_arg arg) { - if (arg.in_example(ex) || arg.in_example(LLAMA_EXAMPLE_COMMON)) { + if ((arg.in_example(ex) || arg.in_example(LLAMA_EXAMPLE_COMMON)) && !arg.is_exclude(ex)) { ctx_arg.options.push_back(std::move(arg)); } }; @@ -649,7 +658,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, const std::string & value) { params.prompt = value; } - )); + ).set_excludes({LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"--no-perf"}, string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"), @@ -673,7 +682,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.prompt.pop_back(); } } - )); + ).set_excludes({LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"--in-file"}, "FNAME", "an input file (repeat to specify multiple files)", @@ -700,7 +709,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.prompt = ss.str(); fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), value.c_str()); } - )); + ).set_excludes({LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"-e", "--escape"}, string_format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"), diff --git a/common/arg.h b/common/arg.h index a6700d323..49ab8667b 100644 --- a/common/arg.h +++ b/common/arg.h @@ -12,6 +12,7 @@ struct common_arg { std::set examples = {LLAMA_EXAMPLE_COMMON}; + std::set excludes = {}; std::vector args; const char * value_hint = nullptr; // help text or example for arg value const char * value_hint_2 = nullptr; // for second arg value @@ -53,9 +54,11 @@ struct common_arg { ) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {} common_arg & set_examples(std::initializer_list examples); + common_arg & set_excludes(std::initializer_list excludes); common_arg & set_env(const char * env); common_arg & set_sparam(); bool in_example(enum llama_example ex); + bool is_exclude(enum llama_example ex); bool get_value_from_env(std::string & output); bool has_value_from_env(); std::string to_string(); diff --git a/examples/server/README.md b/examples/server/README.md index 3ce16945a..1f0a27d96 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -45,10 +45,7 @@ The project is under active development, and we are [looking for feedback and co | `-ub, --ubatch-size N` | physical maximum batch size (default: 512)
(env: LLAMA_ARG_UBATCH) | | `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) | | `-fa, --flash-attn` | enable Flash Attention (default: disabled)
(env: LLAMA_ARG_FLASH_ATTN) | -| `-p, --prompt PROMPT` | prompt to start generation with | | `--no-perf` | disable internal libllama performance timings (default: false)
(env: LLAMA_ARG_NO_PERF) | -| `-f, --file FNAME` | a file containing the prompt (default: none) | -| `-bf, --binary-file FNAME` | binary file containing the prompt (default: none) | | `-e, --escape` | process escapes sequences (\n, \r, \t, \', \", \\) (default: true) | | `--no-escape` | do not process escape sequences | | `--rope-scaling {none,linear,yarn}` | RoPE frequency scaling method, defaults to linear unless specified by the model
(env: LLAMA_ARG_ROPE_SCALING_TYPE) | From 80ccf5d725571035b454659e3c1b4b2b07b65e71 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 8 Jan 2025 12:07:20 +0100 Subject: [PATCH 04/12] ci : pin dependency to specific version (#11137) * ci : pin dependency to specific version * will this fix ec? --- .github/workflows/docker.yml | 2 +- .github/workflows/editorconfig.yml | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 41f1a89ee..f5af72d0b 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -100,7 +100,7 @@ jobs: # https://github.com/jlumbroso/free-disk-space/tree/54081f138730dfa15788a46383842cd2f914a1be#example - name: Free Disk Space (Ubuntu) if: ${{ matrix.config.free_disk_space == true }} - uses: jlumbroso/free-disk-space@main + uses: jlumbroso/free-disk-space@v1.3.1 with: # this might remove tools that are actually needed, # if set to "true" but frees about 6 GB diff --git a/.github/workflows/editorconfig.yml b/.github/workflows/editorconfig.yml index ae86e9927..f02b7c219 100644 --- a/.github/workflows/editorconfig.yml +++ b/.github/workflows/editorconfig.yml @@ -23,5 +23,7 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - - uses: editorconfig-checker/action-editorconfig-checker@main + - uses: editorconfig-checker/action-editorconfig-checker@v2 + with: + version: v3.0.3 - run: editorconfig-checker From c792dcf4880461c2b5f3960584db241ac71a893a Mon Sep 17 00:00:00 2001 From: Radoslav Gerganov Date: Sun, 5 Jan 2025 09:50:37 +0200 Subject: [PATCH 05/12] ggml : allow loading backend with env variable (ggml/1059) ref: #1058 --- ggml/src/ggml-backend-reg.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp index 7ddd178b5..955ed505f 100644 --- a/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp @@ -574,4 +574,9 @@ void ggml_backend_load_all_from_path(const char * dir_path) { ggml_backend_load_best("opencl", silent, dir_path); ggml_backend_load_best("musa", silent, dir_path); ggml_backend_load_best("cpu", silent, dir_path); + // check the environment variable GGML_BACKEND_PATH to load an out-of-tree backend + const char * backend_path = std::getenv("GGML_BACKEND_PATH"); + if (backend_path) { + ggml_backend_load(backend_path); + } } From 99a3755a3c518119d0156766122f7b4b796ea576 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 8 Jan 2025 13:40:30 +0200 Subject: [PATCH 06/12] sync : ggml --- scripts/sync-ggml.last | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index b67445ecd..a0921f1a9 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -a2af72be7baf5b1f4a33d34e77e509e5e85b7cd7 +c8bd0fee71dc8328d93be301bbee06bc10d30429 From c07d437bbd417f42b122e767ad42b3298767dca0 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 8 Jan 2025 16:19:36 +0200 Subject: [PATCH 07/12] llama : avoid hardcoded QK_K (#11061) ggml-ci --- src/llama-quant.cpp | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 038cf58dd..466e7bc61 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -7,14 +7,12 @@ #include #include #include +#include #include #include #include #include -// TODO: replace with ggml API call -#define QK_K 256 - static void zeros(std::ofstream & file, size_t n) { char zero = 0; for (size_t i = 0; i < n; ++i) { @@ -154,8 +152,10 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t if (qs.params->output_tensor_type < GGML_TYPE_COUNT) { new_type = qs.params->output_tensor_type; } else { - int nx = tensor->ne[0]; - if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) { + const int64_t nx = tensor->ne[0]; + const int64_t qk_k = ggml_blck_size(new_type); + + if (arch == LLM_ARCH_FALCON || nx % qk_k != 0) { new_type = GGML_TYPE_Q8_0; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || @@ -367,20 +367,19 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t // if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K; //} bool convert_incompatible_tensor = false; - if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K || - new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || new_type == GGML_TYPE_IQ4_XS || - new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_S || - new_type == GGML_TYPE_IQ3_XXS || new_type == GGML_TYPE_IQ1_S || new_type == GGML_TYPE_IQ3_S || - new_type == GGML_TYPE_IQ1_M) { - int nx = tensor->ne[0]; - int ny = tensor->ne[1]; - if (nx % QK_K != 0) { - LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for %s", __func__, nx, ny, QK_K, ggml_type_name(new_type)); + { + const int64_t nx = tensor->ne[0]; + const int64_t ny = tensor->ne[1]; + const int64_t qk_k = ggml_blck_size(new_type); + + if (nx % qk_k != 0) { + LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type)); convert_incompatible_tensor = true; } else { ++qs.n_k_quantized; } } + if (convert_incompatible_tensor) { switch (new_type) { case GGML_TYPE_TQ1_0: From 4d2b3d88041705b20c30b3219838aa435e7ffbde Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 8 Jan 2025 15:59:53 +0100 Subject: [PATCH 08/12] lora : improve compat with `mergekit-extract-lora` (#11131) * (wip) support mergekit-extracted lora * support mergekit-extract-lora * use lora->get_scale * correct comment * correct norm name & condition * add some hints --- convert_lora_to_gguf.py | 34 +++++++++++++++++++++++++++++++--- src/llama-adapter.cpp | 24 ++++++++++++++++++------ src/llama-adapter.h | 7 +++++++ src/llama.cpp | 21 ++++++++++++++++++--- 4 files changed, 74 insertions(+), 12 deletions(-) diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py index ed1014cae..6dea14a23 100755 --- a/convert_lora_to_gguf.py +++ b/convert_lora_to_gguf.py @@ -226,6 +226,9 @@ def get_base_tensor_name(lora_tensor_name: str) -> str: base_name = lora_tensor_name.replace("base_model.model.", "") base_name = base_name.replace(".lora_A.weight", ".weight") base_name = base_name.replace(".lora_B.weight", ".weight") + # models produced by mergekit-extract-lora have token embeddings in the adapter + base_name = base_name.replace(".lora_embedding_A", ".weight") + base_name = base_name.replace(".lora_embedding_B", ".weight") return base_name @@ -260,6 +263,10 @@ def parse_args() -> argparse.Namespace: "--base", type=Path, help="directory containing Hugging Face model config files (config.json, tokenizer.json) for the base model that the adapter is based on - only config is needed, actual model weights are not required. If base model is unspecified, it will be loaded from Hugging Face hub based on the adapter config", ) + parser.add_argument( + "--base-model-id", type=str, + help="the model ID of the base model, if it is not available locally or in the adapter config. If specified, it will ignore --base and load the base model config from the Hugging Face hub (Example: 'meta-llama/Llama-3.2-1B-Instruct')", + ) parser.add_argument( "lora_path", type=Path, help="directory containing Hugging Face PEFT LoRA config (adapter_model.json) and weights (adapter_model.safetensors or adapter_model.bin)", @@ -290,6 +297,7 @@ if __name__ == '__main__': dir_base_model: Path | None = args.base dir_lora: Path = args.lora_path + base_model_id: str | None = args.base_model_id lora_config = dir_lora / "adapter_config.json" input_model = dir_lora / "adapter_model.safetensors" @@ -313,7 +321,10 @@ if __name__ == '__main__': lparams: dict[str, Any] = json.load(f) # load base model - if dir_base_model is None: + if base_model_id is not None: + logger.info(f"Loading base model from Hugging Face: {base_model_id}") + hparams = load_hparams_from_hf(base_model_id) + elif dir_base_model is None: if "base_model_name_or_path" in lparams: model_id = lparams["base_model_name_or_path"] logger.info(f"Loading base model from Hugging Face: {model_id}") @@ -371,11 +382,16 @@ if __name__ == '__main__': if self.lazy: tensor = LazyTorchTensor.from_eager(tensor) base_name = get_base_tensor_name(name) - is_lora_a = ".lora_A.weight" in name - is_lora_b = ".lora_B.weight" in name + # note: mergekit-extract-lora also adds token embeddings to the adapter + is_lora_a = ".lora_A.weight" in name or ".lora_embedding_A" in name + is_lora_b = ".lora_B.weight" in name or ".lora_embedding_B" in name if not is_lora_a and not is_lora_b: if ".base_layer.weight" in name: continue + # mergekit-extract-lora add these layernorm to the adapter, we need to keep them + if "_layernorm" in name or ".norm" in name: + yield (base_name, tensor) + continue logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor") if ".embed_tokens.weight" in name or ".lm_head.weight" in name: logger.error("Embeddings is present in the adapter. This can be due to new tokens added during fine tuning") @@ -407,9 +423,21 @@ if __name__ == '__main__': if name == "lm_head.weight" and len(dest) == 0: raise ValueError("lm_head is present in adapter, but is ignored in base model") for dest_name, dest_data in dest: + # mergekit-extract-lora add these layernorm to the adapter + if "_norm" in dest_name: + assert dest_data.dim() == 1 + yield (dest_name, dest_data) + continue + + # otherwise, we must get the lora_A and lora_B tensors assert isinstance(dest_data, LoraTorchTensor) lora_a, lora_b = dest_data.get_lora_A_B() + # note: mergekit-extract-lora flip and transpose A and B + # here we only need to transpose token_embd.lora_a, see llm_build_inp_embd() + if "token_embd.weight" in dest_name: + lora_a = lora_a.T + yield (dest_name + ".lora_a", lora_a) yield (dest_name + ".lora_b", lora_b) diff --git a/src/llama-adapter.cpp b/src/llama-adapter.cpp index 9fd7edea3..d4879b778 100644 --- a/src/llama-adapter.cpp +++ b/src/llama-adapter.cpp @@ -242,6 +242,10 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char } else { ab_map[name].b = cur; } + } else if (str_endswith(name, "_norm.weight")) { + // TODO: add support for norm vector + // for now, we don't really care because most adapters still work fine without it + continue; } else { throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix"); } @@ -251,6 +255,7 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char for (auto & it : ab_map) { const std::string & name = it.first; llama_lora_weight & w = it.second; + bool is_token_embd = str_endswith(name, "token_embd.weight"); if (!w.a || !w.b) { throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component"); @@ -259,16 +264,23 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char // device buft and device ctx auto * model_tensor = llama_model_get_tensor(model, name.c_str()); if (!model_tensor) { - throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model"); + throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model (hint: maybe wrong base model?)"); } struct ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer)); // validate tensor shape - if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) { - throw std::runtime_error("tensor '" + name + "' has incorrect shape"); - } - if (w.a->ne[1] != w.b->ne[0]) { - throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)"); + if (is_token_embd) { + // expect B to be non-transposed, A and B are flipped; see llm_build_inp_embd() + if (model_tensor->ne[0] != w.b->ne[1] || model_tensor->ne[1] != w.a->ne[1]) { + throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)"); + } + } else { + if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) { + throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)"); + } + if (w.a->ne[1] != w.b->ne[0]) { + throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)"); + } } // save tensor to adapter diff --git a/src/llama-adapter.h b/src/llama-adapter.h index 5f1870cc8..3448656b1 100644 --- a/src/llama-adapter.h +++ b/src/llama-adapter.h @@ -45,6 +45,13 @@ struct llama_lora_weight { struct ggml_tensor * a = nullptr; struct ggml_tensor * b = nullptr; + // get actual scale based on rank and alpha + float get_scale(float alpha, float adapter_scale) { + const float rank = (float) b->ne[0]; + const float scale = alpha ? adapter_scale * alpha / rank : adapter_scale; + return scale; + } + llama_lora_weight() = default; llama_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b) : a(a), b(b) {} }; diff --git a/src/llama.cpp b/src/llama.cpp index 8ea6686c9..97e716cd6 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2545,6 +2545,21 @@ static struct ggml_tensor * llm_build_inp_embd( ggml_set_input(lctx.inp_tokens); inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens); + + // apply lora for embedding tokens if needed + for (auto & it : lctx.lora_adapters) { + struct llama_lora_weight * lora = it.first->get_weight(tok_embd); + if (lora == nullptr) { + continue; + } + const float adapter_scale = it.second; + const float scale = lora->get_scale(it.first->alpha, adapter_scale); + struct ggml_tensor * inpL_delta = ggml_scale(ctx, ggml_mul_mat( + ctx, lora->b, // non-transposed lora_b + ggml_get_rows(ctx, lora->a, lctx.inp_tokens) + ), scale); + inpL = ggml_add(ctx, inpL, inpL_delta); + } } else { lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, ubatch.n_tokens); inpL = lctx.inp_embd; @@ -2617,9 +2632,8 @@ static struct ggml_tensor * llm_build_lora_mm( if (lora == nullptr) { continue; } - const float alpha = it.first->alpha; - const float rank = (float) lora->b->ne[0]; - const float scale = alpha ? it.second * alpha / rank : it.second; + const float adapter_scale = it.second; + const float scale = lora->get_scale(it.first->alpha, adapter_scale); struct ggml_tensor * ab_cur = ggml_mul_mat( ctx0, lora->b, ggml_mul_mat(ctx0, lora->a, cur) @@ -3967,6 +3981,7 @@ struct llm_build_context { // feed-forward network if (model.layers[il].ffn_gate_inp == nullptr) { + cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); From f7cd13301c2a88f97073fd119072b4cc92c08df1 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 8 Jan 2025 16:09:20 +0100 Subject: [PATCH 09/12] ci : use actions from ggml-org (#11140) --- .github/workflows/build.yml | 2 +- .github/workflows/docker.yml | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 02a193b86..c85999b89 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -1237,7 +1237,7 @@ jobs: - name: Create release id: create_release - uses: anzz1/action-create-release@v1 + uses: ggml-org/action-create-release@v1 env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} with: diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index f5af72d0b..d71f1eb38 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -97,10 +97,9 @@ jobs: GITHUB_BRANCH_NAME: ${{ github.head_ref || github.ref_name }} GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}' - # https://github.com/jlumbroso/free-disk-space/tree/54081f138730dfa15788a46383842cd2f914a1be#example - name: Free Disk Space (Ubuntu) if: ${{ matrix.config.free_disk_space == true }} - uses: jlumbroso/free-disk-space@v1.3.1 + uses: ggml-org/free-disk-space@v1.3.1 with: # this might remove tools that are actually needed, # if set to "true" but frees about 6 GB From 1bf839b1e8b9d043306c65eddd9021fe4337733e Mon Sep 17 00:00:00 2001 From: Eric Curtin Date: Wed, 8 Jan 2025 18:47:05 +0000 Subject: [PATCH 10/12] Enhance user input handling for llama-run (#11138) The main motivation for this change is it was not handing ctrl-c/ctrl-d correctly. Modify `read_user_input` to handle EOF, "/bye" command, and empty input cases. Introduce `get_user_input` function to manage user input loop and handle different return cases. Signed-off-by: Eric Curtin --- examples/run/run.cpp | 63 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 60 insertions(+), 3 deletions(-) diff --git a/examples/run/run.cpp b/examples/run/run.cpp index 2888fcfed..61420e441 100644 --- a/examples/run/run.cpp +++ b/examples/run/run.cpp @@ -11,6 +11,8 @@ # include #endif +#include + #include #include #include @@ -25,6 +27,13 @@ #include "json.hpp" #include "llama-cpp.h" +#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__)) || defined(_WIN32) +[[noreturn]] static void sigint_handler(int) { + printf("\n"); + exit(0); // not ideal, but it's the only way to guarantee exit in all cases +} +#endif + GGML_ATTRIBUTE_FORMAT(1, 2) static std::string fmt(const char * fmt, ...) { va_list ap; @@ -801,7 +810,20 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str static int read_user_input(std::string & user) { std::getline(std::cin, user); - return user.empty(); // Should have data in happy path + if (std::cin.eof()) { + printf("\n"); + return 1; + } + + if (user == "/bye") { + return 1; + } + + if (user.empty()) { + return 2; + } + + return 0; // Should have data in happy path } // Function to generate a response based on the prompt @@ -868,7 +890,25 @@ static bool is_stdout_a_terminal() { #endif } -// Function to tokenize the prompt +// Function to handle user input +static int get_user_input(std::string & user_input, const std::string & user) { + while (true) { + const int ret = handle_user_input(user_input, user); + if (ret == 1) { + return 1; + } + + if (ret == 2) { + continue; + } + + break; + } + + return 0; +} + +// Main chat loop function static int chat_loop(LlamaData & llama_data, const std::string & user) { int prev_len = 0; llama_data.fmtted.resize(llama_n_ctx(llama_data.context.get())); @@ -876,7 +916,8 @@ static int chat_loop(LlamaData & llama_data, const std::string & user) { while (true) { // Get user input std::string user_input; - while (handle_user_input(user_input, user)) { + if (get_user_input(user_input, user) == 1) { + return 0; } add_message("user", user.empty() ? user_input : user, llama_data); @@ -917,7 +958,23 @@ static std::string read_pipe_data() { return result.str(); } +static void ctrl_c_handling() { +#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__)) + struct sigaction sigint_action; + sigint_action.sa_handler = sigint_handler; + sigemptyset(&sigint_action.sa_mask); + sigint_action.sa_flags = 0; + sigaction(SIGINT, &sigint_action, NULL); +#elif defined(_WIN32) + auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL { + return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false; + }; + SetConsoleCtrlHandler(reinterpret_cast(console_ctrl_handler), true); +#endif +} + int main(int argc, const char ** argv) { + ctrl_c_handling(); Opt opt; const int ret = opt.init(argc, argv); if (ret == 2) { From 8a1d9c25fafbaf4182dd0b785dd6303ee40d55bc Mon Sep 17 00:00:00 2001 From: Vinesh Janarthanan <36610342+VJHack@users.noreply.github.com> Date: Wed, 8 Jan 2025 12:54:58 -0600 Subject: [PATCH 11/12] gguf-py : move scripts directory (#11116) * Moved scripts dir and fixed pyproject.toml * updated readme * fixed README urls * bump pypi gguf to v0.14.0 * retrigger ci * empty commit - trigger ci --- gguf-py/README.md | 8 ++++---- gguf-py/{ => gguf}/scripts/__init__.py | 0 gguf-py/{ => gguf}/scripts/gguf_convert_endian.py | 0 gguf-py/{ => gguf}/scripts/gguf_dump.py | 0 gguf-py/{ => gguf}/scripts/gguf_hash.py | 0 gguf-py/{ => gguf}/scripts/gguf_new_metadata.py | 0 gguf-py/{ => gguf}/scripts/gguf_set_metadata.py | 0 gguf-py/pyproject.toml | 11 +++++------ 8 files changed, 9 insertions(+), 10 deletions(-) rename gguf-py/{ => gguf}/scripts/__init__.py (100%) rename gguf-py/{ => gguf}/scripts/gguf_convert_endian.py (100%) rename gguf-py/{ => gguf}/scripts/gguf_dump.py (100%) rename gguf-py/{ => gguf}/scripts/gguf_hash.py (100%) rename gguf-py/{ => gguf}/scripts/gguf_new_metadata.py (100%) rename gguf-py/{ => gguf}/scripts/gguf_set_metadata.py (100%) diff --git a/gguf-py/README.md b/gguf-py/README.md index 24af96a17..37a75923b 100644 --- a/gguf-py/README.md +++ b/gguf-py/README.md @@ -15,13 +15,13 @@ pip install gguf [examples/writer.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/examples/writer.py) — Generates `example.gguf` in the current directory to demonstrate generating a GGUF file. Note that this file cannot be used as a model. -[scripts/gguf_dump.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf_dump.py) — Dumps a GGUF file's metadata to the console. +[gguf/scripts/gguf_dump.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_dump.py) — Dumps a GGUF file's metadata to the console. -[scripts/gguf_set_metadata.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf_set_metadata.py) — Allows changing simple metadata values in a GGUF file by key. +[gguf/scripts/gguf_set_metadata.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_set_metadata.py) — Allows changing simple metadata values in a GGUF file by key. -[scripts/gguf_convert_endian.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf_convert_endian.py) — Allows converting the endianness of GGUF files. +[gguf/scripts/gguf_convert_endian.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_convert_endian.py) — Allows converting the endianness of GGUF files. -[scripts/gguf_new_metadata.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf_new_metadata.py) — Copies a GGUF file with added/modified/removed metadata values. +[gguf/scripts/gguf_new_metadata.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_new_metadata.py) — Copies a GGUF file with added/modified/removed metadata values. ## Development Maintainers who participate in development of this package are advised to install it in editable mode: diff --git a/gguf-py/scripts/__init__.py b/gguf-py/gguf/scripts/__init__.py similarity index 100% rename from gguf-py/scripts/__init__.py rename to gguf-py/gguf/scripts/__init__.py diff --git a/gguf-py/scripts/gguf_convert_endian.py b/gguf-py/gguf/scripts/gguf_convert_endian.py similarity index 100% rename from gguf-py/scripts/gguf_convert_endian.py rename to gguf-py/gguf/scripts/gguf_convert_endian.py diff --git a/gguf-py/scripts/gguf_dump.py b/gguf-py/gguf/scripts/gguf_dump.py similarity index 100% rename from gguf-py/scripts/gguf_dump.py rename to gguf-py/gguf/scripts/gguf_dump.py diff --git a/gguf-py/scripts/gguf_hash.py b/gguf-py/gguf/scripts/gguf_hash.py similarity index 100% rename from gguf-py/scripts/gguf_hash.py rename to gguf-py/gguf/scripts/gguf_hash.py diff --git a/gguf-py/scripts/gguf_new_metadata.py b/gguf-py/gguf/scripts/gguf_new_metadata.py similarity index 100% rename from gguf-py/scripts/gguf_new_metadata.py rename to gguf-py/gguf/scripts/gguf_new_metadata.py diff --git a/gguf-py/scripts/gguf_set_metadata.py b/gguf-py/gguf/scripts/gguf_set_metadata.py similarity index 100% rename from gguf-py/scripts/gguf_set_metadata.py rename to gguf-py/gguf/scripts/gguf_set_metadata.py diff --git a/gguf-py/pyproject.toml b/gguf-py/pyproject.toml index 9c3956256..92d7f22ec 100644 --- a/gguf-py/pyproject.toml +++ b/gguf-py/pyproject.toml @@ -1,12 +1,11 @@ [tool.poetry] name = "gguf" -version = "0.13.0" +version = "0.14.0" description = "Read and write ML models in GGUF for GGML" authors = ["GGML "] packages = [ {include = "gguf"}, {include = "gguf/py.typed"}, - {include = "scripts"}, ] readme = "README.md" homepage = "https://ggml.ai" @@ -33,7 +32,7 @@ requires = ["poetry-core>=1.0.0"] build-backend = "poetry.core.masonry.api" [tool.poetry.scripts] -gguf-convert-endian = "scripts:gguf_convert_endian_entrypoint" -gguf-dump = "scripts:gguf_dump_entrypoint" -gguf-set-metadata = "scripts:gguf_set_metadata_entrypoint" -gguf-new-metadata = "scripts:gguf_new_metadata_entrypoint" +gguf-convert-endian = "gguf.scripts:gguf_convert_endian_entrypoint" +gguf-dump = "gguf.scripts:gguf_dump_entrypoint" +gguf-set-metadata = "gguf.scripts:gguf_set_metadata_entrypoint" +gguf-new-metadata = "gguf.scripts:gguf_new_metadata_entrypoint" From 8d59d911711b8f1ba9ec57c4b192ccd2628af033 Mon Sep 17 00:00:00 2001 From: hydai Date: Thu, 9 Jan 2025 04:03:28 +0800 Subject: [PATCH 12/12] fix: add missing msg in static_assert (#11143) Signed-off-by: hydai --- ggml/src/ggml-cuda/concat.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-cuda/concat.cu b/ggml/src/ggml-cuda/concat.cu index 2f42b8a95..aafbaf803 100644 --- a/ggml/src/ggml-cuda/concat.cu +++ b/ggml/src/ggml-cuda/concat.cu @@ -124,7 +124,7 @@ static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE) uint64_t nb1, uint64_t nb2, uint64_t nb3){ - static_assert(dim >= 0 && dim <= 3); + static_assert(dim >= 0 && dim <= 3, "dim must be in [0, 3]"); const int64_t i3 = blockIdx.z; const int64_t i2 = blockIdx.y;