diff --git a/.ecrc b/.ecrc new file mode 100644 index 000000000..b682057dd --- /dev/null +++ b/.ecrc @@ -0,0 +1,5 @@ +{ + "Disable": { + "IndentSize": true + } +} diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 000000000..df8aaf504 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,16 @@ +# https://EditorConfig.org + +# Top-most EditorConfig file +root = true + +# Unix-style newlines with a newline ending every file, utf-8 charset +[*] +end_of_line = lf +insert_final_newline = true +trim_trailing_whitespace = true +charset = utf-8 +indent_style = space +indent_size = 4 + +[Makefile] +indent_style = tab diff --git a/.github/ISSUE_TEMPLATE/custom.md b/.github/ISSUE_TEMPLATE/custom.md index 0d508802d..8fd955356 100644 --- a/.github/ISSUE_TEMPLATE/custom.md +++ b/.github/ISSUE_TEMPLATE/custom.md @@ -22,9 +22,9 @@ Please provide a detailed written description of what you were trying to do, and # Current Behavior -Please provide a detailed written description of what `llama.cpp` did, instead. +Please provide a detailed written description of what `llama.cpp` did, instead. -# Environment and Context +# Environment and Context Please provide detailed information about your computer setup. This is important in case the issue is not reproducible except for under certain specific conditions. @@ -133,7 +133,7 @@ llama_model_load: loading model part 8/8 from './models/65B/ggml-model-q4_0.bin. llama_model_load: .......................................................................................... done llama_model_load: model size = 4869.09 MB / num tensors = 723 -system_info: n_threads = 16 / 32 | AVX = 1 | AVX2 = 1 | AVX512 = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 | +system_info: n_threads = 16 / 32 | AVX = 1 | AVX2 = 1 | AVX512 = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 | main: prompt: 'Please close your issue when it has been answered.' main: number of tokens in prompt = 11 @@ -166,14 +166,14 @@ main: total time = 246406.42 ms Performance counter stats for './main -m ./models/65B/ggml-model-q4_0.bin -t 16 -n 1024 -p Please close your issue when it has been answered.': - 3636882.89 msec task-clock # 14.677 CPUs utilized - 13509 context-switches # 3.714 /sec - 2436 cpu-migrations # 0.670 /sec - 10476679 page-faults # 2.881 K/sec + 3636882.89 msec task-clock # 14.677 CPUs utilized + 13509 context-switches # 3.714 /sec + 2436 cpu-migrations # 0.670 /sec + 10476679 page-faults # 2.881 K/sec 13133115082869 cycles # 3.611 GHz (16.77%) 29314462753 stalled-cycles-frontend # 0.22% frontend cycles idle (16.76%) 10294402631459 stalled-cycles-backend # 78.39% backend cycles idle (16.74%) - 23479217109614 instructions # 1.79 insn per cycle + 23479217109614 instructions # 1.79 insn per cycle # 0.44 stalled cycles per insn (16.76%) 2353072268027 branches # 647.002 M/sec (16.77%) 1998682780 branch-misses # 0.08% of all branches (16.76%) diff --git a/.github/workflows/editorconfig.yml b/.github/workflows/editorconfig.yml new file mode 100644 index 000000000..b4e535acf --- /dev/null +++ b/.github/workflows/editorconfig.yml @@ -0,0 +1,17 @@ +name: EditorConfig Checker + +on: + push: + branches: + - master + pull_request: + branches: + - master + +jobs: + editorconfig: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: editorconfig-checker/action-editorconfig-checker@main + - run: editorconfig-checker diff --git a/Makefile b/Makefile index f5fa3b870..a44295312 100644 --- a/Makefile +++ b/Makefile @@ -34,6 +34,7 @@ endif CFLAGS = -I. -Ofast -DNDEBUG -std=c11 -fPIC CXXFLAGS = -I. -I./examples -Ofast -DNDEBUG -std=c++11 -fPIC LDFLAGS = +BONUSCFLAGS = #lets try enabling everything CFLAGS += -pthread -s @@ -71,7 +72,8 @@ endif # feel free to update the Makefile for your architecture and send a pull request or issue ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686)) # Use all CPU extensions that are available: - CFLAGS += -mf16c -mfma -mavx2 -mavx -msse3 + CFLAGS += -mf16c -mavx -msse3 + BONUSCFLAGS += -mfma -mavx2 endif ifneq ($(filter ppc64%,$(UNAME_M)),) POWER9_M := $(shell grep "POWER9" /proc/cpuinfo) @@ -122,17 +124,19 @@ ifneq ($(filter armv8%,$(UNAME_M)),) endif OPENBLAS_BUILD = +CLBLAST_BUILD = +OPENBLAS_NOAVX2_BUILD = + ifeq ($(OS),Windows_NT) OPENBLAS_BUILD = $(CXX) $(CXXFLAGS) ggml_openblas.o ggml_v1.o expose.o common.o llama_adapter.o gpttype_adapter.o lib/libopenblas.lib -shared -o koboldcpp_openblas.dll $(LDFLAGS) -else - OPENBLAS_BUILD = @echo 'Your OS $(OS) does not appear to be Windows. If you want to use openblas, please install it seperately, then link it manually with LLAMA_OPENBLAS=1. This is just a reminder, not an error.' -endif - -CLBLAST_BUILD = -ifeq ($(OS),Windows_NT) CLBLAST_BUILD = $(CXX) $(CXXFLAGS) ggml_clblast.o ggml_v1.o expose.o common.o llama_adapter.o gpttype_adapter.o lib/OpenCL.lib lib/clblast.lib -shared -o koboldcpp_clblast.dll $(LDFLAGS) + OPENBLAS_NOAVX2_BUILD = $(CXX) $(CXXFLAGS) ggml_openblas_noavx2.o ggml_v1.o expose.o common.o llama_adapter.o gpttype_adapter.o lib/libopenblas.lib -shared -o koboldcpp_openblas_noavx2.dll $(LDFLAGS) else - CLBLAST_BUILD = @echo 'Your OS $(OS) does not appear to be Windows. If you want to use CLBlast, please install it seperately, then link it manually with LLAMA_CLBLAST=1. This is just a reminder, not an error.' + ifndef LLAMA_OPENBLAS + ifndef LLAMA_CLBLAST + OPENBLAS_BUILD = @echo 'Your OS $(OS) does not appear to be Windows. For faster speeds, install and link a BLAS library. Set LLAMA_OPENBLAS=1 to compile with OpenBLAS support or LLAMA_CLBLAST=1 to compile with ClBlast support. This is just a reminder, not an error.' + endif + endif endif # @@ -150,22 +154,28 @@ $(info I CC: $(CCV)) $(info I CXX: $(CXXV)) $(info ) -default: llamalib llamalib_openblas llamalib_clblast +default: llamalib llamalib_openblas llamalib_openblas_noavx2 llamalib_clblast # # Build library # ggml.o: ggml.c ggml.h - $(CC) $(CFLAGS) -c ggml.c -o ggml.o + $(CC) $(CFLAGS) $(BONUSCFLAGS) -c ggml.c -o ggml.o ggml_openblas.o: ggml.c ggml.h - $(CC) $(CFLAGS) -DGGML_USE_OPENBLAS -c ggml.c -o ggml_openblas.o + $(CC) $(CFLAGS) $(BONUSCFLAGS) -DGGML_USE_OPENBLAS -c ggml.c -o ggml_openblas.o + +ggml_openblas_noavx2.o: ggml.c ggml.h + $(CC) $(CFLAGS) -DGGML_USE_OPENBLAS -c ggml.c -o ggml_openblas_noavx2.o ggml_clblast.o: ggml.c ggml.h - $(CC) $(CFLAGS) -DGGML_USE_OPENBLAS -DGGML_USE_CLBLAST -c ggml.c -o ggml_clblast.o + $(CC) $(CFLAGS) $(BONUSCFLAGS) -DGGML_USE_OPENBLAS -DGGML_USE_CLBLAST -c ggml.c -o ggml_clblast.o ggml_v1.o: otherarch/ggml_v1.c otherarch/ggml_v1.h + $(CC) $(CFLAGS) $(BONUSCFLAGS) -c otherarch/ggml_v1.c -o ggml_v1.o + +ggml_v1_noavx2.o: otherarch/ggml_v1.c otherarch/ggml_v1.h $(CC) $(CFLAGS) -c otherarch/ggml_v1.c -o ggml_v1.o llama.o: llama.cpp llama.h llama_internal.h @@ -198,6 +208,9 @@ llamalib: ggml.o ggml_v1.o expose.o common.o llama_adapter.o gpttype_adapter.o llamalib_openblas: ggml_openblas.o ggml_v1.o expose.o common.o llama_adapter.o gpttype_adapter.o $(OPENBLAS_BUILD) +llamalib_openblas_noavx2: ggml_openblas_noavx2.o ggml_v1_noavx2.o expose.o common.o llama_adapter.o gpttype_adapter.o + $(OPENBLAS_NOAVX2_BUILD) + llamalib_clblast: ggml_clblast.o ggml_v1.o expose.o common.o llama_adapter.o gpttype_adapter.o $(CLBLAST_BUILD) diff --git a/examples/Miku.sh b/examples/Miku.sh index 352478a15..c4cbf80f2 100755 --- a/examples/Miku.sh +++ b/examples/Miku.sh @@ -19,15 +19,15 @@ GEN_OPTIONS=(--batch_size 1024 --top_p 0.5) if [ -n "$N_THREAD" ]; then - GEN_OPTIONS+=(--threads "$N_THREAD") + GEN_OPTIONS+=(--threads "$N_THREAD") fi ./main "${GEN_OPTIONS[@]}" \ - --model "$MODEL" \ - --n_predict "$N_PREDICTS" \ - --color --interactive \ - --reverse-prompt "${USER_NAME}:" \ - --prompt " + --model "$MODEL" \ + --n_predict "$N_PREDICTS" \ + --color --interactive \ + --reverse-prompt "${USER_NAME}:" \ + --prompt " This is a transcript of a 1000 page, never ending conversation between ${USER_NAME} and the cute and helpful AI assistant ${AI_NAME}. ${AI_NAME} is a girl who is an AI running on the users computer. ${AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next. ${AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct she will ask the user for help. diff --git a/examples/common.cpp b/examples/common.cpp index f909eed24..91d96efae 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -22,9 +22,9 @@ extern "C" __declspec(dllimport) int __stdcall GetConsoleMode(void* hConsoleHand extern "C" __declspec(dllimport) int __stdcall SetConsoleMode(void* hConsoleHandle, unsigned long dwMode); extern "C" __declspec(dllimport) int __stdcall SetConsoleCP(unsigned int wCodePageID); extern "C" __declspec(dllimport) int __stdcall SetConsoleOutputCP(unsigned int wCodePageID); -extern "C" __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int CodePage, unsigned long dwFlags, - const wchar_t * lpWideCharStr, int cchWideChar, - char * lpMultiByteStr, int cbMultiByte, +extern "C" __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int CodePage, unsigned long dwFlags, + const wchar_t * lpWideCharStr, int cchWideChar, + char * lpMultiByteStr, int cbMultiByte, const char * lpDefaultChar, bool * lpUsedDefaultChar); #define CP_UTF8 65001 #endif @@ -328,9 +328,9 @@ void win32_console_init(bool enable_color) { // Convert a wide Unicode string to an UTF8 string void win32_utf8_encode(const std::wstring & wstr, std::string & str) { - int size_needed = WideCharToMultiByte(CP_UTF8, 0, &wstr[0], (int)wstr.size(), NULL, 0, NULL, NULL); - std::string strTo(size_needed, 0); - WideCharToMultiByte(CP_UTF8, 0, &wstr[0], (int)wstr.size(), &strTo[0], size_needed, NULL, NULL); - str = strTo; + int size_needed = WideCharToMultiByte(CP_UTF8, 0, &wstr[0], (int)wstr.size(), NULL, 0, NULL, NULL); + std::string strTo(size_needed, 0); + WideCharToMultiByte(CP_UTF8, 0, &wstr[0], (int)wstr.size(), &strTo[0], size_needed, NULL, NULL); + str = strTo; } #endif diff --git a/examples/embedding/README.md b/examples/embedding/README.md index 21d8be65f..fe8f5dcc6 100644 --- a/examples/embedding/README.md +++ b/examples/embedding/README.md @@ -1,3 +1,3 @@ -# embedding - -TODO +# embedding + +TODO diff --git a/examples/main/README.md b/examples/main/README.md index 4701aa558..f09e7ba97 100644 --- a/examples/main/README.md +++ b/examples/main/README.md @@ -1,3 +1,3 @@ -# main - -TODO +# main + +TODO diff --git a/examples/main/main.cpp b/examples/main/main.cpp index bf756c16d..ba153cb82 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -168,7 +168,7 @@ int main(int argc, char ** argv) { } // enable interactive mode if reverse prompt or interactive start is specified - if (params.antiprompt.size() != 0 || params.interactive_start) { + if (params.antiprompt.size() != 0 || params.interactive_start) { params.interactive = true; } diff --git a/examples/perplexity/README.md b/examples/perplexity/README.md index a932275c2..eacfb17c6 100644 --- a/examples/perplexity/README.md +++ b/examples/perplexity/README.md @@ -1,3 +1,3 @@ -# perplexity - -TODO +# perplexity + +TODO diff --git a/ggml.c b/ggml.c index 2d3f78fcf..5c52ef53d 100644 --- a/ggml.c +++ b/ggml.c @@ -127,9 +127,9 @@ typedef void* thread_ret_t; #ifdef GGML_USE_ACCELERATE #include -#elif GGML_USE_OPENBLAS -#include #endif +#include + #undef MIN #undef MAX @@ -228,12 +228,12 @@ static inline float fp32_from_bits(uint32_t w) { } static inline uint32_t fp32_to_bits(float f) { - union { - float as_value; - uint32_t as_bits; - } fp32; - fp32.as_value = f; - return fp32.as_bits; + union { + float as_value; + uint32_t as_bits; + } fp32; + fp32.as_value = f; + return fp32.as_bits; } static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { @@ -1881,7 +1881,7 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest sum1 += x1->d * y1->d * (vgetq_lane_s32(p_1, 0) + vgetq_lane_s32(p_1, 1) + vgetq_lane_s32(p_1, 2) + vgetq_lane_s32(p_1, 3)); #endif #else - const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0ls)); + const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0ls)); const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0ls)); const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0hs)); diff --git a/ggml_blas_adapter.c b/ggml_blas_adapter.c index f3413875e..03c3c594b 100644 --- a/ggml_blas_adapter.c +++ b/ggml_blas_adapter.c @@ -4,6 +4,7 @@ //windows binaries for clblast obtained from https://github.com/CNugteren/CLBlast (apache license) //windows binaries for opencl obtained from https://github.com/KhronosGroup/OpenCL-SDK (apache license) +#if GGML_USE_OPENBLAS #include #include #include @@ -104,21 +105,16 @@ static void ggml_cl_sgemm_wrapper(const enum CBLAS_ORDER order, const enum CBLAS } #endif - -static void do_blas_sgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, -OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc) -{ -#if GGML_USE_CLBLAST - ggml_cl_sgemm_wrapper(Order, TransA, TransB, - M, N, K, - alpha, A, lda, - B, ldb, - beta, C, ldc); -#else - cblas_sgemm(Order, TransA, TransB, - M, N, K, - alpha, A, lda, - B, ldb, - beta, C, ldc); #endif -} \ No newline at end of file + +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) +#if GGML_USE_CLBLAST +#define do_blas_sgemm(Order, TransA, TransB,M, N, K,alpha, A, lda, B, ldb, beta, C, ldc) ({\ +ggml_cl_sgemm_wrapper(Order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);\ +}) +#else +#define do_blas_sgemm(Order, TransA, TransB,M, N, K,alpha, A, lda, B, ldb, beta, C, ldc) ({\ +cblas_sgemm(Order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);\ +}) +#endif +#endif \ No newline at end of file diff --git a/koboldcpp.py b/koboldcpp.py index 85b14e395..187e8279b 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -36,11 +36,14 @@ class generation_outputs(ctypes.Structure): handle = None use_blas = False # if true, uses OpenBLAS for acceleration. libopenblas.dll must exist in the same dir. use_clblast = False #uses CLBlast instead +use_noavx2 = False #uses openblas with no avx2 instructions def init_library(): - global handle, use_blas, use_clblast + global handle, use_blas, use_clblast, use_noavx2 libname = "" - if use_blas: + if use_noavx2: + libname = "koboldcpp_openblas_noavx2.dll" + elif use_blas: libname = "koboldcpp_openblas.dll" elif use_clblast: libname = "koboldcpp_clblast.dll" @@ -309,7 +312,7 @@ def RunServerMultiThreaded(addr, port, embedded_kailite = None): sys.exit(0) def main(args): - global use_blas, use_clblast + global use_blas, use_clblast, use_noavx2 if not os.path.exists(os.path.join(os.path.dirname(os.path.realpath(__file__)), "libopenblas.dll")) or not os.path.exists(os.path.join(os.path.dirname(os.path.realpath(__file__)), "koboldcpp_openblas.dll")): print("Warning: libopenblas.dll or koboldcpp_openblas.dll not found. Non-BLAS library will be used. Ignore this if you have manually linked with OpenBLAS.") use_blas = False @@ -322,6 +325,14 @@ def main(args): else: print("Attempting to use CLBlast library for faster prompt ingestion. A compatible clblast.dll will be required.") use_clblast = True + elif args.noavx2: + if not os.path.exists(os.path.join(os.path.dirname(os.path.realpath(__file__)), "libopenblas.dll")) or not os.path.exists(os.path.join(os.path.dirname(os.path.realpath(__file__)), "koboldcpp_openblas_noavx2.dll")): + print("Warning: libopenblas.dll or koboldcpp_openblas_noavx2.dll not found. This mode cannot be used.") + elif os.name == 'nt': + print("Attempting to use non-avx2 compatibility openblas library.") + use_noavx2 = True + else: + print("Non-AVX2 compatibility OpenBLAS mode only available on windows. On other OS, please manually rebuild without AVX2 flags.") elif not args.noblas: print("Attempting to use OpenBLAS library for faster prompt ingestion. A compatible libopenblas.dll will be required.") use_blas = True @@ -409,8 +420,10 @@ if __name__ == '__main__': parser.add_argument("--threads", help="Use a custom number of threads if specified. Otherwise, uses an amount based on CPU cores", type=int, default=default_threads) parser.add_argument("--psutil_set_threads", help="Experimental flag. If set, uses psutils to determine thread count based on physical cores.", action='store_true') parser.add_argument("--stream", help="Uses pseudo streaming", action='store_true') - parser.add_argument("--noblas", help="Do not use OpenBLAS for accelerated prompt ingestion", action='store_true') parser.add_argument("--nommap", help="If set, do not use mmap to load newer models", action='store_true') - parser.add_argument("--useclblast", help="Use CLBlast instead of OpenBLAS for prompt ingestion. Must specify exactly 2 arguments, platform ID and device ID (e.g. --useclblast 1 0).", type=int, choices=range(0,9), nargs=2) + compatgroup = parser.add_mutually_exclusive_group() + compatgroup.add_argument("--noblas", help="Do not use OpenBLAS for accelerated prompt ingestion", action='store_true') + compatgroup.add_argument("--noavx2", help="Do not use AVX2 instructions, a slower compatibility mode for older devices. Does not work with --noblas or --clblast.", action='store_true') + compatgroup.add_argument("--useclblast", help="Use CLBlast instead of OpenBLAS for prompt ingestion. Must specify exactly 2 arguments, platform ID and device ID (e.g. --useclblast 1 0).", type=int, choices=range(0,9), nargs=2) args = parser.parse_args() main(args) diff --git a/make_pyinstaller.bat b/make_pyinstaller.bat index eb2b3bfe1..e065fdd42 100644 --- a/make_pyinstaller.bat +++ b/make_pyinstaller.bat @@ -1 +1 @@ -pyinstaller --noconfirm --onefile --clean --console --icon "./niko.ico" --add-data "./klite.embd;." --add-data "./koboldcpp.dll;." --add-data "./koboldcpp_openblas.dll;." --add-data "./libopenblas.dll;." --add-data "./koboldcpp_clblast.dll;." --add-data "./clblast.dll;." "./koboldcpp.py" -n "koboldcpp.exe" \ No newline at end of file +pyinstaller --noconfirm --onefile --clean --console --icon "./niko.ico" --add-data "./klite.embd;." --add-data "./koboldcpp.dll;." --add-data "./koboldcpp_openblas.dll;." --add-data "./koboldcpp_openblas_noavx2.dll;." --add-data "./libopenblas.dll;." --add-data "./koboldcpp_clblast.dll;." --add-data "./clblast.dll;." "./koboldcpp.py" -n "koboldcpp.exe" \ No newline at end of file