diff --git a/README.md b/README.md index e8219ab60..9fdc93243 100644 --- a/README.md +++ b/README.md @@ -9,12 +9,14 @@ What does it mean? You get llama.cpp with a fancy UI, persistent stories, editin ## Usage - **[Download the latest .exe release here](https://github.com/LostRuins/koboldcpp/releases/latest)** or clone the git repo. - Windows binaries are provided in the form of **koboldcpp.exe**, which is a pyinstaller wrapper for a few **.dll** files and **koboldcpp.py**. If you feel concerned, you may prefer to rebuild it yourself with the provided makefiles and scripts. -- Weights are not included, you can use the official llama.cpp `quantize.exe` to generate them from your official weight files (or download them from other places). +- Weights are not included, you can use the official llama.cpp `quantize.exe` to generate them from your official weight files (or download them from other places such as [TheBloke's Huggingface](https://huggingface.co/TheBloke). - To run, execute **koboldcpp.exe** or drag and drop your quantized `ggml_model.bin` file onto the .exe, and then connect with Kobold or Kobold Lite. If you're not on windows, then run the script **KoboldCpp.py** after compiling the libraries. +- Launching with no command line arguments displays a GUI containing a subset of configurable settings. Generally you dont have to change much besides the `Presets` and `GPU Layers`. Read the `--help` for more info about each settings. - By default, you can connect to http://localhost:5001 - You can also run it using the command line `koboldcpp.exe [ggml_model.bin] [port]`. For info, please check `koboldcpp.exe --help` -- Big context still too slow? Try the `--smartcontext` flag to reduce prompt processing frequency. Also, you can try to run with your GPU using CLBlast, with `--useclblast` flag for a speedup -- Want even more speedup? Combine `--useclblast` with `--gpulayers` to offload entire layers to the GPU! **Much faster, but uses more VRAM**. Experiment to determine number of layers to offload. +- Default context size to small? Try `--contextsize 3072` to 1.5x your context size! without much perplexity gain. Note that you'll have to increase the max context in the Kobold Lite UI as well (click and edit the number text field). +- Big context too slow? Try the `--smartcontext` flag to reduce prompt processing frequency. Also, you can try to run with your GPU using CLBlast, with `--useclblast` flag for a speedup +- Want even more speedup? Combine `--useclblast` with `--gpulayers` to offload entire layers to the GPU! **Much faster, but uses more VRAM**. Experiment to determine number of layers to offload, and reduce by a few if you run out of memory. - If you are having crashes or issues, you can try turning off BLAS with the `--noblas` flag. You can also try running in a non-avx2 compatibility mode with `--noavx2`. Lastly, you can try turning off mmap with `--nommap`. For more information, be sure to run the program with the `--help` flag. @@ -48,12 +50,14 @@ For more information, be sure to run the program with the `--help` flag. - See https://github.com/ggerganov/llama.cpp/pull/1828/files ## CuBLAS? +- If you're on Windows with an Nvidia GPU you can get CUDA support out of the box using the `--usecublas` flag, make sure you select the correct .exe with CUDA support. - You can attempt a CuBLAS build with `LLAMA_CUBLAS=1` or using the provided CMake file (best for visual studio users). If you use the CMake file to build, copy the `koboldcpp_cublas.dll` generated into the same directory as the `koboldcpp.py` file. If you are bundling executables, you may need to include CUDA dynamic libraries (such as `cublasLt64_11.dll` and `cublas64_11.dll`) in order for the executable to work correctly on a different PC. Note that support for CuBLAS is limited. ## Considerations - For Windows: No installation, single file executable, (It Just Works) - Since v1.0.6, requires libopenblas, the prebuilt windows binaries are included in this repo. If not found, it will fall back to a mode without BLAS. - Since v1.15, requires CLBlast if enabled, the prebuilt windows binaries are included in this repo. If not found, it will fall back to a mode without CLBlast. +- Since v1.33, you can set the context size to be above what the model supports officially. It does increases perplexity but should still work well below 4096 even on untuned models. (For GPT-NeoX, GPT-J, and LLAMA models) Customize this with `--ropeconfig`. - **I plan to keep backwards compatibility with ALL past llama.cpp AND alpaca.cpp models**. But you are also encouraged to reconvert/update your models if possible for best results. ## License diff --git a/convert-lora-to-ggml.py b/convert-lora-to-ggml.py old mode 100644 new mode 100755 index f43c836f5..b4999ff5a --- a/convert-lora-to-ggml.py +++ b/convert-lora-to-ggml.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python import json import os import re diff --git a/convert.py b/convert.py old mode 100644 new mode 100755 index 7a2705e5c..e3f1096e1 --- a/convert.py +++ b/convert.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python import argparse import concurrent.futures import copy diff --git a/examples/baby-llama/CMakeLists.txt b/examples/baby-llama/CMakeLists.txt index d2ce36367..7b70227a5 100644 --- a/examples/baby-llama/CMakeLists.txt +++ b/examples/baby-llama/CMakeLists.txt @@ -1,4 +1,5 @@ set(TARGET baby-llama) add_executable(${TARGET} baby-llama.cpp) +install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/benchmark/CMakeLists.txt b/examples/benchmark/CMakeLists.txt index 037696194..3f3415350 100644 --- a/examples/benchmark/CMakeLists.txt +++ b/examples/benchmark/CMakeLists.txt @@ -1,5 +1,6 @@ set(TARGET benchmark) add_executable(${TARGET} benchmark-matmult.cpp) +install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) if(TARGET BUILD_INFO) diff --git a/examples/embd-input/CMakeLists.txt b/examples/embd-input/CMakeLists.txt index 2b623953e..5bbb1ea02 100644 --- a/examples/embd-input/CMakeLists.txt +++ b/examples/embd-input/CMakeLists.txt @@ -1,5 +1,6 @@ set(TARGET embdinput) add_library(${TARGET} embd-input-lib.cpp embd-input.h) +install(TARGETS ${TARGET} LIBRARY) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) if(TARGET BUILD_INFO) @@ -8,6 +9,7 @@ endif() set(TARGET embd-input-test) add_executable(${TARGET} embd-input-test.cpp) +install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama embdinput ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) if(TARGET BUILD_INFO) diff --git a/examples/embedding/CMakeLists.txt b/examples/embedding/CMakeLists.txt index db73b6b44..0c752c7bb 100644 --- a/examples/embedding/CMakeLists.txt +++ b/examples/embedding/CMakeLists.txt @@ -1,5 +1,6 @@ set(TARGET embedding) add_executable(${TARGET} embedding.cpp) +install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) if(TARGET BUILD_INFO) diff --git a/examples/main/CMakeLists.txt b/examples/main/CMakeLists.txt index c364242fb..cc1888948 100644 --- a/examples/main/CMakeLists.txt +++ b/examples/main/CMakeLists.txt @@ -1,5 +1,6 @@ set(TARGET main) add_executable(${TARGET} main.cpp) +install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) if(TARGET BUILD_INFO) diff --git a/examples/metal/CMakeLists.txt b/examples/metal/CMakeLists.txt index a8c4284a5..f16d49165 100644 --- a/examples/metal/CMakeLists.txt +++ b/examples/metal/CMakeLists.txt @@ -1,3 +1,4 @@ set(TEST_TARGET metal) add_executable(${TEST_TARGET} metal.cpp) +install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TEST_TARGET} PRIVATE ggml) diff --git a/examples/perplexity/CMakeLists.txt b/examples/perplexity/CMakeLists.txt index 61b17b828..af00b4e16 100644 --- a/examples/perplexity/CMakeLists.txt +++ b/examples/perplexity/CMakeLists.txt @@ -1,5 +1,6 @@ set(TARGET perplexity) add_executable(${TARGET} perplexity.cpp) +install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) if(TARGET BUILD_INFO) diff --git a/examples/quantize-stats/CMakeLists.txt b/examples/quantize-stats/CMakeLists.txt index 7bebc11a1..c5c394058 100644 --- a/examples/quantize-stats/CMakeLists.txt +++ b/examples/quantize-stats/CMakeLists.txt @@ -1,4 +1,5 @@ set(TARGET quantize-stats) add_executable(${TARGET} quantize-stats.cpp) +install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/quantize/CMakeLists.txt b/examples/quantize/CMakeLists.txt index 475fc8be8..47d0be72e 100644 --- a/examples/quantize/CMakeLists.txt +++ b/examples/quantize/CMakeLists.txt @@ -1,5 +1,6 @@ set(TARGET quantize) add_executable(${TARGET} quantize.cpp) +install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) if(TARGET BUILD_INFO) diff --git a/examples/save-load-state/CMakeLists.txt b/examples/save-load-state/CMakeLists.txt index 08dbe5c2b..eadd13cdf 100644 --- a/examples/save-load-state/CMakeLists.txt +++ b/examples/save-load-state/CMakeLists.txt @@ -1,5 +1,6 @@ set(TARGET save-load-state) add_executable(${TARGET} save-load-state.cpp) +install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) if(TARGET BUILD_INFO) diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt index 07ba76ad3..812a24b09 100644 --- a/examples/server/CMakeLists.txt +++ b/examples/server/CMakeLists.txt @@ -2,6 +2,7 @@ set(TARGET server) option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON) include_directories(${CMAKE_CURRENT_SOURCE_DIR}) add_executable(${TARGET} server.cpp json.hpp httplib.h) +install(TARGETS ${TARGET} RUNTIME) target_compile_definitions(${TARGET} PRIVATE SERVER_VERBOSE=$ ) diff --git a/examples/simple/CMakeLists.txt b/examples/simple/CMakeLists.txt index 1568f7364..0ac9cb03a 100644 --- a/examples/simple/CMakeLists.txt +++ b/examples/simple/CMakeLists.txt @@ -1,5 +1,6 @@ set(TARGET simple) add_executable(${TARGET} simple.cpp) +install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) if(TARGET BUILD_INFO) diff --git a/examples/train-text-from-scratch/CMakeLists.txt b/examples/train-text-from-scratch/CMakeLists.txt index 1a44c4961..4459516d0 100644 --- a/examples/train-text-from-scratch/CMakeLists.txt +++ b/examples/train-text-from-scratch/CMakeLists.txt @@ -1,4 +1,5 @@ set(TARGET train-text-from-scratch) add_executable(${TARGET} train-text-from-scratch.cpp) +install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/klite.embd b/klite.embd index 62e639187..165f0e878 100644 --- a/klite.embd +++ b/klite.embd @@ -1,6 +1,6 @@