From 0b832d53ba0ffcc759c8d62ede3772dd62321f8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Mon, 3 Jun 2024 16:28:58 +0200 Subject: [PATCH 01/10] make: fix debug options not being applied to NVCC (#7714) --- Makefile | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index c643fe0cc..dca2aafd7 100644 --- a/Makefile +++ b/Makefile @@ -135,12 +135,16 @@ MK_NVCCFLAGS = -std=c++11 ifdef LLAMA_FAST MK_CFLAGS += -Ofast HOST_CXXFLAGS += -Ofast +ifndef LLAMA_DEBUG MK_NVCCFLAGS += -O3 +endif # LLAMA_DEBUG else MK_CFLAGS += -O3 MK_CXXFLAGS += -O3 +ifndef LLAMA_DEBUG MK_NVCCFLAGS += -O3 -endif +endif # LLAMA_DEBUG +endif # LLAMA_FAST ifndef LLAMA_NO_CCACHE CCACHE := $(shell which ccache) @@ -201,9 +205,10 @@ ifdef LLAMA_SCHED_MAX_COPIES endif ifdef LLAMA_DEBUG - MK_CFLAGS += -O0 -g - MK_CXXFLAGS += -O0 -g - MK_LDFLAGS += -g + MK_CFLAGS += -O0 -g + MK_CXXFLAGS += -O0 -g + MK_LDFLAGS += -g + MK_NVCCFLAGS += -O0 -g ifeq ($(UNAME_S),Linux) MK_CPPFLAGS += -D_GLIBCXX_ASSERTIONS From a5735e4426b19a3ebd0c653ad8ac01420458ee95 Mon Sep 17 00:00:00 2001 From: "Masaya, Kato" <62578291+msy-kato@users.noreply.github.com> Date: Tue, 4 Jun 2024 00:14:15 +0900 Subject: [PATCH 02/10] ggml : use OpenMP as a thread pool (#7606) * ggml: Added OpenMP for multi-threads processing * ggml : Limit the number of threads used to avoid deadlock * update shared state n_threads in parallel region * clear numa affinity for main thread even with openmp * enable openmp by default * fix msvc build * disable openmp on macos * ci : disable openmp with thread sanitizer * Update ggml.c Co-authored-by: Georgi Gerganov --------- Co-authored-by: slaren Co-authored-by: Georgi Gerganov --- .github/workflows/build.yml | 10 ++++ CMakeLists.txt | 12 ++++ Makefile | 8 +++ ggml.c | 111 ++++++++++++++++++++++++------------ 4 files changed, 103 insertions(+), 38 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 7b616281b..e824136a5 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -294,12 +294,22 @@ jobs: - name: Build id: cmake_build + if: ${{ matrix.sanitizer != 'THREAD' }} run: | mkdir build cd build cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} cmake --build . --config ${{ matrix.build_type }} -j $(nproc) + - name: Build (no OpenMP) + id: cmake_build_no_openmp + if: ${{ matrix.sanitizer == 'THREAD' }} + run: | + mkdir build + cd build + cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DLLAMA_OPENMP=OFF + cmake --build . --config ${{ matrix.build_type }} -j $(nproc) + - name: Test id: cmake_test run: | diff --git a/CMakeLists.txt b/CMakeLists.txt index 0f07f9a53..620305ca7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -126,6 +126,7 @@ set(LLAMA_METAL_MACOSX_VERSION_MIN "" CACHE STRING set(LLAMA_METAL_STD "" CACHE STRING "llama: metal standard version (-std flag)") option(LLAMA_KOMPUTE "llama: use Kompute" OFF) option(LLAMA_RPC "llama: use RPC" OFF) +option(LLAMA_OPENMP "llama: use OpenMP" ON) option(LLAMA_SYCL "llama: use SYCL" OFF) option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF) set(LLAMA_SYCL_TARGET "INTEL" CACHE STRING "llama: sycl target device") @@ -296,6 +297,17 @@ if (LLAMA_METAL) ) endif() +if (LLAMA_OPENMP) + find_package(OpenMP) + if (OpenMP_FOUND) + message(STATUS "OpenMP found") + add_compile_definitions(GGML_USE_OPENMP) + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} OpenMP::OpenMP_C OpenMP::OpenMP_CXX) + else() + message(WARNING "OpenMP not found") + endif() +endif() + if (LLAMA_BLAS) if (LLAMA_STATIC) set(BLA_STATIC ON) diff --git a/Makefile b/Makefile index dca2aafd7..43b94db01 100644 --- a/Makefile +++ b/Makefile @@ -57,6 +57,8 @@ ifeq ($(UNAME_S),Darwin) LLAMA_METAL := 1 endif + LLAMA_NO_OPENMP := 1 + ifneq ($(UNAME_P),arm) SYSCTL_M := $(shell sysctl -n hw.optional.arm64 2>/dev/null) ifeq ($(SYSCTL_M),1) @@ -405,6 +407,12 @@ ifndef LLAMA_NO_ACCELERATE endif endif # LLAMA_NO_ACCELERATE +ifndef LLAMA_NO_OPENMP + MK_CPPFLAGS += -DGGML_USE_OPENMP + MK_CFLAGS += -fopenmp + MK_CXXFLAGS += -fopenmp +endif # LLAMA_NO_OPENMP + ifdef LLAMA_OPENBLAS MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas) MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas) diff --git a/ggml.c b/ggml.c index f479dc3e1..000d9db7e 100644 --- a/ggml.c +++ b/ggml.c @@ -5,6 +5,7 @@ #include "ggml-quants.h" #include "ggml.h" + #if defined(_MSC_VER) || defined(__MINGW32__) #include // using malloc.h with MSC/MINGW #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) @@ -28,6 +29,10 @@ #include #endif +#ifdef GGML_USE_OPENMP +#include +#endif + #ifdef GGML_USE_METAL #include #endif @@ -1756,7 +1761,7 @@ struct ggml_compute_state_shared { int64_t perf_node_start_cycles; int64_t perf_node_start_time_us; - const int n_threads; + int n_threads; // synchronization primitives atomic_int n_active; // num active threads @@ -19670,6 +19675,59 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa return cplan; } +static enum ggml_status ggml_graph_compute_parallel(struct ggml_compute_state * workers, int n_threads) { + enum ggml_status compute_status = GGML_STATUS_SUCCESS; + +#ifdef GGML_USE_OPENMP + if (n_threads > 1) { + #pragma omp parallel num_threads(n_threads) + { + #pragma omp single + { + // update the number of threads from the actual number of threads that we got from OpenMP + n_threads = omp_get_num_threads(); + workers[0].shared->n_threads = n_threads; + workers[0].shared->n_active = n_threads; + } + ggml_graph_compute_thread(&workers[omp_get_thread_num()]); + } + } else { + ggml_graph_compute_thread(&workers[0]); + } +#else + // create thread pool + if (n_threads > 1) { + for (int j = 1; j < n_threads; ++j) { + const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]); + GGML_ASSERT(rc == 0); + UNUSED(rc); + } + } + + // this is a work thread too + ggml_graph_compute_thread(&workers[0]); + + // join or kill thread pool + if (n_threads > 1) { + for (int j = 1; j < n_threads; j++) { + const int rc = ggml_thread_join(workers[j].thrd, NULL); + GGML_ASSERT(rc == 0); + UNUSED(rc); + } + } +#endif + // don't leave affinity set on the main thread + clear_numa_thread_affinity(); + + for (int j = 0; j < n_threads; j++) { + if (workers[j].ec != GGML_STATUS_SUCCESS) { + compute_status = workers[j].ec; + break; + } + } + return compute_status; +} + enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) { { GGML_ASSERT(cplan); @@ -19680,7 +19738,11 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl } } - const int n_threads = cplan->n_threads; + int n_threads = cplan->n_threads; + +#if defined(GGML_USE_OPENMP) + n_threads = MIN(n_threads, omp_get_max_threads()); +#endif struct ggml_compute_state_shared state_shared = { /*.cgraph =*/ cgraph, @@ -19696,47 +19758,20 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl /*.current_chunk; =*/ 0, }; struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads); - - // create thread pool - if (n_threads > 1) { - for (int j = 1; j < n_threads; ++j) { - workers[j] = (struct ggml_compute_state) { - .thrd = 0, - .ith = j, - .shared = &state_shared, - .ec = GGML_STATUS_SUCCESS, - }; - - const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]); - GGML_ASSERT(rc == 0); - UNUSED(rc); - } - } - - workers[0].ith = 0; - workers[0].shared = &state_shared; - workers[0].ec = GGML_STATUS_SUCCESS; - const int64_t perf_start_cycles = ggml_perf_cycles(); const int64_t perf_start_time_us = ggml_perf_time_us(); - // this is a work thread too - ggml_graph_compute_thread(&workers[0]); - enum ggml_status compute_status = workers[0].ec; - - // don't leave affinity set on the main thread - clear_numa_thread_affinity(); - - // join or kill thread pool - if (n_threads > 1) { - for (int j = 1; j < n_threads; j++) { - const int rc = ggml_thread_join(workers[j].thrd, NULL); - GGML_ASSERT(rc == 0); - if (workers[j].ec != GGML_STATUS_SUCCESS) - compute_status = workers[j].ec; - } + for (int j = 0; j < n_threads; ++j) { + workers[j] = (struct ggml_compute_state) { + .thrd = 0, + .ith = j, + .shared = &state_shared, + .ec = GGML_STATUS_SUCCESS, + }; } + enum ggml_status compute_status = ggml_graph_compute_parallel(workers, n_threads); + // performance stats (graph) { int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles; From bde7cd3cd949c1a85d3a199498ac98e78039d46f Mon Sep 17 00:00:00 2001 From: Radoslav Gerganov Date: Mon, 3 Jun 2024 20:03:26 +0300 Subject: [PATCH 03/10] llama : offload to RPC in addition to other backends (#7640) * llama : offload to RPC in addition to other backends * - fix copy_tensor being called on the src buffer instead of the dst buffer - always initialize views in the view_src buffer - add RPC backend to Makefile build - add endpoint to all RPC object names * add rpc-server to Makefile * Update llama.cpp Co-authored-by: slaren --------- Co-authored-by: slaren --- Makefile | 29 ++++++++++++++--- ggml-alloc.c | 6 ++-- ggml-backend.c | 10 +++--- ggml-backend.h | 2 +- ggml-rpc.cpp | 4 +-- llama.cpp | 88 +++++++++++++++++++++++++++++--------------------- 6 files changed, 86 insertions(+), 53 deletions(-) diff --git a/Makefile b/Makefile index 43b94db01..b527f6f35 100644 --- a/Makefile +++ b/Makefile @@ -69,6 +69,10 @@ ifeq ($(UNAME_S),Darwin) endif endif +ifdef LLAMA_RPC + BUILD_TARGETS += rpc-server +endif + default: $(BUILD_TARGETS) test: $(TEST_TARGETS) @@ -429,6 +433,11 @@ ifdef LLAMA_BLIS MK_LDFLAGS += -lblis -L/usr/local/lib endif # LLAMA_BLIS +ifdef LLAMA_RPC + MK_CPPFLAGS += -DGGML_USE_RPC + OBJS += ggml-rpc.o +endif # LLAMA_RPC + ifdef LLAMA_CUBLAS # LLAMA_CUBLAS is deprecated and will be removed in the future LLAMA_CUDA := 1 @@ -654,11 +663,26 @@ ggml-metal-embed.o: ggml-metal.metal ggml-common.h endif endif # LLAMA_METAL +OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o +COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h +COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o + ifndef LLAMA_NO_LLAMAFILE sgemm.o: sgemm.cpp sgemm.h ggml.h $(CXX) $(CXXFLAGS) -c $< -o $@ endif +ifdef LLAMA_RPC +ggml-rpc.o: ggml-rpc.cpp ggml-rpc.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + +rpc-server.o: examples/rpc/rpc-server.cpp ggml-rpc.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + +rpc-server: rpc-server.o ggml.o llama.o $(COMMON_DEPS) $(OBJS) + $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) +endif # LLAMA_RPC + GF_CC := $(CC) include scripts/get-flags.mk @@ -738,14 +762,9 @@ unicode.o: unicode.cpp unicode.h unicode-data.o: unicode-data.cpp unicode-data.h $(CXX) $(CXXFLAGS) -c $< -o $@ -OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o - llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h $(CXX) $(CXXFLAGS) -c $< -o $@ -COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h -COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o - common.o: common/common.cpp $(COMMON_H_DEPS) $(CXX) $(CXXFLAGS) -c $< -o $@ diff --git a/ggml-alloc.c b/ggml-alloc.c index 0146946eb..73a3c1575 100644 --- a/ggml-alloc.c +++ b/ggml-alloc.c @@ -750,7 +750,7 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * // this tensor was allocated without ggml-backend return; } - ggml_backend_view_init(galloc->buffers[buffer_id], tensor); + ggml_backend_view_init(tensor); } } else { if (tensor->data == NULL) { @@ -899,12 +899,12 @@ static bool alloc_tensor_range(struct ggml_context * ctx, if (t->view_src == NULL) { ggml_tallocr_alloc(&tallocr, t); } else if (t->buffer == NULL) { - ggml_backend_view_init(buffer, t); + ggml_backend_view_init(t); } } else { if (t->view_src != NULL && t->buffer == NULL) { // view of a pre-allocated tensor - ggml_backend_view_init(buffer, t); + ggml_backend_view_init(t); } } } diff --git a/ggml-backend.c b/ggml-backend.c index 9e35ce98d..05737ed69 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -151,7 +151,7 @@ void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) { bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst) { ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer; if (dst_buf->iface.cpy_tensor) { - return src->buffer->iface.cpy_tensor(dst_buf, src, dst); + return dst_buf->iface.cpy_tensor(dst_buf, src, dst); } return false; } @@ -1887,15 +1887,15 @@ ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, // utils -void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { +void ggml_backend_view_init(struct ggml_tensor * tensor) { GGML_ASSERT(tensor->buffer == NULL); GGML_ASSERT(tensor->view_src != NULL); GGML_ASSERT(tensor->view_src->buffer != NULL); GGML_ASSERT(tensor->view_src->data != NULL); - tensor->buffer = buffer; + tensor->buffer = tensor->view_src->buffer; tensor->data = (char *)tensor->view_src->data + tensor->view_offs; - ggml_backend_buffer_init_tensor(buffer, tensor); + ggml_backend_buffer_init_tensor(tensor->buffer, tensor); } void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) { @@ -1954,7 +1954,7 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te struct ggml_tensor * dst = node_copies[id]; if (dst->view_src != NULL) { graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src); - ggml_backend_view_init(dst->view_src->buffer, dst); + ggml_backend_view_init(dst); } else { ggml_backend_tensor_copy(src, dst); diff --git a/ggml-backend.h b/ggml-backend.h index 744b6a774..c582b0685 100644 --- a/ggml-backend.h +++ b/ggml-backend.h @@ -225,7 +225,7 @@ extern "C" { // Tensor initialization GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr); - GGML_API void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); + GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor); #ifdef __cplusplus diff --git a/ggml-rpc.cpp b/ggml-rpc.cpp index 49a20df4b..679ce4f28 100644 --- a/ggml-rpc.cpp +++ b/ggml-rpc.cpp @@ -491,7 +491,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer if (remote_ptr != 0) { ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft, ggml_backend_rpc_buffer_interface, - new ggml_backend_rpc_buffer_context{sock, {}, remote_ptr, "RPC"}, + new ggml_backend_rpc_buffer_context{sock, {}, remote_ptr, "RPC[" + std::string(buft_ctx->endpoint) + "]"}, remote_size); return buffer; } else { @@ -692,7 +692,7 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint) { ggml_backend_rpc_context * ctx = new ggml_backend_rpc_context { /* .endpoint = */ endpoint, - /* .name = */ "RPC", + /* .name = */ "RPC[" + std::string(endpoint) + "]", }; ggml_backend_t backend = new ggml_backend { diff --git a/llama.cpp b/llama.cpp index b19c6ff34..b0bf70e20 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2371,13 +2371,34 @@ struct llama_context { struct llama_control_vector cvec; }; +static size_t llama_get_device_count(const llama_model & model) { + size_t count = 1; +#if defined(GGML_USE_CUDA) + count = ggml_backend_cuda_get_device_count(); +#elif defined(GGML_USE_SYCL) + count = ggml_backend_sycl_get_device_count(); +#elif defined(GGML_USE_VULKAN) + count = ggml_backend_vk_get_device_count(); +#endif +#if defined(GGML_USE_RPC) + count += model.rpc_servers.size(); +#endif + return count; + GGML_UNUSED(model); +} + static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) { ggml_backend_buffer_type_t buft = nullptr; -#ifdef GGML_USE_RPC - std::string endpoint = model.rpc_servers[gpu]; - buft = ggml_backend_rpc_buffer_type(endpoint.c_str()); -#elif defined(GGML_USE_METAL) +#if defined(GGML_USE_RPC) + int dev_count = (int)llama_get_device_count(model); + int rpc_count = (int)model.rpc_servers.size(); + if (gpu >= dev_count - rpc_count) { + const char * endpoint = model.rpc_servers[gpu - dev_count + rpc_count].c_str(); + return ggml_backend_rpc_buffer_type(endpoint); + } +#endif +#if defined(GGML_USE_METAL) buft = ggml_backend_metal_buffer_type(); #elif defined(GGML_USE_CUDA) buft = ggml_backend_cuda_buffer_type(gpu); @@ -2425,29 +2446,19 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_mo GGML_UNUSED(tensor_split); } -static size_t llama_get_device_count(const llama_model & model) { -#if defined(GGML_USE_RPC) - return model.rpc_servers.size(); -#elif defined(GGML_USE_CUDA) - return ggml_backend_cuda_get_device_count(); -#elif defined(GGML_USE_SYCL) - return ggml_backend_sycl_get_device_count(); -#elif defined(GGML_USE_VULKAN) - return ggml_backend_vk_get_device_count(); -#else - return 1; -#endif - GGML_UNUSED(model); -} - static size_t llama_get_device_memory(const llama_model & model, int device) { #if defined(GGML_USE_RPC) - size_t total; - size_t free; - std::string endpoint = model.rpc_servers[device]; - ggml_backend_rpc_get_device_memory(endpoint.c_str(), &free, &total); - return free; -#elif defined(GGML_USE_CUDA) + int dev_count = (int)llama_get_device_count(model); + int rpc_count = (int)model.rpc_servers.size(); + if (device >= dev_count - rpc_count) { + size_t total; + size_t free; + const char * endpoint = model.rpc_servers[device - dev_count + rpc_count].c_str(); + ggml_backend_rpc_get_device_memory(endpoint, &free, &total); + return free; + } +#endif +#if defined(GGML_USE_CUDA) size_t total; size_t free; ggml_backend_cuda_get_device_memory(device, &free, &total); @@ -16160,7 +16171,7 @@ struct llama_model * llama_load_model_from_file( return true; }; } - if (params.rpc_servers != nullptr) { + if (params.rpc_servers != nullptr && params.rpc_servers[0] != '\0') { // split the servers set them into model->rpc_servers std::string servers(params.rpc_servers); size_t pos = 0; @@ -16323,17 +16334,7 @@ struct llama_context * llama_new_context_with_model( if (!hparams.vocab_only) { // initialize backends -#if defined(GGML_USE_RPC) - for (auto & server : model->rpc_servers) { - ggml_backend_t backend = ggml_backend_rpc_init(server.c_str()); - if (backend == nullptr) { - LLAMA_LOG_ERROR("%s: failed to connect RPC backend to %s\n", __func__, server.c_str()); - llama_free(ctx); - return nullptr; - } - ctx->backends.push_back(backend); - } -#elif defined(GGML_USE_METAL) +#if defined(GGML_USE_METAL) if (model->n_gpu_layers > 0) { ctx->backend_metal = ggml_backend_metal_init(); if (ctx->backend_metal == nullptr) { @@ -16425,6 +16426,19 @@ struct llama_context * llama_new_context_with_model( } ctx->backends.push_back(backend); } +#endif +#if defined(GGML_USE_RPC) + if (model->n_gpu_layers > 0) { + for (const auto & endpoint : model->rpc_servers) { + ggml_backend_t backend = ggml_backend_rpc_init(endpoint.c_str()); + if (backend == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str()); + llama_free(ctx); + return nullptr; + } + ctx->backends.push_back(backend); + } + } #endif ctx->backend_cpu = ggml_backend_cpu_init(); if (ctx->backend_cpu == nullptr) { From 6d1616944d9efd342ed2a4fd318722adfc9febcd Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 4 Jun 2024 10:01:09 +0300 Subject: [PATCH 04/10] ggml : prevent builds with -ffinite-math-only (#7726) This enforces a check that -fno-finite-math-only was set and that the operating compiling mode is not in finite maths mode. This is because during rewriting of silu and softmax for cpu #7154 there emerged an issue where the result that was observed when >1 slot was nondeterministic as found by @JohannesGaessler. @LostRuins narrowed the problem down to -ffinite-math-only which was theorised to be due to SiLU, instead of flushing small values to 0, returns NaN or some other garbage. @jart proposed a fix that @ggerganov then implemented in this fix ref https://github.com/ggerganov/llama.cpp/pull/7154#issuecomment-2145661825 --- cmake/arm64-windows-llvm.cmake | 2 +- ggml.c | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/cmake/arm64-windows-llvm.cmake b/cmake/arm64-windows-llvm.cmake index 46fba6514..802379680 100644 --- a/cmake/arm64-windows-llvm.cmake +++ b/cmake/arm64-windows-llvm.cmake @@ -9,7 +9,7 @@ set( CMAKE_CXX_COMPILER clang++ ) set( CMAKE_C_COMPILER_TARGET ${target} ) set( CMAKE_CXX_COMPILER_TARGET ${target} ) -set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast" ) +set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast -fno-finite-math-only" ) set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments" ) set( CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" ) diff --git a/ggml.c b/ggml.c index 000d9db7e..8869e146a 100644 --- a/ggml.c +++ b/ggml.c @@ -2272,6 +2272,11 @@ inline static float ggml_silu_f32(float x) { return x/(1.0f + expf(-x)); } +#if __FINITE_MATH_ONLY__ +#error "some routines in ggml.c require non-finite math arithmetics -- pass -fno-finite-math-only to the compiler to fix" +#error "ref: https://github.com/ggerganov/llama.cpp/pull/7154#issuecomment-2143844461" +#endif + #if defined(__ARM_NEON) && defined(__aarch64__) // adapted from arm limited optimized routine From 3b38d48609280aa5f8ab7ea135a4351b2a5ee240 Mon Sep 17 00:00:00 2001 From: jaime-m-p <167997752+jaime-m-p@users.noreply.github.com> Date: Tue, 4 Jun 2024 09:17:17 +0200 Subject: [PATCH 05/10] Per token attributes (#7685) * Add per token attributes enum * Using phi-3 for testing 'rstrip' * Using jina-v2 for testing 'lstrip' * Brute force test for 'lstrip' and 'rstrip' * Implement 'rstrip' and 'lstrip' * Update phi-3 GGUF file (obsolete since 917dc8c) * Replace llama_token_type with llama_token_attribs --- llama.cpp | 149 +++++++++++++++++++++++---------- llama.h | 18 +++- models/ggml-vocab-phi-3.gguf | Bin 725846 -> 726019 bytes tests/test-tokenizer-random.py | 50 +++++++---- 4 files changed, 155 insertions(+), 62 deletions(-) diff --git a/llama.cpp b/llama.cpp index b0bf70e20..a3e944874 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2149,12 +2149,12 @@ struct llama_control_vector { struct llama_vocab { using id = int32_t; using token = std::string; - using ttype = llama_token_type; + using tattr = llama_token_attr; struct token_data { token text; float score; - ttype type; + tattr attr; }; enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM; @@ -4750,7 +4750,20 @@ static void llm_load_vocab( auto & token_data = vocab.id_to_token[i]; token_data.text = std::move(word); token_data.score = scores ? scores[i] : 0.0f; - token_data.type = toktypes ? (llama_token_type) toktypes[i] : LLAMA_TOKEN_TYPE_NORMAL; + token_data.attr = LLAMA_TOKEN_ATTR_NORMAL; + + if (toktypes) { //TODO: remove, required until per token attributes are available from GGUF file + switch(toktypes[i]) { + case LLAMA_TOKEN_TYPE_UNKNOWN: token_data.attr = LLAMA_TOKEN_ATTR_UNKNOWN; break; + case LLAMA_TOKEN_TYPE_UNUSED: token_data.attr = LLAMA_TOKEN_ATTR_UNUSED; break; + case LLAMA_TOKEN_TYPE_NORMAL: token_data.attr = LLAMA_TOKEN_ATTR_NORMAL; break; + case LLAMA_TOKEN_TYPE_CONTROL: token_data.attr = LLAMA_TOKEN_ATTR_CONTROL; break; + case LLAMA_TOKEN_TYPE_USER_DEFINED: token_data.attr = LLAMA_TOKEN_ATTR_USER_DEFINED; break; + case LLAMA_TOKEN_TYPE_BYTE: token_data.attr = LLAMA_TOKEN_ATTR_BYTE; break; + case LLAMA_TOKEN_TYPE_UNDEFINED: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break; + default: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break; + } + } } GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size()); @@ -4841,7 +4854,7 @@ static void llm_load_vocab( // build special tokens cache { for (llama_vocab::id id = 0; id < (llama_vocab::id)n_vocab; ++id) { - if (vocab.id_to_token[id].type != LLAMA_TOKEN_TYPE_NORMAL) { + if (!(vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL)) { vocab.cache_special_tokens.push_back(id); } } @@ -4871,6 +4884,59 @@ static void llm_load_vocab( LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0); } + + // Handle per token attributes + //NOTE: Each model customizes per token attributes. + //NOTE: Per token attributes are missing from the GGUF file. + //TODO: Extract attributes from GGUF file. + { + auto _contains_any = [] (const std::string &str, const std::vector &substrs) -> bool { + for (auto substr : substrs) { + if (str.find(substr) < std::string::npos) { + return true; + } + } + return false; + }; + + auto _set_tokenid_attr = [&] (const llama_vocab::id id, llama_token_attr attr, bool value) { + uint32_t current = vocab.id_to_token.at(id).attr; + current = value ? (current | attr) : (current & ~attr); + vocab.id_to_token[id].attr = (llama_token_attr) current; + }; + + auto _set_token_attr = [&] (const std::string & token, llama_token_attr attr, bool value) { + _set_tokenid_attr(vocab.token_to_id.at(token), attr, value); + }; + + std::string model_name; + std::string tokenizer_pre; + + ml.get_key(LLM_KV_GENERAL_NAME, model_name, false); + ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false); + + // model name to lowercase + std::transform(model_name.begin(), model_name.end(), model_name.begin(), + [] (const std::string::value_type x) { + return std::tolower(x); + } + ); + + // set attributes by model/tokenizer name + if (_contains_any(tokenizer_pre, {"jina-v2-es", "jina-v2-de"})) { + _set_token_attr("", LLAMA_TOKEN_ATTR_LSTRIP, true); + } else if (_contains_any(model_name, {"phi-3", "phi3"})) { + for (auto id : vocab.cache_special_tokens) { + _set_tokenid_attr(id, LLAMA_TOKEN_ATTR_RSTRIP, true); + } + for (auto token : {""}) { + _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, true); + } + for (auto token : {"", "", "<|endoftext|>"}) { + _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, false); + } + } + } } static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { @@ -12620,27 +12686,27 @@ static enum llama_vocab_type llama_vocab_get_type(const llama_vocab & vocab) { static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) { GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE); - return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL; + return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL; } static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) { GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE); - return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNKNOWN; + return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNKNOWN; } static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) { GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE); - return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_CONTROL; + return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_CONTROL; } static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) { GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE); - return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE; + return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_BYTE; } static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id) { GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE); - return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED; + return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_USER_DEFINED; } static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) { @@ -13258,7 +13324,8 @@ struct fragment_buffer_variant { static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list & buffer) { // for each special token for (const llama_vocab::id special_id : vocab.cache_special_tokens) { - const auto & special_token = vocab.id_to_token[special_id].text; + const auto & data = vocab.id_to_token[special_id]; + const auto & special_token = data.text; // for each text fragment std::forward_list::iterator it = buffer.begin(); @@ -13295,13 +13362,22 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list< if (match > raw_text_base_offset) { // left const int64_t left_reminder_offset = raw_text_base_offset + 0; - const int64_t left_reminder_length = match - raw_text_base_offset; - buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length); + int64_t left_reminder_length = match - raw_text_base_offset; + + if (data.attr & LLAMA_TOKEN_ATTR_LSTRIP) { + while (left_reminder_length > 0 && isspace(raw_text[left_reminder_offset + left_reminder_length - 1])) { + left_reminder_length--; + } + } + + if (left_reminder_length > 0) { + buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length); + it++; + } #ifdef PRETOKENIZERDEBUG LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str()); #endif - it++; } // special token @@ -13310,16 +13386,25 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list< // right if (match + special_token.length() < raw_text_base_offset + raw_text_base_length) { - const int64_t right_reminder_offset = match + special_token.length(); - const int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length()); - buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length); + int64_t right_reminder_offset = match + special_token.length(); + int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length()); + + if (data.attr & LLAMA_TOKEN_ATTR_RSTRIP) { + while (right_reminder_length > 0 && isspace(raw_text[right_reminder_offset])) { + right_reminder_offset++; + right_reminder_length--; + } + } + + if (right_reminder_length > 0) { + buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length); + it++; + } #ifdef PRETOKENIZERDEBUG LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str()); #endif - it++; - if (source == 0) { buffer.erase_after(buffer.before_begin()); } else { @@ -13365,9 +13450,7 @@ static std::vector llama_tokenize_internal(const llama_vocab & // tokenizer.encode('', add_special_tokens=True) returns [1] // tokenizer.encode('', add_special_tokens=False) returns [] - static const bool rtrim = true; //TODO: as param bool is_prev_special = false; - bool special_token_rtrim = false; if (add_special && vocab.special_add_bos != 0) { GGML_ASSERT(vocab.special_bos_id != -1); @@ -13377,25 +13460,8 @@ static std::vector llama_tokenize_internal(const llama_vocab & for (const auto & fragment : fragment_buffer) { if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { - // without adding this leading whitespace, we do not get the same results as the original tokenizer - - // TODO: It's likely possible to get rid of this string copy entirely - // by modifying llm_tokenizer_x to operate with string offsets like pre-tokenizer - // and passing 'add space prefix' as bool argument - // auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length); - if (special_token_rtrim) { - size_t num_whitespaces = 0; - while (isspace(raw_text[num_whitespaces])) { - num_whitespaces++; - } - if (num_whitespaces == raw_text.size()) { - continue; // skip if all whitespaces - } - raw_text = raw_text.substr(num_whitespaces); - } - if (vocab.add_space_prefix) { if (!output.size() || is_prev_special) { // prefix with space if first token raw_text = " " + raw_text; @@ -13411,11 +13477,6 @@ static std::vector llama_tokenize_internal(const llama_vocab & } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) output.push_back(fragment.token); is_prev_special = true; - // phi-3 special tokens without rtrim, works fine for llama-spm too - special_token_rtrim = rtrim - && fragment.token != vocab.special_bos_id - && fragment.token != vocab.special_unk_id - && fragment.token != vocab.special_eos_id; } } @@ -18221,9 +18282,9 @@ float llama_token_get_score(const struct llama_model * model, llama_token token) return model->vocab.id_to_token[token].score; } -llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token) { +llama_token_attr llama_token_get_attr(const struct llama_model * model, llama_token token) { GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE); - return model->vocab.id_to_token[token].type; + return model->vocab.id_to_token[token].attr; } bool llama_token_is_eog(const struct llama_model * model, llama_token token) { diff --git a/llama.h b/llama.h index 95105c28e..a78ccdaf5 100644 --- a/llama.h +++ b/llama.h @@ -97,7 +97,7 @@ extern "C" { LLAMA_ROPE_TYPE_GLM = 4, }; - enum llama_token_type { + enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file LLAMA_TOKEN_TYPE_UNDEFINED = 0, LLAMA_TOKEN_TYPE_NORMAL = 1, LLAMA_TOKEN_TYPE_UNKNOWN = 2, @@ -107,6 +107,20 @@ extern "C" { LLAMA_TOKEN_TYPE_BYTE = 6, }; + enum llama_token_attr { + LLAMA_TOKEN_ATTR_UNDEFINED = 0, + LLAMA_TOKEN_ATTR_UNKNOWN = 1 << 1, + LLAMA_TOKEN_ATTR_UNUSED = 1 << 2, + LLAMA_TOKEN_ATTR_NORMAL = 1 << 3, + LLAMA_TOKEN_ATTR_CONTROL = 1 << 4, // SPECIAL? + LLAMA_TOKEN_ATTR_USER_DEFINED = 1 << 5, + LLAMA_TOKEN_ATTR_BYTE = 1 << 6, + LLAMA_TOKEN_ATTR_NORMALIZED = 1 << 7, + LLAMA_TOKEN_ATTR_LSTRIP = 1 << 8, + LLAMA_TOKEN_ATTR_RSTRIP = 1 << 9, + LLAMA_TOKEN_ATTR_SINGLE_WORD = 1 << 10, + }; + // model file types enum llama_ftype { LLAMA_FTYPE_ALL_F32 = 0, @@ -821,7 +835,7 @@ extern "C" { LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token); - LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token); + LLAMA_API enum llama_token_attr llama_token_get_attr(const struct llama_model * model, llama_token token); // Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.) LLAMA_API bool llama_token_is_eog(const struct llama_model * model, llama_token token); diff --git a/models/ggml-vocab-phi-3.gguf b/models/ggml-vocab-phi-3.gguf index f8022a385e4aa48ca10d40d0f079a25365a7be78..745be416a798a1e7a2effaa935bc903edaaf303d 100644 GIT binary patch delta 576 zcmcb%Os9E;j-b1Hs2ei_0!U31tWws5@(VIDjrEH13sUuplM{0?^V0S5i!#$Q^AdC7 zHZQ#0~&Qc{Zyafln?5I05>&w&|a zQe(#jx1hMPxFj{V#*Q5>0(2ajPMAhh9O7m*b{uf+rNya5HFn#S*u7Sj-b1Hs2ei_0*Fr(tlD@)fw7sLshypP5r{!FGZ3=?F)I+WZD(ghKMD)h@nC3_C{s) zm(1Hc)!AP&PoJR9F5h0F&Az=vo8#;I=?kSfk5Atq&1o?`Lx$6klL=@gD6E*buan{Y n&OCju3@1 Iterator[str]: 'a', # Phi-3 fail '<|endoftext|>', # Phi-3 fail 'a\na', # TODO: Bert fail + 'a b', # rstrip phi-3 + 'a b', # lstrip jina-v2 ] -def generator_random_special_tokens(tokenizer, iterations=100) -> Iterator[str]: - special_tokens = set(tokenizer.all_special_tokens) - special_tokens.update([" ", "\n", "\t", "-", "!", "one", "1", "", ""]) - special_tokens = list(sorted(special_tokens)) +def generator_vocab_words(vocab: list[str]) -> Iterator[str]: + """Brute force check all vocab words""" + yield from vocab + + +def generator_added_lr_strip(tokenizer) -> Iterator[str]: + WHITESPACES = ["", " ", " ", " "] + special_tokens = list(tokenizer.all_special_tokens) + added_tokens = list(tokenizer.added_tokens_encoder) + all_tokens = list(sorted(set(special_tokens + added_tokens))) + for token in all_tokens: + for lstrip in WHITESPACES: + for rstrip in WHITESPACES: + yield lstrip + token + rstrip + yield "a" + lstrip + token + rstrip + yield lstrip + token + rstrip + "z" + yield "a" + lstrip + token + rstrip + "z" + + +def generator_random_added_tokens(tokenizer, iterations=100) -> Iterator[str]: + special_tokens = list(tokenizer.all_special_tokens) + added_tokens = list(tokenizer.added_tokens_encoder) + separations = [" ", "\n", "\t", "-", "!", "one", "1", "", ""] + all_tokens = list(sorted(set(special_tokens + added_tokens + separations))) rand = random.Random() for m in range(iterations): rand.seed(m) - words = rand.choices(special_tokens, k=500) + words = rand.choices(all_tokens, k=500) if words[0] == tokenizer.bos_token: # skip spam warning of double BOS while len(words) > 1 and words[1] == tokenizer.bos_token: # leave one starting BOS words.pop(0) @@ -175,11 +197,6 @@ def generator_random_special_tokens(tokenizer, iterations=100) -> Iterator[str]: yield "".join(words) -def generator_vocab_words(vocab: list[str]) -> Iterator[str]: - """Brute force check all vocab words""" - yield from vocab - - def generator_random_chars(iterations=100) -> Iterator[str]: """Brute force random text with simple characters""" @@ -274,8 +291,8 @@ def test_compare_tokenizer(func_tokenize1: Callable, func_tokenize2: Callable, g ids2 = func_tokenize2(text) if ids1 != ids2: i = find_first_mismatch(ids1, ids2) - ids1 = list(ids1)[max(0, i - 2) : i + 2 + 1] - ids2 = list(ids2)[max(0, i - 2) : i + 2 + 1] + ids1 = list(ids1)[max(0, i - 2) : i + 5 + 1] + ids2 = list(ids2)[max(0, i - 2) : i + 5 + 1] logger.info(" TokenIDs: " + str(ids1)) logger.info(" Expected: " + str(ids2)) raise Exception() @@ -309,8 +326,9 @@ def main(argv: list[str] = None): vocab = list(sorted(tokenizer.batch_decode(list(tokenizer.get_vocab().values()), skip_special_tokens=True))) test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text()) test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text_edge_cases()) - test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_special_tokens(tokenizer, 10_000)) test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_vocab_words(vocab)) + test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_added_lr_strip(tokenizer)) + test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_added_tokens(tokenizer, 10_000)) test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_chars(10_000)) test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_vocab_chars(vocab, 10_000)) test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_vocab_words(vocab, 5_000)) @@ -322,14 +340,14 @@ def main(argv: list[str] = None): if __name__ == "__main__": # main() - path_tokenizers = "./models/tokenizers/" + path_tokenizers = "./models/tokenizers/" path_vocab_format = "./models/ggml-vocab-%s.gguf" # import os # tokenizers = os.listdir(path_tokenizers) tokenizers = [ - # "llama-spm", # SPM - # "phi-3", # SPM + "llama-spm", # SPM + "phi-3", # SPM "jina-v2-en", # WPM "bert-bge", # WPM ] From b226c1227bcf6412076ecf787421135fd2c42ef0 Mon Sep 17 00:00:00 2001 From: zhouwg Date: Tue, 4 Jun 2024 19:21:26 +0800 Subject: [PATCH 06/10] refine .gitignore (#7688) This adds tags and android ndk into the git ignore list --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 049efd703..5223c6963 100644 --- a/.gitignore +++ b/.gitignore @@ -34,9 +34,11 @@ ggml-metal-embed.metal lcov-report/ gcovr-report/ +tags build* !build.zig cmake-build-* +android-ndk-* out/ tmp/ From 987d743d6bc4cee4bde6820733ea33a2abc0afac Mon Sep 17 00:00:00 2001 From: Daniele <57776841+daniandtheweb@users.noreply.github.com> Date: Tue, 4 Jun 2024 12:09:15 +0000 Subject: [PATCH 07/10] Improve hipBLAS support in CMake (#7696) * Improve hipBLAS support in CMake This improves the detection of the correct CMAKE_PREFIX_PATH when using different distributions or a self-built ROCm SDK. * Set ROCM_PATH correctly --- CMakeLists.txt | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 620305ca7..76ea27412 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -557,12 +557,17 @@ if (LLAMA_VULKAN) endif() if (LLAMA_HIPBLAS) - if ($ENV{ROCM_PATH}) - set(ROCM_PATH $ENV{ROCM_PATH}) + if (NOT EXISTS $ENV{ROCM_PATH}) + if (NOT EXISTS /opt/rocm) + set(ROCM_PATH /usr) + else() + set(ROCM_PATH /opt/rocm) + endif() else() - set(ROCM_PATH /opt/rocm) + set(ROCM_PATH $ENV{ROCM_PATH}) endif() list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH}) + list(APPEND CMAKE_PREFIX_PATH "${ROCM_PATH}/lib64/cmake") # CMake on Windows doesn't support the HIP language yet if(WIN32) From adc9ff384121f4d550d28638a646b336d051bf42 Mon Sep 17 00:00:00 2001 From: slaren Date: Tue, 4 Jun 2024 14:32:42 +0200 Subject: [PATCH 08/10] llama-bench : allow using a different printer for stderr with -oe (#7722) compare-commits.sh : hide stdout, use -oe to print markdown --- examples/llama-bench/llama-bench.cpp | 145 +++++++++++++++++---------- scripts/compare-commits.sh | 16 +-- 2 files changed, 101 insertions(+), 60 deletions(-) diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index c00890447..5d3cbd842 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -140,10 +140,11 @@ static std::string get_gpu_info() { } // command line params -enum output_formats {CSV, JSON, MARKDOWN, SQL}; +enum output_formats {NONE, CSV, JSON, MARKDOWN, SQL}; static const char * output_format_str(output_formats format) { switch (format) { + case NONE: return "none"; case CSV: return "csv"; case JSON: return "json"; case MARKDOWN: return "md"; @@ -152,6 +153,23 @@ static const char * output_format_str(output_formats format) { } } +static bool output_format_from_str(const std::string & s, output_formats & format) { + if (s == "none") { + format = NONE; + } else if (s == "csv") { + format = CSV; + } else if (s == "json") { + format = JSON; + } else if (s == "md") { + format = MARKDOWN; + } else if (s == "sql") { + format = SQL; + } else { + return false; + } + return true; +} + static const char * split_mode_str(llama_split_mode mode) { switch (mode) { case LLAMA_SPLIT_MODE_NONE: return "none"; @@ -190,31 +208,33 @@ struct cmd_params { int reps; bool verbose; output_formats output_format; + output_formats output_format_stderr; }; static const cmd_params cmd_params_defaults = { - /* model */ {"models/7B/ggml-model-q4_0.gguf"}, - /* n_prompt */ {512}, - /* n_gen */ {128}, - /* n_pg */ {}, - /* n_batch */ {2048}, - /* n_ubatch */ {512}, - /* type_k */ {GGML_TYPE_F16}, - /* type_v */ {GGML_TYPE_F16}, - /* n_threads */ {cpu_get_num_math()}, - /* n_gpu_layers */ {99}, - /* rpc_servers */ {""}, - /* split_mode */ {LLAMA_SPLIT_MODE_LAYER}, - /* main_gpu */ {0}, - /* no_kv_offload */ {false}, - /* flash_attn */ {false}, - /* tensor_split */ {std::vector(llama_max_devices(), 0.0f)}, - /* use_mmap */ {true}, - /* embeddings */ {false}, - /* numa */ GGML_NUMA_STRATEGY_DISABLED, - /* reps */ 5, - /* verbose */ false, - /* output_format */ MARKDOWN + /* model */ {"models/7B/ggml-model-q4_0.gguf"}, + /* n_prompt */ {512}, + /* n_gen */ {128}, + /* n_pg */ {}, + /* n_batch */ {2048}, + /* n_ubatch */ {512}, + /* type_k */ {GGML_TYPE_F16}, + /* type_v */ {GGML_TYPE_F16}, + /* n_threads */ {cpu_get_num_math()}, + /* n_gpu_layers */ {99}, + /* rpc_servers */ {""}, + /* split_mode */ {LLAMA_SPLIT_MODE_LAYER}, + /* main_gpu */ {0}, + /* no_kv_offload */ {false}, + /* flash_attn */ {false}, + /* tensor_split */ {std::vector(llama_max_devices(), 0.0f)}, + /* use_mmap */ {true}, + /* embeddings */ {false}, + /* numa */ GGML_NUMA_STRATEGY_DISABLED, + /* reps */ 5, + /* verbose */ false, + /* output_format */ MARKDOWN, + /* output_format_stderr */ NONE, }; static void print_usage(int /* argc */, char ** argv) { @@ -243,6 +263,7 @@ static void print_usage(int /* argc */, char ** argv) { printf(" -ts, --tensor-split (default: 0)\n"); printf(" -r, --repetitions (default: %d)\n", cmd_params_defaults.reps); printf(" -o, --output (default: %s)\n", output_format_str(cmd_params_defaults.output_format)); + printf(" -oe, --output-err (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr)); printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0"); printf("\n"); printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n"); @@ -284,6 +305,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { params.verbose = cmd_params_defaults.verbose; params.output_format = cmd_params_defaults.output_format; + params.output_format_stderr = cmd_params_defaults.output_format_stderr; params.reps = cmd_params_defaults.reps; for (int i = 1; i < argc; i++) { @@ -493,18 +515,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { invalid_param = true; break; } - if (argv[i] == std::string("csv")) { - params.output_format = CSV; - } else if (argv[i] == std::string("json")) { - params.output_format = JSON; - } else if (argv[i] == std::string("md")) { - params.output_format = MARKDOWN; - } else if (argv[i] == std::string("sql")) { - params.output_format = SQL; - } else { + invalid_param = !output_format_from_str(argv[i], params.output_format); + } else if (arg == "-oe" || arg == "--output-err") { + if (++i >= argc) { invalid_param = true; break; } + invalid_param = !output_format_from_str(argv[i], params.output_format_stderr); } else if (arg == "-v" || arg == "--verbose") { params.verbose = true; } else { @@ -1278,6 +1295,22 @@ static void llama_null_log_callback(enum ggml_log_level level, const char * text (void) user_data; } +static std::unique_ptr create_printer(output_formats format) { + switch (format) { + case NONE: + return nullptr; + case CSV: + return std::unique_ptr(new csv_printer()); + case JSON: + return std::unique_ptr(new json_printer()); + case MARKDOWN: + return std::unique_ptr(new markdown_printer()); + case SQL: + return std::unique_ptr(new sql_printer()); + } + GGML_ASSERT(false); +} + int main(int argc, char ** argv) { // try to set locale for unicode characters in markdown setlocale(LC_CTYPE, ".UTF-8"); @@ -1304,26 +1337,18 @@ int main(int argc, char ** argv) { llama_numa_init(params.numa); // initialize printer - std::unique_ptr p; - switch (params.output_format) { - case CSV: - p.reset(new csv_printer()); - break; - case JSON: - p.reset(new json_printer()); - break; - case MARKDOWN: - p.reset(new markdown_printer()); - break; - case SQL: - p.reset(new sql_printer()); - break; - default: - assert(false); - exit(1); + std::unique_ptr p = create_printer(params.output_format); + std::unique_ptr p_err = create_printer(params.output_format_stderr); + + if (p) { + p->fout = stdout; + p->print_header(params); + } + + if (p_err) { + p_err->fout = stderr; + p_err->print_header(params); } - p->fout = stdout; - p->print_header(params); std::vector params_instances = get_cmd_params_instances(params); @@ -1381,7 +1406,15 @@ int main(int argc, char ** argv) { t.samples_ns.push_back(t_ns); } - p->print_test(t); + if (p) { + p->print_test(t); + fflush(p->fout); + } + + if (p_err) { + p_err->print_test(t); + fflush(p_err->fout); + } llama_print_timings(ctx); @@ -1390,7 +1423,13 @@ int main(int argc, char ** argv) { llama_free_model(lmodel); - p->print_footer(); + if (p) { + p->print_footer(); + } + + if (p_err) { + p_err->print_footer(); + } llama_backend_free(); diff --git a/scripts/compare-commits.sh b/scripts/compare-commits.sh index fd0ee88b2..a45cd3962 100755 --- a/scripts/compare-commits.sh +++ b/scripts/compare-commits.sh @@ -10,16 +10,18 @@ set -x bench_args="${@:3}" -rm -f llama-bench.sqlite +rm -f llama-bench.sqlite > /dev/null # to test a backend, call the script with the corresponding environment variable (e.g. LLAMA_CUDA=1 ./scripts/compare-commits.sh ...) -git checkout $1 -make clean && make -j32 $make_opts llama-bench -./llama-bench -o sql $bench_args | tee /dev/tty | sqlite3 llama-bench.sqlite +git checkout $1 > /dev/null +make clean > /dev/null +make -j$(nproc) $make_opts llama-bench > /dev/null +./llama-bench -o sql -oe md $bench_args | sqlite3 llama-bench.sqlite -git checkout $2 -make clean && make -j32 $make_opts llama-bench -./llama-bench -o sql $bench_args | tee /dev/tty | sqlite3 llama-bench.sqlite +git checkout $2 > /dev/null +make clean > /dev/null +make -j$(nproc) $make_opts llama-bench > /dev/null +./llama-bench -o sql -oe md $bench_args | sqlite3 llama-bench.sqlite ./scripts/compare-llama-bench.py -b $1 -c $2 From 5ca0944a153b65724d51b2f484139aa25ccb7a8b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 4 Jun 2024 19:43:01 +0300 Subject: [PATCH 09/10] readme : remove obsolete Zig instructions (#7471) --- README.md | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/README.md b/README.md index 8680460aa..1f2d9b1f2 100644 --- a/README.md +++ b/README.md @@ -364,17 +364,6 @@ In order to build llama.cpp you have four different options. cmake --build build --config Debug ``` -- Using `Zig` (version 0.11 or later): - - Building for optimization levels and CPU features can be accomplished using standard build arguments, for example AVX2, FMA, F16C, - it's also possible to cross compile for other operating systems and architectures: - - ```bash - zig build -Doptimize=ReleaseFast -Dtarget=x86_64-windows-gnu -Dcpu=x86_64+avx2+fma+f16c - ``` - - The `zig targets` command will give you valid options to use. - - Using `gmake` (FreeBSD): 1. Install and activate [DRM in FreeBSD](https://wiki.freebsd.org/Graphics) From 0cd6bd3483fa66124b76a8a8ac794d9ee18c70c1 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 4 Jun 2024 21:23:05 +0300 Subject: [PATCH 10/10] llama : remove beam search (#7736) --- Makefile | 6 +- examples/CMakeLists.txt | 1 - examples/beam-search/CMakeLists.txt | 5 - examples/beam-search/beam-search.cpp | 188 -------------------- llama.cpp | 254 --------------------------- llama.h | 42 +---- 6 files changed, 2 insertions(+), 494 deletions(-) delete mode 100644 examples/beam-search/CMakeLists.txt delete mode 100644 examples/beam-search/beam-search.cpp diff --git a/Makefile b/Makefile index b527f6f35..27eb69871 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # Define the default target now so that it is always the first target BUILD_TARGETS = \ main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \ - simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama beam-search \ + simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \ retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o # Binaries only useful for tests @@ -914,10 +914,6 @@ baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) tra $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -beam-search: examples/beam-search/beam-search.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index b40ee4ccb..53002f8e1 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -15,7 +15,6 @@ else() add_subdirectory(baby-llama) add_subdirectory(batched) add_subdirectory(batched-bench) - add_subdirectory(beam-search) add_subdirectory(benchmark) add_subdirectory(convert-llama2c-to-ggml) add_subdirectory(embedding) diff --git a/examples/beam-search/CMakeLists.txt b/examples/beam-search/CMakeLists.txt deleted file mode 100644 index f0e37468b..000000000 --- a/examples/beam-search/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -set(TARGET beam-search) -add_executable(${TARGET} beam-search.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/beam-search/beam-search.cpp b/examples/beam-search/beam-search.cpp deleted file mode 100644 index 3d34378a5..000000000 --- a/examples/beam-search/beam-search.cpp +++ /dev/null @@ -1,188 +0,0 @@ -#include "common.h" -#include "llama.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) -#include -#include -#elif defined (_WIN32) -#define WIN32_LEAN_AND_MEAN -#ifndef NOMINMAX -# define NOMINMAX -#endif -#include -#include -#endif - -// Used for debugging to print out beam tokens. -struct ostream_beam_view { - llama_context * ctx; - llama_beam_view beam_view; -}; - -static std::ostream & operator<<(std::ostream & os, const ostream_beam_view & obv) { - os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens("; - for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) { - os << llama_token_to_piece(obv.ctx, obv.beam_view.tokens[i]); - } - return os << ')'; -} - -// Put here anything you want back in beam_search_callback(). -struct beam_search_callback_data { - llama_context * ctx; - std::vector response; -}; - -// In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same. -// For example, eob can be flagged due to maximum token length, stop words, etc. -static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) { - return n_tokens && llama_token_is_eog(llama_get_model(callback_data.ctx), tokens[n_tokens-1]); -} - -// Function matching type llama_beam_search_callback_fn_t. -// Custom callback example is called each time the beams lengths increase: -// * Show progress by printing ',' following by number of convergent beam tokens if any. -// * When all beams converge to a common prefix, they are made available in beams_state.beams[0]. -// This is also called when the stop condition is met. -// Collect tokens into std::vector response which is pointed to by callback_data. -static void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) { - auto& callback_data = *static_cast(callback_data_ptr); - // Mark beams as EOS as needed. - for (size_t i = 0 ; i < beams_state.n_beams ; ++i) { - llama_beam_view& beam_view = beams_state.beam_views[i]; - if (!beam_view.eob && is_at_eob(callback_data, beam_view.tokens, beam_view.n_tokens)) { - beam_view.eob = true; - } - } - printf(","); // Show progress - if (const size_t n = beams_state.common_prefix_length) { - callback_data.response.resize(callback_data.response.size() + n); - assert(0u < beams_state.n_beams); - const llama_token * tokens = beams_state.beam_views[0].tokens; - std::copy(tokens, tokens + n, callback_data.response.end() - n); - printf("%zu", n); - } - fflush(stdout); -#if 1 // DEBUG: print current beams for this iteration - std::cout << "\n\nCurrent beams (last_call=" << beams_state.last_call << "):\n"; - for (size_t i = 0 ; i < beams_state.n_beams ; ++i) { - std::cout << "beams["< 3 ) - { - params.prompt = argv[3]; - } - - if ( params.prompt.empty() ) - { - params.prompt = "### Request:\nHow many countries are there?\n\n### Response:\n"; - } - - //--------------------------------- - // Init LLM : - //--------------------------------- - - llama_backend_init(); - llama_numa_init(params.numa); - - llama_model * model; - llama_context * ctx; - - std::tie(model, ctx) = llama_init_from_gpt_params( params ); - - if ( model == NULL ) - { - fprintf( stderr , "%s: error: unable to load model\n" , __func__ ); - return 1; - } - - //--------------------------------- - // Tokenize the prompt : - //--------------------------------- - - std::vector tokens_list = llama_tokenize(ctx, params.prompt, true); - - const size_t max_context_size = llama_n_ctx( ctx ); - const size_t max_tokens_list_size = max_context_size - 4 ; - - if (tokens_list.size() > max_tokens_list_size) - { - fprintf( stderr , "%s: error: prompt too long (%zu tokens, max %zu)\n" , - __func__ , tokens_list.size() , max_tokens_list_size ); - return 1; - } - - fprintf( stderr, "\n\n" ); - - // Print the tokens from the prompt : - - for( auto id : tokens_list ) - { - std::cout << llama_token_to_piece(ctx, id); - } - std::cout << std::flush; - - int n_past = 0; - - if (llama_decode(ctx, llama_batch_get_one(tokens_list.data(), tokens_list.size(), n_past, 0))) - { - fprintf(stderr, "%s : failed to eval prompt.\n" , __func__ ); - return 1; - } - n_past += tokens_list.size(); - - beam_search_callback_data callback_data{ctx, {}}; - size_t const beam_width = static_cast(params.n_beams); - int const n_predict = 256; - llama_beam_search(ctx, beam_search_callback, &callback_data, beam_width, n_past, n_predict); - - std::cout << "\n\n"; - for (llama_token const token_id : callback_data.response) { - std::cout << llama_token_to_piece(ctx,token_id); - } - std::cout << std::endl; - - llama_free( ctx ); - llama_free_model( model ); - - llama_backend_free(); - - return 0; -} diff --git a/llama.cpp b/llama.cpp index a3e944874..92c33f53e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -14711,260 +14711,6 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar ctx->t_sample_us += ggml_time_us() - t_start_sample_us; } -// -// Beam search -// - -struct llama_beam { - std::vector tokens; - float p; // Cumulative beam probability (renormalized relative to all beams) - bool eob; // Initialize end-of-beam to false. Callback sets this to true. - // Sort beams by probability. In case of ties, prefer beams at eob. - bool operator<(const llama_beam & rhs) const { - return std::make_pair(p, eob) < std::make_pair(rhs.p, rhs.eob); - } - // Shift off first n tokens and discard them. - void shift_tokens(const size_t n) { - if (n) { - std::copy(tokens.begin() + n, tokens.end(), tokens.begin()); - tokens.resize(tokens.size() - n); - } - } - llama_beam_view view() const { return {tokens.data(), tokens.size(), p, eob}; } -}; - -// A struct for calculating logit-related info. -struct llama_logit_info { - const float * const logits; - const int n_vocab; - const float max_l; - const float normalizer; - struct sum_exp { - float max_l; - float operator()(float sum, float l) const { return sum + std::exp(l - max_l); } - }; - llama_logit_info(llama_context * ctx) - : logits(llama_get_logits(ctx)) - , n_vocab(llama_n_vocab(llama_get_model(ctx))) - , max_l(*std::max_element(logits, logits + n_vocab)) - , normalizer(1.0f / std::accumulate(logits, logits + n_vocab, 0.0f, sum_exp{max_l})) - { } - llama_token_data get_token_data(const llama_token token_id) const { - constexpr auto p = std::numeric_limits::quiet_NaN(); // never used - return {token_id, logits[token_id], p}; - } - // Return top k token_data by logit. - std::vector top_k(size_t k) { - std::vector min_heap; // min-heap by logit - const llama_token k_min = std::min(static_cast(k), n_vocab); - min_heap.reserve(k_min); - for (llama_token token_id = 0 ; token_id < k_min ; ++token_id) { - min_heap.push_back(get_token_data(token_id)); - } - auto comp = [](const llama_token_data & a, const llama_token_data & b) { return a.logit > b.logit; }; - std::make_heap(min_heap.begin(), min_heap.end(), comp); - for (llama_token token_id = k_min ; token_id < n_vocab ; ++token_id) { - if (min_heap.front().logit < logits[token_id]) { - std::pop_heap(min_heap.begin(), min_heap.end(), comp); - min_heap.back().id = token_id; - min_heap.back().logit = logits[token_id]; - std::push_heap(min_heap.begin(), min_heap.end(), comp); - } - } - return min_heap; - } - float probability_from_logit(float logit) const { - return normalizer * std::exp(logit - max_l); - } -}; - -struct llama_beam_search_data { - llama_context * ctx; - size_t n_beams; - int n_past; - int n_predict; - std::vector beams; - std::vector next_beams; - - // Re-calculated on each loop iteration - size_t common_prefix_length; - - // Used to communicate to/from callback on beams state. - std::vector beam_views; - - llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict) - : ctx(ctx) - , n_beams(n_beams) - , n_past(n_past) - , n_predict(n_predict) - , beam_views(n_beams) { - beams.reserve(n_beams); - next_beams.reserve(n_beams); - } - - // Collapse beams to a single beam given by index. - void collapse_beams(const size_t beam_idx) { - if (0u < beam_idx) { - std::swap(beams[0], beams[beam_idx]); - } - beams.resize(1); - } - - // Min-heaps are used to efficiently collect the top-k elements (k=n_beams). - // The repetitive patterns below reflect the 2 stages of heaps: - // * Gather elements until the vector is full, then call std::make_heap() on it. - // * If the heap is full and a new element is found that should be included, pop the - // least element to the back(), replace it with the new, then push it into the heap. - void fill_next_beams_by_top_probabilities(llama_beam & beam) { - // Min-heaps use a greater-than comparator. - const auto comp = [](const llama_beam & a, const llama_beam & b) { return a.p > b.p; }; - if (beam.eob) { - // beam is at end-of-sentence, so just copy it to next_beams if its probability is high enough. - if (next_beams.size() < n_beams) { - next_beams.push_back(std::move(beam)); - if (next_beams.size() == n_beams) { - std::make_heap(next_beams.begin(), next_beams.end(), comp); - } - } else if (next_beams.front().p < beam.p) { - std::pop_heap(next_beams.begin(), next_beams.end(), comp); - next_beams.back() = std::move(beam); - std::push_heap(next_beams.begin(), next_beams.end(), comp); - } - } else { - // beam is not at end-of-sentence, so branch with next top_k tokens. - if (!beam.tokens.empty()) { - llama_decode(ctx, llama_batch_get_one(beam.tokens.data(), beam.tokens.size(), n_past, 0)); - } - llama_logit_info logit_info(ctx); - std::vector next_tokens = logit_info.top_k(n_beams); - - // Clear the kv slot so that other beams may try different tokens at this position. The llama_decode() - // call in loop() will conclusively fill in the kv slot once the beams converge at this position. - llama_kv_cache_seq_rm(ctx, 0, n_past, -1); - - size_t i=0; - if (next_beams.size() < n_beams) { - for (; next_beams.size() < n_beams ; ++i) { - llama_beam next_beam = beam; - next_beam.tokens.push_back(next_tokens[i].id); - next_beam.p *= logit_info.probability_from_logit(next_tokens[i].logit); - next_beams.push_back(std::move(next_beam)); - } - std::make_heap(next_beams.begin(), next_beams.end(), comp); - } else { - for (; next_beams.front().p == 0.0f ; ++i) { - std::pop_heap(next_beams.begin(), next_beams.end(), comp); - next_beams.back() = beam; - next_beams.back().tokens.push_back(next_tokens[i].id); - next_beams.back().p *= logit_info.probability_from_logit(next_tokens[i].logit); - std::push_heap(next_beams.begin(), next_beams.end(), comp); - } - } - for (; i < n_beams ; ++i) { - const float next_p = beam.p * logit_info.probability_from_logit(next_tokens[i].logit); - if (next_beams.front().p < next_p) { - std::pop_heap(next_beams.begin(), next_beams.end(), comp); - next_beams.back() = beam; - next_beams.back().tokens.push_back(next_tokens[i].id); - next_beams.back().p = next_p; - std::push_heap(next_beams.begin(), next_beams.end(), comp); - } - } - } - } - - // Find common_prefix_length based on beams. - // Requires beams is not empty. - size_t find_common_prefix_length() { - size_t common_prefix_length = beams[0].tokens.size(); - for (size_t i = 1 ; i < beams.size() ; ++i) { - common_prefix_length = std::min(common_prefix_length, beams[i].tokens.size()); - for (size_t j = 0 ; j < common_prefix_length ; ++j) { - if (beams[0].tokens[j] != beams[i].tokens[j]) { - common_prefix_length = j; - break; - } - } - } - return common_prefix_length; - } - - // Construct beams_state to send back to caller via the callback function. - // Side effect: set common_prefix_length = find_common_prefix_length(); - llama_beams_state get_beams_state(const bool last_call) { - for (size_t i = 0 ; i < beams.size() ; ++i) { - beam_views[i] = beams[i].view(); - } - common_prefix_length = find_common_prefix_length(); - return {beam_views.data(), beams.size(), common_prefix_length, last_call}; - } - - // Loop: - // * while i < n_predict, AND - // * any of the beams have not yet reached end-of-beam (eob), AND - // * the highest probability beam(s) (plural in case of ties) are not at end-of-sentence - // (since all other beam probabilities can only decrease) - void loop(const llama_beam_search_callback_fn_t callback, void * const callback_data) { - beams.push_back({{}, 1.0f, false}); // Start with one empty beam w/ probability = 1.0 and !eob. - const auto not_eob = [](const llama_beam & beam) { return !beam.eob; }; - for (int i = 0 ; i < n_predict && std::any_of(beams.begin(),beams.end(),not_eob) && - !beams[top_beam_index()].eob ; ++i) { - callback(callback_data, get_beams_state(false)); // Sets common_prefix_length - update_beams_from_beam_views(); // Update values (p,eob) that callback may have changed. - if (common_prefix_length) { - llama_decode(ctx, llama_batch_get_one(beams[0].tokens.data(), common_prefix_length, n_past, 0)); - n_past += common_prefix_length; - } - // Zero-out next_beam probabilities to place them last in following min-heap. - std::for_each(next_beams.begin(), next_beams.end(), [](llama_beam & beam) { beam.p = 0.0f; }); - for (llama_beam & beam : beams) { - beam.shift_tokens(common_prefix_length); - fill_next_beams_by_top_probabilities(beam); - } - // next_beams become the beams of next/final iteration. Swap them to re-use memory. - beams.swap(next_beams); - renormalize_beam_probabilities(beams); - } - collapse_beams(top_beam_index()); - callback(callback_data, get_beams_state(true)); - } - - // As beams grow, the cumulative probabilities decrease. - // Renormalize them to avoid floating point underflow. - static void renormalize_beam_probabilities(std::vector & beams) { - const auto sum_p = [](float sum, llama_beam & beam) { return sum + beam.p; }; - const float inv_sum = 1.0f / std::accumulate(beams.begin(), beams.end(), 0.0f, sum_p); - std::for_each(beams.begin(), beams.end(), [=](llama_beam & beam) { beam.p *= inv_sum; }); - } - - // Assumes beams is non-empty. Uses llama_beam::operator<() for ordering. - size_t top_beam_index() { - return std::max_element(beams.begin(), beams.end()) - beams.begin(); - } - - // Copy (p,eob) for each beam which may have been changed by the callback. - void update_beams_from_beam_views() { - for (size_t i = 0 ; i < beams.size() ; ++i) { - beams[i].p = beam_views[i].p; - beams[i].eob = beam_views[i].eob; - } - } -}; - -void llama_beam_search(llama_context * ctx, - llama_beam_search_callback_fn_t callback, void * callback_data, - size_t n_beams, int n_past, int n_predict) { - assert(ctx); - const int64_t t_start_sample_us = ggml_time_us(); - - llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict); - - beam_search_data.loop(callback, callback_data); - - ctx->t_sample_us += ggml_time_us() - t_start_sample_us; - ctx->n_sample++; -} - // // quantization // diff --git a/llama.h b/llama.h index a78ccdaf5..b2a302dad 100644 --- a/llama.h +++ b/llama.h @@ -1056,49 +1056,9 @@ extern "C" { llama_token token); // - // Beam search + // Model split // - struct llama_beam_view { - const llama_token * tokens; - - size_t n_tokens; - float p; // Cumulative beam probability (renormalized relative to all beams) - bool eob; // Callback should set this to true when a beam is at end-of-beam. - }; - - // Passed to beam_search_callback function. - // Whenever 0 < common_prefix_length, this number of tokens should be copied from any of the beams - // (e.g. beams[0]) as they will be removed (shifted) from all beams in all subsequent callbacks. - // These pointers are valid only during the synchronous callback, so should not be saved. - struct llama_beams_state { - struct llama_beam_view * beam_views; - - size_t n_beams; // Number of elements in beam_views[]. - size_t common_prefix_length; // Current max length of prefix tokens shared by all beams. - bool last_call; // True iff this is the last callback invocation. - }; - - // Type of pointer to the beam_search_callback function. - // void* callback_data is any custom data passed to llama_beam_search, that is subsequently - // passed back to beam_search_callback. This avoids having to use global variables in the callback. - typedef void (*llama_beam_search_callback_fn_t)(void * callback_data, struct llama_beams_state); - - /// @details Deterministically returns entire sentence constructed by a beam search. - /// @param ctx Pointer to the llama_context. - /// @param callback Invoked for each iteration of the beam_search loop, passing in beams_state. - /// @param callback_data A pointer that is simply passed back to callback. - /// @param n_beams Number of beams to use. - /// @param n_past Number of tokens already evaluated. - /// @param n_predict Maximum number of tokens to predict. EOS may occur earlier. - LLAMA_API void llama_beam_search( - struct llama_context * ctx, - llama_beam_search_callback_fn_t callback, - void * callback_data, - size_t n_beams, - int32_t n_past, - int32_t n_predict); - /// @details Build a split GGUF final path for this chunk. /// llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf" // Returns the split_path length.