diff --git a/cmake/arm64-windows-llvm.cmake b/cmake/arm64-windows-llvm.cmake index 46fba6514..802379680 100644 --- a/cmake/arm64-windows-llvm.cmake +++ b/cmake/arm64-windows-llvm.cmake @@ -9,7 +9,7 @@ set( CMAKE_CXX_COMPILER clang++ ) set( CMAKE_C_COMPILER_TARGET ${target} ) set( CMAKE_CXX_COMPILER_TARGET ${target} ) -set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast" ) +set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast -fno-finite-math-only" ) set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments" ) set( CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" ) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index b40ee4ccb..53002f8e1 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -15,7 +15,6 @@ else() add_subdirectory(baby-llama) add_subdirectory(batched) add_subdirectory(batched-bench) - add_subdirectory(beam-search) add_subdirectory(benchmark) add_subdirectory(convert-llama2c-to-ggml) add_subdirectory(embedding) diff --git a/examples/beam-search/CMakeLists.txt b/examples/beam-search/CMakeLists.txt deleted file mode 100644 index f0e37468b..000000000 --- a/examples/beam-search/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -set(TARGET beam-search) -add_executable(${TARGET} beam-search.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/beam-search/beam-search.cpp b/examples/beam-search/beam-search.cpp deleted file mode 100644 index 3d34378a5..000000000 --- a/examples/beam-search/beam-search.cpp +++ /dev/null @@ -1,188 +0,0 @@ -#include "common.h" -#include "llama.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) -#include -#include -#elif defined (_WIN32) -#define WIN32_LEAN_AND_MEAN -#ifndef NOMINMAX -# define NOMINMAX -#endif -#include -#include -#endif - -// Used for debugging to print out beam tokens. -struct ostream_beam_view { - llama_context * ctx; - llama_beam_view beam_view; -}; - -static std::ostream & operator<<(std::ostream & os, const ostream_beam_view & obv) { - os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens("; - for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) { - os << llama_token_to_piece(obv.ctx, obv.beam_view.tokens[i]); - } - return os << ')'; -} - -// Put here anything you want back in beam_search_callback(). -struct beam_search_callback_data { - llama_context * ctx; - std::vector response; -}; - -// In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same. -// For example, eob can be flagged due to maximum token length, stop words, etc. -static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) { - return n_tokens && llama_token_is_eog(llama_get_model(callback_data.ctx), tokens[n_tokens-1]); -} - -// Function matching type llama_beam_search_callback_fn_t. -// Custom callback example is called each time the beams lengths increase: -// * Show progress by printing ',' following by number of convergent beam tokens if any. -// * When all beams converge to a common prefix, they are made available in beams_state.beams[0]. -// This is also called when the stop condition is met. -// Collect tokens into std::vector response which is pointed to by callback_data. -static void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) { - auto& callback_data = *static_cast(callback_data_ptr); - // Mark beams as EOS as needed. - for (size_t i = 0 ; i < beams_state.n_beams ; ++i) { - llama_beam_view& beam_view = beams_state.beam_views[i]; - if (!beam_view.eob && is_at_eob(callback_data, beam_view.tokens, beam_view.n_tokens)) { - beam_view.eob = true; - } - } - printf(","); // Show progress - if (const size_t n = beams_state.common_prefix_length) { - callback_data.response.resize(callback_data.response.size() + n); - assert(0u < beams_state.n_beams); - const llama_token * tokens = beams_state.beam_views[0].tokens; - std::copy(tokens, tokens + n, callback_data.response.end() - n); - printf("%zu", n); - } - fflush(stdout); -#if 1 // DEBUG: print current beams for this iteration - std::cout << "\n\nCurrent beams (last_call=" << beams_state.last_call << "):\n"; - for (size_t i = 0 ; i < beams_state.n_beams ; ++i) { - std::cout << "beams["< 3 ) - { - params.prompt = argv[3]; - } - - if ( params.prompt.empty() ) - { - params.prompt = "### Request:\nHow many countries are there?\n\n### Response:\n"; - } - - //--------------------------------- - // Init LLM : - //--------------------------------- - - llama_backend_init(); - llama_numa_init(params.numa); - - llama_model * model; - llama_context * ctx; - - std::tie(model, ctx) = llama_init_from_gpt_params( params ); - - if ( model == NULL ) - { - fprintf( stderr , "%s: error: unable to load model\n" , __func__ ); - return 1; - } - - //--------------------------------- - // Tokenize the prompt : - //--------------------------------- - - std::vector tokens_list = llama_tokenize(ctx, params.prompt, true); - - const size_t max_context_size = llama_n_ctx( ctx ); - const size_t max_tokens_list_size = max_context_size - 4 ; - - if (tokens_list.size() > max_tokens_list_size) - { - fprintf( stderr , "%s: error: prompt too long (%zu tokens, max %zu)\n" , - __func__ , tokens_list.size() , max_tokens_list_size ); - return 1; - } - - fprintf( stderr, "\n\n" ); - - // Print the tokens from the prompt : - - for( auto id : tokens_list ) - { - std::cout << llama_token_to_piece(ctx, id); - } - std::cout << std::flush; - - int n_past = 0; - - if (llama_decode(ctx, llama_batch_get_one(tokens_list.data(), tokens_list.size(), n_past, 0))) - { - fprintf(stderr, "%s : failed to eval prompt.\n" , __func__ ); - return 1; - } - n_past += tokens_list.size(); - - beam_search_callback_data callback_data{ctx, {}}; - size_t const beam_width = static_cast(params.n_beams); - int const n_predict = 256; - llama_beam_search(ctx, beam_search_callback, &callback_data, beam_width, n_past, n_predict); - - std::cout << "\n\n"; - for (llama_token const token_id : callback_data.response) { - std::cout << llama_token_to_piece(ctx,token_id); - } - std::cout << std::endl; - - llama_free( ctx ); - llama_free_model( model ); - - llama_backend_free(); - - return 0; -} diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index b282b0b41..05a21267a 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -141,10 +141,11 @@ static std::string get_gpu_info() { } // command line params -enum output_formats {CSV, JSON, MARKDOWN, SQL}; +enum output_formats {NONE, CSV, JSON, MARKDOWN, SQL}; static const char * output_format_str(output_formats format) { switch (format) { + case NONE: return "none"; case CSV: return "csv"; case JSON: return "json"; case MARKDOWN: return "md"; @@ -153,6 +154,23 @@ static const char * output_format_str(output_formats format) { } } +static bool output_format_from_str(const std::string & s, output_formats & format) { + if (s == "none") { + format = NONE; + } else if (s == "csv") { + format = CSV; + } else if (s == "json") { + format = JSON; + } else if (s == "md") { + format = MARKDOWN; + } else if (s == "sql") { + format = SQL; + } else { + return false; + } + return true; +} + static const char * split_mode_str(llama_split_mode mode) { switch (mode) { case LLAMA_SPLIT_MODE_NONE: return "none"; @@ -191,31 +209,33 @@ struct cmd_params { int reps; bool verbose; output_formats output_format; + output_formats output_format_stderr; }; static const cmd_params cmd_params_defaults = { - /* model */ {"models/7B/ggml-model-q4_0.gguf"}, - /* n_prompt */ {512}, - /* n_gen */ {128}, - /* n_pg */ {}, - /* n_batch */ {2048}, - /* n_ubatch */ {512}, - /* type_k */ {GGML_TYPE_F16}, - /* type_v */ {GGML_TYPE_F16}, - /* n_threads */ {cpu_get_num_math()}, - /* n_gpu_layers */ {99}, - /* rpc_servers */ {""}, - /* split_mode */ {LLAMA_SPLIT_MODE_LAYER}, - /* main_gpu */ {0}, - /* no_kv_offload */ {false}, - /* flash_attn */ {false}, - /* tensor_split */ {std::vector(llama_max_devices(), 0.0f)}, - /* use_mmap */ {true}, - /* embeddings */ {false}, - /* numa */ GGML_NUMA_STRATEGY_DISABLED, - /* reps */ 5, - /* verbose */ false, - /* output_format */ MARKDOWN + /* model */ {"models/7B/ggml-model-q4_0.gguf"}, + /* n_prompt */ {512}, + /* n_gen */ {128}, + /* n_pg */ {}, + /* n_batch */ {2048}, + /* n_ubatch */ {512}, + /* type_k */ {GGML_TYPE_F16}, + /* type_v */ {GGML_TYPE_F16}, + /* n_threads */ {cpu_get_num_math()}, + /* n_gpu_layers */ {99}, + /* rpc_servers */ {""}, + /* split_mode */ {LLAMA_SPLIT_MODE_LAYER}, + /* main_gpu */ {0}, + /* no_kv_offload */ {false}, + /* flash_attn */ {false}, + /* tensor_split */ {std::vector(llama_max_devices(), 0.0f)}, + /* use_mmap */ {true}, + /* embeddings */ {false}, + /* numa */ GGML_NUMA_STRATEGY_DISABLED, + /* reps */ 5, + /* verbose */ false, + /* output_format */ MARKDOWN, + /* output_format_stderr */ NONE, }; static void print_usage(int /* argc */, char ** argv) { @@ -244,6 +264,7 @@ static void print_usage(int /* argc */, char ** argv) { printf(" -ts, --tensor-split (default: 0)\n"); printf(" -r, --repetitions (default: %d)\n", cmd_params_defaults.reps); printf(" -o, --output (default: %s)\n", output_format_str(cmd_params_defaults.output_format)); + printf(" -oe, --output-err (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr)); printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0"); printf("\n"); printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n"); @@ -285,6 +306,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { params.verbose = cmd_params_defaults.verbose; params.output_format = cmd_params_defaults.output_format; + params.output_format_stderr = cmd_params_defaults.output_format_stderr; params.reps = cmd_params_defaults.reps; for (int i = 1; i < argc; i++) { @@ -494,18 +516,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { invalid_param = true; break; } - if (argv[i] == std::string("csv")) { - params.output_format = CSV; - } else if (argv[i] == std::string("json")) { - params.output_format = JSON; - } else if (argv[i] == std::string("md")) { - params.output_format = MARKDOWN; - } else if (argv[i] == std::string("sql")) { - params.output_format = SQL; - } else { + invalid_param = !output_format_from_str(argv[i], params.output_format); + } else if (arg == "-oe" || arg == "--output-err") { + if (++i >= argc) { invalid_param = true; break; } + invalid_param = !output_format_from_str(argv[i], params.output_format_stderr); } else if (arg == "-v" || arg == "--verbose") { params.verbose = true; } else { @@ -1279,6 +1296,22 @@ static void llama_null_log_callback(enum ggml_log_level level, const char * text (void) user_data; } +static std::unique_ptr create_printer(output_formats format) { + switch (format) { + case NONE: + return nullptr; + case CSV: + return std::unique_ptr(new csv_printer()); + case JSON: + return std::unique_ptr(new json_printer()); + case MARKDOWN: + return std::unique_ptr(new markdown_printer()); + case SQL: + return std::unique_ptr(new sql_printer()); + } + GGML_ASSERT(false); +} + int main(int argc, char ** argv) { // try to set locale for unicode characters in markdown setlocale(LC_CTYPE, ".UTF-8"); @@ -1305,26 +1338,18 @@ int main(int argc, char ** argv) { llama_numa_init(params.numa); // initialize printer - std::unique_ptr p; - switch (params.output_format) { - case CSV: - p.reset(new csv_printer()); - break; - case JSON: - p.reset(new json_printer()); - break; - case MARKDOWN: - p.reset(new markdown_printer()); - break; - case SQL: - p.reset(new sql_printer()); - break; - default: - assert(false); - exit(1); + std::unique_ptr p = create_printer(params.output_format); + std::unique_ptr p_err = create_printer(params.output_format_stderr); + + if (p) { + p->fout = stdout; + p->print_header(params); + } + + if (p_err) { + p_err->fout = stderr; + p_err->print_header(params); } - p->fout = stdout; - p->print_header(params); std::vector params_instances = get_cmd_params_instances(params); @@ -1382,7 +1407,15 @@ int main(int argc, char ** argv) { t.samples_ns.push_back(t_ns); } - p->print_test(t); + if (p) { + p->print_test(t); + fflush(p->fout); + } + + if (p_err) { + p_err->print_test(t); + fflush(p_err->fout); + } llama_print_timings(ctx); @@ -1391,7 +1424,13 @@ int main(int argc, char ** argv) { llama_free_model(lmodel); - p->print_footer(); + if (p) { + p->print_footer(); + } + + if (p_err) { + p_err->print_footer(); + } llama_backend_free(); diff --git a/ggml-alloc.c b/ggml-alloc.c index 0146946eb..73a3c1575 100644 --- a/ggml-alloc.c +++ b/ggml-alloc.c @@ -750,7 +750,7 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * // this tensor was allocated without ggml-backend return; } - ggml_backend_view_init(galloc->buffers[buffer_id], tensor); + ggml_backend_view_init(tensor); } } else { if (tensor->data == NULL) { @@ -899,12 +899,12 @@ static bool alloc_tensor_range(struct ggml_context * ctx, if (t->view_src == NULL) { ggml_tallocr_alloc(&tallocr, t); } else if (t->buffer == NULL) { - ggml_backend_view_init(buffer, t); + ggml_backend_view_init(t); } } else { if (t->view_src != NULL && t->buffer == NULL) { // view of a pre-allocated tensor - ggml_backend_view_init(buffer, t); + ggml_backend_view_init(t); } } } diff --git a/ggml-backend.c b/ggml-backend.c index 9e35ce98d..05737ed69 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -151,7 +151,7 @@ void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) { bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst) { ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer; if (dst_buf->iface.cpy_tensor) { - return src->buffer->iface.cpy_tensor(dst_buf, src, dst); + return dst_buf->iface.cpy_tensor(dst_buf, src, dst); } return false; } @@ -1887,15 +1887,15 @@ ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, // utils -void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { +void ggml_backend_view_init(struct ggml_tensor * tensor) { GGML_ASSERT(tensor->buffer == NULL); GGML_ASSERT(tensor->view_src != NULL); GGML_ASSERT(tensor->view_src->buffer != NULL); GGML_ASSERT(tensor->view_src->data != NULL); - tensor->buffer = buffer; + tensor->buffer = tensor->view_src->buffer; tensor->data = (char *)tensor->view_src->data + tensor->view_offs; - ggml_backend_buffer_init_tensor(buffer, tensor); + ggml_backend_buffer_init_tensor(tensor->buffer, tensor); } void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) { @@ -1954,7 +1954,7 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te struct ggml_tensor * dst = node_copies[id]; if (dst->view_src != NULL) { graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src); - ggml_backend_view_init(dst->view_src->buffer, dst); + ggml_backend_view_init(dst); } else { ggml_backend_tensor_copy(src, dst); diff --git a/ggml-backend.h b/ggml-backend.h index 744b6a774..c582b0685 100644 --- a/ggml-backend.h +++ b/ggml-backend.h @@ -225,7 +225,7 @@ extern "C" { // Tensor initialization GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr); - GGML_API void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); + GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor); #ifdef __cplusplus diff --git a/ggml-rpc.cpp b/ggml-rpc.cpp index 49a20df4b..679ce4f28 100644 --- a/ggml-rpc.cpp +++ b/ggml-rpc.cpp @@ -491,7 +491,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer if (remote_ptr != 0) { ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft, ggml_backend_rpc_buffer_interface, - new ggml_backend_rpc_buffer_context{sock, {}, remote_ptr, "RPC"}, + new ggml_backend_rpc_buffer_context{sock, {}, remote_ptr, "RPC[" + std::string(buft_ctx->endpoint) + "]"}, remote_size); return buffer; } else { @@ -692,7 +692,7 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint) { ggml_backend_rpc_context * ctx = new ggml_backend_rpc_context { /* .endpoint = */ endpoint, - /* .name = */ "RPC", + /* .name = */ "RPC[" + std::string(endpoint) + "]", }; ggml_backend_t backend = new ggml_backend { diff --git a/ggml.c b/ggml.c index f23467252..67625269f 100644 --- a/ggml.c +++ b/ggml.c @@ -5,6 +5,7 @@ #include "ggml-quants.h" #include "ggml.h" + #if defined(_MSC_VER) || defined(__MINGW32__) #include // using malloc.h with MSC/MINGW #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) @@ -28,6 +29,10 @@ #include #endif +#ifdef GGML_USE_OPENMP +#include +#endif + #ifdef GGML_USE_METAL #include #endif @@ -1759,7 +1764,7 @@ struct ggml_compute_state_shared { int64_t perf_node_start_cycles; int64_t perf_node_start_time_us; - const int n_threads; + int n_threads; // synchronization primitives atomic_int n_active; // num active threads @@ -2270,6 +2275,11 @@ inline static float ggml_silu_f32(float x) { return x/(1.0f + expf(-x)); } +#if __FINITE_MATH_ONLY__ +#error "some routines in ggml.c require non-finite math arithmetics -- pass -fno-finite-math-only to the compiler to fix" +#error "ref: https://github.com/ggerganov/llama.cpp/pull/7154#issuecomment-2143844461" +#endif + #if defined(__ARM_NEON) && defined(__aarch64__) // adapted from arm limited optimized routine @@ -19686,6 +19696,59 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa return cplan; } +static enum ggml_status ggml_graph_compute_parallel(struct ggml_compute_state * workers, int n_threads) { + enum ggml_status compute_status = GGML_STATUS_SUCCESS; + +#ifdef GGML_USE_OPENMP + if (n_threads > 1) { + #pragma omp parallel num_threads(n_threads) + { + #pragma omp single + { + // update the number of threads from the actual number of threads that we got from OpenMP + n_threads = omp_get_num_threads(); + workers[0].shared->n_threads = n_threads; + workers[0].shared->n_active = n_threads; + } + ggml_graph_compute_thread(&workers[omp_get_thread_num()]); + } + } else { + ggml_graph_compute_thread(&workers[0]); + } +#else + // create thread pool + if (n_threads > 1) { + for (int j = 1; j < n_threads; ++j) { + const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]); + GGML_ASSERT(rc == 0); + UNUSED(rc); + } + } + + // this is a work thread too + ggml_graph_compute_thread(&workers[0]); + + // join or kill thread pool + if (n_threads > 1) { + for (int j = 1; j < n_threads; j++) { + const int rc = ggml_thread_join(workers[j].thrd, NULL); + GGML_ASSERT(rc == 0); + UNUSED(rc); + } + } +#endif + // don't leave affinity set on the main thread + clear_numa_thread_affinity(); + + for (int j = 0; j < n_threads; j++) { + if (workers[j].ec != GGML_STATUS_SUCCESS) { + compute_status = workers[j].ec; + break; + } + } + return compute_status; +} + enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) { { GGML_ASSERT(cplan); @@ -19696,7 +19759,11 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl } } - const int n_threads = cplan->n_threads; + int n_threads = cplan->n_threads; + +#if defined(GGML_USE_OPENMP) + n_threads = MIN(n_threads, omp_get_max_threads()); +#endif struct ggml_compute_state_shared state_shared = { /*.cgraph =*/ cgraph, @@ -19712,47 +19779,20 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl /*.current_chunk; =*/ 0, }; struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads); - - // create thread pool - if (n_threads > 1) { - for (int j = 1; j < n_threads; ++j) { - workers[j] = (struct ggml_compute_state) { - .thrd = 0, - .ith = j, - .shared = &state_shared, - .ec = GGML_STATUS_SUCCESS, - }; - - const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]); - GGML_ASSERT(rc == 0); - UNUSED(rc); - } - } - - workers[0].ith = 0; - workers[0].shared = &state_shared; - workers[0].ec = GGML_STATUS_SUCCESS; - const int64_t perf_start_cycles = ggml_perf_cycles(); const int64_t perf_start_time_us = ggml_perf_time_us(); - // this is a work thread too - ggml_graph_compute_thread(&workers[0]); - enum ggml_status compute_status = workers[0].ec; - - // don't leave affinity set on the main thread - clear_numa_thread_affinity(); - - // join or kill thread pool - if (n_threads > 1) { - for (int j = 1; j < n_threads; j++) { - const int rc = ggml_thread_join(workers[j].thrd, NULL); - GGML_ASSERT(rc == 0); - if (workers[j].ec != GGML_STATUS_SUCCESS) - compute_status = workers[j].ec; - } + for (int j = 0; j < n_threads; ++j) { + workers[j] = (struct ggml_compute_state) { + .thrd = 0, + .ith = j, + .shared = &state_shared, + .ec = GGML_STATUS_SUCCESS, + }; } + enum ggml_status compute_status = ggml_graph_compute_parallel(workers, n_threads); + // performance stats (graph) { int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles; diff --git a/llama.cpp b/llama.cpp index eb21989e5..b0a68ae42 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2175,12 +2175,12 @@ struct llama_control_vector { struct llama_vocab { using id = int32_t; using token = std::string; - using ttype = llama_token_type; + using tattr = llama_token_attr; struct token_data { token text; float score; - ttype type; + tattr attr; }; enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM; @@ -2401,13 +2401,34 @@ struct llama_context { struct llama_control_vector cvec; }; +static size_t llama_get_device_count(const llama_model & model) { + size_t count = 1; +#if defined(GGML_USE_CUDA) + count = ggml_backend_cuda_get_device_count(); +#elif defined(GGML_USE_SYCL) + count = ggml_backend_sycl_get_device_count(); +#elif defined(GGML_USE_VULKAN) + count = ggml_backend_vk_get_device_count(); +#endif +#if defined(GGML_USE_RPC) + count += model.rpc_servers.size(); +#endif + return count; + GGML_UNUSED(model); +} + static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) { ggml_backend_buffer_type_t buft = nullptr; -#ifdef GGML_USE_RPC - std::string endpoint = model.rpc_servers[gpu]; - buft = ggml_backend_rpc_buffer_type(endpoint.c_str()); -#elif defined(GGML_USE_METAL) +#if defined(GGML_USE_RPC) + int dev_count = (int)llama_get_device_count(model); + int rpc_count = (int)model.rpc_servers.size(); + if (gpu >= dev_count - rpc_count) { + const char * endpoint = model.rpc_servers[gpu - dev_count + rpc_count].c_str(); + return ggml_backend_rpc_buffer_type(endpoint); + } +#endif +#if defined(GGML_USE_METAL) buft = ggml_backend_metal_buffer_type(); #elif defined(GGML_USE_CUDA) buft = ggml_backend_cuda_buffer_type(gpu); @@ -2455,29 +2476,19 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_mo GGML_UNUSED(tensor_split); } -static size_t llama_get_device_count(const llama_model & model) { -#if defined(GGML_USE_RPC) - return model.rpc_servers.size(); -#elif defined(GGML_USE_CUDA) - return ggml_backend_cuda_get_device_count(); -#elif defined(GGML_USE_SYCL) - return ggml_backend_sycl_get_device_count(); -#elif defined(GGML_USE_VULKAN) - return ggml_backend_vk_get_device_count(); -#else - return 1; -#endif - GGML_UNUSED(model); -} - static size_t llama_get_device_memory(const llama_model & model, int device) { #if defined(GGML_USE_RPC) - size_t total; - size_t free; - std::string endpoint = model.rpc_servers[device]; - ggml_backend_rpc_get_device_memory(endpoint.c_str(), &free, &total); - return free; -#elif defined(GGML_USE_CUDA) + int dev_count = (int)llama_get_device_count(model); + int rpc_count = (int)model.rpc_servers.size(); + if (device >= dev_count - rpc_count) { + size_t total; + size_t free; + const char * endpoint = model.rpc_servers[device - dev_count + rpc_count].c_str(); + ggml_backend_rpc_get_device_memory(endpoint, &free, &total); + return free; + } +#endif +#if defined(GGML_USE_CUDA) size_t total; size_t free; ggml_backend_cuda_get_device_memory(device, &free, &total); @@ -4802,7 +4813,20 @@ static void llm_load_vocab( auto & token_data = vocab.id_to_token[i]; token_data.text = std::move(word); token_data.score = scores ? scores[i] : 0.0f; - token_data.type = toktypes ? (llama_token_type) toktypes[i] : LLAMA_TOKEN_TYPE_NORMAL; + token_data.attr = LLAMA_TOKEN_ATTR_NORMAL; + + if (toktypes) { //TODO: remove, required until per token attributes are available from GGUF file + switch(toktypes[i]) { + case LLAMA_TOKEN_TYPE_UNKNOWN: token_data.attr = LLAMA_TOKEN_ATTR_UNKNOWN; break; + case LLAMA_TOKEN_TYPE_UNUSED: token_data.attr = LLAMA_TOKEN_ATTR_UNUSED; break; + case LLAMA_TOKEN_TYPE_NORMAL: token_data.attr = LLAMA_TOKEN_ATTR_NORMAL; break; + case LLAMA_TOKEN_TYPE_CONTROL: token_data.attr = LLAMA_TOKEN_ATTR_CONTROL; break; + case LLAMA_TOKEN_TYPE_USER_DEFINED: token_data.attr = LLAMA_TOKEN_ATTR_USER_DEFINED; break; + case LLAMA_TOKEN_TYPE_BYTE: token_data.attr = LLAMA_TOKEN_ATTR_BYTE; break; + case LLAMA_TOKEN_TYPE_UNDEFINED: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break; + default: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break; + } + } } GGML_ASSERT_CONTINUE(vocab.id_to_token.size() == vocab.token_to_id.size()); @@ -4893,7 +4917,7 @@ static void llm_load_vocab( // build special tokens cache { for (llama_vocab::id id = 0; id < (llama_vocab::id)n_vocab; ++id) { - if (vocab.id_to_token[id].type != LLAMA_TOKEN_TYPE_NORMAL) { + if (!(vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL)) { vocab.cache_special_tokens.push_back(id); } } @@ -4923,6 +4947,59 @@ static void llm_load_vocab( LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0); } + + // Handle per token attributes + //NOTE: Each model customizes per token attributes. + //NOTE: Per token attributes are missing from the GGUF file. + //TODO: Extract attributes from GGUF file. + { + auto _contains_any = [] (const std::string &str, const std::vector &substrs) -> bool { + for (auto substr : substrs) { + if (str.find(substr) < std::string::npos) { + return true; + } + } + return false; + }; + + auto _set_tokenid_attr = [&] (const llama_vocab::id id, llama_token_attr attr, bool value) { + uint32_t current = vocab.id_to_token.at(id).attr; + current = value ? (current | attr) : (current & ~attr); + vocab.id_to_token[id].attr = (llama_token_attr) current; + }; + + auto _set_token_attr = [&] (const std::string & token, llama_token_attr attr, bool value) { + _set_tokenid_attr(vocab.token_to_id.at(token), attr, value); + }; + + std::string model_name; + std::string tokenizer_pre; + + ml.get_key(LLM_KV_GENERAL_NAME, model_name, false); + ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false); + + // model name to lowercase + std::transform(model_name.begin(), model_name.end(), model_name.begin(), + [] (const std::string::value_type x) { + return std::tolower(x); + } + ); + + // set attributes by model/tokenizer name + if (_contains_any(tokenizer_pre, {"jina-v2-es", "jina-v2-de"})) { + _set_token_attr("", LLAMA_TOKEN_ATTR_LSTRIP, true); + } else if (_contains_any(model_name, {"phi-3", "phi3"})) { + for (auto id : vocab.cache_special_tokens) { + _set_tokenid_attr(id, LLAMA_TOKEN_ATTR_RSTRIP, true); + } + for (auto token : {""}) { + _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, true); + } + for (auto token : {"", "", "<|endoftext|>"}) { + _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, false); + } + } + } } static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { @@ -12679,27 +12756,27 @@ static enum llama_vocab_type llama_vocab_get_type(const llama_vocab & vocab) { static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) { GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE); - return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL; + return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL; } static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) { GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE); - return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNKNOWN; + return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNKNOWN; } static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) { GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE); - return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_CONTROL; + return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_CONTROL; } static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) { GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE); - return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE; + return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_BYTE; } static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id) { GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE); - return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED; + return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_USER_DEFINED; } static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) { @@ -13538,7 +13615,8 @@ struct fragment_buffer_variant { static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list & buffer) { // for each special token for (const llama_vocab::id special_id : vocab.cache_special_tokens) { - const auto & special_token = vocab.id_to_token[special_id].text; + const auto & data = vocab.id_to_token[special_id]; + const auto & special_token = data.text; // for each text fragment std::forward_list::iterator it = buffer.begin(); @@ -13575,13 +13653,22 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list< if (match > raw_text_base_offset) { // left const int64_t left_reminder_offset = raw_text_base_offset + 0; - const int64_t left_reminder_length = match - raw_text_base_offset; - buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length); + int64_t left_reminder_length = match - raw_text_base_offset; + + if (data.attr & LLAMA_TOKEN_ATTR_LSTRIP) { + while (left_reminder_length > 0 && isspace(raw_text[left_reminder_offset + left_reminder_length - 1])) { + left_reminder_length--; + } + } + + if (left_reminder_length > 0) { + buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length); + it++; + } #ifdef PRETOKENIZERDEBUG LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str()); #endif - it++; } // special token @@ -13590,16 +13677,25 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list< // right if (match + special_token.length() < raw_text_base_offset + raw_text_base_length) { - const int64_t right_reminder_offset = match + special_token.length(); - const int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length()); - buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length); + int64_t right_reminder_offset = match + special_token.length(); + int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length()); + + if (data.attr & LLAMA_TOKEN_ATTR_RSTRIP) { + while (right_reminder_length > 0 && isspace(raw_text[right_reminder_offset])) { + right_reminder_offset++; + right_reminder_length--; + } + } + + if (right_reminder_length > 0) { + buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length); + it++; + } #ifdef PRETOKENIZERDEBUG LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str()); #endif - it++; - if (source == 0) { buffer.erase_after(buffer.before_begin()); } else { @@ -13645,9 +13741,7 @@ static std::vector llama_tokenize_internal(const llama_vocab & // tokenizer.encode('', add_special_tokens=True) returns [1] // tokenizer.encode('', add_special_tokens=False) returns [] - static const bool rtrim = true; //TODO: as param bool is_prev_special = false; - bool special_token_rtrim = false; if (add_special && vocab.special_add_bos != 0) { GGML_ASSERT(vocab.special_bos_id != -1); @@ -13657,25 +13751,8 @@ static std::vector llama_tokenize_internal(const llama_vocab & for (const auto & fragment : fragment_buffer) { if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { - // without adding this leading whitespace, we do not get the same results as the original tokenizer - - // TODO: It's likely possible to get rid of this string copy entirely - // by modifying llm_tokenizer_x to operate with string offsets like pre-tokenizer - // and passing 'add space prefix' as bool argument - // auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length); - if (special_token_rtrim) { - size_t num_whitespaces = 0; - while (isspace(raw_text[num_whitespaces])) { - num_whitespaces++; - } - if (num_whitespaces == raw_text.size()) { - continue; // skip if all whitespaces - } - raw_text = raw_text.substr(num_whitespaces); - } - if (vocab.add_space_prefix) { if (!output.size() || is_prev_special) { // prefix with space if first token raw_text = " " + raw_text; @@ -13691,11 +13768,6 @@ static std::vector llama_tokenize_internal(const llama_vocab & } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) output.push_back(fragment.token); is_prev_special = true; - // phi-3 special tokens without rtrim, works fine for llama-spm too - special_token_rtrim = rtrim - && fragment.token != vocab.special_bos_id - && fragment.token != vocab.special_unk_id - && fragment.token != vocab.special_eos_id; } } @@ -14954,260 +15026,6 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar ctx->t_sample_us += ggml_time_us() - t_start_sample_us; } -// -// Beam search -// - -struct llama_beam { - std::vector tokens; - float p; // Cumulative beam probability (renormalized relative to all beams) - bool eob; // Initialize end-of-beam to false. Callback sets this to true. - // Sort beams by probability. In case of ties, prefer beams at eob. - bool operator<(const llama_beam & rhs) const { - return std::make_pair(p, eob) < std::make_pair(rhs.p, rhs.eob); - } - // Shift off first n tokens and discard them. - void shift_tokens(const size_t n) { - if (n) { - std::copy(tokens.begin() + n, tokens.end(), tokens.begin()); - tokens.resize(tokens.size() - n); - } - } - llama_beam_view view() const { return {tokens.data(), tokens.size(), p, eob}; } -}; - -// A struct for calculating logit-related info. -struct llama_logit_info { - const float * const logits; - const int n_vocab; - const float max_l; - const float normalizer; - struct sum_exp { - float max_l; - float operator()(float sum, float l) const { return sum + std::exp(l - max_l); } - }; - llama_logit_info(llama_context * ctx) - : logits(llama_get_logits(ctx)) - , n_vocab(llama_n_vocab(llama_get_model(ctx))) - , max_l(*std::max_element(logits, logits + n_vocab)) - , normalizer(1.0f / std::accumulate(logits, logits + n_vocab, 0.0f, sum_exp{max_l})) - { } - llama_token_data get_token_data(const llama_token token_id) const { - constexpr auto p = std::numeric_limits::quiet_NaN(); // never used - return {token_id, logits[token_id], p}; - } - // Return top k token_data by logit. - std::vector top_k(size_t k) { - std::vector min_heap; // min-heap by logit - const llama_token k_min = std::min(static_cast(k), n_vocab); - min_heap.reserve(k_min); - for (llama_token token_id = 0 ; token_id < k_min ; ++token_id) { - min_heap.push_back(get_token_data(token_id)); - } - auto comp = [](const llama_token_data & a, const llama_token_data & b) { return a.logit > b.logit; }; - std::make_heap(min_heap.begin(), min_heap.end(), comp); - for (llama_token token_id = k_min ; token_id < n_vocab ; ++token_id) { - if (min_heap.front().logit < logits[token_id]) { - std::pop_heap(min_heap.begin(), min_heap.end(), comp); - min_heap.back().id = token_id; - min_heap.back().logit = logits[token_id]; - std::push_heap(min_heap.begin(), min_heap.end(), comp); - } - } - return min_heap; - } - float probability_from_logit(float logit) const { - return normalizer * std::exp(logit - max_l); - } -}; - -struct llama_beam_search_data { - llama_context * ctx; - size_t n_beams; - int n_past; - int n_predict; - std::vector beams; - std::vector next_beams; - - // Re-calculated on each loop iteration - size_t common_prefix_length; - - // Used to communicate to/from callback on beams state. - std::vector beam_views; - - llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict) - : ctx(ctx) - , n_beams(n_beams) - , n_past(n_past) - , n_predict(n_predict) - , beam_views(n_beams) { - beams.reserve(n_beams); - next_beams.reserve(n_beams); - } - - // Collapse beams to a single beam given by index. - void collapse_beams(const size_t beam_idx) { - if (0u < beam_idx) { - std::swap(beams[0], beams[beam_idx]); - } - beams.resize(1); - } - - // Min-heaps are used to efficiently collect the top-k elements (k=n_beams). - // The repetitive patterns below reflect the 2 stages of heaps: - // * Gather elements until the vector is full, then call std::make_heap() on it. - // * If the heap is full and a new element is found that should be included, pop the - // least element to the back(), replace it with the new, then push it into the heap. - void fill_next_beams_by_top_probabilities(llama_beam & beam) { - // Min-heaps use a greater-than comparator. - const auto comp = [](const llama_beam & a, const llama_beam & b) { return a.p > b.p; }; - if (beam.eob) { - // beam is at end-of-sentence, so just copy it to next_beams if its probability is high enough. - if (next_beams.size() < n_beams) { - next_beams.push_back(std::move(beam)); - if (next_beams.size() == n_beams) { - std::make_heap(next_beams.begin(), next_beams.end(), comp); - } - } else if (next_beams.front().p < beam.p) { - std::pop_heap(next_beams.begin(), next_beams.end(), comp); - next_beams.back() = std::move(beam); - std::push_heap(next_beams.begin(), next_beams.end(), comp); - } - } else { - // beam is not at end-of-sentence, so branch with next top_k tokens. - if (!beam.tokens.empty()) { - llama_decode(ctx, llama_batch_get_one(beam.tokens.data(), beam.tokens.size(), n_past, 0)); - } - llama_logit_info logit_info(ctx); - std::vector next_tokens = logit_info.top_k(n_beams); - - // Clear the kv slot so that other beams may try different tokens at this position. The llama_decode() - // call in loop() will conclusively fill in the kv slot once the beams converge at this position. - llama_kv_cache_seq_rm(ctx, 0, n_past, -1); - - size_t i=0; - if (next_beams.size() < n_beams) { - for (; next_beams.size() < n_beams ; ++i) { - llama_beam next_beam = beam; - next_beam.tokens.push_back(next_tokens[i].id); - next_beam.p *= logit_info.probability_from_logit(next_tokens[i].logit); - next_beams.push_back(std::move(next_beam)); - } - std::make_heap(next_beams.begin(), next_beams.end(), comp); - } else { - for (; next_beams.front().p == 0.0f ; ++i) { - std::pop_heap(next_beams.begin(), next_beams.end(), comp); - next_beams.back() = beam; - next_beams.back().tokens.push_back(next_tokens[i].id); - next_beams.back().p *= logit_info.probability_from_logit(next_tokens[i].logit); - std::push_heap(next_beams.begin(), next_beams.end(), comp); - } - } - for (; i < n_beams ; ++i) { - const float next_p = beam.p * logit_info.probability_from_logit(next_tokens[i].logit); - if (next_beams.front().p < next_p) { - std::pop_heap(next_beams.begin(), next_beams.end(), comp); - next_beams.back() = beam; - next_beams.back().tokens.push_back(next_tokens[i].id); - next_beams.back().p = next_p; - std::push_heap(next_beams.begin(), next_beams.end(), comp); - } - } - } - } - - // Find common_prefix_length based on beams. - // Requires beams is not empty. - size_t find_common_prefix_length() { - size_t common_prefix_length = beams[0].tokens.size(); - for (size_t i = 1 ; i < beams.size() ; ++i) { - common_prefix_length = std::min(common_prefix_length, beams[i].tokens.size()); - for (size_t j = 0 ; j < common_prefix_length ; ++j) { - if (beams[0].tokens[j] != beams[i].tokens[j]) { - common_prefix_length = j; - break; - } - } - } - return common_prefix_length; - } - - // Construct beams_state to send back to caller via the callback function. - // Side effect: set common_prefix_length = find_common_prefix_length(); - llama_beams_state get_beams_state(const bool last_call) { - for (size_t i = 0 ; i < beams.size() ; ++i) { - beam_views[i] = beams[i].view(); - } - common_prefix_length = find_common_prefix_length(); - return {beam_views.data(), beams.size(), common_prefix_length, last_call}; - } - - // Loop: - // * while i < n_predict, AND - // * any of the beams have not yet reached end-of-beam (eob), AND - // * the highest probability beam(s) (plural in case of ties) are not at end-of-sentence - // (since all other beam probabilities can only decrease) - void loop(const llama_beam_search_callback_fn_t callback, void * const callback_data) { - beams.push_back({{}, 1.0f, false}); // Start with one empty beam w/ probability = 1.0 and !eob. - const auto not_eob = [](const llama_beam & beam) { return !beam.eob; }; - for (int i = 0 ; i < n_predict && std::any_of(beams.begin(),beams.end(),not_eob) && - !beams[top_beam_index()].eob ; ++i) { - callback(callback_data, get_beams_state(false)); // Sets common_prefix_length - update_beams_from_beam_views(); // Update values (p,eob) that callback may have changed. - if (common_prefix_length) { - llama_decode(ctx, llama_batch_get_one(beams[0].tokens.data(), common_prefix_length, n_past, 0)); - n_past += common_prefix_length; - } - // Zero-out next_beam probabilities to place them last in following min-heap. - std::for_each(next_beams.begin(), next_beams.end(), [](llama_beam & beam) { beam.p = 0.0f; }); - for (llama_beam & beam : beams) { - beam.shift_tokens(common_prefix_length); - fill_next_beams_by_top_probabilities(beam); - } - // next_beams become the beams of next/final iteration. Swap them to re-use memory. - beams.swap(next_beams); - renormalize_beam_probabilities(beams); - } - collapse_beams(top_beam_index()); - callback(callback_data, get_beams_state(true)); - } - - // As beams grow, the cumulative probabilities decrease. - // Renormalize them to avoid floating point underflow. - static void renormalize_beam_probabilities(std::vector & beams) { - const auto sum_p = [](float sum, llama_beam & beam) { return sum + beam.p; }; - const float inv_sum = 1.0f / std::accumulate(beams.begin(), beams.end(), 0.0f, sum_p); - std::for_each(beams.begin(), beams.end(), [=](llama_beam & beam) { beam.p *= inv_sum; }); - } - - // Assumes beams is non-empty. Uses llama_beam::operator<() for ordering. - size_t top_beam_index() { - return std::max_element(beams.begin(), beams.end()) - beams.begin(); - } - - // Copy (p,eob) for each beam which may have been changed by the callback. - void update_beams_from_beam_views() { - for (size_t i = 0 ; i < beams.size() ; ++i) { - beams[i].p = beam_views[i].p; - beams[i].eob = beam_views[i].eob; - } - } -}; - -void llama_beam_search(llama_context * ctx, - llama_beam_search_callback_fn_t callback, void * callback_data, - size_t n_beams, int n_past, int n_predict) { - assert(ctx); - const int64_t t_start_sample_us = ggml_time_us(); - - llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict); - - beam_search_data.loop(callback, callback_data); - - ctx->t_sample_us += ggml_time_us() - t_start_sample_us; - ctx->n_sample++; -} - // // quantization // @@ -16463,7 +16281,7 @@ struct llama_model * llama_load_model_from_file( return true; }; } - if (params.rpc_servers != nullptr) { + if (params.rpc_servers != nullptr && params.rpc_servers[0] != '\0') { // split the servers set them into model->rpc_servers std::string servers(params.rpc_servers); size_t pos = 0; @@ -16626,17 +16444,7 @@ struct llama_context * llama_new_context_with_model( if (!hparams.vocab_only) { // initialize backends -#if defined(GGML_USE_RPC) - for (auto & server : model->rpc_servers) { - ggml_backend_t backend = ggml_backend_rpc_init(server.c_str()); - if (backend == nullptr) { - LLAMA_LOG_ERROR("%s: failed to connect RPC backend to %s\n", __func__, server.c_str()); - llama_free(ctx); - return nullptr; - } - ctx->backends.push_back(backend); - } -#elif defined(GGML_USE_METAL) +#if defined(GGML_USE_METAL) if (model->n_gpu_layers > 0) { ctx->backend_metal = ggml_backend_metal_init(); if (ctx->backend_metal == nullptr) { @@ -16728,6 +16536,19 @@ struct llama_context * llama_new_context_with_model( } ctx->backends.push_back(backend); } +#endif +#if defined(GGML_USE_RPC) + if (model->n_gpu_layers > 0) { + for (const auto & endpoint : model->rpc_servers) { + ggml_backend_t backend = ggml_backend_rpc_init(endpoint.c_str()); + if (backend == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str()); + llama_free(ctx); + return nullptr; + } + ctx->backends.push_back(backend); + } + } #endif ctx->backend_cpu = ggml_backend_cpu_init(); if (ctx->backend_cpu == nullptr) { @@ -18521,9 +18342,9 @@ float llama_token_get_score(const struct llama_model * model, llama_token token) return model->vocab.id_to_token[token].score; } -llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token) { +llama_token_attr llama_token_get_attr(const struct llama_model * model, llama_token token) { GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE); - return model->vocab.id_to_token[token].type; + return model->vocab.id_to_token[token].attr; } bool llama_token_is_eog(const struct llama_model * model, llama_token token) { diff --git a/llama.h b/llama.h index f78e02e3b..011379a5c 100644 --- a/llama.h +++ b/llama.h @@ -97,7 +97,7 @@ extern "C" { LLAMA_ROPE_TYPE_GLM = 4, }; - enum llama_token_type { + enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file LLAMA_TOKEN_TYPE_UNDEFINED = 0, LLAMA_TOKEN_TYPE_NORMAL = 1, LLAMA_TOKEN_TYPE_UNKNOWN = 2, @@ -107,6 +107,20 @@ extern "C" { LLAMA_TOKEN_TYPE_BYTE = 6, }; + enum llama_token_attr { + LLAMA_TOKEN_ATTR_UNDEFINED = 0, + LLAMA_TOKEN_ATTR_UNKNOWN = 1 << 1, + LLAMA_TOKEN_ATTR_UNUSED = 1 << 2, + LLAMA_TOKEN_ATTR_NORMAL = 1 << 3, + LLAMA_TOKEN_ATTR_CONTROL = 1 << 4, // SPECIAL? + LLAMA_TOKEN_ATTR_USER_DEFINED = 1 << 5, + LLAMA_TOKEN_ATTR_BYTE = 1 << 6, + LLAMA_TOKEN_ATTR_NORMALIZED = 1 << 7, + LLAMA_TOKEN_ATTR_LSTRIP = 1 << 8, + LLAMA_TOKEN_ATTR_RSTRIP = 1 << 9, + LLAMA_TOKEN_ATTR_SINGLE_WORD = 1 << 10, + }; + // model file types enum llama_ftype { LLAMA_FTYPE_ALL_F32 = 0, @@ -823,7 +837,7 @@ extern "C" { LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token); - LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token); + LLAMA_API enum llama_token_attr llama_token_get_attr(const struct llama_model * model, llama_token token); // Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.) LLAMA_API bool llama_token_is_eog(const struct llama_model * model, llama_token token); @@ -1046,49 +1060,9 @@ extern "C" { llama_token token); // - // Beam search + // Model split // - struct llama_beam_view { - const llama_token * tokens; - - size_t n_tokens; - float p; // Cumulative beam probability (renormalized relative to all beams) - bool eob; // Callback should set this to true when a beam is at end-of-beam. - }; - - // Passed to beam_search_callback function. - // Whenever 0 < common_prefix_length, this number of tokens should be copied from any of the beams - // (e.g. beams[0]) as they will be removed (shifted) from all beams in all subsequent callbacks. - // These pointers are valid only during the synchronous callback, so should not be saved. - struct llama_beams_state { - struct llama_beam_view * beam_views; - - size_t n_beams; // Number of elements in beam_views[]. - size_t common_prefix_length; // Current max length of prefix tokens shared by all beams. - bool last_call; // True iff this is the last callback invocation. - }; - - // Type of pointer to the beam_search_callback function. - // void* callback_data is any custom data passed to llama_beam_search, that is subsequently - // passed back to beam_search_callback. This avoids having to use global variables in the callback. - typedef void (*llama_beam_search_callback_fn_t)(void * callback_data, struct llama_beams_state); - - /// @details Deterministically returns entire sentence constructed by a beam search. - /// @param ctx Pointer to the llama_context. - /// @param callback Invoked for each iteration of the beam_search loop, passing in beams_state. - /// @param callback_data A pointer that is simply passed back to callback. - /// @param n_beams Number of beams to use. - /// @param n_past Number of tokens already evaluated. - /// @param n_predict Maximum number of tokens to predict. EOS may occur earlier. - LLAMA_API void llama_beam_search( - struct llama_context * ctx, - llama_beam_search_callback_fn_t callback, - void * callback_data, - size_t n_beams, - int32_t n_past, - int32_t n_predict); - /// @details Build a split GGUF final path for this chunk. /// llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf" // Returns the split_path length.