Merge commit '0cd6bd3483' into concedo_experimental

# Conflicts:
#	.github/workflows/build.yml
#	.gitignore
#	CMakeLists.txt
#	Makefile
#	README.md
#	models/ggml-vocab-phi-3.gguf
#	scripts/compare-commits.sh
#	tests/test-tokenizer-random.py
This commit is contained in:
Concedo 2024-06-05 10:53:52 +08:00
commit e3e21cc44d
12 changed files with 355 additions and 675 deletions

View file

@ -9,7 +9,7 @@ set( CMAKE_CXX_COMPILER clang++ )
set( CMAKE_C_COMPILER_TARGET ${target} ) set( CMAKE_C_COMPILER_TARGET ${target} )
set( CMAKE_CXX_COMPILER_TARGET ${target} ) set( CMAKE_CXX_COMPILER_TARGET ${target} )
set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast" ) set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments" ) set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments" )
set( CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" ) set( CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )

View file

@ -15,7 +15,6 @@ else()
add_subdirectory(baby-llama) add_subdirectory(baby-llama)
add_subdirectory(batched) add_subdirectory(batched)
add_subdirectory(batched-bench) add_subdirectory(batched-bench)
add_subdirectory(beam-search)
add_subdirectory(benchmark) add_subdirectory(benchmark)
add_subdirectory(convert-llama2c-to-ggml) add_subdirectory(convert-llama2c-to-ggml)
add_subdirectory(embedding) add_subdirectory(embedding)

View file

@ -1,5 +0,0 @@
set(TARGET beam-search)
add_executable(${TARGET} beam-search.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View file

@ -1,188 +0,0 @@
#include "common.h"
#include "llama.h"
#include <cassert>
#include <cinttypes>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <ctime>
#include <fstream>
#include <iostream>
#include <string>
#include <vector>
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
#include <signal.h>
#include <unistd.h>
#elif defined (_WIN32)
#define WIN32_LEAN_AND_MEAN
#ifndef NOMINMAX
# define NOMINMAX
#endif
#include <windows.h>
#include <signal.h>
#endif
// Used for debugging to print out beam tokens.
struct ostream_beam_view {
llama_context * ctx;
llama_beam_view beam_view;
};
static std::ostream & operator<<(std::ostream & os, const ostream_beam_view & obv) {
os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens(";
for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) {
os << llama_token_to_piece(obv.ctx, obv.beam_view.tokens[i]);
}
return os << ')';
}
// Put here anything you want back in beam_search_callback().
struct beam_search_callback_data {
llama_context * ctx;
std::vector<llama_token> response;
};
// In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.
// For example, eob can be flagged due to maximum token length, stop words, etc.
static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) {
return n_tokens && llama_token_is_eog(llama_get_model(callback_data.ctx), tokens[n_tokens-1]);
}
// Function matching type llama_beam_search_callback_fn_t.
// Custom callback example is called each time the beams lengths increase:
// * Show progress by printing ',' following by number of convergent beam tokens if any.
// * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
// This is also called when the stop condition is met.
// Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
static void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) {
auto& callback_data = *static_cast<beam_search_callback_data*>(callback_data_ptr);
// Mark beams as EOS as needed.
for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
llama_beam_view& beam_view = beams_state.beam_views[i];
if (!beam_view.eob && is_at_eob(callback_data, beam_view.tokens, beam_view.n_tokens)) {
beam_view.eob = true;
}
}
printf(","); // Show progress
if (const size_t n = beams_state.common_prefix_length) {
callback_data.response.resize(callback_data.response.size() + n);
assert(0u < beams_state.n_beams);
const llama_token * tokens = beams_state.beam_views[0].tokens;
std::copy(tokens, tokens + n, callback_data.response.end() - n);
printf("%zu", n);
}
fflush(stdout);
#if 1 // DEBUG: print current beams for this iteration
std::cout << "\n\nCurrent beams (last_call=" << beams_state.last_call << "):\n";
for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
std::cout << "beams["<<i<<"]: " << ostream_beam_view{callback_data.ctx,beams_state.beam_views[i]} << std::endl;
}
#endif
}
int main(int argc, char ** argv)
{
gpt_params params;
//params.n_gpu_layers = 200;
//---------------------------------
// Print help :
//---------------------------------
if ( argc < 2 || argv[1][0] == '-' )
{
printf( "Usage: %s MODEL_PATH [BEAM_WIDTH=2] [PROMPT]\n" , argv[0] );
return 1 ;
}
//---------------------------------
// Load parameters :
//---------------------------------
params.model = argv[1];
params.n_beams = 2 < argc ? std::stoi(argv[2]) : 2;
if ( argc > 3 )
{
params.prompt = argv[3];
}
if ( params.prompt.empty() )
{
params.prompt = "### Request:\nHow many countries are there?\n\n### Response:\n";
}
//---------------------------------
// Init LLM :
//---------------------------------
llama_backend_init();
llama_numa_init(params.numa);
llama_model * model;
llama_context * ctx;
std::tie(model, ctx) = llama_init_from_gpt_params( params );
if ( model == NULL )
{
fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
return 1;
}
//---------------------------------
// Tokenize the prompt :
//---------------------------------
std::vector<llama_token> tokens_list = llama_tokenize(ctx, params.prompt, true);
const size_t max_context_size = llama_n_ctx( ctx );
const size_t max_tokens_list_size = max_context_size - 4 ;
if (tokens_list.size() > max_tokens_list_size)
{
fprintf( stderr , "%s: error: prompt too long (%zu tokens, max %zu)\n" ,
__func__ , tokens_list.size() , max_tokens_list_size );
return 1;
}
fprintf( stderr, "\n\n" );
// Print the tokens from the prompt :
for( auto id : tokens_list )
{
std::cout << llama_token_to_piece(ctx, id);
}
std::cout << std::flush;
int n_past = 0;
if (llama_decode(ctx, llama_batch_get_one(tokens_list.data(), tokens_list.size(), n_past, 0)))
{
fprintf(stderr, "%s : failed to eval prompt.\n" , __func__ );
return 1;
}
n_past += tokens_list.size();
beam_search_callback_data callback_data{ctx, {}};
size_t const beam_width = static_cast<size_t>(params.n_beams);
int const n_predict = 256;
llama_beam_search(ctx, beam_search_callback, &callback_data, beam_width, n_past, n_predict);
std::cout << "\n\n";
for (llama_token const token_id : callback_data.response) {
std::cout << llama_token_to_piece(ctx,token_id);
}
std::cout << std::endl;
llama_free( ctx );
llama_free_model( model );
llama_backend_free();
return 0;
}

View file

@ -141,10 +141,11 @@ static std::string get_gpu_info() {
} }
// command line params // command line params
enum output_formats {CSV, JSON, MARKDOWN, SQL}; enum output_formats {NONE, CSV, JSON, MARKDOWN, SQL};
static const char * output_format_str(output_formats format) { static const char * output_format_str(output_formats format) {
switch (format) { switch (format) {
case NONE: return "none";
case CSV: return "csv"; case CSV: return "csv";
case JSON: return "json"; case JSON: return "json";
case MARKDOWN: return "md"; case MARKDOWN: return "md";
@ -153,6 +154,23 @@ static const char * output_format_str(output_formats format) {
} }
} }
static bool output_format_from_str(const std::string & s, output_formats & format) {
if (s == "none") {
format = NONE;
} else if (s == "csv") {
format = CSV;
} else if (s == "json") {
format = JSON;
} else if (s == "md") {
format = MARKDOWN;
} else if (s == "sql") {
format = SQL;
} else {
return false;
}
return true;
}
static const char * split_mode_str(llama_split_mode mode) { static const char * split_mode_str(llama_split_mode mode) {
switch (mode) { switch (mode) {
case LLAMA_SPLIT_MODE_NONE: return "none"; case LLAMA_SPLIT_MODE_NONE: return "none";
@ -191,6 +209,7 @@ struct cmd_params {
int reps; int reps;
bool verbose; bool verbose;
output_formats output_format; output_formats output_format;
output_formats output_format_stderr;
}; };
static const cmd_params cmd_params_defaults = { static const cmd_params cmd_params_defaults = {
@ -215,7 +234,8 @@ static const cmd_params cmd_params_defaults = {
/* numa */ GGML_NUMA_STRATEGY_DISABLED, /* numa */ GGML_NUMA_STRATEGY_DISABLED,
/* reps */ 5, /* reps */ 5,
/* verbose */ false, /* verbose */ false,
/* output_format */ MARKDOWN /* output_format */ MARKDOWN,
/* output_format_stderr */ NONE,
}; };
static void print_usage(int /* argc */, char ** argv) { static void print_usage(int /* argc */, char ** argv) {
@ -244,6 +264,7 @@ static void print_usage(int /* argc */, char ** argv) {
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n"); printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps); printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
printf(" -o, --output <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format)); printf(" -o, --output <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
printf(" -oe, --output-err <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr));
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0"); printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
printf("\n"); printf("\n");
printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n"); printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
@ -285,6 +306,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
params.verbose = cmd_params_defaults.verbose; params.verbose = cmd_params_defaults.verbose;
params.output_format = cmd_params_defaults.output_format; params.output_format = cmd_params_defaults.output_format;
params.output_format_stderr = cmd_params_defaults.output_format_stderr;
params.reps = cmd_params_defaults.reps; params.reps = cmd_params_defaults.reps;
for (int i = 1; i < argc; i++) { for (int i = 1; i < argc; i++) {
@ -494,18 +516,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
invalid_param = true; invalid_param = true;
break; break;
} }
if (argv[i] == std::string("csv")) { invalid_param = !output_format_from_str(argv[i], params.output_format);
params.output_format = CSV; } else if (arg == "-oe" || arg == "--output-err") {
} else if (argv[i] == std::string("json")) { if (++i >= argc) {
params.output_format = JSON;
} else if (argv[i] == std::string("md")) {
params.output_format = MARKDOWN;
} else if (argv[i] == std::string("sql")) {
params.output_format = SQL;
} else {
invalid_param = true; invalid_param = true;
break; break;
} }
invalid_param = !output_format_from_str(argv[i], params.output_format_stderr);
} else if (arg == "-v" || arg == "--verbose") { } else if (arg == "-v" || arg == "--verbose") {
params.verbose = true; params.verbose = true;
} else { } else {
@ -1279,6 +1296,22 @@ static void llama_null_log_callback(enum ggml_log_level level, const char * text
(void) user_data; (void) user_data;
} }
static std::unique_ptr<printer> create_printer(output_formats format) {
switch (format) {
case NONE:
return nullptr;
case CSV:
return std::unique_ptr<printer>(new csv_printer());
case JSON:
return std::unique_ptr<printer>(new json_printer());
case MARKDOWN:
return std::unique_ptr<printer>(new markdown_printer());
case SQL:
return std::unique_ptr<printer>(new sql_printer());
}
GGML_ASSERT(false);
}
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
// try to set locale for unicode characters in markdown // try to set locale for unicode characters in markdown
setlocale(LC_CTYPE, ".UTF-8"); setlocale(LC_CTYPE, ".UTF-8");
@ -1305,26 +1338,18 @@ int main(int argc, char ** argv) {
llama_numa_init(params.numa); llama_numa_init(params.numa);
// initialize printer // initialize printer
std::unique_ptr<printer> p; std::unique_ptr<printer> p = create_printer(params.output_format);
switch (params.output_format) { std::unique_ptr<printer> p_err = create_printer(params.output_format_stderr);
case CSV:
p.reset(new csv_printer()); if (p) {
break;
case JSON:
p.reset(new json_printer());
break;
case MARKDOWN:
p.reset(new markdown_printer());
break;
case SQL:
p.reset(new sql_printer());
break;
default:
assert(false);
exit(1);
}
p->fout = stdout; p->fout = stdout;
p->print_header(params); p->print_header(params);
}
if (p_err) {
p_err->fout = stderr;
p_err->print_header(params);
}
std::vector<cmd_params_instance> params_instances = get_cmd_params_instances(params); std::vector<cmd_params_instance> params_instances = get_cmd_params_instances(params);
@ -1382,7 +1407,15 @@ int main(int argc, char ** argv) {
t.samples_ns.push_back(t_ns); t.samples_ns.push_back(t_ns);
} }
if (p) {
p->print_test(t); p->print_test(t);
fflush(p->fout);
}
if (p_err) {
p_err->print_test(t);
fflush(p_err->fout);
}
llama_print_timings(ctx); llama_print_timings(ctx);
@ -1391,7 +1424,13 @@ int main(int argc, char ** argv) {
llama_free_model(lmodel); llama_free_model(lmodel);
if (p) {
p->print_footer(); p->print_footer();
}
if (p_err) {
p_err->print_footer();
}
llama_backend_free(); llama_backend_free();

View file

@ -750,7 +750,7 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
// this tensor was allocated without ggml-backend // this tensor was allocated without ggml-backend
return; return;
} }
ggml_backend_view_init(galloc->buffers[buffer_id], tensor); ggml_backend_view_init(tensor);
} }
} else { } else {
if (tensor->data == NULL) { if (tensor->data == NULL) {
@ -899,12 +899,12 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
if (t->view_src == NULL) { if (t->view_src == NULL) {
ggml_tallocr_alloc(&tallocr, t); ggml_tallocr_alloc(&tallocr, t);
} else if (t->buffer == NULL) { } else if (t->buffer == NULL) {
ggml_backend_view_init(buffer, t); ggml_backend_view_init(t);
} }
} else { } else {
if (t->view_src != NULL && t->buffer == NULL) { if (t->view_src != NULL && t->buffer == NULL) {
// view of a pre-allocated tensor // view of a pre-allocated tensor
ggml_backend_view_init(buffer, t); ggml_backend_view_init(t);
} }
} }
} }

View file

@ -151,7 +151,7 @@ void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) {
bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst) { bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst) {
ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer; ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer;
if (dst_buf->iface.cpy_tensor) { if (dst_buf->iface.cpy_tensor) {
return src->buffer->iface.cpy_tensor(dst_buf, src, dst); return dst_buf->iface.cpy_tensor(dst_buf, src, dst);
} }
return false; return false;
} }
@ -1887,15 +1887,15 @@ ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched,
// utils // utils
void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { void ggml_backend_view_init(struct ggml_tensor * tensor) {
GGML_ASSERT(tensor->buffer == NULL); GGML_ASSERT(tensor->buffer == NULL);
GGML_ASSERT(tensor->view_src != NULL); GGML_ASSERT(tensor->view_src != NULL);
GGML_ASSERT(tensor->view_src->buffer != NULL); GGML_ASSERT(tensor->view_src->buffer != NULL);
GGML_ASSERT(tensor->view_src->data != NULL); GGML_ASSERT(tensor->view_src->data != NULL);
tensor->buffer = buffer; tensor->buffer = tensor->view_src->buffer;
tensor->data = (char *)tensor->view_src->data + tensor->view_offs; tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
ggml_backend_buffer_init_tensor(buffer, tensor); ggml_backend_buffer_init_tensor(tensor->buffer, tensor);
} }
void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) { void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) {
@ -1954,7 +1954,7 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te
struct ggml_tensor * dst = node_copies[id]; struct ggml_tensor * dst = node_copies[id];
if (dst->view_src != NULL) { if (dst->view_src != NULL) {
graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src); graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
ggml_backend_view_init(dst->view_src->buffer, dst); ggml_backend_view_init(dst);
} }
else { else {
ggml_backend_tensor_copy(src, dst); ggml_backend_tensor_copy(src, dst);

View file

@ -225,7 +225,7 @@ extern "C" {
// Tensor initialization // Tensor initialization
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr); GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
GGML_API void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
#ifdef __cplusplus #ifdef __cplusplus

View file

@ -491,7 +491,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer
if (remote_ptr != 0) { if (remote_ptr != 0) {
ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft, ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft,
ggml_backend_rpc_buffer_interface, ggml_backend_rpc_buffer_interface,
new ggml_backend_rpc_buffer_context{sock, {}, remote_ptr, "RPC"}, new ggml_backend_rpc_buffer_context{sock, {}, remote_ptr, "RPC[" + std::string(buft_ctx->endpoint) + "]"},
remote_size); remote_size);
return buffer; return buffer;
} else { } else {
@ -692,7 +692,7 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const
GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint) { GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint) {
ggml_backend_rpc_context * ctx = new ggml_backend_rpc_context { ggml_backend_rpc_context * ctx = new ggml_backend_rpc_context {
/* .endpoint = */ endpoint, /* .endpoint = */ endpoint,
/* .name = */ "RPC", /* .name = */ "RPC[" + std::string(endpoint) + "]",
}; };
ggml_backend_t backend = new ggml_backend { ggml_backend_t backend = new ggml_backend {

106
ggml.c
View file

@ -5,6 +5,7 @@
#include "ggml-quants.h" #include "ggml-quants.h"
#include "ggml.h" #include "ggml.h"
#if defined(_MSC_VER) || defined(__MINGW32__) #if defined(_MSC_VER) || defined(__MINGW32__)
#include <malloc.h> // using malloc.h with MSC/MINGW #include <malloc.h> // using malloc.h with MSC/MINGW
#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
@ -28,6 +29,10 @@
#include <syscall.h> #include <syscall.h>
#endif #endif
#ifdef GGML_USE_OPENMP
#include <omp.h>
#endif
#ifdef GGML_USE_METAL #ifdef GGML_USE_METAL
#include <unistd.h> #include <unistd.h>
#endif #endif
@ -1759,7 +1764,7 @@ struct ggml_compute_state_shared {
int64_t perf_node_start_cycles; int64_t perf_node_start_cycles;
int64_t perf_node_start_time_us; int64_t perf_node_start_time_us;
const int n_threads; int n_threads;
// synchronization primitives // synchronization primitives
atomic_int n_active; // num active threads atomic_int n_active; // num active threads
@ -2270,6 +2275,11 @@ inline static float ggml_silu_f32(float x) {
return x/(1.0f + expf(-x)); return x/(1.0f + expf(-x));
} }
#if __FINITE_MATH_ONLY__
#error "some routines in ggml.c require non-finite math arithmetics -- pass -fno-finite-math-only to the compiler to fix"
#error "ref: https://github.com/ggerganov/llama.cpp/pull/7154#issuecomment-2143844461"
#endif
#if defined(__ARM_NEON) && defined(__aarch64__) #if defined(__ARM_NEON) && defined(__aarch64__)
// adapted from arm limited optimized routine // adapted from arm limited optimized routine
@ -19686,6 +19696,59 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
return cplan; return cplan;
} }
static enum ggml_status ggml_graph_compute_parallel(struct ggml_compute_state * workers, int n_threads) {
enum ggml_status compute_status = GGML_STATUS_SUCCESS;
#ifdef GGML_USE_OPENMP
if (n_threads > 1) {
#pragma omp parallel num_threads(n_threads)
{
#pragma omp single
{
// update the number of threads from the actual number of threads that we got from OpenMP
n_threads = omp_get_num_threads();
workers[0].shared->n_threads = n_threads;
workers[0].shared->n_active = n_threads;
}
ggml_graph_compute_thread(&workers[omp_get_thread_num()]);
}
} else {
ggml_graph_compute_thread(&workers[0]);
}
#else
// create thread pool
if (n_threads > 1) {
for (int j = 1; j < n_threads; ++j) {
const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
GGML_ASSERT(rc == 0);
UNUSED(rc);
}
}
// this is a work thread too
ggml_graph_compute_thread(&workers[0]);
// join or kill thread pool
if (n_threads > 1) {
for (int j = 1; j < n_threads; j++) {
const int rc = ggml_thread_join(workers[j].thrd, NULL);
GGML_ASSERT(rc == 0);
UNUSED(rc);
}
}
#endif
// don't leave affinity set on the main thread
clear_numa_thread_affinity();
for (int j = 0; j < n_threads; j++) {
if (workers[j].ec != GGML_STATUS_SUCCESS) {
compute_status = workers[j].ec;
break;
}
}
return compute_status;
}
enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) { enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
{ {
GGML_ASSERT(cplan); GGML_ASSERT(cplan);
@ -19696,7 +19759,11 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
} }
} }
const int n_threads = cplan->n_threads; int n_threads = cplan->n_threads;
#if defined(GGML_USE_OPENMP)
n_threads = MIN(n_threads, omp_get_max_threads());
#endif
struct ggml_compute_state_shared state_shared = { struct ggml_compute_state_shared state_shared = {
/*.cgraph =*/ cgraph, /*.cgraph =*/ cgraph,
@ -19712,46 +19779,19 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
/*.current_chunk; =*/ 0, /*.current_chunk; =*/ 0,
}; };
struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads); struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
const int64_t perf_start_cycles = ggml_perf_cycles();
const int64_t perf_start_time_us = ggml_perf_time_us();
// create thread pool for (int j = 0; j < n_threads; ++j) {
if (n_threads > 1) {
for (int j = 1; j < n_threads; ++j) {
workers[j] = (struct ggml_compute_state) { workers[j] = (struct ggml_compute_state) {
.thrd = 0, .thrd = 0,
.ith = j, .ith = j,
.shared = &state_shared, .shared = &state_shared,
.ec = GGML_STATUS_SUCCESS, .ec = GGML_STATUS_SUCCESS,
}; };
const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
GGML_ASSERT(rc == 0);
UNUSED(rc);
}
} }
workers[0].ith = 0; enum ggml_status compute_status = ggml_graph_compute_parallel(workers, n_threads);
workers[0].shared = &state_shared;
workers[0].ec = GGML_STATUS_SUCCESS;
const int64_t perf_start_cycles = ggml_perf_cycles();
const int64_t perf_start_time_us = ggml_perf_time_us();
// this is a work thread too
ggml_graph_compute_thread(&workers[0]);
enum ggml_status compute_status = workers[0].ec;
// don't leave affinity set on the main thread
clear_numa_thread_affinity();
// join or kill thread pool
if (n_threads > 1) {
for (int j = 1; j < n_threads; j++) {
const int rc = ggml_thread_join(workers[j].thrd, NULL);
GGML_ASSERT(rc == 0);
if (workers[j].ec != GGML_STATUS_SUCCESS)
compute_status = workers[j].ec;
}
}
// performance stats (graph) // performance stats (graph)
{ {

481
llama.cpp
View file

@ -2175,12 +2175,12 @@ struct llama_control_vector {
struct llama_vocab { struct llama_vocab {
using id = int32_t; using id = int32_t;
using token = std::string; using token = std::string;
using ttype = llama_token_type; using tattr = llama_token_attr;
struct token_data { struct token_data {
token text; token text;
float score; float score;
ttype type; tattr attr;
}; };
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM; enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
@ -2401,13 +2401,34 @@ struct llama_context {
struct llama_control_vector cvec; struct llama_control_vector cvec;
}; };
static size_t llama_get_device_count(const llama_model & model) {
size_t count = 1;
#if defined(GGML_USE_CUDA)
count = ggml_backend_cuda_get_device_count();
#elif defined(GGML_USE_SYCL)
count = ggml_backend_sycl_get_device_count();
#elif defined(GGML_USE_VULKAN)
count = ggml_backend_vk_get_device_count();
#endif
#if defined(GGML_USE_RPC)
count += model.rpc_servers.size();
#endif
return count;
GGML_UNUSED(model);
}
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) { static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
ggml_backend_buffer_type_t buft = nullptr; ggml_backend_buffer_type_t buft = nullptr;
#ifdef GGML_USE_RPC #if defined(GGML_USE_RPC)
std::string endpoint = model.rpc_servers[gpu]; int dev_count = (int)llama_get_device_count(model);
buft = ggml_backend_rpc_buffer_type(endpoint.c_str()); int rpc_count = (int)model.rpc_servers.size();
#elif defined(GGML_USE_METAL) if (gpu >= dev_count - rpc_count) {
const char * endpoint = model.rpc_servers[gpu - dev_count + rpc_count].c_str();
return ggml_backend_rpc_buffer_type(endpoint);
}
#endif
#if defined(GGML_USE_METAL)
buft = ggml_backend_metal_buffer_type(); buft = ggml_backend_metal_buffer_type();
#elif defined(GGML_USE_CUDA) #elif defined(GGML_USE_CUDA)
buft = ggml_backend_cuda_buffer_type(gpu); buft = ggml_backend_cuda_buffer_type(gpu);
@ -2455,29 +2476,19 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_mo
GGML_UNUSED(tensor_split); GGML_UNUSED(tensor_split);
} }
static size_t llama_get_device_count(const llama_model & model) {
#if defined(GGML_USE_RPC)
return model.rpc_servers.size();
#elif defined(GGML_USE_CUDA)
return ggml_backend_cuda_get_device_count();
#elif defined(GGML_USE_SYCL)
return ggml_backend_sycl_get_device_count();
#elif defined(GGML_USE_VULKAN)
return ggml_backend_vk_get_device_count();
#else
return 1;
#endif
GGML_UNUSED(model);
}
static size_t llama_get_device_memory(const llama_model & model, int device) { static size_t llama_get_device_memory(const llama_model & model, int device) {
#if defined(GGML_USE_RPC) #if defined(GGML_USE_RPC)
int dev_count = (int)llama_get_device_count(model);
int rpc_count = (int)model.rpc_servers.size();
if (device >= dev_count - rpc_count) {
size_t total; size_t total;
size_t free; size_t free;
std::string endpoint = model.rpc_servers[device]; const char * endpoint = model.rpc_servers[device - dev_count + rpc_count].c_str();
ggml_backend_rpc_get_device_memory(endpoint.c_str(), &free, &total); ggml_backend_rpc_get_device_memory(endpoint, &free, &total);
return free; return free;
#elif defined(GGML_USE_CUDA) }
#endif
#if defined(GGML_USE_CUDA)
size_t total; size_t total;
size_t free; size_t free;
ggml_backend_cuda_get_device_memory(device, &free, &total); ggml_backend_cuda_get_device_memory(device, &free, &total);
@ -4802,7 +4813,20 @@ static void llm_load_vocab(
auto & token_data = vocab.id_to_token[i]; auto & token_data = vocab.id_to_token[i];
token_data.text = std::move(word); token_data.text = std::move(word);
token_data.score = scores ? scores[i] : 0.0f; token_data.score = scores ? scores[i] : 0.0f;
token_data.type = toktypes ? (llama_token_type) toktypes[i] : LLAMA_TOKEN_TYPE_NORMAL; token_data.attr = LLAMA_TOKEN_ATTR_NORMAL;
if (toktypes) { //TODO: remove, required until per token attributes are available from GGUF file
switch(toktypes[i]) {
case LLAMA_TOKEN_TYPE_UNKNOWN: token_data.attr = LLAMA_TOKEN_ATTR_UNKNOWN; break;
case LLAMA_TOKEN_TYPE_UNUSED: token_data.attr = LLAMA_TOKEN_ATTR_UNUSED; break;
case LLAMA_TOKEN_TYPE_NORMAL: token_data.attr = LLAMA_TOKEN_ATTR_NORMAL; break;
case LLAMA_TOKEN_TYPE_CONTROL: token_data.attr = LLAMA_TOKEN_ATTR_CONTROL; break;
case LLAMA_TOKEN_TYPE_USER_DEFINED: token_data.attr = LLAMA_TOKEN_ATTR_USER_DEFINED; break;
case LLAMA_TOKEN_TYPE_BYTE: token_data.attr = LLAMA_TOKEN_ATTR_BYTE; break;
case LLAMA_TOKEN_TYPE_UNDEFINED: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
default: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
}
}
} }
GGML_ASSERT_CONTINUE(vocab.id_to_token.size() == vocab.token_to_id.size()); GGML_ASSERT_CONTINUE(vocab.id_to_token.size() == vocab.token_to_id.size());
@ -4893,7 +4917,7 @@ static void llm_load_vocab(
// build special tokens cache // build special tokens cache
{ {
for (llama_vocab::id id = 0; id < (llama_vocab::id)n_vocab; ++id) { for (llama_vocab::id id = 0; id < (llama_vocab::id)n_vocab; ++id) {
if (vocab.id_to_token[id].type != LLAMA_TOKEN_TYPE_NORMAL) { if (!(vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL)) {
vocab.cache_special_tokens.push_back(id); vocab.cache_special_tokens.push_back(id);
} }
} }
@ -4923,6 +4947,59 @@ static void llm_load_vocab(
LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0); LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0);
} }
// Handle per token attributes
//NOTE: Each model customizes per token attributes.
//NOTE: Per token attributes are missing from the GGUF file.
//TODO: Extract attributes from GGUF file.
{
auto _contains_any = [] (const std::string &str, const std::vector<std::string> &substrs) -> bool {
for (auto substr : substrs) {
if (str.find(substr) < std::string::npos) {
return true;
}
}
return false;
};
auto _set_tokenid_attr = [&] (const llama_vocab::id id, llama_token_attr attr, bool value) {
uint32_t current = vocab.id_to_token.at(id).attr;
current = value ? (current | attr) : (current & ~attr);
vocab.id_to_token[id].attr = (llama_token_attr) current;
};
auto _set_token_attr = [&] (const std::string & token, llama_token_attr attr, bool value) {
_set_tokenid_attr(vocab.token_to_id.at(token), attr, value);
};
std::string model_name;
std::string tokenizer_pre;
ml.get_key(LLM_KV_GENERAL_NAME, model_name, false);
ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
// model name to lowercase
std::transform(model_name.begin(), model_name.end(), model_name.begin(),
[] (const std::string::value_type x) {
return std::tolower(x);
}
);
// set attributes by model/tokenizer name
if (_contains_any(tokenizer_pre, {"jina-v2-es", "jina-v2-de"})) {
_set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
} else if (_contains_any(model_name, {"phi-3", "phi3"})) {
for (auto id : vocab.cache_special_tokens) {
_set_tokenid_attr(id, LLAMA_TOKEN_ATTR_RSTRIP, true);
}
for (auto token : {"</s>"}) {
_set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, true);
}
for (auto token : {"<unk>", "<s>", "<|endoftext|>"}) {
_set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, false);
}
}
}
} }
static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
@ -12679,27 +12756,27 @@ static enum llama_vocab_type llama_vocab_get_type(const llama_vocab & vocab) {
static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) { static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) {
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE); GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL; return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL;
} }
static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) { static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) {
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE); GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNKNOWN; return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNKNOWN;
} }
static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) { static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE); GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_CONTROL; return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_CONTROL;
} }
static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) { static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE); GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE; return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_BYTE;
} }
static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id) { static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id) {
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE); GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED; return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_USER_DEFINED;
} }
static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) { static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
@ -13538,7 +13615,8 @@ struct fragment_buffer_variant {
static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer) { static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer) {
// for each special token // for each special token
for (const llama_vocab::id special_id : vocab.cache_special_tokens) { for (const llama_vocab::id special_id : vocab.cache_special_tokens) {
const auto & special_token = vocab.id_to_token[special_id].text; const auto & data = vocab.id_to_token[special_id];
const auto & special_token = data.text;
// for each text fragment // for each text fragment
std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin(); std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
@ -13575,13 +13653,22 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
if (match > raw_text_base_offset) { if (match > raw_text_base_offset) {
// left // left
const int64_t left_reminder_offset = raw_text_base_offset + 0; const int64_t left_reminder_offset = raw_text_base_offset + 0;
const int64_t left_reminder_length = match - raw_text_base_offset; int64_t left_reminder_length = match - raw_text_base_offset;
if (data.attr & LLAMA_TOKEN_ATTR_LSTRIP) {
while (left_reminder_length > 0 && isspace(raw_text[left_reminder_offset + left_reminder_length - 1])) {
left_reminder_length--;
}
}
if (left_reminder_length > 0) {
buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length); buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length);
it++;
}
#ifdef PRETOKENIZERDEBUG #ifdef PRETOKENIZERDEBUG
LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str()); LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
#endif #endif
it++;
} }
// special token // special token
@ -13590,16 +13677,25 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
// right // right
if (match + special_token.length() < raw_text_base_offset + raw_text_base_length) { if (match + special_token.length() < raw_text_base_offset + raw_text_base_length) {
const int64_t right_reminder_offset = match + special_token.length(); int64_t right_reminder_offset = match + special_token.length();
const int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length()); int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
if (data.attr & LLAMA_TOKEN_ATTR_RSTRIP) {
while (right_reminder_length > 0 && isspace(raw_text[right_reminder_offset])) {
right_reminder_offset++;
right_reminder_length--;
}
}
if (right_reminder_length > 0) {
buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length); buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length);
it++;
}
#ifdef PRETOKENIZERDEBUG #ifdef PRETOKENIZERDEBUG
LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str()); LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
#endif #endif
it++;
if (source == 0) { if (source == 0) {
buffer.erase_after(buffer.before_begin()); buffer.erase_after(buffer.before_begin());
} else { } else {
@ -13645,9 +13741,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
// tokenizer.encode('', add_special_tokens=True) returns [1] // tokenizer.encode('', add_special_tokens=True) returns [1]
// tokenizer.encode('', add_special_tokens=False) returns [] // tokenizer.encode('', add_special_tokens=False) returns []
static const bool rtrim = true; //TODO: as param
bool is_prev_special = false; bool is_prev_special = false;
bool special_token_rtrim = false;
if (add_special && vocab.special_add_bos != 0) { if (add_special && vocab.special_add_bos != 0) {
GGML_ASSERT(vocab.special_bos_id != -1); GGML_ASSERT(vocab.special_bos_id != -1);
@ -13657,25 +13751,8 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
for (const auto & fragment : fragment_buffer) { for (const auto & fragment : fragment_buffer) {
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
// without adding this leading whitespace, we do not get the same results as the original tokenizer
// TODO: It's likely possible to get rid of this string copy entirely
// by modifying llm_tokenizer_x to operate with string offsets like pre-tokenizer
// and passing 'add space prefix' as bool argument
//
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length); auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
if (special_token_rtrim) {
size_t num_whitespaces = 0;
while (isspace(raw_text[num_whitespaces])) {
num_whitespaces++;
}
if (num_whitespaces == raw_text.size()) {
continue; // skip if all whitespaces
}
raw_text = raw_text.substr(num_whitespaces);
}
if (vocab.add_space_prefix) { if (vocab.add_space_prefix) {
if (!output.size() || is_prev_special) { // prefix with space if first token if (!output.size() || is_prev_special) { // prefix with space if first token
raw_text = " " + raw_text; raw_text = " " + raw_text;
@ -13691,11 +13768,6 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN) } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
output.push_back(fragment.token); output.push_back(fragment.token);
is_prev_special = true; is_prev_special = true;
// phi-3 special tokens without rtrim, works fine for llama-spm too
special_token_rtrim = rtrim
&& fragment.token != vocab.special_bos_id
&& fragment.token != vocab.special_unk_id
&& fragment.token != vocab.special_eos_id;
} }
} }
@ -14954,260 +15026,6 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
ctx->t_sample_us += ggml_time_us() - t_start_sample_us; ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
} }
//
// Beam search
//
struct llama_beam {
std::vector<llama_token> tokens;
float p; // Cumulative beam probability (renormalized relative to all beams)
bool eob; // Initialize end-of-beam to false. Callback sets this to true.
// Sort beams by probability. In case of ties, prefer beams at eob.
bool operator<(const llama_beam & rhs) const {
return std::make_pair(p, eob) < std::make_pair(rhs.p, rhs.eob);
}
// Shift off first n tokens and discard them.
void shift_tokens(const size_t n) {
if (n) {
std::copy(tokens.begin() + n, tokens.end(), tokens.begin());
tokens.resize(tokens.size() - n);
}
}
llama_beam_view view() const { return {tokens.data(), tokens.size(), p, eob}; }
};
// A struct for calculating logit-related info.
struct llama_logit_info {
const float * const logits;
const int n_vocab;
const float max_l;
const float normalizer;
struct sum_exp {
float max_l;
float operator()(float sum, float l) const { return sum + std::exp(l - max_l); }
};
llama_logit_info(llama_context * ctx)
: logits(llama_get_logits(ctx))
, n_vocab(llama_n_vocab(llama_get_model(ctx)))
, max_l(*std::max_element(logits, logits + n_vocab))
, normalizer(1.0f / std::accumulate(logits, logits + n_vocab, 0.0f, sum_exp{max_l}))
{ }
llama_token_data get_token_data(const llama_token token_id) const {
constexpr auto p = std::numeric_limits<float>::quiet_NaN(); // never used
return {token_id, logits[token_id], p};
}
// Return top k token_data by logit.
std::vector<llama_token_data> top_k(size_t k) {
std::vector<llama_token_data> min_heap; // min-heap by logit
const llama_token k_min = std::min(static_cast<llama_token>(k), n_vocab);
min_heap.reserve(k_min);
for (llama_token token_id = 0 ; token_id < k_min ; ++token_id) {
min_heap.push_back(get_token_data(token_id));
}
auto comp = [](const llama_token_data & a, const llama_token_data & b) { return a.logit > b.logit; };
std::make_heap(min_heap.begin(), min_heap.end(), comp);
for (llama_token token_id = k_min ; token_id < n_vocab ; ++token_id) {
if (min_heap.front().logit < logits[token_id]) {
std::pop_heap(min_heap.begin(), min_heap.end(), comp);
min_heap.back().id = token_id;
min_heap.back().logit = logits[token_id];
std::push_heap(min_heap.begin(), min_heap.end(), comp);
}
}
return min_heap;
}
float probability_from_logit(float logit) const {
return normalizer * std::exp(logit - max_l);
}
};
struct llama_beam_search_data {
llama_context * ctx;
size_t n_beams;
int n_past;
int n_predict;
std::vector<llama_beam> beams;
std::vector<llama_beam> next_beams;
// Re-calculated on each loop iteration
size_t common_prefix_length;
// Used to communicate to/from callback on beams state.
std::vector<llama_beam_view> beam_views;
llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict)
: ctx(ctx)
, n_beams(n_beams)
, n_past(n_past)
, n_predict(n_predict)
, beam_views(n_beams) {
beams.reserve(n_beams);
next_beams.reserve(n_beams);
}
// Collapse beams to a single beam given by index.
void collapse_beams(const size_t beam_idx) {
if (0u < beam_idx) {
std::swap(beams[0], beams[beam_idx]);
}
beams.resize(1);
}
// Min-heaps are used to efficiently collect the top-k elements (k=n_beams).
// The repetitive patterns below reflect the 2 stages of heaps:
// * Gather elements until the vector is full, then call std::make_heap() on it.
// * If the heap is full and a new element is found that should be included, pop the
// least element to the back(), replace it with the new, then push it into the heap.
void fill_next_beams_by_top_probabilities(llama_beam & beam) {
// Min-heaps use a greater-than comparator.
const auto comp = [](const llama_beam & a, const llama_beam & b) { return a.p > b.p; };
if (beam.eob) {
// beam is at end-of-sentence, so just copy it to next_beams if its probability is high enough.
if (next_beams.size() < n_beams) {
next_beams.push_back(std::move(beam));
if (next_beams.size() == n_beams) {
std::make_heap(next_beams.begin(), next_beams.end(), comp);
}
} else if (next_beams.front().p < beam.p) {
std::pop_heap(next_beams.begin(), next_beams.end(), comp);
next_beams.back() = std::move(beam);
std::push_heap(next_beams.begin(), next_beams.end(), comp);
}
} else {
// beam is not at end-of-sentence, so branch with next top_k tokens.
if (!beam.tokens.empty()) {
llama_decode(ctx, llama_batch_get_one(beam.tokens.data(), beam.tokens.size(), n_past, 0));
}
llama_logit_info logit_info(ctx);
std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
// Clear the kv slot so that other beams may try different tokens at this position. The llama_decode()
// call in loop() will conclusively fill in the kv slot once the beams converge at this position.
llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
size_t i=0;
if (next_beams.size() < n_beams) {
for (; next_beams.size() < n_beams ; ++i) {
llama_beam next_beam = beam;
next_beam.tokens.push_back(next_tokens[i].id);
next_beam.p *= logit_info.probability_from_logit(next_tokens[i].logit);
next_beams.push_back(std::move(next_beam));
}
std::make_heap(next_beams.begin(), next_beams.end(), comp);
} else {
for (; next_beams.front().p == 0.0f ; ++i) {
std::pop_heap(next_beams.begin(), next_beams.end(), comp);
next_beams.back() = beam;
next_beams.back().tokens.push_back(next_tokens[i].id);
next_beams.back().p *= logit_info.probability_from_logit(next_tokens[i].logit);
std::push_heap(next_beams.begin(), next_beams.end(), comp);
}
}
for (; i < n_beams ; ++i) {
const float next_p = beam.p * logit_info.probability_from_logit(next_tokens[i].logit);
if (next_beams.front().p < next_p) {
std::pop_heap(next_beams.begin(), next_beams.end(), comp);
next_beams.back() = beam;
next_beams.back().tokens.push_back(next_tokens[i].id);
next_beams.back().p = next_p;
std::push_heap(next_beams.begin(), next_beams.end(), comp);
}
}
}
}
// Find common_prefix_length based on beams.
// Requires beams is not empty.
size_t find_common_prefix_length() {
size_t common_prefix_length = beams[0].tokens.size();
for (size_t i = 1 ; i < beams.size() ; ++i) {
common_prefix_length = std::min(common_prefix_length, beams[i].tokens.size());
for (size_t j = 0 ; j < common_prefix_length ; ++j) {
if (beams[0].tokens[j] != beams[i].tokens[j]) {
common_prefix_length = j;
break;
}
}
}
return common_prefix_length;
}
// Construct beams_state to send back to caller via the callback function.
// Side effect: set common_prefix_length = find_common_prefix_length();
llama_beams_state get_beams_state(const bool last_call) {
for (size_t i = 0 ; i < beams.size() ; ++i) {
beam_views[i] = beams[i].view();
}
common_prefix_length = find_common_prefix_length();
return {beam_views.data(), beams.size(), common_prefix_length, last_call};
}
// Loop:
// * while i < n_predict, AND
// * any of the beams have not yet reached end-of-beam (eob), AND
// * the highest probability beam(s) (plural in case of ties) are not at end-of-sentence
// (since all other beam probabilities can only decrease)
void loop(const llama_beam_search_callback_fn_t callback, void * const callback_data) {
beams.push_back({{}, 1.0f, false}); // Start with one empty beam w/ probability = 1.0 and !eob.
const auto not_eob = [](const llama_beam & beam) { return !beam.eob; };
for (int i = 0 ; i < n_predict && std::any_of(beams.begin(),beams.end(),not_eob) &&
!beams[top_beam_index()].eob ; ++i) {
callback(callback_data, get_beams_state(false)); // Sets common_prefix_length
update_beams_from_beam_views(); // Update values (p,eob) that callback may have changed.
if (common_prefix_length) {
llama_decode(ctx, llama_batch_get_one(beams[0].tokens.data(), common_prefix_length, n_past, 0));
n_past += common_prefix_length;
}
// Zero-out next_beam probabilities to place them last in following min-heap.
std::for_each(next_beams.begin(), next_beams.end(), [](llama_beam & beam) { beam.p = 0.0f; });
for (llama_beam & beam : beams) {
beam.shift_tokens(common_prefix_length);
fill_next_beams_by_top_probabilities(beam);
}
// next_beams become the beams of next/final iteration. Swap them to re-use memory.
beams.swap(next_beams);
renormalize_beam_probabilities(beams);
}
collapse_beams(top_beam_index());
callback(callback_data, get_beams_state(true));
}
// As beams grow, the cumulative probabilities decrease.
// Renormalize them to avoid floating point underflow.
static void renormalize_beam_probabilities(std::vector<llama_beam> & beams) {
const auto sum_p = [](float sum, llama_beam & beam) { return sum + beam.p; };
const float inv_sum = 1.0f / std::accumulate(beams.begin(), beams.end(), 0.0f, sum_p);
std::for_each(beams.begin(), beams.end(), [=](llama_beam & beam) { beam.p *= inv_sum; });
}
// Assumes beams is non-empty. Uses llama_beam::operator<() for ordering.
size_t top_beam_index() {
return std::max_element(beams.begin(), beams.end()) - beams.begin();
}
// Copy (p,eob) for each beam which may have been changed by the callback.
void update_beams_from_beam_views() {
for (size_t i = 0 ; i < beams.size() ; ++i) {
beams[i].p = beam_views[i].p;
beams[i].eob = beam_views[i].eob;
}
}
};
void llama_beam_search(llama_context * ctx,
llama_beam_search_callback_fn_t callback, void * callback_data,
size_t n_beams, int n_past, int n_predict) {
assert(ctx);
const int64_t t_start_sample_us = ggml_time_us();
llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict);
beam_search_data.loop(callback, callback_data);
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
ctx->n_sample++;
}
// //
// quantization // quantization
// //
@ -16463,7 +16281,7 @@ struct llama_model * llama_load_model_from_file(
return true; return true;
}; };
} }
if (params.rpc_servers != nullptr) { if (params.rpc_servers != nullptr && params.rpc_servers[0] != '\0') {
// split the servers set them into model->rpc_servers // split the servers set them into model->rpc_servers
std::string servers(params.rpc_servers); std::string servers(params.rpc_servers);
size_t pos = 0; size_t pos = 0;
@ -16626,17 +16444,7 @@ struct llama_context * llama_new_context_with_model(
if (!hparams.vocab_only) { if (!hparams.vocab_only) {
// initialize backends // initialize backends
#if defined(GGML_USE_RPC) #if defined(GGML_USE_METAL)
for (auto & server : model->rpc_servers) {
ggml_backend_t backend = ggml_backend_rpc_init(server.c_str());
if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to connect RPC backend to %s\n", __func__, server.c_str());
llama_free(ctx);
return nullptr;
}
ctx->backends.push_back(backend);
}
#elif defined(GGML_USE_METAL)
if (model->n_gpu_layers > 0) { if (model->n_gpu_layers > 0) {
ctx->backend_metal = ggml_backend_metal_init(); ctx->backend_metal = ggml_backend_metal_init();
if (ctx->backend_metal == nullptr) { if (ctx->backend_metal == nullptr) {
@ -16728,6 +16536,19 @@ struct llama_context * llama_new_context_with_model(
} }
ctx->backends.push_back(backend); ctx->backends.push_back(backend);
} }
#endif
#if defined(GGML_USE_RPC)
if (model->n_gpu_layers > 0) {
for (const auto & endpoint : model->rpc_servers) {
ggml_backend_t backend = ggml_backend_rpc_init(endpoint.c_str());
if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
llama_free(ctx);
return nullptr;
}
ctx->backends.push_back(backend);
}
}
#endif #endif
ctx->backend_cpu = ggml_backend_cpu_init(); ctx->backend_cpu = ggml_backend_cpu_init();
if (ctx->backend_cpu == nullptr) { if (ctx->backend_cpu == nullptr) {
@ -18521,9 +18342,9 @@ float llama_token_get_score(const struct llama_model * model, llama_token token)
return model->vocab.id_to_token[token].score; return model->vocab.id_to_token[token].score;
} }
llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token) { llama_token_attr llama_token_get_attr(const struct llama_model * model, llama_token token) {
GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE); GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE);
return model->vocab.id_to_token[token].type; return model->vocab.id_to_token[token].attr;
} }
bool llama_token_is_eog(const struct llama_model * model, llama_token token) { bool llama_token_is_eog(const struct llama_model * model, llama_token token) {

60
llama.h
View file

@ -97,7 +97,7 @@ extern "C" {
LLAMA_ROPE_TYPE_GLM = 4, LLAMA_ROPE_TYPE_GLM = 4,
}; };
enum llama_token_type { enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file
LLAMA_TOKEN_TYPE_UNDEFINED = 0, LLAMA_TOKEN_TYPE_UNDEFINED = 0,
LLAMA_TOKEN_TYPE_NORMAL = 1, LLAMA_TOKEN_TYPE_NORMAL = 1,
LLAMA_TOKEN_TYPE_UNKNOWN = 2, LLAMA_TOKEN_TYPE_UNKNOWN = 2,
@ -107,6 +107,20 @@ extern "C" {
LLAMA_TOKEN_TYPE_BYTE = 6, LLAMA_TOKEN_TYPE_BYTE = 6,
}; };
enum llama_token_attr {
LLAMA_TOKEN_ATTR_UNDEFINED = 0,
LLAMA_TOKEN_ATTR_UNKNOWN = 1 << 1,
LLAMA_TOKEN_ATTR_UNUSED = 1 << 2,
LLAMA_TOKEN_ATTR_NORMAL = 1 << 3,
LLAMA_TOKEN_ATTR_CONTROL = 1 << 4, // SPECIAL?
LLAMA_TOKEN_ATTR_USER_DEFINED = 1 << 5,
LLAMA_TOKEN_ATTR_BYTE = 1 << 6,
LLAMA_TOKEN_ATTR_NORMALIZED = 1 << 7,
LLAMA_TOKEN_ATTR_LSTRIP = 1 << 8,
LLAMA_TOKEN_ATTR_RSTRIP = 1 << 9,
LLAMA_TOKEN_ATTR_SINGLE_WORD = 1 << 10,
};
// model file types // model file types
enum llama_ftype { enum llama_ftype {
LLAMA_FTYPE_ALL_F32 = 0, LLAMA_FTYPE_ALL_F32 = 0,
@ -823,7 +837,7 @@ extern "C" {
LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token); LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token);
LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token); LLAMA_API enum llama_token_attr llama_token_get_attr(const struct llama_model * model, llama_token token);
// Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.) // Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
LLAMA_API bool llama_token_is_eog(const struct llama_model * model, llama_token token); LLAMA_API bool llama_token_is_eog(const struct llama_model * model, llama_token token);
@ -1046,49 +1060,9 @@ extern "C" {
llama_token token); llama_token token);
// //
// Beam search // Model split
// //
struct llama_beam_view {
const llama_token * tokens;
size_t n_tokens;
float p; // Cumulative beam probability (renormalized relative to all beams)
bool eob; // Callback should set this to true when a beam is at end-of-beam.
};
// Passed to beam_search_callback function.
// Whenever 0 < common_prefix_length, this number of tokens should be copied from any of the beams
// (e.g. beams[0]) as they will be removed (shifted) from all beams in all subsequent callbacks.
// These pointers are valid only during the synchronous callback, so should not be saved.
struct llama_beams_state {
struct llama_beam_view * beam_views;
size_t n_beams; // Number of elements in beam_views[].
size_t common_prefix_length; // Current max length of prefix tokens shared by all beams.
bool last_call; // True iff this is the last callback invocation.
};
// Type of pointer to the beam_search_callback function.
// void* callback_data is any custom data passed to llama_beam_search, that is subsequently
// passed back to beam_search_callback. This avoids having to use global variables in the callback.
typedef void (*llama_beam_search_callback_fn_t)(void * callback_data, struct llama_beams_state);
/// @details Deterministically returns entire sentence constructed by a beam search.
/// @param ctx Pointer to the llama_context.
/// @param callback Invoked for each iteration of the beam_search loop, passing in beams_state.
/// @param callback_data A pointer that is simply passed back to callback.
/// @param n_beams Number of beams to use.
/// @param n_past Number of tokens already evaluated.
/// @param n_predict Maximum number of tokens to predict. EOS may occur earlier.
LLAMA_API void llama_beam_search(
struct llama_context * ctx,
llama_beam_search_callback_fn_t callback,
void * callback_data,
size_t n_beams,
int32_t n_past,
int32_t n_predict);
/// @details Build a split GGUF final path for this chunk. /// @details Build a split GGUF final path for this chunk.
/// llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf" /// llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
// Returns the split_path length. // Returns the split_path length.