mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 09:34:37 +00:00
Merge commit '0cd6bd3483
' into concedo_experimental
# Conflicts: # .github/workflows/build.yml # .gitignore # CMakeLists.txt # Makefile # README.md # models/ggml-vocab-phi-3.gguf # scripts/compare-commits.sh # tests/test-tokenizer-random.py
This commit is contained in:
commit
e3e21cc44d
12 changed files with 355 additions and 675 deletions
|
@ -9,7 +9,7 @@ set( CMAKE_CXX_COMPILER clang++ )
|
||||||
set( CMAKE_C_COMPILER_TARGET ${target} )
|
set( CMAKE_C_COMPILER_TARGET ${target} )
|
||||||
set( CMAKE_CXX_COMPILER_TARGET ${target} )
|
set( CMAKE_CXX_COMPILER_TARGET ${target} )
|
||||||
|
|
||||||
set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast" )
|
set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
|
||||||
set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments" )
|
set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments" )
|
||||||
|
|
||||||
set( CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
|
set( CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
|
||||||
|
|
|
@ -15,7 +15,6 @@ else()
|
||||||
add_subdirectory(baby-llama)
|
add_subdirectory(baby-llama)
|
||||||
add_subdirectory(batched)
|
add_subdirectory(batched)
|
||||||
add_subdirectory(batched-bench)
|
add_subdirectory(batched-bench)
|
||||||
add_subdirectory(beam-search)
|
|
||||||
add_subdirectory(benchmark)
|
add_subdirectory(benchmark)
|
||||||
add_subdirectory(convert-llama2c-to-ggml)
|
add_subdirectory(convert-llama2c-to-ggml)
|
||||||
add_subdirectory(embedding)
|
add_subdirectory(embedding)
|
||||||
|
|
|
@ -1,5 +0,0 @@
|
||||||
set(TARGET beam-search)
|
|
||||||
add_executable(${TARGET} beam-search.cpp)
|
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
|
|
@ -1,188 +0,0 @@
|
||||||
#include "common.h"
|
|
||||||
#include "llama.h"
|
|
||||||
|
|
||||||
#include <cassert>
|
|
||||||
#include <cinttypes>
|
|
||||||
#include <cmath>
|
|
||||||
#include <cstdio>
|
|
||||||
#include <cstring>
|
|
||||||
#include <ctime>
|
|
||||||
#include <fstream>
|
|
||||||
#include <iostream>
|
|
||||||
#include <string>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
|
||||||
#include <signal.h>
|
|
||||||
#include <unistd.h>
|
|
||||||
#elif defined (_WIN32)
|
|
||||||
#define WIN32_LEAN_AND_MEAN
|
|
||||||
#ifndef NOMINMAX
|
|
||||||
# define NOMINMAX
|
|
||||||
#endif
|
|
||||||
#include <windows.h>
|
|
||||||
#include <signal.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Used for debugging to print out beam tokens.
|
|
||||||
struct ostream_beam_view {
|
|
||||||
llama_context * ctx;
|
|
||||||
llama_beam_view beam_view;
|
|
||||||
};
|
|
||||||
|
|
||||||
static std::ostream & operator<<(std::ostream & os, const ostream_beam_view & obv) {
|
|
||||||
os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens(";
|
|
||||||
for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) {
|
|
||||||
os << llama_token_to_piece(obv.ctx, obv.beam_view.tokens[i]);
|
|
||||||
}
|
|
||||||
return os << ')';
|
|
||||||
}
|
|
||||||
|
|
||||||
// Put here anything you want back in beam_search_callback().
|
|
||||||
struct beam_search_callback_data {
|
|
||||||
llama_context * ctx;
|
|
||||||
std::vector<llama_token> response;
|
|
||||||
};
|
|
||||||
|
|
||||||
// In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.
|
|
||||||
// For example, eob can be flagged due to maximum token length, stop words, etc.
|
|
||||||
static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) {
|
|
||||||
return n_tokens && llama_token_is_eog(llama_get_model(callback_data.ctx), tokens[n_tokens-1]);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Function matching type llama_beam_search_callback_fn_t.
|
|
||||||
// Custom callback example is called each time the beams lengths increase:
|
|
||||||
// * Show progress by printing ',' following by number of convergent beam tokens if any.
|
|
||||||
// * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
|
|
||||||
// This is also called when the stop condition is met.
|
|
||||||
// Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
|
|
||||||
static void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) {
|
|
||||||
auto& callback_data = *static_cast<beam_search_callback_data*>(callback_data_ptr);
|
|
||||||
// Mark beams as EOS as needed.
|
|
||||||
for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
|
|
||||||
llama_beam_view& beam_view = beams_state.beam_views[i];
|
|
||||||
if (!beam_view.eob && is_at_eob(callback_data, beam_view.tokens, beam_view.n_tokens)) {
|
|
||||||
beam_view.eob = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
printf(","); // Show progress
|
|
||||||
if (const size_t n = beams_state.common_prefix_length) {
|
|
||||||
callback_data.response.resize(callback_data.response.size() + n);
|
|
||||||
assert(0u < beams_state.n_beams);
|
|
||||||
const llama_token * tokens = beams_state.beam_views[0].tokens;
|
|
||||||
std::copy(tokens, tokens + n, callback_data.response.end() - n);
|
|
||||||
printf("%zu", n);
|
|
||||||
}
|
|
||||||
fflush(stdout);
|
|
||||||
#if 1 // DEBUG: print current beams for this iteration
|
|
||||||
std::cout << "\n\nCurrent beams (last_call=" << beams_state.last_call << "):\n";
|
|
||||||
for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
|
|
||||||
std::cout << "beams["<<i<<"]: " << ostream_beam_view{callback_data.ctx,beams_state.beam_views[i]} << std::endl;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
int main(int argc, char ** argv)
|
|
||||||
{
|
|
||||||
gpt_params params;
|
|
||||||
//params.n_gpu_layers = 200;
|
|
||||||
|
|
||||||
//---------------------------------
|
|
||||||
// Print help :
|
|
||||||
//---------------------------------
|
|
||||||
|
|
||||||
if ( argc < 2 || argv[1][0] == '-' )
|
|
||||||
{
|
|
||||||
printf( "Usage: %s MODEL_PATH [BEAM_WIDTH=2] [PROMPT]\n" , argv[0] );
|
|
||||||
return 1 ;
|
|
||||||
}
|
|
||||||
|
|
||||||
//---------------------------------
|
|
||||||
// Load parameters :
|
|
||||||
//---------------------------------
|
|
||||||
|
|
||||||
params.model = argv[1];
|
|
||||||
|
|
||||||
params.n_beams = 2 < argc ? std::stoi(argv[2]) : 2;
|
|
||||||
|
|
||||||
if ( argc > 3 )
|
|
||||||
{
|
|
||||||
params.prompt = argv[3];
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( params.prompt.empty() )
|
|
||||||
{
|
|
||||||
params.prompt = "### Request:\nHow many countries are there?\n\n### Response:\n";
|
|
||||||
}
|
|
||||||
|
|
||||||
//---------------------------------
|
|
||||||
// Init LLM :
|
|
||||||
//---------------------------------
|
|
||||||
|
|
||||||
llama_backend_init();
|
|
||||||
llama_numa_init(params.numa);
|
|
||||||
|
|
||||||
llama_model * model;
|
|
||||||
llama_context * ctx;
|
|
||||||
|
|
||||||
std::tie(model, ctx) = llama_init_from_gpt_params( params );
|
|
||||||
|
|
||||||
if ( model == NULL )
|
|
||||||
{
|
|
||||||
fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
//---------------------------------
|
|
||||||
// Tokenize the prompt :
|
|
||||||
//---------------------------------
|
|
||||||
|
|
||||||
std::vector<llama_token> tokens_list = llama_tokenize(ctx, params.prompt, true);
|
|
||||||
|
|
||||||
const size_t max_context_size = llama_n_ctx( ctx );
|
|
||||||
const size_t max_tokens_list_size = max_context_size - 4 ;
|
|
||||||
|
|
||||||
if (tokens_list.size() > max_tokens_list_size)
|
|
||||||
{
|
|
||||||
fprintf( stderr , "%s: error: prompt too long (%zu tokens, max %zu)\n" ,
|
|
||||||
__func__ , tokens_list.size() , max_tokens_list_size );
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
fprintf( stderr, "\n\n" );
|
|
||||||
|
|
||||||
// Print the tokens from the prompt :
|
|
||||||
|
|
||||||
for( auto id : tokens_list )
|
|
||||||
{
|
|
||||||
std::cout << llama_token_to_piece(ctx, id);
|
|
||||||
}
|
|
||||||
std::cout << std::flush;
|
|
||||||
|
|
||||||
int n_past = 0;
|
|
||||||
|
|
||||||
if (llama_decode(ctx, llama_batch_get_one(tokens_list.data(), tokens_list.size(), n_past, 0)))
|
|
||||||
{
|
|
||||||
fprintf(stderr, "%s : failed to eval prompt.\n" , __func__ );
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
n_past += tokens_list.size();
|
|
||||||
|
|
||||||
beam_search_callback_data callback_data{ctx, {}};
|
|
||||||
size_t const beam_width = static_cast<size_t>(params.n_beams);
|
|
||||||
int const n_predict = 256;
|
|
||||||
llama_beam_search(ctx, beam_search_callback, &callback_data, beam_width, n_past, n_predict);
|
|
||||||
|
|
||||||
std::cout << "\n\n";
|
|
||||||
for (llama_token const token_id : callback_data.response) {
|
|
||||||
std::cout << llama_token_to_piece(ctx,token_id);
|
|
||||||
}
|
|
||||||
std::cout << std::endl;
|
|
||||||
|
|
||||||
llama_free( ctx );
|
|
||||||
llama_free_model( model );
|
|
||||||
|
|
||||||
llama_backend_free();
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
|
@ -141,10 +141,11 @@ static std::string get_gpu_info() {
|
||||||
}
|
}
|
||||||
|
|
||||||
// command line params
|
// command line params
|
||||||
enum output_formats {CSV, JSON, MARKDOWN, SQL};
|
enum output_formats {NONE, CSV, JSON, MARKDOWN, SQL};
|
||||||
|
|
||||||
static const char * output_format_str(output_formats format) {
|
static const char * output_format_str(output_formats format) {
|
||||||
switch (format) {
|
switch (format) {
|
||||||
|
case NONE: return "none";
|
||||||
case CSV: return "csv";
|
case CSV: return "csv";
|
||||||
case JSON: return "json";
|
case JSON: return "json";
|
||||||
case MARKDOWN: return "md";
|
case MARKDOWN: return "md";
|
||||||
|
@ -153,6 +154,23 @@ static const char * output_format_str(output_formats format) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool output_format_from_str(const std::string & s, output_formats & format) {
|
||||||
|
if (s == "none") {
|
||||||
|
format = NONE;
|
||||||
|
} else if (s == "csv") {
|
||||||
|
format = CSV;
|
||||||
|
} else if (s == "json") {
|
||||||
|
format = JSON;
|
||||||
|
} else if (s == "md") {
|
||||||
|
format = MARKDOWN;
|
||||||
|
} else if (s == "sql") {
|
||||||
|
format = SQL;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
static const char * split_mode_str(llama_split_mode mode) {
|
static const char * split_mode_str(llama_split_mode mode) {
|
||||||
switch (mode) {
|
switch (mode) {
|
||||||
case LLAMA_SPLIT_MODE_NONE: return "none";
|
case LLAMA_SPLIT_MODE_NONE: return "none";
|
||||||
|
@ -191,6 +209,7 @@ struct cmd_params {
|
||||||
int reps;
|
int reps;
|
||||||
bool verbose;
|
bool verbose;
|
||||||
output_formats output_format;
|
output_formats output_format;
|
||||||
|
output_formats output_format_stderr;
|
||||||
};
|
};
|
||||||
|
|
||||||
static const cmd_params cmd_params_defaults = {
|
static const cmd_params cmd_params_defaults = {
|
||||||
|
@ -215,7 +234,8 @@ static const cmd_params cmd_params_defaults = {
|
||||||
/* numa */ GGML_NUMA_STRATEGY_DISABLED,
|
/* numa */ GGML_NUMA_STRATEGY_DISABLED,
|
||||||
/* reps */ 5,
|
/* reps */ 5,
|
||||||
/* verbose */ false,
|
/* verbose */ false,
|
||||||
/* output_format */ MARKDOWN
|
/* output_format */ MARKDOWN,
|
||||||
|
/* output_format_stderr */ NONE,
|
||||||
};
|
};
|
||||||
|
|
||||||
static void print_usage(int /* argc */, char ** argv) {
|
static void print_usage(int /* argc */, char ** argv) {
|
||||||
|
@ -244,6 +264,7 @@ static void print_usage(int /* argc */, char ** argv) {
|
||||||
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
|
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
|
||||||
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
|
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
|
||||||
printf(" -o, --output <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
|
printf(" -o, --output <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
|
||||||
|
printf(" -oe, --output-err <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr));
|
||||||
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
|
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
|
||||||
printf("\n");
|
printf("\n");
|
||||||
printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
|
printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
|
||||||
|
@ -285,6 +306,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||||
|
|
||||||
params.verbose = cmd_params_defaults.verbose;
|
params.verbose = cmd_params_defaults.verbose;
|
||||||
params.output_format = cmd_params_defaults.output_format;
|
params.output_format = cmd_params_defaults.output_format;
|
||||||
|
params.output_format_stderr = cmd_params_defaults.output_format_stderr;
|
||||||
params.reps = cmd_params_defaults.reps;
|
params.reps = cmd_params_defaults.reps;
|
||||||
|
|
||||||
for (int i = 1; i < argc; i++) {
|
for (int i = 1; i < argc; i++) {
|
||||||
|
@ -494,18 +516,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (argv[i] == std::string("csv")) {
|
invalid_param = !output_format_from_str(argv[i], params.output_format);
|
||||||
params.output_format = CSV;
|
} else if (arg == "-oe" || arg == "--output-err") {
|
||||||
} else if (argv[i] == std::string("json")) {
|
if (++i >= argc) {
|
||||||
params.output_format = JSON;
|
|
||||||
} else if (argv[i] == std::string("md")) {
|
|
||||||
params.output_format = MARKDOWN;
|
|
||||||
} else if (argv[i] == std::string("sql")) {
|
|
||||||
params.output_format = SQL;
|
|
||||||
} else {
|
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
invalid_param = !output_format_from_str(argv[i], params.output_format_stderr);
|
||||||
} else if (arg == "-v" || arg == "--verbose") {
|
} else if (arg == "-v" || arg == "--verbose") {
|
||||||
params.verbose = true;
|
params.verbose = true;
|
||||||
} else {
|
} else {
|
||||||
|
@ -1279,6 +1296,22 @@ static void llama_null_log_callback(enum ggml_log_level level, const char * text
|
||||||
(void) user_data;
|
(void) user_data;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static std::unique_ptr<printer> create_printer(output_formats format) {
|
||||||
|
switch (format) {
|
||||||
|
case NONE:
|
||||||
|
return nullptr;
|
||||||
|
case CSV:
|
||||||
|
return std::unique_ptr<printer>(new csv_printer());
|
||||||
|
case JSON:
|
||||||
|
return std::unique_ptr<printer>(new json_printer());
|
||||||
|
case MARKDOWN:
|
||||||
|
return std::unique_ptr<printer>(new markdown_printer());
|
||||||
|
case SQL:
|
||||||
|
return std::unique_ptr<printer>(new sql_printer());
|
||||||
|
}
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
}
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
// try to set locale for unicode characters in markdown
|
// try to set locale for unicode characters in markdown
|
||||||
setlocale(LC_CTYPE, ".UTF-8");
|
setlocale(LC_CTYPE, ".UTF-8");
|
||||||
|
@ -1305,26 +1338,18 @@ int main(int argc, char ** argv) {
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
// initialize printer
|
// initialize printer
|
||||||
std::unique_ptr<printer> p;
|
std::unique_ptr<printer> p = create_printer(params.output_format);
|
||||||
switch (params.output_format) {
|
std::unique_ptr<printer> p_err = create_printer(params.output_format_stderr);
|
||||||
case CSV:
|
|
||||||
p.reset(new csv_printer());
|
if (p) {
|
||||||
break;
|
|
||||||
case JSON:
|
|
||||||
p.reset(new json_printer());
|
|
||||||
break;
|
|
||||||
case MARKDOWN:
|
|
||||||
p.reset(new markdown_printer());
|
|
||||||
break;
|
|
||||||
case SQL:
|
|
||||||
p.reset(new sql_printer());
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
assert(false);
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
p->fout = stdout;
|
p->fout = stdout;
|
||||||
p->print_header(params);
|
p->print_header(params);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (p_err) {
|
||||||
|
p_err->fout = stderr;
|
||||||
|
p_err->print_header(params);
|
||||||
|
}
|
||||||
|
|
||||||
std::vector<cmd_params_instance> params_instances = get_cmd_params_instances(params);
|
std::vector<cmd_params_instance> params_instances = get_cmd_params_instances(params);
|
||||||
|
|
||||||
|
@ -1382,7 +1407,15 @@ int main(int argc, char ** argv) {
|
||||||
t.samples_ns.push_back(t_ns);
|
t.samples_ns.push_back(t_ns);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (p) {
|
||||||
p->print_test(t);
|
p->print_test(t);
|
||||||
|
fflush(p->fout);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (p_err) {
|
||||||
|
p_err->print_test(t);
|
||||||
|
fflush(p_err->fout);
|
||||||
|
}
|
||||||
|
|
||||||
llama_print_timings(ctx);
|
llama_print_timings(ctx);
|
||||||
|
|
||||||
|
@ -1391,7 +1424,13 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
llama_free_model(lmodel);
|
llama_free_model(lmodel);
|
||||||
|
|
||||||
|
if (p) {
|
||||||
p->print_footer();
|
p->print_footer();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (p_err) {
|
||||||
|
p_err->print_footer();
|
||||||
|
}
|
||||||
|
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
|
|
||||||
|
|
|
@ -750,7 +750,7 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
|
||||||
// this tensor was allocated without ggml-backend
|
// this tensor was allocated without ggml-backend
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
ggml_backend_view_init(galloc->buffers[buffer_id], tensor);
|
ggml_backend_view_init(tensor);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (tensor->data == NULL) {
|
if (tensor->data == NULL) {
|
||||||
|
@ -899,12 +899,12 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
|
||||||
if (t->view_src == NULL) {
|
if (t->view_src == NULL) {
|
||||||
ggml_tallocr_alloc(&tallocr, t);
|
ggml_tallocr_alloc(&tallocr, t);
|
||||||
} else if (t->buffer == NULL) {
|
} else if (t->buffer == NULL) {
|
||||||
ggml_backend_view_init(buffer, t);
|
ggml_backend_view_init(t);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (t->view_src != NULL && t->buffer == NULL) {
|
if (t->view_src != NULL && t->buffer == NULL) {
|
||||||
// view of a pre-allocated tensor
|
// view of a pre-allocated tensor
|
||||||
ggml_backend_view_init(buffer, t);
|
ggml_backend_view_init(t);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -151,7 +151,7 @@ void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) {
|
||||||
bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst) {
|
||||||
ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer;
|
ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer;
|
||||||
if (dst_buf->iface.cpy_tensor) {
|
if (dst_buf->iface.cpy_tensor) {
|
||||||
return src->buffer->iface.cpy_tensor(dst_buf, src, dst);
|
return dst_buf->iface.cpy_tensor(dst_buf, src, dst);
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -1887,15 +1887,15 @@ ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched,
|
||||||
|
|
||||||
// utils
|
// utils
|
||||||
|
|
||||||
void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
void ggml_backend_view_init(struct ggml_tensor * tensor) {
|
||||||
GGML_ASSERT(tensor->buffer == NULL);
|
GGML_ASSERT(tensor->buffer == NULL);
|
||||||
GGML_ASSERT(tensor->view_src != NULL);
|
GGML_ASSERT(tensor->view_src != NULL);
|
||||||
GGML_ASSERT(tensor->view_src->buffer != NULL);
|
GGML_ASSERT(tensor->view_src->buffer != NULL);
|
||||||
GGML_ASSERT(tensor->view_src->data != NULL);
|
GGML_ASSERT(tensor->view_src->data != NULL);
|
||||||
|
|
||||||
tensor->buffer = buffer;
|
tensor->buffer = tensor->view_src->buffer;
|
||||||
tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
|
tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
|
||||||
ggml_backend_buffer_init_tensor(buffer, tensor);
|
ggml_backend_buffer_init_tensor(tensor->buffer, tensor);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) {
|
void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) {
|
||||||
|
@ -1954,7 +1954,7 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te
|
||||||
struct ggml_tensor * dst = node_copies[id];
|
struct ggml_tensor * dst = node_copies[id];
|
||||||
if (dst->view_src != NULL) {
|
if (dst->view_src != NULL) {
|
||||||
graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
|
graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
|
||||||
ggml_backend_view_init(dst->view_src->buffer, dst);
|
ggml_backend_view_init(dst);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
ggml_backend_tensor_copy(src, dst);
|
ggml_backend_tensor_copy(src, dst);
|
||||||
|
|
|
@ -225,7 +225,7 @@ extern "C" {
|
||||||
|
|
||||||
// Tensor initialization
|
// Tensor initialization
|
||||||
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
|
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
|
||||||
GGML_API void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
|
||||||
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
|
|
|
@ -491,7 +491,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer
|
||||||
if (remote_ptr != 0) {
|
if (remote_ptr != 0) {
|
||||||
ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft,
|
ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft,
|
||||||
ggml_backend_rpc_buffer_interface,
|
ggml_backend_rpc_buffer_interface,
|
||||||
new ggml_backend_rpc_buffer_context{sock, {}, remote_ptr, "RPC"},
|
new ggml_backend_rpc_buffer_context{sock, {}, remote_ptr, "RPC[" + std::string(buft_ctx->endpoint) + "]"},
|
||||||
remote_size);
|
remote_size);
|
||||||
return buffer;
|
return buffer;
|
||||||
} else {
|
} else {
|
||||||
|
@ -692,7 +692,7 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const
|
||||||
GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint) {
|
GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint) {
|
||||||
ggml_backend_rpc_context * ctx = new ggml_backend_rpc_context {
|
ggml_backend_rpc_context * ctx = new ggml_backend_rpc_context {
|
||||||
/* .endpoint = */ endpoint,
|
/* .endpoint = */ endpoint,
|
||||||
/* .name = */ "RPC",
|
/* .name = */ "RPC[" + std::string(endpoint) + "]",
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_backend_t backend = new ggml_backend {
|
ggml_backend_t backend = new ggml_backend {
|
||||||
|
|
106
ggml.c
106
ggml.c
|
@ -5,6 +5,7 @@
|
||||||
#include "ggml-quants.h"
|
#include "ggml-quants.h"
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
|
||||||
|
|
||||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||||
#include <malloc.h> // using malloc.h with MSC/MINGW
|
#include <malloc.h> // using malloc.h with MSC/MINGW
|
||||||
#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
|
#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
|
||||||
|
@ -28,6 +29,10 @@
|
||||||
#include <syscall.h>
|
#include <syscall.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef GGML_USE_OPENMP
|
||||||
|
#include <omp.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
#endif
|
#endif
|
||||||
|
@ -1759,7 +1764,7 @@ struct ggml_compute_state_shared {
|
||||||
int64_t perf_node_start_cycles;
|
int64_t perf_node_start_cycles;
|
||||||
int64_t perf_node_start_time_us;
|
int64_t perf_node_start_time_us;
|
||||||
|
|
||||||
const int n_threads;
|
int n_threads;
|
||||||
|
|
||||||
// synchronization primitives
|
// synchronization primitives
|
||||||
atomic_int n_active; // num active threads
|
atomic_int n_active; // num active threads
|
||||||
|
@ -2270,6 +2275,11 @@ inline static float ggml_silu_f32(float x) {
|
||||||
return x/(1.0f + expf(-x));
|
return x/(1.0f + expf(-x));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if __FINITE_MATH_ONLY__
|
||||||
|
#error "some routines in ggml.c require non-finite math arithmetics -- pass -fno-finite-math-only to the compiler to fix"
|
||||||
|
#error "ref: https://github.com/ggerganov/llama.cpp/pull/7154#issuecomment-2143844461"
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(__ARM_NEON) && defined(__aarch64__)
|
#if defined(__ARM_NEON) && defined(__aarch64__)
|
||||||
|
|
||||||
// adapted from arm limited optimized routine
|
// adapted from arm limited optimized routine
|
||||||
|
@ -19686,6 +19696,59 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
||||||
return cplan;
|
return cplan;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static enum ggml_status ggml_graph_compute_parallel(struct ggml_compute_state * workers, int n_threads) {
|
||||||
|
enum ggml_status compute_status = GGML_STATUS_SUCCESS;
|
||||||
|
|
||||||
|
#ifdef GGML_USE_OPENMP
|
||||||
|
if (n_threads > 1) {
|
||||||
|
#pragma omp parallel num_threads(n_threads)
|
||||||
|
{
|
||||||
|
#pragma omp single
|
||||||
|
{
|
||||||
|
// update the number of threads from the actual number of threads that we got from OpenMP
|
||||||
|
n_threads = omp_get_num_threads();
|
||||||
|
workers[0].shared->n_threads = n_threads;
|
||||||
|
workers[0].shared->n_active = n_threads;
|
||||||
|
}
|
||||||
|
ggml_graph_compute_thread(&workers[omp_get_thread_num()]);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
ggml_graph_compute_thread(&workers[0]);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
// create thread pool
|
||||||
|
if (n_threads > 1) {
|
||||||
|
for (int j = 1; j < n_threads; ++j) {
|
||||||
|
const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
|
||||||
|
GGML_ASSERT(rc == 0);
|
||||||
|
UNUSED(rc);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// this is a work thread too
|
||||||
|
ggml_graph_compute_thread(&workers[0]);
|
||||||
|
|
||||||
|
// join or kill thread pool
|
||||||
|
if (n_threads > 1) {
|
||||||
|
for (int j = 1; j < n_threads; j++) {
|
||||||
|
const int rc = ggml_thread_join(workers[j].thrd, NULL);
|
||||||
|
GGML_ASSERT(rc == 0);
|
||||||
|
UNUSED(rc);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
// don't leave affinity set on the main thread
|
||||||
|
clear_numa_thread_affinity();
|
||||||
|
|
||||||
|
for (int j = 0; j < n_threads; j++) {
|
||||||
|
if (workers[j].ec != GGML_STATUS_SUCCESS) {
|
||||||
|
compute_status = workers[j].ec;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return compute_status;
|
||||||
|
}
|
||||||
|
|
||||||
enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
||||||
{
|
{
|
||||||
GGML_ASSERT(cplan);
|
GGML_ASSERT(cplan);
|
||||||
|
@ -19696,7 +19759,11 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const int n_threads = cplan->n_threads;
|
int n_threads = cplan->n_threads;
|
||||||
|
|
||||||
|
#if defined(GGML_USE_OPENMP)
|
||||||
|
n_threads = MIN(n_threads, omp_get_max_threads());
|
||||||
|
#endif
|
||||||
|
|
||||||
struct ggml_compute_state_shared state_shared = {
|
struct ggml_compute_state_shared state_shared = {
|
||||||
/*.cgraph =*/ cgraph,
|
/*.cgraph =*/ cgraph,
|
||||||
|
@ -19712,46 +19779,19 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
||||||
/*.current_chunk; =*/ 0,
|
/*.current_chunk; =*/ 0,
|
||||||
};
|
};
|
||||||
struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
|
struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
|
||||||
|
const int64_t perf_start_cycles = ggml_perf_cycles();
|
||||||
|
const int64_t perf_start_time_us = ggml_perf_time_us();
|
||||||
|
|
||||||
// create thread pool
|
for (int j = 0; j < n_threads; ++j) {
|
||||||
if (n_threads > 1) {
|
|
||||||
for (int j = 1; j < n_threads; ++j) {
|
|
||||||
workers[j] = (struct ggml_compute_state) {
|
workers[j] = (struct ggml_compute_state) {
|
||||||
.thrd = 0,
|
.thrd = 0,
|
||||||
.ith = j,
|
.ith = j,
|
||||||
.shared = &state_shared,
|
.shared = &state_shared,
|
||||||
.ec = GGML_STATUS_SUCCESS,
|
.ec = GGML_STATUS_SUCCESS,
|
||||||
};
|
};
|
||||||
|
|
||||||
const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
|
|
||||||
GGML_ASSERT(rc == 0);
|
|
||||||
UNUSED(rc);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
workers[0].ith = 0;
|
enum ggml_status compute_status = ggml_graph_compute_parallel(workers, n_threads);
|
||||||
workers[0].shared = &state_shared;
|
|
||||||
workers[0].ec = GGML_STATUS_SUCCESS;
|
|
||||||
|
|
||||||
const int64_t perf_start_cycles = ggml_perf_cycles();
|
|
||||||
const int64_t perf_start_time_us = ggml_perf_time_us();
|
|
||||||
|
|
||||||
// this is a work thread too
|
|
||||||
ggml_graph_compute_thread(&workers[0]);
|
|
||||||
enum ggml_status compute_status = workers[0].ec;
|
|
||||||
|
|
||||||
// don't leave affinity set on the main thread
|
|
||||||
clear_numa_thread_affinity();
|
|
||||||
|
|
||||||
// join or kill thread pool
|
|
||||||
if (n_threads > 1) {
|
|
||||||
for (int j = 1; j < n_threads; j++) {
|
|
||||||
const int rc = ggml_thread_join(workers[j].thrd, NULL);
|
|
||||||
GGML_ASSERT(rc == 0);
|
|
||||||
if (workers[j].ec != GGML_STATUS_SUCCESS)
|
|
||||||
compute_status = workers[j].ec;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// performance stats (graph)
|
// performance stats (graph)
|
||||||
{
|
{
|
||||||
|
|
481
llama.cpp
481
llama.cpp
|
@ -2175,12 +2175,12 @@ struct llama_control_vector {
|
||||||
struct llama_vocab {
|
struct llama_vocab {
|
||||||
using id = int32_t;
|
using id = int32_t;
|
||||||
using token = std::string;
|
using token = std::string;
|
||||||
using ttype = llama_token_type;
|
using tattr = llama_token_attr;
|
||||||
|
|
||||||
struct token_data {
|
struct token_data {
|
||||||
token text;
|
token text;
|
||||||
float score;
|
float score;
|
||||||
ttype type;
|
tattr attr;
|
||||||
};
|
};
|
||||||
|
|
||||||
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
|
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
|
||||||
|
@ -2401,13 +2401,34 @@ struct llama_context {
|
||||||
struct llama_control_vector cvec;
|
struct llama_control_vector cvec;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
static size_t llama_get_device_count(const llama_model & model) {
|
||||||
|
size_t count = 1;
|
||||||
|
#if defined(GGML_USE_CUDA)
|
||||||
|
count = ggml_backend_cuda_get_device_count();
|
||||||
|
#elif defined(GGML_USE_SYCL)
|
||||||
|
count = ggml_backend_sycl_get_device_count();
|
||||||
|
#elif defined(GGML_USE_VULKAN)
|
||||||
|
count = ggml_backend_vk_get_device_count();
|
||||||
|
#endif
|
||||||
|
#if defined(GGML_USE_RPC)
|
||||||
|
count += model.rpc_servers.size();
|
||||||
|
#endif
|
||||||
|
return count;
|
||||||
|
GGML_UNUSED(model);
|
||||||
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
|
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
|
||||||
ggml_backend_buffer_type_t buft = nullptr;
|
ggml_backend_buffer_type_t buft = nullptr;
|
||||||
|
|
||||||
#ifdef GGML_USE_RPC
|
#if defined(GGML_USE_RPC)
|
||||||
std::string endpoint = model.rpc_servers[gpu];
|
int dev_count = (int)llama_get_device_count(model);
|
||||||
buft = ggml_backend_rpc_buffer_type(endpoint.c_str());
|
int rpc_count = (int)model.rpc_servers.size();
|
||||||
#elif defined(GGML_USE_METAL)
|
if (gpu >= dev_count - rpc_count) {
|
||||||
|
const char * endpoint = model.rpc_servers[gpu - dev_count + rpc_count].c_str();
|
||||||
|
return ggml_backend_rpc_buffer_type(endpoint);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
#if defined(GGML_USE_METAL)
|
||||||
buft = ggml_backend_metal_buffer_type();
|
buft = ggml_backend_metal_buffer_type();
|
||||||
#elif defined(GGML_USE_CUDA)
|
#elif defined(GGML_USE_CUDA)
|
||||||
buft = ggml_backend_cuda_buffer_type(gpu);
|
buft = ggml_backend_cuda_buffer_type(gpu);
|
||||||
|
@ -2455,29 +2476,19 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_mo
|
||||||
GGML_UNUSED(tensor_split);
|
GGML_UNUSED(tensor_split);
|
||||||
}
|
}
|
||||||
|
|
||||||
static size_t llama_get_device_count(const llama_model & model) {
|
|
||||||
#if defined(GGML_USE_RPC)
|
|
||||||
return model.rpc_servers.size();
|
|
||||||
#elif defined(GGML_USE_CUDA)
|
|
||||||
return ggml_backend_cuda_get_device_count();
|
|
||||||
#elif defined(GGML_USE_SYCL)
|
|
||||||
return ggml_backend_sycl_get_device_count();
|
|
||||||
#elif defined(GGML_USE_VULKAN)
|
|
||||||
return ggml_backend_vk_get_device_count();
|
|
||||||
#else
|
|
||||||
return 1;
|
|
||||||
#endif
|
|
||||||
GGML_UNUSED(model);
|
|
||||||
}
|
|
||||||
|
|
||||||
static size_t llama_get_device_memory(const llama_model & model, int device) {
|
static size_t llama_get_device_memory(const llama_model & model, int device) {
|
||||||
#if defined(GGML_USE_RPC)
|
#if defined(GGML_USE_RPC)
|
||||||
|
int dev_count = (int)llama_get_device_count(model);
|
||||||
|
int rpc_count = (int)model.rpc_servers.size();
|
||||||
|
if (device >= dev_count - rpc_count) {
|
||||||
size_t total;
|
size_t total;
|
||||||
size_t free;
|
size_t free;
|
||||||
std::string endpoint = model.rpc_servers[device];
|
const char * endpoint = model.rpc_servers[device - dev_count + rpc_count].c_str();
|
||||||
ggml_backend_rpc_get_device_memory(endpoint.c_str(), &free, &total);
|
ggml_backend_rpc_get_device_memory(endpoint, &free, &total);
|
||||||
return free;
|
return free;
|
||||||
#elif defined(GGML_USE_CUDA)
|
}
|
||||||
|
#endif
|
||||||
|
#if defined(GGML_USE_CUDA)
|
||||||
size_t total;
|
size_t total;
|
||||||
size_t free;
|
size_t free;
|
||||||
ggml_backend_cuda_get_device_memory(device, &free, &total);
|
ggml_backend_cuda_get_device_memory(device, &free, &total);
|
||||||
|
@ -4802,7 +4813,20 @@ static void llm_load_vocab(
|
||||||
auto & token_data = vocab.id_to_token[i];
|
auto & token_data = vocab.id_to_token[i];
|
||||||
token_data.text = std::move(word);
|
token_data.text = std::move(word);
|
||||||
token_data.score = scores ? scores[i] : 0.0f;
|
token_data.score = scores ? scores[i] : 0.0f;
|
||||||
token_data.type = toktypes ? (llama_token_type) toktypes[i] : LLAMA_TOKEN_TYPE_NORMAL;
|
token_data.attr = LLAMA_TOKEN_ATTR_NORMAL;
|
||||||
|
|
||||||
|
if (toktypes) { //TODO: remove, required until per token attributes are available from GGUF file
|
||||||
|
switch(toktypes[i]) {
|
||||||
|
case LLAMA_TOKEN_TYPE_UNKNOWN: token_data.attr = LLAMA_TOKEN_ATTR_UNKNOWN; break;
|
||||||
|
case LLAMA_TOKEN_TYPE_UNUSED: token_data.attr = LLAMA_TOKEN_ATTR_UNUSED; break;
|
||||||
|
case LLAMA_TOKEN_TYPE_NORMAL: token_data.attr = LLAMA_TOKEN_ATTR_NORMAL; break;
|
||||||
|
case LLAMA_TOKEN_TYPE_CONTROL: token_data.attr = LLAMA_TOKEN_ATTR_CONTROL; break;
|
||||||
|
case LLAMA_TOKEN_TYPE_USER_DEFINED: token_data.attr = LLAMA_TOKEN_ATTR_USER_DEFINED; break;
|
||||||
|
case LLAMA_TOKEN_TYPE_BYTE: token_data.attr = LLAMA_TOKEN_ATTR_BYTE; break;
|
||||||
|
case LLAMA_TOKEN_TYPE_UNDEFINED: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
|
||||||
|
default: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
GGML_ASSERT_CONTINUE(vocab.id_to_token.size() == vocab.token_to_id.size());
|
GGML_ASSERT_CONTINUE(vocab.id_to_token.size() == vocab.token_to_id.size());
|
||||||
|
|
||||||
|
@ -4893,7 +4917,7 @@ static void llm_load_vocab(
|
||||||
// build special tokens cache
|
// build special tokens cache
|
||||||
{
|
{
|
||||||
for (llama_vocab::id id = 0; id < (llama_vocab::id)n_vocab; ++id) {
|
for (llama_vocab::id id = 0; id < (llama_vocab::id)n_vocab; ++id) {
|
||||||
if (vocab.id_to_token[id].type != LLAMA_TOKEN_TYPE_NORMAL) {
|
if (!(vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL)) {
|
||||||
vocab.cache_special_tokens.push_back(id);
|
vocab.cache_special_tokens.push_back(id);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -4923,6 +4947,59 @@ static void llm_load_vocab(
|
||||||
|
|
||||||
LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0);
|
LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Handle per token attributes
|
||||||
|
//NOTE: Each model customizes per token attributes.
|
||||||
|
//NOTE: Per token attributes are missing from the GGUF file.
|
||||||
|
//TODO: Extract attributes from GGUF file.
|
||||||
|
{
|
||||||
|
auto _contains_any = [] (const std::string &str, const std::vector<std::string> &substrs) -> bool {
|
||||||
|
for (auto substr : substrs) {
|
||||||
|
if (str.find(substr) < std::string::npos) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
};
|
||||||
|
|
||||||
|
auto _set_tokenid_attr = [&] (const llama_vocab::id id, llama_token_attr attr, bool value) {
|
||||||
|
uint32_t current = vocab.id_to_token.at(id).attr;
|
||||||
|
current = value ? (current | attr) : (current & ~attr);
|
||||||
|
vocab.id_to_token[id].attr = (llama_token_attr) current;
|
||||||
|
};
|
||||||
|
|
||||||
|
auto _set_token_attr = [&] (const std::string & token, llama_token_attr attr, bool value) {
|
||||||
|
_set_tokenid_attr(vocab.token_to_id.at(token), attr, value);
|
||||||
|
};
|
||||||
|
|
||||||
|
std::string model_name;
|
||||||
|
std::string tokenizer_pre;
|
||||||
|
|
||||||
|
ml.get_key(LLM_KV_GENERAL_NAME, model_name, false);
|
||||||
|
ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
|
||||||
|
|
||||||
|
// model name to lowercase
|
||||||
|
std::transform(model_name.begin(), model_name.end(), model_name.begin(),
|
||||||
|
[] (const std::string::value_type x) {
|
||||||
|
return std::tolower(x);
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
// set attributes by model/tokenizer name
|
||||||
|
if (_contains_any(tokenizer_pre, {"jina-v2-es", "jina-v2-de"})) {
|
||||||
|
_set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
|
||||||
|
} else if (_contains_any(model_name, {"phi-3", "phi3"})) {
|
||||||
|
for (auto id : vocab.cache_special_tokens) {
|
||||||
|
_set_tokenid_attr(id, LLAMA_TOKEN_ATTR_RSTRIP, true);
|
||||||
|
}
|
||||||
|
for (auto token : {"</s>"}) {
|
||||||
|
_set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, true);
|
||||||
|
}
|
||||||
|
for (auto token : {"<unk>", "<s>", "<|endoftext|>"}) {
|
||||||
|
_set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
||||||
|
@ -12679,27 +12756,27 @@ static enum llama_vocab_type llama_vocab_get_type(const llama_vocab & vocab) {
|
||||||
|
|
||||||
static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) {
|
static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) {
|
||||||
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
||||||
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL;
|
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) {
|
static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) {
|
||||||
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
||||||
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNKNOWN;
|
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNKNOWN;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
|
static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
|
||||||
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
||||||
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_CONTROL;
|
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_CONTROL;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
|
static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
|
||||||
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
||||||
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE;
|
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_BYTE;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id) {
|
static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id) {
|
||||||
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
||||||
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED;
|
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_USER_DEFINED;
|
||||||
}
|
}
|
||||||
|
|
||||||
static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
|
static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
|
||||||
|
@ -13538,7 +13615,8 @@ struct fragment_buffer_variant {
|
||||||
static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer) {
|
static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer) {
|
||||||
// for each special token
|
// for each special token
|
||||||
for (const llama_vocab::id special_id : vocab.cache_special_tokens) {
|
for (const llama_vocab::id special_id : vocab.cache_special_tokens) {
|
||||||
const auto & special_token = vocab.id_to_token[special_id].text;
|
const auto & data = vocab.id_to_token[special_id];
|
||||||
|
const auto & special_token = data.text;
|
||||||
|
|
||||||
// for each text fragment
|
// for each text fragment
|
||||||
std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
|
std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
|
||||||
|
@ -13575,13 +13653,22 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
||||||
if (match > raw_text_base_offset) {
|
if (match > raw_text_base_offset) {
|
||||||
// left
|
// left
|
||||||
const int64_t left_reminder_offset = raw_text_base_offset + 0;
|
const int64_t left_reminder_offset = raw_text_base_offset + 0;
|
||||||
const int64_t left_reminder_length = match - raw_text_base_offset;
|
int64_t left_reminder_length = match - raw_text_base_offset;
|
||||||
|
|
||||||
|
if (data.attr & LLAMA_TOKEN_ATTR_LSTRIP) {
|
||||||
|
while (left_reminder_length > 0 && isspace(raw_text[left_reminder_offset + left_reminder_length - 1])) {
|
||||||
|
left_reminder_length--;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (left_reminder_length > 0) {
|
||||||
buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length);
|
buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length);
|
||||||
|
it++;
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef PRETOKENIZERDEBUG
|
#ifdef PRETOKENIZERDEBUG
|
||||||
LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
|
LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
|
||||||
#endif
|
#endif
|
||||||
it++;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// special token
|
// special token
|
||||||
|
@ -13590,16 +13677,25 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
||||||
|
|
||||||
// right
|
// right
|
||||||
if (match + special_token.length() < raw_text_base_offset + raw_text_base_length) {
|
if (match + special_token.length() < raw_text_base_offset + raw_text_base_length) {
|
||||||
const int64_t right_reminder_offset = match + special_token.length();
|
int64_t right_reminder_offset = match + special_token.length();
|
||||||
const int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
|
int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
|
||||||
|
|
||||||
|
if (data.attr & LLAMA_TOKEN_ATTR_RSTRIP) {
|
||||||
|
while (right_reminder_length > 0 && isspace(raw_text[right_reminder_offset])) {
|
||||||
|
right_reminder_offset++;
|
||||||
|
right_reminder_length--;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (right_reminder_length > 0) {
|
||||||
buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length);
|
buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length);
|
||||||
|
it++;
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef PRETOKENIZERDEBUG
|
#ifdef PRETOKENIZERDEBUG
|
||||||
LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
|
LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
it++;
|
|
||||||
|
|
||||||
if (source == 0) {
|
if (source == 0) {
|
||||||
buffer.erase_after(buffer.before_begin());
|
buffer.erase_after(buffer.before_begin());
|
||||||
} else {
|
} else {
|
||||||
|
@ -13645,9 +13741,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
||||||
// tokenizer.encode('', add_special_tokens=True) returns [1]
|
// tokenizer.encode('', add_special_tokens=True) returns [1]
|
||||||
// tokenizer.encode('', add_special_tokens=False) returns []
|
// tokenizer.encode('', add_special_tokens=False) returns []
|
||||||
|
|
||||||
static const bool rtrim = true; //TODO: as param
|
|
||||||
bool is_prev_special = false;
|
bool is_prev_special = false;
|
||||||
bool special_token_rtrim = false;
|
|
||||||
|
|
||||||
if (add_special && vocab.special_add_bos != 0) {
|
if (add_special && vocab.special_add_bos != 0) {
|
||||||
GGML_ASSERT(vocab.special_bos_id != -1);
|
GGML_ASSERT(vocab.special_bos_id != -1);
|
||||||
|
@ -13657,25 +13751,8 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
||||||
|
|
||||||
for (const auto & fragment : fragment_buffer) {
|
for (const auto & fragment : fragment_buffer) {
|
||||||
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
||||||
// without adding this leading whitespace, we do not get the same results as the original tokenizer
|
|
||||||
|
|
||||||
// TODO: It's likely possible to get rid of this string copy entirely
|
|
||||||
// by modifying llm_tokenizer_x to operate with string offsets like pre-tokenizer
|
|
||||||
// and passing 'add space prefix' as bool argument
|
|
||||||
//
|
|
||||||
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
||||||
|
|
||||||
if (special_token_rtrim) {
|
|
||||||
size_t num_whitespaces = 0;
|
|
||||||
while (isspace(raw_text[num_whitespaces])) {
|
|
||||||
num_whitespaces++;
|
|
||||||
}
|
|
||||||
if (num_whitespaces == raw_text.size()) {
|
|
||||||
continue; // skip if all whitespaces
|
|
||||||
}
|
|
||||||
raw_text = raw_text.substr(num_whitespaces);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (vocab.add_space_prefix) {
|
if (vocab.add_space_prefix) {
|
||||||
if (!output.size() || is_prev_special) { // prefix with space if first token
|
if (!output.size() || is_prev_special) { // prefix with space if first token
|
||||||
raw_text = " " + raw_text;
|
raw_text = " " + raw_text;
|
||||||
|
@ -13691,11 +13768,6 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
||||||
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
||||||
output.push_back(fragment.token);
|
output.push_back(fragment.token);
|
||||||
is_prev_special = true;
|
is_prev_special = true;
|
||||||
// phi-3 special tokens without rtrim, works fine for llama-spm too
|
|
||||||
special_token_rtrim = rtrim
|
|
||||||
&& fragment.token != vocab.special_bos_id
|
|
||||||
&& fragment.token != vocab.special_unk_id
|
|
||||||
&& fragment.token != vocab.special_eos_id;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -14954,260 +15026,6 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
||||||
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
|
||||||
// Beam search
|
|
||||||
//
|
|
||||||
|
|
||||||
struct llama_beam {
|
|
||||||
std::vector<llama_token> tokens;
|
|
||||||
float p; // Cumulative beam probability (renormalized relative to all beams)
|
|
||||||
bool eob; // Initialize end-of-beam to false. Callback sets this to true.
|
|
||||||
// Sort beams by probability. In case of ties, prefer beams at eob.
|
|
||||||
bool operator<(const llama_beam & rhs) const {
|
|
||||||
return std::make_pair(p, eob) < std::make_pair(rhs.p, rhs.eob);
|
|
||||||
}
|
|
||||||
// Shift off first n tokens and discard them.
|
|
||||||
void shift_tokens(const size_t n) {
|
|
||||||
if (n) {
|
|
||||||
std::copy(tokens.begin() + n, tokens.end(), tokens.begin());
|
|
||||||
tokens.resize(tokens.size() - n);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
llama_beam_view view() const { return {tokens.data(), tokens.size(), p, eob}; }
|
|
||||||
};
|
|
||||||
|
|
||||||
// A struct for calculating logit-related info.
|
|
||||||
struct llama_logit_info {
|
|
||||||
const float * const logits;
|
|
||||||
const int n_vocab;
|
|
||||||
const float max_l;
|
|
||||||
const float normalizer;
|
|
||||||
struct sum_exp {
|
|
||||||
float max_l;
|
|
||||||
float operator()(float sum, float l) const { return sum + std::exp(l - max_l); }
|
|
||||||
};
|
|
||||||
llama_logit_info(llama_context * ctx)
|
|
||||||
: logits(llama_get_logits(ctx))
|
|
||||||
, n_vocab(llama_n_vocab(llama_get_model(ctx)))
|
|
||||||
, max_l(*std::max_element(logits, logits + n_vocab))
|
|
||||||
, normalizer(1.0f / std::accumulate(logits, logits + n_vocab, 0.0f, sum_exp{max_l}))
|
|
||||||
{ }
|
|
||||||
llama_token_data get_token_data(const llama_token token_id) const {
|
|
||||||
constexpr auto p = std::numeric_limits<float>::quiet_NaN(); // never used
|
|
||||||
return {token_id, logits[token_id], p};
|
|
||||||
}
|
|
||||||
// Return top k token_data by logit.
|
|
||||||
std::vector<llama_token_data> top_k(size_t k) {
|
|
||||||
std::vector<llama_token_data> min_heap; // min-heap by logit
|
|
||||||
const llama_token k_min = std::min(static_cast<llama_token>(k), n_vocab);
|
|
||||||
min_heap.reserve(k_min);
|
|
||||||
for (llama_token token_id = 0 ; token_id < k_min ; ++token_id) {
|
|
||||||
min_heap.push_back(get_token_data(token_id));
|
|
||||||
}
|
|
||||||
auto comp = [](const llama_token_data & a, const llama_token_data & b) { return a.logit > b.logit; };
|
|
||||||
std::make_heap(min_heap.begin(), min_heap.end(), comp);
|
|
||||||
for (llama_token token_id = k_min ; token_id < n_vocab ; ++token_id) {
|
|
||||||
if (min_heap.front().logit < logits[token_id]) {
|
|
||||||
std::pop_heap(min_heap.begin(), min_heap.end(), comp);
|
|
||||||
min_heap.back().id = token_id;
|
|
||||||
min_heap.back().logit = logits[token_id];
|
|
||||||
std::push_heap(min_heap.begin(), min_heap.end(), comp);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return min_heap;
|
|
||||||
}
|
|
||||||
float probability_from_logit(float logit) const {
|
|
||||||
return normalizer * std::exp(logit - max_l);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
struct llama_beam_search_data {
|
|
||||||
llama_context * ctx;
|
|
||||||
size_t n_beams;
|
|
||||||
int n_past;
|
|
||||||
int n_predict;
|
|
||||||
std::vector<llama_beam> beams;
|
|
||||||
std::vector<llama_beam> next_beams;
|
|
||||||
|
|
||||||
// Re-calculated on each loop iteration
|
|
||||||
size_t common_prefix_length;
|
|
||||||
|
|
||||||
// Used to communicate to/from callback on beams state.
|
|
||||||
std::vector<llama_beam_view> beam_views;
|
|
||||||
|
|
||||||
llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict)
|
|
||||||
: ctx(ctx)
|
|
||||||
, n_beams(n_beams)
|
|
||||||
, n_past(n_past)
|
|
||||||
, n_predict(n_predict)
|
|
||||||
, beam_views(n_beams) {
|
|
||||||
beams.reserve(n_beams);
|
|
||||||
next_beams.reserve(n_beams);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Collapse beams to a single beam given by index.
|
|
||||||
void collapse_beams(const size_t beam_idx) {
|
|
||||||
if (0u < beam_idx) {
|
|
||||||
std::swap(beams[0], beams[beam_idx]);
|
|
||||||
}
|
|
||||||
beams.resize(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Min-heaps are used to efficiently collect the top-k elements (k=n_beams).
|
|
||||||
// The repetitive patterns below reflect the 2 stages of heaps:
|
|
||||||
// * Gather elements until the vector is full, then call std::make_heap() on it.
|
|
||||||
// * If the heap is full and a new element is found that should be included, pop the
|
|
||||||
// least element to the back(), replace it with the new, then push it into the heap.
|
|
||||||
void fill_next_beams_by_top_probabilities(llama_beam & beam) {
|
|
||||||
// Min-heaps use a greater-than comparator.
|
|
||||||
const auto comp = [](const llama_beam & a, const llama_beam & b) { return a.p > b.p; };
|
|
||||||
if (beam.eob) {
|
|
||||||
// beam is at end-of-sentence, so just copy it to next_beams if its probability is high enough.
|
|
||||||
if (next_beams.size() < n_beams) {
|
|
||||||
next_beams.push_back(std::move(beam));
|
|
||||||
if (next_beams.size() == n_beams) {
|
|
||||||
std::make_heap(next_beams.begin(), next_beams.end(), comp);
|
|
||||||
}
|
|
||||||
} else if (next_beams.front().p < beam.p) {
|
|
||||||
std::pop_heap(next_beams.begin(), next_beams.end(), comp);
|
|
||||||
next_beams.back() = std::move(beam);
|
|
||||||
std::push_heap(next_beams.begin(), next_beams.end(), comp);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// beam is not at end-of-sentence, so branch with next top_k tokens.
|
|
||||||
if (!beam.tokens.empty()) {
|
|
||||||
llama_decode(ctx, llama_batch_get_one(beam.tokens.data(), beam.tokens.size(), n_past, 0));
|
|
||||||
}
|
|
||||||
llama_logit_info logit_info(ctx);
|
|
||||||
std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
|
|
||||||
|
|
||||||
// Clear the kv slot so that other beams may try different tokens at this position. The llama_decode()
|
|
||||||
// call in loop() will conclusively fill in the kv slot once the beams converge at this position.
|
|
||||||
llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
|
|
||||||
|
|
||||||
size_t i=0;
|
|
||||||
if (next_beams.size() < n_beams) {
|
|
||||||
for (; next_beams.size() < n_beams ; ++i) {
|
|
||||||
llama_beam next_beam = beam;
|
|
||||||
next_beam.tokens.push_back(next_tokens[i].id);
|
|
||||||
next_beam.p *= logit_info.probability_from_logit(next_tokens[i].logit);
|
|
||||||
next_beams.push_back(std::move(next_beam));
|
|
||||||
}
|
|
||||||
std::make_heap(next_beams.begin(), next_beams.end(), comp);
|
|
||||||
} else {
|
|
||||||
for (; next_beams.front().p == 0.0f ; ++i) {
|
|
||||||
std::pop_heap(next_beams.begin(), next_beams.end(), comp);
|
|
||||||
next_beams.back() = beam;
|
|
||||||
next_beams.back().tokens.push_back(next_tokens[i].id);
|
|
||||||
next_beams.back().p *= logit_info.probability_from_logit(next_tokens[i].logit);
|
|
||||||
std::push_heap(next_beams.begin(), next_beams.end(), comp);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for (; i < n_beams ; ++i) {
|
|
||||||
const float next_p = beam.p * logit_info.probability_from_logit(next_tokens[i].logit);
|
|
||||||
if (next_beams.front().p < next_p) {
|
|
||||||
std::pop_heap(next_beams.begin(), next_beams.end(), comp);
|
|
||||||
next_beams.back() = beam;
|
|
||||||
next_beams.back().tokens.push_back(next_tokens[i].id);
|
|
||||||
next_beams.back().p = next_p;
|
|
||||||
std::push_heap(next_beams.begin(), next_beams.end(), comp);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Find common_prefix_length based on beams.
|
|
||||||
// Requires beams is not empty.
|
|
||||||
size_t find_common_prefix_length() {
|
|
||||||
size_t common_prefix_length = beams[0].tokens.size();
|
|
||||||
for (size_t i = 1 ; i < beams.size() ; ++i) {
|
|
||||||
common_prefix_length = std::min(common_prefix_length, beams[i].tokens.size());
|
|
||||||
for (size_t j = 0 ; j < common_prefix_length ; ++j) {
|
|
||||||
if (beams[0].tokens[j] != beams[i].tokens[j]) {
|
|
||||||
common_prefix_length = j;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return common_prefix_length;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Construct beams_state to send back to caller via the callback function.
|
|
||||||
// Side effect: set common_prefix_length = find_common_prefix_length();
|
|
||||||
llama_beams_state get_beams_state(const bool last_call) {
|
|
||||||
for (size_t i = 0 ; i < beams.size() ; ++i) {
|
|
||||||
beam_views[i] = beams[i].view();
|
|
||||||
}
|
|
||||||
common_prefix_length = find_common_prefix_length();
|
|
||||||
return {beam_views.data(), beams.size(), common_prefix_length, last_call};
|
|
||||||
}
|
|
||||||
|
|
||||||
// Loop:
|
|
||||||
// * while i < n_predict, AND
|
|
||||||
// * any of the beams have not yet reached end-of-beam (eob), AND
|
|
||||||
// * the highest probability beam(s) (plural in case of ties) are not at end-of-sentence
|
|
||||||
// (since all other beam probabilities can only decrease)
|
|
||||||
void loop(const llama_beam_search_callback_fn_t callback, void * const callback_data) {
|
|
||||||
beams.push_back({{}, 1.0f, false}); // Start with one empty beam w/ probability = 1.0 and !eob.
|
|
||||||
const auto not_eob = [](const llama_beam & beam) { return !beam.eob; };
|
|
||||||
for (int i = 0 ; i < n_predict && std::any_of(beams.begin(),beams.end(),not_eob) &&
|
|
||||||
!beams[top_beam_index()].eob ; ++i) {
|
|
||||||
callback(callback_data, get_beams_state(false)); // Sets common_prefix_length
|
|
||||||
update_beams_from_beam_views(); // Update values (p,eob) that callback may have changed.
|
|
||||||
if (common_prefix_length) {
|
|
||||||
llama_decode(ctx, llama_batch_get_one(beams[0].tokens.data(), common_prefix_length, n_past, 0));
|
|
||||||
n_past += common_prefix_length;
|
|
||||||
}
|
|
||||||
// Zero-out next_beam probabilities to place them last in following min-heap.
|
|
||||||
std::for_each(next_beams.begin(), next_beams.end(), [](llama_beam & beam) { beam.p = 0.0f; });
|
|
||||||
for (llama_beam & beam : beams) {
|
|
||||||
beam.shift_tokens(common_prefix_length);
|
|
||||||
fill_next_beams_by_top_probabilities(beam);
|
|
||||||
}
|
|
||||||
// next_beams become the beams of next/final iteration. Swap them to re-use memory.
|
|
||||||
beams.swap(next_beams);
|
|
||||||
renormalize_beam_probabilities(beams);
|
|
||||||
}
|
|
||||||
collapse_beams(top_beam_index());
|
|
||||||
callback(callback_data, get_beams_state(true));
|
|
||||||
}
|
|
||||||
|
|
||||||
// As beams grow, the cumulative probabilities decrease.
|
|
||||||
// Renormalize them to avoid floating point underflow.
|
|
||||||
static void renormalize_beam_probabilities(std::vector<llama_beam> & beams) {
|
|
||||||
const auto sum_p = [](float sum, llama_beam & beam) { return sum + beam.p; };
|
|
||||||
const float inv_sum = 1.0f / std::accumulate(beams.begin(), beams.end(), 0.0f, sum_p);
|
|
||||||
std::for_each(beams.begin(), beams.end(), [=](llama_beam & beam) { beam.p *= inv_sum; });
|
|
||||||
}
|
|
||||||
|
|
||||||
// Assumes beams is non-empty. Uses llama_beam::operator<() for ordering.
|
|
||||||
size_t top_beam_index() {
|
|
||||||
return std::max_element(beams.begin(), beams.end()) - beams.begin();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Copy (p,eob) for each beam which may have been changed by the callback.
|
|
||||||
void update_beams_from_beam_views() {
|
|
||||||
for (size_t i = 0 ; i < beams.size() ; ++i) {
|
|
||||||
beams[i].p = beam_views[i].p;
|
|
||||||
beams[i].eob = beam_views[i].eob;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
void llama_beam_search(llama_context * ctx,
|
|
||||||
llama_beam_search_callback_fn_t callback, void * callback_data,
|
|
||||||
size_t n_beams, int n_past, int n_predict) {
|
|
||||||
assert(ctx);
|
|
||||||
const int64_t t_start_sample_us = ggml_time_us();
|
|
||||||
|
|
||||||
llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict);
|
|
||||||
|
|
||||||
beam_search_data.loop(callback, callback_data);
|
|
||||||
|
|
||||||
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
|
||||||
ctx->n_sample++;
|
|
||||||
}
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// quantization
|
// quantization
|
||||||
//
|
//
|
||||||
|
@ -16463,7 +16281,7 @@ struct llama_model * llama_load_model_from_file(
|
||||||
return true;
|
return true;
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
if (params.rpc_servers != nullptr) {
|
if (params.rpc_servers != nullptr && params.rpc_servers[0] != '\0') {
|
||||||
// split the servers set them into model->rpc_servers
|
// split the servers set them into model->rpc_servers
|
||||||
std::string servers(params.rpc_servers);
|
std::string servers(params.rpc_servers);
|
||||||
size_t pos = 0;
|
size_t pos = 0;
|
||||||
|
@ -16626,17 +16444,7 @@ struct llama_context * llama_new_context_with_model(
|
||||||
|
|
||||||
if (!hparams.vocab_only) {
|
if (!hparams.vocab_only) {
|
||||||
// initialize backends
|
// initialize backends
|
||||||
#if defined(GGML_USE_RPC)
|
#if defined(GGML_USE_METAL)
|
||||||
for (auto & server : model->rpc_servers) {
|
|
||||||
ggml_backend_t backend = ggml_backend_rpc_init(server.c_str());
|
|
||||||
if (backend == nullptr) {
|
|
||||||
LLAMA_LOG_ERROR("%s: failed to connect RPC backend to %s\n", __func__, server.c_str());
|
|
||||||
llama_free(ctx);
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
ctx->backends.push_back(backend);
|
|
||||||
}
|
|
||||||
#elif defined(GGML_USE_METAL)
|
|
||||||
if (model->n_gpu_layers > 0) {
|
if (model->n_gpu_layers > 0) {
|
||||||
ctx->backend_metal = ggml_backend_metal_init();
|
ctx->backend_metal = ggml_backend_metal_init();
|
||||||
if (ctx->backend_metal == nullptr) {
|
if (ctx->backend_metal == nullptr) {
|
||||||
|
@ -16728,6 +16536,19 @@ struct llama_context * llama_new_context_with_model(
|
||||||
}
|
}
|
||||||
ctx->backends.push_back(backend);
|
ctx->backends.push_back(backend);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
#if defined(GGML_USE_RPC)
|
||||||
|
if (model->n_gpu_layers > 0) {
|
||||||
|
for (const auto & endpoint : model->rpc_servers) {
|
||||||
|
ggml_backend_t backend = ggml_backend_rpc_init(endpoint.c_str());
|
||||||
|
if (backend == nullptr) {
|
||||||
|
LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
|
||||||
|
llama_free(ctx);
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
ctx->backends.push_back(backend);
|
||||||
|
}
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
ctx->backend_cpu = ggml_backend_cpu_init();
|
ctx->backend_cpu = ggml_backend_cpu_init();
|
||||||
if (ctx->backend_cpu == nullptr) {
|
if (ctx->backend_cpu == nullptr) {
|
||||||
|
@ -18521,9 +18342,9 @@ float llama_token_get_score(const struct llama_model * model, llama_token token)
|
||||||
return model->vocab.id_to_token[token].score;
|
return model->vocab.id_to_token[token].score;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token) {
|
llama_token_attr llama_token_get_attr(const struct llama_model * model, llama_token token) {
|
||||||
GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
||||||
return model->vocab.id_to_token[token].type;
|
return model->vocab.id_to_token[token].attr;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
|
bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
|
||||||
|
|
60
llama.h
60
llama.h
|
@ -97,7 +97,7 @@ extern "C" {
|
||||||
LLAMA_ROPE_TYPE_GLM = 4,
|
LLAMA_ROPE_TYPE_GLM = 4,
|
||||||
};
|
};
|
||||||
|
|
||||||
enum llama_token_type {
|
enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file
|
||||||
LLAMA_TOKEN_TYPE_UNDEFINED = 0,
|
LLAMA_TOKEN_TYPE_UNDEFINED = 0,
|
||||||
LLAMA_TOKEN_TYPE_NORMAL = 1,
|
LLAMA_TOKEN_TYPE_NORMAL = 1,
|
||||||
LLAMA_TOKEN_TYPE_UNKNOWN = 2,
|
LLAMA_TOKEN_TYPE_UNKNOWN = 2,
|
||||||
|
@ -107,6 +107,20 @@ extern "C" {
|
||||||
LLAMA_TOKEN_TYPE_BYTE = 6,
|
LLAMA_TOKEN_TYPE_BYTE = 6,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
enum llama_token_attr {
|
||||||
|
LLAMA_TOKEN_ATTR_UNDEFINED = 0,
|
||||||
|
LLAMA_TOKEN_ATTR_UNKNOWN = 1 << 1,
|
||||||
|
LLAMA_TOKEN_ATTR_UNUSED = 1 << 2,
|
||||||
|
LLAMA_TOKEN_ATTR_NORMAL = 1 << 3,
|
||||||
|
LLAMA_TOKEN_ATTR_CONTROL = 1 << 4, // SPECIAL?
|
||||||
|
LLAMA_TOKEN_ATTR_USER_DEFINED = 1 << 5,
|
||||||
|
LLAMA_TOKEN_ATTR_BYTE = 1 << 6,
|
||||||
|
LLAMA_TOKEN_ATTR_NORMALIZED = 1 << 7,
|
||||||
|
LLAMA_TOKEN_ATTR_LSTRIP = 1 << 8,
|
||||||
|
LLAMA_TOKEN_ATTR_RSTRIP = 1 << 9,
|
||||||
|
LLAMA_TOKEN_ATTR_SINGLE_WORD = 1 << 10,
|
||||||
|
};
|
||||||
|
|
||||||
// model file types
|
// model file types
|
||||||
enum llama_ftype {
|
enum llama_ftype {
|
||||||
LLAMA_FTYPE_ALL_F32 = 0,
|
LLAMA_FTYPE_ALL_F32 = 0,
|
||||||
|
@ -823,7 +837,7 @@ extern "C" {
|
||||||
|
|
||||||
LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token);
|
LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token);
|
||||||
|
|
||||||
LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token);
|
LLAMA_API enum llama_token_attr llama_token_get_attr(const struct llama_model * model, llama_token token);
|
||||||
|
|
||||||
// Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
|
// Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
|
||||||
LLAMA_API bool llama_token_is_eog(const struct llama_model * model, llama_token token);
|
LLAMA_API bool llama_token_is_eog(const struct llama_model * model, llama_token token);
|
||||||
|
@ -1046,49 +1060,9 @@ extern "C" {
|
||||||
llama_token token);
|
llama_token token);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Beam search
|
// Model split
|
||||||
//
|
//
|
||||||
|
|
||||||
struct llama_beam_view {
|
|
||||||
const llama_token * tokens;
|
|
||||||
|
|
||||||
size_t n_tokens;
|
|
||||||
float p; // Cumulative beam probability (renormalized relative to all beams)
|
|
||||||
bool eob; // Callback should set this to true when a beam is at end-of-beam.
|
|
||||||
};
|
|
||||||
|
|
||||||
// Passed to beam_search_callback function.
|
|
||||||
// Whenever 0 < common_prefix_length, this number of tokens should be copied from any of the beams
|
|
||||||
// (e.g. beams[0]) as they will be removed (shifted) from all beams in all subsequent callbacks.
|
|
||||||
// These pointers are valid only during the synchronous callback, so should not be saved.
|
|
||||||
struct llama_beams_state {
|
|
||||||
struct llama_beam_view * beam_views;
|
|
||||||
|
|
||||||
size_t n_beams; // Number of elements in beam_views[].
|
|
||||||
size_t common_prefix_length; // Current max length of prefix tokens shared by all beams.
|
|
||||||
bool last_call; // True iff this is the last callback invocation.
|
|
||||||
};
|
|
||||||
|
|
||||||
// Type of pointer to the beam_search_callback function.
|
|
||||||
// void* callback_data is any custom data passed to llama_beam_search, that is subsequently
|
|
||||||
// passed back to beam_search_callback. This avoids having to use global variables in the callback.
|
|
||||||
typedef void (*llama_beam_search_callback_fn_t)(void * callback_data, struct llama_beams_state);
|
|
||||||
|
|
||||||
/// @details Deterministically returns entire sentence constructed by a beam search.
|
|
||||||
/// @param ctx Pointer to the llama_context.
|
|
||||||
/// @param callback Invoked for each iteration of the beam_search loop, passing in beams_state.
|
|
||||||
/// @param callback_data A pointer that is simply passed back to callback.
|
|
||||||
/// @param n_beams Number of beams to use.
|
|
||||||
/// @param n_past Number of tokens already evaluated.
|
|
||||||
/// @param n_predict Maximum number of tokens to predict. EOS may occur earlier.
|
|
||||||
LLAMA_API void llama_beam_search(
|
|
||||||
struct llama_context * ctx,
|
|
||||||
llama_beam_search_callback_fn_t callback,
|
|
||||||
void * callback_data,
|
|
||||||
size_t n_beams,
|
|
||||||
int32_t n_past,
|
|
||||||
int32_t n_predict);
|
|
||||||
|
|
||||||
/// @details Build a split GGUF final path for this chunk.
|
/// @details Build a split GGUF final path for this chunk.
|
||||||
/// llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
|
/// llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
|
||||||
// Returns the split_path length.
|
// Returns the split_path length.
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue