mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-10 17:14:36 +00:00
Merge branch 'upstream' into concedo_experimental
# Conflicts: # README.md # docs/build.md # examples/infill/infill.cpp # examples/main/README.md # examples/server/README.md # flake.lock # scripts/sync-ggml.last # src/llama.cpp # tests/test-json-schema-to-grammar.cpp # tests/test-sampling.cpp
This commit is contained in:
commit
a9dbcdd3ec
34 changed files with 1701 additions and 928 deletions
284
common/arg.cpp
284
common/arg.cpp
|
@ -120,32 +120,6 @@ std::string common_arg::to_string() {
|
||||||
// utils
|
// utils
|
||||||
//
|
//
|
||||||
|
|
||||||
#ifdef __GNUC__
|
|
||||||
#ifdef __MINGW32__
|
|
||||||
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
|
|
||||||
#else
|
|
||||||
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
|
|
||||||
#endif
|
|
||||||
#else
|
|
||||||
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
|
|
||||||
static std::string format(const char * fmt, ...) {
|
|
||||||
va_list ap;
|
|
||||||
va_list ap2;
|
|
||||||
va_start(ap, fmt);
|
|
||||||
va_copy(ap2, ap);
|
|
||||||
int size = vsnprintf(NULL, 0, fmt, ap);
|
|
||||||
GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
|
|
||||||
std::vector<char> buf(size + 1);
|
|
||||||
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
|
|
||||||
GGML_ASSERT(size2 == size);
|
|
||||||
va_end(ap2);
|
|
||||||
va_end(ap);
|
|
||||||
return std::string(buf.data(), size);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void common_params_handle_model_default(common_params & params) {
|
static void common_params_handle_model_default(common_params & params) {
|
||||||
if (!params.hf_repo.empty()) {
|
if (!params.hf_repo.empty()) {
|
||||||
// short-hand to avoid specifying --hf-file -> default it to --model
|
// short-hand to avoid specifying --hf-file -> default it to --model
|
||||||
|
@ -200,7 +174,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
} catch (std::exception & e) {
|
} catch (std::exception & e) {
|
||||||
throw std::invalid_argument(format(
|
throw std::invalid_argument(string_format(
|
||||||
"error while handling environment variable \"%s\": %s\n\n", opt.env, e.what()));
|
"error while handling environment variable \"%s\": %s\n\n", opt.env, e.what()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -221,7 +195,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
||||||
std::replace(arg.begin(), arg.end(), '_', '-');
|
std::replace(arg.begin(), arg.end(), '_', '-');
|
||||||
}
|
}
|
||||||
if (arg_to_options.find(arg) == arg_to_options.end()) {
|
if (arg_to_options.find(arg) == arg_to_options.end()) {
|
||||||
throw std::invalid_argument(format("error: invalid argument: %s", arg.c_str()));
|
throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
|
||||||
}
|
}
|
||||||
auto opt = *arg_to_options[arg];
|
auto opt = *arg_to_options[arg];
|
||||||
if (opt.has_value_from_env()) {
|
if (opt.has_value_from_env()) {
|
||||||
|
@ -253,7 +227,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
} catch (std::exception & e) {
|
} catch (std::exception & e) {
|
||||||
throw std::invalid_argument(format(
|
throw std::invalid_argument(string_format(
|
||||||
"error while handling argument \"%s\": %s\n\n"
|
"error while handling argument \"%s\": %s\n\n"
|
||||||
"usage:\n%s\n\nto show complete usage, run with -h",
|
"usage:\n%s\n\nto show complete usage, run with -h",
|
||||||
arg.c_str(), e.what(), arg_to_options[arg]->to_string().c_str()));
|
arg.c_str(), e.what(), arg_to_options[arg]->to_string().c_str()));
|
||||||
|
@ -392,28 +366,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
));
|
));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--verbose-prompt"},
|
{"--verbose-prompt"},
|
||||||
format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"),
|
string_format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"),
|
||||||
[](common_params & params) {
|
[](common_params & params) {
|
||||||
params.verbose_prompt = true;
|
params.verbose_prompt = true;
|
||||||
}
|
}
|
||||||
));
|
));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--no-display-prompt"},
|
{"--no-display-prompt"},
|
||||||
format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"),
|
string_format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"),
|
||||||
[](common_params & params) {
|
[](common_params & params) {
|
||||||
params.display_prompt = false;
|
params.display_prompt = false;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-co", "--color"},
|
{"-co", "--color"},
|
||||||
format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"),
|
string_format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"),
|
||||||
[](common_params & params) {
|
[](common_params & params) {
|
||||||
params.use_color = true;
|
params.use_color = true;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
|
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-t", "--threads"}, "N",
|
{"-t", "--threads"}, "N",
|
||||||
format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads),
|
string_format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads),
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.cpuparams.n_threads = value;
|
params.cpuparams.n_threads = value;
|
||||||
if (params.cpuparams.n_threads <= 0) {
|
if (params.cpuparams.n_threads <= 0) {
|
||||||
|
@ -473,14 +447,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
));
|
));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--cpu-strict"}, "<0|1>",
|
{"--cpu-strict"}, "<0|1>",
|
||||||
format("use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu),
|
string_format("use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.cpuparams.strict_cpu = std::stoul(value);
|
params.cpuparams.strict_cpu = std::stoul(value);
|
||||||
}
|
}
|
||||||
));
|
));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--prio"}, "N",
|
{"--prio"}, "N",
|
||||||
format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority),
|
string_format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority),
|
||||||
[](common_params & params, int prio) {
|
[](common_params & params, int prio) {
|
||||||
if (prio < 0 || prio > 3) {
|
if (prio < 0 || prio > 3) {
|
||||||
throw std::invalid_argument("invalid value");
|
throw std::invalid_argument("invalid value");
|
||||||
|
@ -490,7 +464,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
));
|
));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--poll"}, "<0...100>",
|
{"--poll"}, "<0...100>",
|
||||||
format("use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll),
|
string_format("use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.cpuparams.poll = std::stoul(value);
|
params.cpuparams.poll = std::stoul(value);
|
||||||
}
|
}
|
||||||
|
@ -524,7 +498,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
));
|
));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--prio-batch"}, "N",
|
{"--prio-batch"}, "N",
|
||||||
format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams_batch.priority),
|
string_format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams_batch.priority),
|
||||||
[](common_params & params, int prio) {
|
[](common_params & params, int prio) {
|
||||||
if (prio < 0 || prio > 3) {
|
if (prio < 0 || prio > 3) {
|
||||||
throw std::invalid_argument("invalid value");
|
throw std::invalid_argument("invalid value");
|
||||||
|
@ -568,7 +542,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--prio-draft"}, "N",
|
{"--prio-draft"}, "N",
|
||||||
format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams.priority),
|
string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams.priority),
|
||||||
[](common_params & params, int prio) {
|
[](common_params & params, int prio) {
|
||||||
if (prio < 0 || prio > 3) {
|
if (prio < 0 || prio > 3) {
|
||||||
throw std::invalid_argument("invalid value");
|
throw std::invalid_argument("invalid value");
|
||||||
|
@ -612,7 +586,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--prio-batch-draft"}, "N",
|
{"--prio-batch-draft"}, "N",
|
||||||
format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams_batch.priority),
|
string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams_batch.priority),
|
||||||
[](common_params & params, int prio) {
|
[](common_params & params, int prio) {
|
||||||
if (prio < 0 || prio > 3) {
|
if (prio < 0 || prio > 3) {
|
||||||
throw std::invalid_argument("invalid value");
|
throw std::invalid_argument("invalid value");
|
||||||
|
@ -629,14 +603,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--draft"}, "N",
|
{"--draft"}, "N",
|
||||||
format("number of tokens to draft for speculative decoding (default: %d)", params.n_draft),
|
string_format("number of tokens to draft for speculative decoding (default: %d)", params.n_draft),
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.n_draft = value;
|
params.n_draft = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-ps", "--p-split"}, "N",
|
{"-ps", "--p-split"}, "N",
|
||||||
format("speculative decoding split probability (default: %.1f)", (double)params.p_split),
|
string_format("speculative decoding split probability (default: %.1f)", (double)params.p_split),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.p_split = std::stof(value);
|
params.p_split = std::stof(value);
|
||||||
}
|
}
|
||||||
|
@ -657,56 +631,56 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
).set_examples({LLAMA_EXAMPLE_LOOKUP}));
|
).set_examples({LLAMA_EXAMPLE_LOOKUP}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-c", "--ctx-size"}, "N",
|
{"-c", "--ctx-size"}, "N",
|
||||||
format("size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx),
|
string_format("size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx),
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.n_ctx = value;
|
params.n_ctx = value;
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_CTX_SIZE"));
|
).set_env("LLAMA_ARG_CTX_SIZE"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-n", "--predict", "--n-predict"}, "N",
|
{"-n", "--predict", "--n-predict"}, "N",
|
||||||
format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict),
|
string_format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict),
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.n_predict = value;
|
params.n_predict = value;
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_N_PREDICT"));
|
).set_env("LLAMA_ARG_N_PREDICT"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-b", "--batch-size"}, "N",
|
{"-b", "--batch-size"}, "N",
|
||||||
format("logical maximum batch size (default: %d)", params.n_batch),
|
string_format("logical maximum batch size (default: %d)", params.n_batch),
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.n_batch = value;
|
params.n_batch = value;
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_BATCH"));
|
).set_env("LLAMA_ARG_BATCH"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-ub", "--ubatch-size"}, "N",
|
{"-ub", "--ubatch-size"}, "N",
|
||||||
format("physical maximum batch size (default: %d)", params.n_ubatch),
|
string_format("physical maximum batch size (default: %d)", params.n_ubatch),
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.n_ubatch = value;
|
params.n_ubatch = value;
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_UBATCH"));
|
).set_env("LLAMA_ARG_UBATCH"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--keep"}, "N",
|
{"--keep"}, "N",
|
||||||
format("number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep),
|
string_format("number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep),
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.n_keep = value;
|
params.n_keep = value;
|
||||||
}
|
}
|
||||||
));
|
));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--no-context-shift"},
|
{"--no-context-shift"},
|
||||||
format("disables context shift on inifinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
|
string_format("disables context shift on inifinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
|
||||||
[](common_params & params) {
|
[](common_params & params) {
|
||||||
params.ctx_shift = false;
|
params.ctx_shift = false;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
|
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--chunks"}, "N",
|
{"--chunks"}, "N",
|
||||||
format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
|
string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.n_chunks = value;
|
params.n_chunks = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL}));
|
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-fa", "--flash-attn"},
|
{"-fa", "--flash-attn"},
|
||||||
format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"),
|
string_format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"),
|
||||||
[](common_params & params) {
|
[](common_params & params) {
|
||||||
params.flash_attn = true;
|
params.flash_attn = true;
|
||||||
}
|
}
|
||||||
|
@ -722,7 +696,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
));
|
));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--no-perf"},
|
{"--no-perf"},
|
||||||
format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
|
string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
|
||||||
[](common_params & params) {
|
[](common_params & params) {
|
||||||
params.no_perf = true;
|
params.no_perf = true;
|
||||||
params.sparams.no_perf = true;
|
params.sparams.no_perf = true;
|
||||||
|
@ -734,7 +708,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
std::ifstream file(value);
|
std::ifstream file(value);
|
||||||
if (!file) {
|
if (!file) {
|
||||||
throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
|
throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
|
||||||
}
|
}
|
||||||
// store the external file name in params
|
// store the external file name in params
|
||||||
params.prompt_file = value;
|
params.prompt_file = value;
|
||||||
|
@ -750,7 +724,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
std::ifstream file(value);
|
std::ifstream file(value);
|
||||||
if (!file) {
|
if (!file) {
|
||||||
throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
|
throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
|
||||||
}
|
}
|
||||||
params.in_files.push_back(value);
|
params.in_files.push_back(value);
|
||||||
}
|
}
|
||||||
|
@ -761,7 +735,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
std::ifstream file(value, std::ios::binary);
|
std::ifstream file(value, std::ios::binary);
|
||||||
if (!file) {
|
if (!file) {
|
||||||
throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
|
throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
|
||||||
}
|
}
|
||||||
// store the external file name in params
|
// store the external file name in params
|
||||||
params.prompt_file = value;
|
params.prompt_file = value;
|
||||||
|
@ -773,7 +747,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
));
|
));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-e", "--escape"},
|
{"-e", "--escape"},
|
||||||
format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
|
string_format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
|
||||||
[](common_params & params) {
|
[](common_params & params) {
|
||||||
params.escape = true;
|
params.escape = true;
|
||||||
}
|
}
|
||||||
|
@ -787,7 +761,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
));
|
));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-ptc", "--print-token-count"}, "N",
|
{"-ptc", "--print-token-count"}, "N",
|
||||||
format("print token count every N tokens (default: %d)", params.n_print),
|
string_format("print token count every N tokens (default: %d)", params.n_print),
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.n_print = value;
|
params.n_print = value;
|
||||||
}
|
}
|
||||||
|
@ -822,14 +796,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-sp", "--special"},
|
{"-sp", "--special"},
|
||||||
format("special tokens output enabled (default: %s)", params.special ? "true" : "false"),
|
string_format("special tokens output enabled (default: %s)", params.special ? "true" : "false"),
|
||||||
[](common_params & params) {
|
[](common_params & params) {
|
||||||
params.special = true;
|
params.special = true;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
|
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-cnv", "--conversation"},
|
{"-cnv", "--conversation"},
|
||||||
format(
|
string_format(
|
||||||
"run in conversation mode:\n"
|
"run in conversation mode:\n"
|
||||||
"- does not print special tokens and suffix/prefix\n"
|
"- does not print special tokens and suffix/prefix\n"
|
||||||
"- interactive mode is also enabled\n"
|
"- interactive mode is also enabled\n"
|
||||||
|
@ -842,14 +816,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-i", "--interactive"},
|
{"-i", "--interactive"},
|
||||||
format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"),
|
string_format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"),
|
||||||
[](common_params & params) {
|
[](common_params & params) {
|
||||||
params.interactive = true;
|
params.interactive = true;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-if", "--interactive-first"},
|
{"-if", "--interactive-first"},
|
||||||
format("run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"),
|
string_format("run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"),
|
||||||
[](common_params & params) {
|
[](common_params & params) {
|
||||||
params.interactive_first = true;
|
params.interactive_first = true;
|
||||||
}
|
}
|
||||||
|
@ -894,7 +868,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--spm-infill"},
|
{"--spm-infill"},
|
||||||
format(
|
string_format(
|
||||||
"use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)",
|
"use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)",
|
||||||
params.spm_infill ? "enabled" : "disabled"
|
params.spm_infill ? "enabled" : "disabled"
|
||||||
),
|
),
|
||||||
|
@ -904,7 +878,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_INFILL}));
|
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_INFILL}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--samplers"}, "SAMPLERS",
|
{"--samplers"}, "SAMPLERS",
|
||||||
format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()),
|
string_format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
const auto sampler_names = string_split(value, ';');
|
const auto sampler_names = string_split(value, ';');
|
||||||
params.sparams.samplers = common_sampler_types_from_names(sampler_names, true);
|
params.sparams.samplers = common_sampler_types_from_names(sampler_names, true);
|
||||||
|
@ -912,14 +886,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-s", "--seed"}, "SEED",
|
{"-s", "--seed"}, "SEED",
|
||||||
format("RNG seed (default: %d, use random seed for %d)", params.sparams.seed, LLAMA_DEFAULT_SEED),
|
string_format("RNG seed (default: %d, use random seed for %d)", params.sparams.seed, LLAMA_DEFAULT_SEED),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.sparams.seed = std::stoul(value);
|
params.sparams.seed = std::stoul(value);
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--sampling-seq"}, "SEQUENCE",
|
{"--sampling-seq"}, "SEQUENCE",
|
||||||
format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()),
|
string_format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.sparams.samplers = common_sampler_types_from_chars(value);
|
params.sparams.samplers = common_sampler_types_from_chars(value);
|
||||||
}
|
}
|
||||||
|
@ -933,14 +907,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--penalize-nl"},
|
{"--penalize-nl"},
|
||||||
format("penalize newline tokens (default: %s)", params.sparams.penalize_nl ? "true" : "false"),
|
string_format("penalize newline tokens (default: %s)", params.sparams.penalize_nl ? "true" : "false"),
|
||||||
[](common_params & params) {
|
[](common_params & params) {
|
||||||
params.sparams.penalize_nl = true;
|
params.sparams.penalize_nl = true;
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--temp"}, "N",
|
{"--temp"}, "N",
|
||||||
format("temperature (default: %.1f)", (double)params.sparams.temp),
|
string_format("temperature (default: %.1f)", (double)params.sparams.temp),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.sparams.temp = std::stof(value);
|
params.sparams.temp = std::stof(value);
|
||||||
params.sparams.temp = std::max(params.sparams.temp, 0.0f);
|
params.sparams.temp = std::max(params.sparams.temp, 0.0f);
|
||||||
|
@ -948,42 +922,56 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--top-k"}, "N",
|
{"--top-k"}, "N",
|
||||||
format("top-k sampling (default: %d, 0 = disabled)", params.sparams.top_k),
|
string_format("top-k sampling (default: %d, 0 = disabled)", params.sparams.top_k),
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.sparams.top_k = value;
|
params.sparams.top_k = value;
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--top-p"}, "N",
|
{"--top-p"}, "N",
|
||||||
format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sparams.top_p),
|
string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sparams.top_p),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.sparams.top_p = std::stof(value);
|
params.sparams.top_p = std::stof(value);
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--min-p"}, "N",
|
{"--min-p"}, "N",
|
||||||
format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sparams.min_p),
|
string_format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sparams.min_p),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.sparams.min_p = std::stof(value);
|
params.sparams.min_p = std::stof(value);
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--tfs"}, "N",
|
{"--tfs"}, "N",
|
||||||
format("tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)params.sparams.tfs_z),
|
string_format("tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)params.sparams.tfs_z),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.sparams.tfs_z = std::stof(value);
|
params.sparams.tfs_z = std::stof(value);
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"--xtc-probability"}, "N",
|
||||||
|
string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sparams.xtc_probability),
|
||||||
|
[](common_params & params, const std::string & value) {
|
||||||
|
params.sparams.xtc_probability = std::stof(value);
|
||||||
|
}
|
||||||
|
).set_sparam());
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"--xtc-threshold"}, "N",
|
||||||
|
string_format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sparams.xtc_threshold),
|
||||||
|
[](common_params & params, const std::string & value) {
|
||||||
|
params.sparams.xtc_threshold = std::stof(value);
|
||||||
|
}
|
||||||
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--typical"}, "N",
|
{"--typical"}, "N",
|
||||||
format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sparams.typ_p),
|
string_format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sparams.typ_p),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.sparams.typ_p = std::stof(value);
|
params.sparams.typ_p = std::stof(value);
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--repeat-last-n"}, "N",
|
{"--repeat-last-n"}, "N",
|
||||||
format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sparams.penalty_last_n),
|
string_format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sparams.penalty_last_n),
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.sparams.penalty_last_n = value;
|
params.sparams.penalty_last_n = value;
|
||||||
params.sparams.n_prev = std::max(params.sparams.n_prev, params.sparams.penalty_last_n);
|
params.sparams.n_prev = std::max(params.sparams.n_prev, params.sparams.penalty_last_n);
|
||||||
|
@ -991,42 +979,42 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--repeat-penalty"}, "N",
|
{"--repeat-penalty"}, "N",
|
||||||
format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sparams.penalty_repeat),
|
string_format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sparams.penalty_repeat),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.sparams.penalty_repeat = std::stof(value);
|
params.sparams.penalty_repeat = std::stof(value);
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--presence-penalty"}, "N",
|
{"--presence-penalty"}, "N",
|
||||||
format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_present),
|
string_format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_present),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.sparams.penalty_present = std::stof(value);
|
params.sparams.penalty_present = std::stof(value);
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--frequency-penalty"}, "N",
|
{"--frequency-penalty"}, "N",
|
||||||
format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_freq),
|
string_format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_freq),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.sparams.penalty_freq = std::stof(value);
|
params.sparams.penalty_freq = std::stof(value);
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--dynatemp-range"}, "N",
|
{"--dynatemp-range"}, "N",
|
||||||
format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sparams.dynatemp_range),
|
string_format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sparams.dynatemp_range),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.sparams.dynatemp_range = std::stof(value);
|
params.sparams.dynatemp_range = std::stof(value);
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--dynatemp-exp"}, "N",
|
{"--dynatemp-exp"}, "N",
|
||||||
format("dynamic temperature exponent (default: %.1f)", (double)params.sparams.dynatemp_exponent),
|
string_format("dynamic temperature exponent (default: %.1f)", (double)params.sparams.dynatemp_exponent),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.sparams.dynatemp_exponent = std::stof(value);
|
params.sparams.dynatemp_exponent = std::stof(value);
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--mirostat"}, "N",
|
{"--mirostat"}, "N",
|
||||||
format("use Mirostat sampling.\nTop K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"
|
string_format("use Mirostat sampling.\nTop K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"
|
||||||
"(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sparams.mirostat),
|
"(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sparams.mirostat),
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.sparams.mirostat = value;
|
params.sparams.mirostat = value;
|
||||||
|
@ -1034,14 +1022,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--mirostat-lr"}, "N",
|
{"--mirostat-lr"}, "N",
|
||||||
format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sparams.mirostat_eta),
|
string_format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sparams.mirostat_eta),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.sparams.mirostat_eta = std::stof(value);
|
params.sparams.mirostat_eta = std::stof(value);
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--mirostat-ent"}, "N",
|
{"--mirostat-ent"}, "N",
|
||||||
format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sparams.mirostat_tau),
|
string_format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sparams.mirostat_tau),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.sparams.mirostat_tau = std::stof(value);
|
params.sparams.mirostat_tau = std::stof(value);
|
||||||
}
|
}
|
||||||
|
@ -1070,7 +1058,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--grammar"}, "GRAMMAR",
|
{"--grammar"}, "GRAMMAR",
|
||||||
format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sparams.grammar.c_str()),
|
string_format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sparams.grammar.c_str()),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.sparams.grammar = value;
|
params.sparams.grammar = value;
|
||||||
}
|
}
|
||||||
|
@ -1081,7 +1069,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
std::ifstream file(value);
|
std::ifstream file(value);
|
||||||
if (!file) {
|
if (!file) {
|
||||||
throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
|
throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
|
||||||
}
|
}
|
||||||
std::copy(
|
std::copy(
|
||||||
std::istreambuf_iterator<char>(file),
|
std::istreambuf_iterator<char>(file),
|
||||||
|
@ -1151,53 +1139,53 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
).set_env("LLAMA_ARG_ROPE_FREQ_SCALE"));
|
).set_env("LLAMA_ARG_ROPE_FREQ_SCALE"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--yarn-orig-ctx"}, "N",
|
{"--yarn-orig-ctx"}, "N",
|
||||||
format("YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx),
|
string_format("YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx),
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.yarn_orig_ctx = value;
|
params.yarn_orig_ctx = value;
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_YARN_ORIG_CTX"));
|
).set_env("LLAMA_ARG_YARN_ORIG_CTX"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--yarn-ext-factor"}, "N",
|
{"--yarn-ext-factor"}, "N",
|
||||||
format("YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor),
|
string_format("YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.yarn_ext_factor = std::stof(value);
|
params.yarn_ext_factor = std::stof(value);
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_YARN_EXT_FACTOR"));
|
).set_env("LLAMA_ARG_YARN_EXT_FACTOR"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--yarn-attn-factor"}, "N",
|
{"--yarn-attn-factor"}, "N",
|
||||||
format("YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor),
|
string_format("YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.yarn_attn_factor = std::stof(value);
|
params.yarn_attn_factor = std::stof(value);
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_YARN_ATTN_FACTOR"));
|
).set_env("LLAMA_ARG_YARN_ATTN_FACTOR"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--yarn-beta-slow"}, "N",
|
{"--yarn-beta-slow"}, "N",
|
||||||
format("YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow),
|
string_format("YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.yarn_beta_slow = std::stof(value);
|
params.yarn_beta_slow = std::stof(value);
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_YARN_BETA_SLOW"));
|
).set_env("LLAMA_ARG_YARN_BETA_SLOW"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--yarn-beta-fast"}, "N",
|
{"--yarn-beta-fast"}, "N",
|
||||||
format("YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast),
|
string_format("YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.yarn_beta_fast = std::stof(value);
|
params.yarn_beta_fast = std::stof(value);
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_YARN_BETA_FAST"));
|
).set_env("LLAMA_ARG_YARN_BETA_FAST"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-gan", "--grp-attn-n"}, "N",
|
{"-gan", "--grp-attn-n"}, "N",
|
||||||
format("group-attention factor (default: %d)", params.grp_attn_n),
|
string_format("group-attention factor (default: %d)", params.grp_attn_n),
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.grp_attn_n = value;
|
params.grp_attn_n = value;
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_GRP_ATTN_N"));
|
).set_env("LLAMA_ARG_GRP_ATTN_N").set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_PASSKEY}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-gaw", "--grp-attn-w"}, "N",
|
{"-gaw", "--grp-attn-w"}, "N",
|
||||||
format("group-attention width (default: %.1f)", (double)params.grp_attn_w),
|
string_format("group-attention width (default: %d)", params.grp_attn_w),
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.grp_attn_w = value;
|
params.grp_attn_w = value;
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_GRP_ATTN_W"));
|
).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_MAIN}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-dkvc", "--dump-kv-cache"},
|
{"-dkvc", "--dump-kv-cache"},
|
||||||
"verbose print of the KV cache",
|
"verbose print of the KV cache",
|
||||||
|
@ -1214,7 +1202,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
|
).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-ctk", "--cache-type-k"}, "TYPE",
|
{"-ctk", "--cache-type-k"}, "TYPE",
|
||||||
format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()),
|
string_format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
// TODO: get the type right here
|
// TODO: get the type right here
|
||||||
params.cache_type_k = value;
|
params.cache_type_k = value;
|
||||||
|
@ -1222,7 +1210,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
).set_env("LLAMA_ARG_CACHE_TYPE_K"));
|
).set_env("LLAMA_ARG_CACHE_TYPE_K"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-ctv", "--cache-type-v"}, "TYPE",
|
{"-ctv", "--cache-type-v"}, "TYPE",
|
||||||
format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()),
|
string_format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
// TODO: get the type right here
|
// TODO: get the type right here
|
||||||
params.cache_type_v = value;
|
params.cache_type_v = value;
|
||||||
|
@ -1230,7 +1218,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
).set_env("LLAMA_ARG_CACHE_TYPE_V"));
|
).set_env("LLAMA_ARG_CACHE_TYPE_V"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--perplexity", "--all-logits"},
|
{"--perplexity", "--all-logits"},
|
||||||
format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"),
|
string_format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"),
|
||||||
[](common_params & params) {
|
[](common_params & params) {
|
||||||
params.logits_all = true;
|
params.logits_all = true;
|
||||||
}
|
}
|
||||||
|
@ -1244,7 +1232,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--hellaswag-tasks"}, "N",
|
{"--hellaswag-tasks"}, "N",
|
||||||
format("number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks),
|
string_format("number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks),
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.hellaswag_tasks = value;
|
params.hellaswag_tasks = value;
|
||||||
}
|
}
|
||||||
|
@ -1258,7 +1246,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--winogrande-tasks"}, "N",
|
{"--winogrande-tasks"}, "N",
|
||||||
format("number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks),
|
string_format("number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks),
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.winogrande_tasks = value;
|
params.winogrande_tasks = value;
|
||||||
}
|
}
|
||||||
|
@ -1272,7 +1260,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--multiple-choice-tasks"}, "N",
|
{"--multiple-choice-tasks"}, "N",
|
||||||
format("number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks),
|
string_format("number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks),
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.multiple_choice_tasks = value;
|
params.multiple_choice_tasks = value;
|
||||||
}
|
}
|
||||||
|
@ -1293,42 +1281,42 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--ppl-stride"}, "N",
|
{"--ppl-stride"}, "N",
|
||||||
format("stride for perplexity calculation (default: %d)", params.ppl_stride),
|
string_format("stride for perplexity calculation (default: %d)", params.ppl_stride),
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.ppl_stride = value;
|
params.ppl_stride = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--ppl-output-type"}, "<0|1>",
|
{"--ppl-output-type"}, "<0|1>",
|
||||||
format("output type for perplexity calculation (default: %d)", params.ppl_output_type),
|
string_format("output type for perplexity calculation (default: %d)", params.ppl_output_type),
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.ppl_output_type = value;
|
params.ppl_output_type = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-dt", "--defrag-thold"}, "N",
|
{"-dt", "--defrag-thold"}, "N",
|
||||||
format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold),
|
string_format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.defrag_thold = std::stof(value);
|
params.defrag_thold = std::stof(value);
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_DEFRAG_THOLD"));
|
).set_env("LLAMA_ARG_DEFRAG_THOLD"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-np", "--parallel"}, "N",
|
{"-np", "--parallel"}, "N",
|
||||||
format("number of parallel sequences to decode (default: %d)", params.n_parallel),
|
string_format("number of parallel sequences to decode (default: %d)", params.n_parallel),
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.n_parallel = value;
|
params.n_parallel = value;
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_N_PARALLEL"));
|
).set_env("LLAMA_ARG_N_PARALLEL"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-ns", "--sequences"}, "N",
|
{"-ns", "--sequences"}, "N",
|
||||||
format("number of sequences to decode (default: %d)", params.n_sequences),
|
string_format("number of sequences to decode (default: %d)", params.n_sequences),
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.n_sequences = value;
|
params.n_sequences = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_PARALLEL}));
|
).set_examples({LLAMA_EXAMPLE_PARALLEL}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-cb", "--cont-batching"},
|
{"-cb", "--cont-batching"},
|
||||||
format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"),
|
string_format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"),
|
||||||
[](common_params & params) {
|
[](common_params & params) {
|
||||||
params.cont_batching = true;
|
params.cont_batching = true;
|
||||||
}
|
}
|
||||||
|
@ -1452,7 +1440,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
std::vector<std::string> split_arg{ it, {} };
|
std::vector<std::string> split_arg{ it, {} };
|
||||||
if (split_arg.size() >= llama_max_devices()) {
|
if (split_arg.size() >= llama_max_devices()) {
|
||||||
throw std::invalid_argument(
|
throw std::invalid_argument(
|
||||||
format("got %d input configs, but system only has %d devices", (int)split_arg.size(), (int)llama_max_devices())
|
string_format("got %d input configs, but system only has %d devices", (int)split_arg.size(), (int)llama_max_devices())
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
for (size_t i = 0; i < llama_max_devices(); ++i) {
|
for (size_t i = 0; i < llama_max_devices(); ++i) {
|
||||||
|
@ -1469,7 +1457,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
).set_env("LLAMA_ARG_TENSOR_SPLIT"));
|
).set_env("LLAMA_ARG_TENSOR_SPLIT"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-mg", "--main-gpu"}, "INDEX",
|
{"-mg", "--main-gpu"}, "INDEX",
|
||||||
format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu),
|
string_format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu),
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.main_gpu = value;
|
params.main_gpu = value;
|
||||||
if (!llama_supports_gpu_offload()) {
|
if (!llama_supports_gpu_offload()) {
|
||||||
|
@ -1479,7 +1467,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
).set_env("LLAMA_ARG_MAIN_GPU"));
|
).set_env("LLAMA_ARG_MAIN_GPU"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--check-tensors"},
|
{"--check-tensors"},
|
||||||
format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"),
|
string_format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"),
|
||||||
[](common_params & params) {
|
[](common_params & params) {
|
||||||
params.check_tensors = true;
|
params.check_tensors = true;
|
||||||
}
|
}
|
||||||
|
@ -1490,7 +1478,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
"types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false",
|
"types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false",
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
if (!string_parse_kv_override(value.c_str(), params.kv_overrides)) {
|
if (!string_parse_kv_override(value.c_str(), params.kv_overrides)) {
|
||||||
throw std::runtime_error(format("error: Invalid type for KV override: %s\n", value.c_str()));
|
throw std::runtime_error(string_format("error: Invalid type for KV override: %s\n", value.c_str()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
));
|
));
|
||||||
|
@ -1544,7 +1532,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
{"-m", "--model"}, "FNAME",
|
{"-m", "--model"}, "FNAME",
|
||||||
ex == LLAMA_EXAMPLE_EXPORT_LORA
|
ex == LLAMA_EXAMPLE_EXPORT_LORA
|
||||||
? std::string("model path from which to load base model")
|
? std::string("model path from which to load base model")
|
||||||
: format(
|
: string_format(
|
||||||
"model path (default: `models/$filename` with filename from `--hf-file` "
|
"model path (default: `models/$filename` with filename from `--hf-file` "
|
||||||
"or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH
|
"or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH
|
||||||
),
|
),
|
||||||
|
@ -1593,42 +1581,42 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
std::ifstream file(value, std::ios::binary);
|
std::ifstream file(value, std::ios::binary);
|
||||||
if (!file) {
|
if (!file) {
|
||||||
throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
|
throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
|
||||||
}
|
}
|
||||||
params.context_files.push_back(value);
|
params.context_files.push_back(value);
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
|
).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--chunk-size"}, "N",
|
{"--chunk-size"}, "N",
|
||||||
format("minimum length of embedded text chunks (default: %d)", params.chunk_size),
|
string_format("minimum length of embedded text chunks (default: %d)", params.chunk_size),
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.chunk_size = value;
|
params.chunk_size = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
|
).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--chunk-separator"}, "STRING",
|
{"--chunk-separator"}, "STRING",
|
||||||
format("separator between chunks (default: '%s')", params.chunk_separator.c_str()),
|
string_format("separator between chunks (default: '%s')", params.chunk_separator.c_str()),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.chunk_separator = value;
|
params.chunk_separator = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
|
).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--junk"}, "N",
|
{"--junk"}, "N",
|
||||||
format("number of times to repeat the junk text (default: %d)", params.n_junk),
|
string_format("number of times to repeat the junk text (default: %d)", params.n_junk),
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.n_junk = value;
|
params.n_junk = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_PASSKEY}));
|
).set_examples({LLAMA_EXAMPLE_PASSKEY}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--pos"}, "N",
|
{"--pos"}, "N",
|
||||||
format("position of the passkey in the junk text (default: %d)", params.i_pos),
|
string_format("position of the passkey in the junk text (default: %d)", params.i_pos),
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.i_pos = value;
|
params.i_pos = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_PASSKEY}));
|
).set_examples({LLAMA_EXAMPLE_PASSKEY}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-o", "--output", "--output-file"}, "FNAME",
|
{"-o", "--output", "--output-file"}, "FNAME",
|
||||||
format("output file (default: '%s')",
|
string_format("output file (default: '%s')",
|
||||||
ex == LLAMA_EXAMPLE_EXPORT_LORA
|
ex == LLAMA_EXAMPLE_EXPORT_LORA
|
||||||
? params.lora_outfile.c_str()
|
? params.lora_outfile.c_str()
|
||||||
: ex == LLAMA_EXAMPLE_CVECTOR_GENERATOR
|
: ex == LLAMA_EXAMPLE_CVECTOR_GENERATOR
|
||||||
|
@ -1642,42 +1630,42 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA}));
|
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-ofreq", "--output-frequency"}, "N",
|
{"-ofreq", "--output-frequency"}, "N",
|
||||||
format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
|
string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.n_out_freq = value;
|
params.n_out_freq = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--save-frequency"}, "N",
|
{"--save-frequency"}, "N",
|
||||||
format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq),
|
string_format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq),
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.n_save_freq = value;
|
params.n_save_freq = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--process-output"},
|
{"--process-output"},
|
||||||
format("collect data for the output tensor (default: %s)", params.process_output ? "true" : "false"),
|
string_format("collect data for the output tensor (default: %s)", params.process_output ? "true" : "false"),
|
||||||
[](common_params & params) {
|
[](common_params & params) {
|
||||||
params.process_output = true;
|
params.process_output = true;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--no-ppl"},
|
{"--no-ppl"},
|
||||||
format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"),
|
string_format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"),
|
||||||
[](common_params & params) {
|
[](common_params & params) {
|
||||||
params.compute_ppl = false;
|
params.compute_ppl = false;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--chunk", "--from-chunk"}, "N",
|
{"--chunk", "--from-chunk"}, "N",
|
||||||
format("start processing the input from chunk N (default: %d)", params.i_chunk),
|
string_format("start processing the input from chunk N (default: %d)", params.i_chunk),
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.i_chunk = value;
|
params.i_chunk = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-pps"},
|
{"-pps"},
|
||||||
format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"),
|
string_format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"),
|
||||||
[](common_params & params) {
|
[](common_params & params) {
|
||||||
params.is_pp_shared = true;
|
params.is_pp_shared = true;
|
||||||
}
|
}
|
||||||
|
@ -1708,7 +1696,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
).set_examples({LLAMA_EXAMPLE_BENCH}));
|
).set_examples({LLAMA_EXAMPLE_BENCH}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--embd-normalize"}, "N",
|
{"--embd-normalize"}, "N",
|
||||||
format("normalisation for embendings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize),
|
string_format("normalisation for embendings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize),
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.embd_normalize = value;
|
params.embd_normalize = value;
|
||||||
}
|
}
|
||||||
|
@ -1729,35 +1717,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
|
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--host"}, "HOST",
|
{"--host"}, "HOST",
|
||||||
format("ip address to listen (default: %s)", params.hostname.c_str()),
|
string_format("ip address to listen (default: %s)", params.hostname.c_str()),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.hostname = value;
|
params.hostname = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_HOST"));
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_HOST"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--port"}, "PORT",
|
{"--port"}, "PORT",
|
||||||
format("port to listen (default: %d)", params.port),
|
string_format("port to listen (default: %d)", params.port),
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.port = value;
|
params.port = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PORT"));
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PORT"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--path"}, "PATH",
|
{"--path"}, "PATH",
|
||||||
format("path to serve static files from (default: %s)", params.public_path.c_str()),
|
string_format("path to serve static files from (default: %s)", params.public_path.c_str()),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.public_path = value;
|
params.public_path = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH"));
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--embedding", "--embeddings"},
|
{"--embedding", "--embeddings"},
|
||||||
format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
|
string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
|
||||||
[](common_params & params) {
|
[](common_params & params) {
|
||||||
params.embedding = true;
|
params.embedding = true;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--reranking", "--rerank"},
|
{"--reranking", "--rerank"},
|
||||||
format("enable reranking endpoint on server (default: %s)", params.reranking ? "enabled" : "disabled"),
|
string_format("enable reranking endpoint on server (default: %s)", params.reranking ? "enabled" : "disabled"),
|
||||||
[](common_params & params) {
|
[](common_params & params) {
|
||||||
params.reranking = true;
|
params.reranking = true;
|
||||||
}
|
}
|
||||||
|
@ -1775,7 +1763,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
std::ifstream key_file(value);
|
std::ifstream key_file(value);
|
||||||
if (!key_file) {
|
if (!key_file) {
|
||||||
throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
|
throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
|
||||||
}
|
}
|
||||||
std::string key;
|
std::string key;
|
||||||
while (std::getline(key_file, key)) {
|
while (std::getline(key_file, key)) {
|
||||||
|
@ -1802,7 +1790,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE"));
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-to", "--timeout"}, "N",
|
{"-to", "--timeout"}, "N",
|
||||||
format("server read/write timeout in seconds (default: %d)", params.timeout_read),
|
string_format("server read/write timeout in seconds (default: %d)", params.timeout_read),
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.timeout_read = value;
|
params.timeout_read = value;
|
||||||
params.timeout_write = value;
|
params.timeout_write = value;
|
||||||
|
@ -1810,45 +1798,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TIMEOUT"));
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TIMEOUT"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--threads-http"}, "N",
|
{"--threads-http"}, "N",
|
||||||
format("number of threads used to process HTTP requests (default: %d)", params.n_threads_http),
|
string_format("number of threads used to process HTTP requests (default: %d)", params.n_threads_http),
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.n_threads_http = value;
|
params.n_threads_http = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP"));
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-spf", "--system-prompt-file"}, "FNAME",
|
{"--cache-reuse"}, "N",
|
||||||
"set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications",
|
string_format("min chunk size to attempt reusing from the cache via KV shifting (default: %d)", params.n_cache_reuse),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, int value) {
|
||||||
std::ifstream file(value);
|
params.n_cache_reuse = value;
|
||||||
if (!file) {
|
|
||||||
throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
|
|
||||||
}
|
}
|
||||||
std::string system_prompt;
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CACHE_REUSE"));
|
||||||
std::copy(
|
|
||||||
std::istreambuf_iterator<char>(file),
|
|
||||||
std::istreambuf_iterator<char>(),
|
|
||||||
std::back_inserter(system_prompt)
|
|
||||||
);
|
|
||||||
params.system_prompt = system_prompt;
|
|
||||||
}
|
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--metrics"},
|
{"--metrics"},
|
||||||
format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"),
|
string_format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"),
|
||||||
[](common_params & params) {
|
[](common_params & params) {
|
||||||
params.endpoint_metrics = true;
|
params.endpoint_metrics = true;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS"));
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--slots"},
|
{"--slots"},
|
||||||
format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
|
string_format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
|
||||||
[](common_params & params) {
|
[](common_params & params) {
|
||||||
params.endpoint_slots = true;
|
params.endpoint_slots = true;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--props"},
|
{"--props"},
|
||||||
format("enable changing global properties via POST /props (default: %s)", params.endpoint_props ? "enabled" : "disabled"),
|
string_format("enable changing global properties via POST /props (default: %s)", params.endpoint_props ? "enabled" : "disabled"),
|
||||||
[](common_params & params) {
|
[](common_params & params) {
|
||||||
params.endpoint_props = true;
|
params.endpoint_props = true;
|
||||||
}
|
}
|
||||||
|
@ -1878,7 +1856,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
"only commonly used templates are accepted:\nhttps://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template",
|
"only commonly used templates are accepted:\nhttps://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template",
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
if (!common_chat_verify_template(value)) {
|
if (!common_chat_verify_template(value)) {
|
||||||
throw std::runtime_error(format(
|
throw std::runtime_error(string_format(
|
||||||
"error: the supplied chat template is not supported: %s\n"
|
"error: the supplied chat template is not supported: %s\n"
|
||||||
"note: llama.cpp does not use jinja parser, we only support commonly used templates\n",
|
"note: llama.cpp does not use jinja parser, we only support commonly used templates\n",
|
||||||
value.c_str()
|
value.c_str()
|
||||||
|
@ -1889,14 +1867,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
|
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
|
{"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
|
||||||
format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
|
string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.slot_prompt_similarity = std::stof(value);
|
params.slot_prompt_similarity = std::stof(value);
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--lora-init-without-apply"},
|
{"--lora-init-without-apply"},
|
||||||
format("load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"),
|
string_format("load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"),
|
||||||
[](common_params & params) {
|
[](common_params & params) {
|
||||||
params.lora_init_without_apply = true;
|
params.lora_init_without_apply = true;
|
||||||
}
|
}
|
||||||
|
@ -1921,28 +1899,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
));
|
));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--positive-file"}, "FNAME",
|
{"--positive-file"}, "FNAME",
|
||||||
format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()),
|
string_format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.cvector_positive_file = value;
|
params.cvector_positive_file = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
|
).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--negative-file"}, "FNAME",
|
{"--negative-file"}, "FNAME",
|
||||||
format("negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str()),
|
string_format("negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str()),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.cvector_negative_file = value;
|
params.cvector_negative_file = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
|
).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--pca-batch"}, "N",
|
{"--pca-batch"}, "N",
|
||||||
format("batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch),
|
string_format("batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch),
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.n_pca_batch = value;
|
params.n_pca_batch = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
|
).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--pca-iter"}, "N",
|
{"--pca-iter"}, "N",
|
||||||
format("number of iterations used for PCA (default: %d)", params.n_pca_iterations),
|
string_format("number of iterations used for PCA (default: %d)", params.n_pca_iterations),
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.n_pca_iterations = value;
|
params.n_pca_iterations = value;
|
||||||
}
|
}
|
||||||
|
|
|
@ -14,6 +14,7 @@
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <cinttypes>
|
#include <cinttypes>
|
||||||
|
#include <climits>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <codecvt>
|
#include <codecvt>
|
||||||
#include <cstdarg>
|
#include <cstdarg>
|
||||||
|
@ -25,10 +26,10 @@
|
||||||
#include <regex>
|
#include <regex>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
#include <thread>
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
#include <unordered_set>
|
#include <unordered_set>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <thread>
|
|
||||||
|
|
||||||
#if defined(__APPLE__) && defined(__MACH__)
|
#if defined(__APPLE__) && defined(__MACH__)
|
||||||
#include <sys/types.h>
|
#include <sys/types.h>
|
||||||
|
@ -402,6 +403,21 @@ std::string common_params_get_system_info(const common_params & params) {
|
||||||
// String utils
|
// String utils
|
||||||
//
|
//
|
||||||
|
|
||||||
|
std::string string_format(const char * fmt, ...) {
|
||||||
|
va_list ap;
|
||||||
|
va_list ap2;
|
||||||
|
va_start(ap, fmt);
|
||||||
|
va_copy(ap2, ap);
|
||||||
|
int size = vsnprintf(NULL, 0, fmt, ap);
|
||||||
|
GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
|
||||||
|
std::vector<char> buf(size + 1);
|
||||||
|
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
|
||||||
|
GGML_ASSERT(size2 == size);
|
||||||
|
va_end(ap2);
|
||||||
|
va_end(ap);
|
||||||
|
return std::string(buf.data(), size);
|
||||||
|
}
|
||||||
|
|
||||||
std::vector<std::string> string_split(std::string input, char separator) {
|
std::vector<std::string> string_split(std::string input, char separator) {
|
||||||
std::vector<std::string> parts;
|
std::vector<std::string> parts;
|
||||||
size_t separator_pos = input.find(separator);
|
size_t separator_pos = input.find(separator);
|
||||||
|
@ -2090,6 +2106,8 @@ void yaml_dump_non_result_info(FILE * stream, const common_params & params, cons
|
||||||
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
|
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
|
||||||
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
|
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
|
||||||
fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
|
fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
|
||||||
|
fprintf(stream, "xtc_probability: %f # default: 0.0\n", sparams.xtc_probability);
|
||||||
|
fprintf(stream, "xtc_threshold: %f # default: 0.1\n", sparams.xtc_threshold);
|
||||||
fprintf(stream, "typ_p: %f # default: 1.0\n", sparams.typ_p);
|
fprintf(stream, "typ_p: %f # default: 1.0\n", sparams.typ_p);
|
||||||
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
|
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
|
||||||
fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false");
|
fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false");
|
||||||
|
|
|
@ -86,6 +86,8 @@ enum common_sampler_type {
|
||||||
COMMON_SAMPLER_TYPE_TFS_Z = 4,
|
COMMON_SAMPLER_TYPE_TFS_Z = 4,
|
||||||
COMMON_SAMPLER_TYPE_TYPICAL_P = 5,
|
COMMON_SAMPLER_TYPE_TYPICAL_P = 5,
|
||||||
COMMON_SAMPLER_TYPE_TEMPERATURE = 6,
|
COMMON_SAMPLER_TYPE_TEMPERATURE = 6,
|
||||||
|
COMMON_SAMPLER_TYPE_XTC = 7,
|
||||||
|
COMMON_SAMPLER_TYPE_INFILL = 8,
|
||||||
};
|
};
|
||||||
|
|
||||||
// dimensionality reduction methods, used by cvector-generator
|
// dimensionality reduction methods, used by cvector-generator
|
||||||
|
@ -104,6 +106,8 @@ struct common_sampler_params {
|
||||||
int32_t top_k = 40; // <= 0 to use vocab size
|
int32_t top_k = 40; // <= 0 to use vocab size
|
||||||
float top_p = 0.95f; // 1.0 = disabled
|
float top_p = 0.95f; // 1.0 = disabled
|
||||||
float min_p = 0.05f; // 0.0 = disabled
|
float min_p = 0.05f; // 0.0 = disabled
|
||||||
|
float xtc_probability = 0.00f; // 0.0 = disabled
|
||||||
|
float xtc_threshold = 0.10f; // > 0.5 disables XTC
|
||||||
float tfs_z = 1.00f; // 1.0 = disabled
|
float tfs_z = 1.00f; // 1.0 = disabled
|
||||||
float typ_p = 1.00f; // typical_p, 1.0 = disabled
|
float typ_p = 1.00f; // typical_p, 1.0 = disabled
|
||||||
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
|
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
|
||||||
|
@ -120,13 +124,15 @@ struct common_sampler_params {
|
||||||
bool ignore_eos = false;
|
bool ignore_eos = false;
|
||||||
bool no_perf = false; // disable performance metrics
|
bool no_perf = false; // disable performance metrics
|
||||||
|
|
||||||
|
|
||||||
std::vector<enum common_sampler_type> samplers = {
|
std::vector<enum common_sampler_type> samplers = {
|
||||||
COMMON_SAMPLER_TYPE_TOP_K,
|
COMMON_SAMPLER_TYPE_TOP_K,
|
||||||
COMMON_SAMPLER_TYPE_TFS_Z,
|
COMMON_SAMPLER_TYPE_TFS_Z,
|
||||||
COMMON_SAMPLER_TYPE_TYPICAL_P,
|
COMMON_SAMPLER_TYPE_TYPICAL_P,
|
||||||
COMMON_SAMPLER_TYPE_TOP_P,
|
COMMON_SAMPLER_TYPE_TOP_P,
|
||||||
COMMON_SAMPLER_TYPE_MIN_P,
|
COMMON_SAMPLER_TYPE_MIN_P,
|
||||||
COMMON_SAMPLER_TYPE_TEMPERATURE
|
COMMON_SAMPLER_TYPE_XTC,
|
||||||
|
COMMON_SAMPLER_TYPE_TEMPERATURE,
|
||||||
};
|
};
|
||||||
|
|
||||||
std::string grammar; // optional BNF-like grammar to constrain sampling
|
std::string grammar; // optional BNF-like grammar to constrain sampling
|
||||||
|
@ -273,12 +279,12 @@ struct common_params {
|
||||||
int32_t port = 8080; // server listens on this network port
|
int32_t port = 8080; // server listens on this network port
|
||||||
int32_t timeout_read = 600; // http read timeout in seconds
|
int32_t timeout_read = 600; // http read timeout in seconds
|
||||||
int32_t timeout_write = timeout_read; // http write timeout in seconds
|
int32_t timeout_write = timeout_read; // http write timeout in seconds
|
||||||
int n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
|
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
|
||||||
|
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
|
||||||
|
|
||||||
std::string hostname = "127.0.0.1";
|
std::string hostname = "127.0.0.1";
|
||||||
std::string public_path = ""; // NOLINT
|
std::string public_path = ""; // NOLINT
|
||||||
std::string chat_template = ""; // NOLINT
|
std::string chat_template = ""; // NOLINT
|
||||||
std::string system_prompt = ""; // NOLINT
|
|
||||||
bool enable_chat_template = true;
|
bool enable_chat_template = true;
|
||||||
|
|
||||||
std::vector<std::string> api_keys;
|
std::vector<std::string> api_keys;
|
||||||
|
@ -357,6 +363,19 @@ bool set_process_priority(enum ggml_sched_priority prio);
|
||||||
// String utils
|
// String utils
|
||||||
//
|
//
|
||||||
|
|
||||||
|
#ifdef __GNUC__
|
||||||
|
#ifdef __MINGW32__
|
||||||
|
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
|
||||||
|
#else
|
||||||
|
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
|
||||||
|
#endif
|
||||||
|
#else
|
||||||
|
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
|
||||||
|
std::string string_format(const char * fmt, ...);
|
||||||
|
|
||||||
std::vector<std::string> string_split(std::string input, char separator);
|
std::vector<std::string> string_split(std::string input, char separator);
|
||||||
|
|
||||||
std::string string_strip(const std::string & str);
|
std::string string_strip(const std::string & str);
|
||||||
|
|
|
@ -611,7 +611,7 @@ private:
|
||||||
}
|
}
|
||||||
return join_seq();
|
return join_seq();
|
||||||
};
|
};
|
||||||
return _add_rule(name, "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space");
|
return _add_rule(name, "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space");
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
|
@ -130,10 +130,10 @@ std::string common_sampler_params::print() const {
|
||||||
|
|
||||||
snprintf(result, sizeof(result),
|
snprintf(result, sizeof(result),
|
||||||
"\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
|
"\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
|
||||||
"\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n"
|
"\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, temp = %.3f\n"
|
||||||
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
|
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
|
||||||
penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
|
penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
|
||||||
top_k, tfs_z, top_p, min_p, typ_p, temp,
|
top_k, tfs_z, top_p, min_p, xtc_probability, xtc_threshold, typ_p, temp,
|
||||||
mirostat, mirostat_eta, mirostat_tau);
|
mirostat, mirostat_eta, mirostat_tau);
|
||||||
|
|
||||||
return std::string(result);
|
return std::string(result);
|
||||||
|
@ -184,6 +184,9 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
||||||
case COMMON_SAMPLER_TYPE_MIN_P:
|
case COMMON_SAMPLER_TYPE_MIN_P:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
|
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
|
||||||
break;
|
break;
|
||||||
|
case COMMON_SAMPLER_TYPE_XTC:
|
||||||
|
llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
|
||||||
|
break;
|
||||||
case COMMON_SAMPLER_TYPE_TFS_Z:
|
case COMMON_SAMPLER_TYPE_TFS_Z:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep));
|
llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep));
|
||||||
break;
|
break;
|
||||||
|
@ -193,6 +196,9 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
||||||
case COMMON_SAMPLER_TYPE_TEMPERATURE:
|
case COMMON_SAMPLER_TYPE_TEMPERATURE:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
|
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
|
||||||
break;
|
break;
|
||||||
|
case COMMON_SAMPLER_TYPE_INFILL:
|
||||||
|
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (model));
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
GGML_ASSERT(false && "unknown sampler type");
|
GGML_ASSERT(false && "unknown sampler type");
|
||||||
}
|
}
|
||||||
|
@ -372,6 +378,8 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
|
||||||
case COMMON_SAMPLER_TYPE_TOP_P: return 'p';
|
case COMMON_SAMPLER_TYPE_TOP_P: return 'p';
|
||||||
case COMMON_SAMPLER_TYPE_MIN_P: return 'm';
|
case COMMON_SAMPLER_TYPE_MIN_P: return 'm';
|
||||||
case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't';
|
case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't';
|
||||||
|
case COMMON_SAMPLER_TYPE_XTC: return 'x';
|
||||||
|
case COMMON_SAMPLER_TYPE_INFILL: return 'i';
|
||||||
default : return '?';
|
default : return '?';
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -384,6 +392,8 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
|
||||||
case COMMON_SAMPLER_TYPE_TOP_P: return "top_p";
|
case COMMON_SAMPLER_TYPE_TOP_P: return "top_p";
|
||||||
case COMMON_SAMPLER_TYPE_MIN_P: return "min_p";
|
case COMMON_SAMPLER_TYPE_MIN_P: return "min_p";
|
||||||
case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature";
|
case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature";
|
||||||
|
case COMMON_SAMPLER_TYPE_XTC: return "xtc";
|
||||||
|
case COMMON_SAMPLER_TYPE_INFILL: return "infill";
|
||||||
default : return "";
|
default : return "";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -396,6 +406,8 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
|
||||||
{ "min_p", COMMON_SAMPLER_TYPE_MIN_P },
|
{ "min_p", COMMON_SAMPLER_TYPE_MIN_P },
|
||||||
{ "tfs_z", COMMON_SAMPLER_TYPE_TFS_Z },
|
{ "tfs_z", COMMON_SAMPLER_TYPE_TFS_Z },
|
||||||
{ "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
|
{ "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
|
||||||
|
{ "xtc", COMMON_SAMPLER_TYPE_XTC },
|
||||||
|
{ "infill", COMMON_SAMPLER_TYPE_INFILL },
|
||||||
};
|
};
|
||||||
|
|
||||||
// since samplers names are written multiple ways
|
// since samplers names are written multiple ways
|
||||||
|
@ -441,7 +453,9 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
|
||||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P), COMMON_SAMPLER_TYPE_TYPICAL_P },
|
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P), COMMON_SAMPLER_TYPE_TYPICAL_P },
|
||||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P), COMMON_SAMPLER_TYPE_TOP_P },
|
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P), COMMON_SAMPLER_TYPE_TOP_P },
|
||||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P), COMMON_SAMPLER_TYPE_MIN_P },
|
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P), COMMON_SAMPLER_TYPE_MIN_P },
|
||||||
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE }
|
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE },
|
||||||
|
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC), COMMON_SAMPLER_TYPE_XTC },
|
||||||
|
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_INFILL), COMMON_SAMPLER_TYPE_INFILL },
|
||||||
};
|
};
|
||||||
|
|
||||||
std::vector<common_sampler_type> samplers;
|
std::vector<common_sampler_type> samplers;
|
||||||
|
|
|
@ -540,7 +540,7 @@ class SchemaConverter:
|
||||||
return self._add_rule(
|
return self._add_rule(
|
||||||
name,
|
name,
|
||||||
to_rule(transform()) if self._raw_pattern \
|
to_rule(transform()) if self._raw_pattern \
|
||||||
else "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space")
|
else "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space")
|
||||||
|
|
||||||
|
|
||||||
def _resolve_ref(self, ref):
|
def _resolve_ref(self, ref):
|
||||||
|
|
|
@ -432,7 +432,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
|
||||||
bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos);
|
bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos);
|
||||||
if (!image_embed_result) {
|
if (!image_embed_result) {
|
||||||
clip_image_u8_free(img);
|
clip_image_u8_free(img);
|
||||||
LOG_ERR("%s: coulnd't embed the image\n", __func__);
|
LOG_ERR("%s: couldn't embed the image\n", __func__);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -570,7 +570,8 @@ int main(int argc, char ** argv) {
|
||||||
if (!params.ctx_shift){
|
if (!params.ctx_shift){
|
||||||
LOG_DBG("\n\n%s: context full and context shift is disabled => stopping\n", __func__);
|
LOG_DBG("\n\n%s: context full and context shift is disabled => stopping\n", __func__);
|
||||||
break;
|
break;
|
||||||
} else {
|
}
|
||||||
|
|
||||||
if (params.n_predict == -2) {
|
if (params.n_predict == -2) {
|
||||||
LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
|
LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
|
||||||
break;
|
break;
|
||||||
|
@ -594,7 +595,6 @@ int main(int argc, char ** argv) {
|
||||||
LOG_DBG("clear session path\n");
|
LOG_DBG("clear session path\n");
|
||||||
path_session.clear();
|
path_session.clear();
|
||||||
}
|
}
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
// context extension via Self-Extend
|
// context extension via Self-Extend
|
||||||
while (n_past >= ga_i + ga_w) {
|
while (n_past >= ga_i + ga_w) {
|
||||||
|
|
|
@ -43,6 +43,8 @@
|
||||||
top_k: 0, // <= 0 to use vocab size
|
top_k: 0, // <= 0 to use vocab size
|
||||||
top_p: 1.0, // 1.0 = disabled
|
top_p: 1.0, // 1.0 = disabled
|
||||||
min_p: 0.05, // 0 = disabled; recommended for non-english: ~ 0.4
|
min_p: 0.05, // 0 = disabled; recommended for non-english: ~ 0.4
|
||||||
|
xtc_probability: 0.0, // 0 = disabled;
|
||||||
|
xtc_threshold: 0.1, // > 0.5 disables XTC;
|
||||||
tfs_z: 1.0, // 1.0 = disabled
|
tfs_z: 1.0, // 1.0 = disabled
|
||||||
typical_p: 1.0, // 1.0 = disabled
|
typical_p: 1.0, // 1.0 = disabled
|
||||||
presence_penalty: 0.0, // 0.0 = disabled
|
presence_penalty: 0.0, // 0.0 = disabled
|
||||||
|
@ -836,6 +838,8 @@ return html`
|
||||||
${FloatField({ label: "TFS-Z", title: "Activates tail-free sampling, a method used to limit the prediction of tokens that are too frequent. The parameter z controls the strength of this limitation. A value of 1.0 means that this function is deactivated.", max: 1.0, min: 0.0, name: "tfs_z", step: 0.01, value: params.value.tfs_z })}
|
${FloatField({ label: "TFS-Z", title: "Activates tail-free sampling, a method used to limit the prediction of tokens that are too frequent. The parameter z controls the strength of this limitation. A value of 1.0 means that this function is deactivated.", max: 1.0, min: 0.0, name: "tfs_z", step: 0.01, value: params.value.tfs_z })}
|
||||||
${FloatField({ label: "Frequency Penalty", title: "A penalty that is applied based on the frequency with which certain tokens occur in the training data set. A higher value results in rare tokens being favoured.", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty })}
|
${FloatField({ label: "Frequency Penalty", title: "A penalty that is applied based on the frequency with which certain tokens occur in the training data set. A higher value results in rare tokens being favoured.", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty })}
|
||||||
${FloatField({ label: "Typical-P", title: "Activates local typical sampling, a method used to limit the prediction of tokens that are atypical in the current context. The parameter p controls the strength of this limitation. A value of 1.0 means that this function is deactivated.", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p })}
|
${FloatField({ label: "Typical-P", title: "Activates local typical sampling, a method used to limit the prediction of tokens that are atypical in the current context. The parameter p controls the strength of this limitation. A value of 1.0 means that this function is deactivated.", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p })}
|
||||||
|
${FloatField({ label: "XTC probability", title: "Sets the chance for token removal (checked once on sampler start)", max: 1.0, min: 0.0, name: "xtc_probability", step: 0.01, value: params.value.xtc_probability })}
|
||||||
|
${FloatField({ label: "XTC threshold", title: "Sets a minimum probability threshold for tokens to be removed", max: 0.5, min: 0.0, name: "xtc_threshold", step: 0.01, value: params.value.xtc_threshold })}
|
||||||
${IntField({ label: "Min Keep", title: "If greater than 0, samplers are forced to return N possible tokens at minimum. Default is 0", max: 10, min: 0, name: "min_keep", value: params.value.min_keep })}
|
${IntField({ label: "Min Keep", title: "If greater than 0, samplers are forced to return N possible tokens at minimum. Default is 0", max: 10, min: 0, name: "min_keep", value: params.value.min_keep })}
|
||||||
</fieldset>
|
</fieldset>
|
||||||
|
|
||||||
|
@ -1132,6 +1136,8 @@ document.addEventListener('DOMContentLoaded', (event) => {
|
||||||
const snapSettings = {
|
const snapSettings = {
|
||||||
temperature: { snapValue: 1.0, snapRangeMultiplier: 6 },
|
temperature: { snapValue: 1.0, snapRangeMultiplier: 6 },
|
||||||
min_p: { snapValue: 0.05, snapRangeMultiplier: 2 },
|
min_p: { snapValue: 0.05, snapRangeMultiplier: 2 },
|
||||||
|
xtc_probability: { snapValue: 0.0, snapRangeMultiplier: 4 },
|
||||||
|
xtc_threshold: { snapValue: 0.5, snapRangeMultiplier: 4 },
|
||||||
top_p: { snapValue: 1.0, snapRangeMultiplier: 4 },
|
top_p: { snapValue: 1.0, snapRangeMultiplier: 4 },
|
||||||
tfs_z: { snapValue: 1.0, snapRangeMultiplier: 4 },
|
tfs_z: { snapValue: 1.0, snapRangeMultiplier: 4 },
|
||||||
typical_p: { snapValue: 1.0, snapRangeMultiplier: 4 },
|
typical_p: { snapValue: 1.0, snapRangeMultiplier: 4 },
|
||||||
|
|
|
@ -307,6 +307,8 @@
|
||||||
top_k: 40, // <= 0 to use vocab size
|
top_k: 40, // <= 0 to use vocab size
|
||||||
top_p: 0.95, // 1.0 = disabled
|
top_p: 0.95, // 1.0 = disabled
|
||||||
min_p: 0.05, // 0 = disabled
|
min_p: 0.05, // 0 = disabled
|
||||||
|
xtc_probability: 0.0, // 0 = disabled;
|
||||||
|
xtc_threshold: 0.1, // > 0.5 disables XTC;
|
||||||
tfs_z: 1.0, // 1.0 = disabled
|
tfs_z: 1.0, // 1.0 = disabled
|
||||||
typical_p: 1.0, // 1.0 = disabled
|
typical_p: 1.0, // 1.0 = disabled
|
||||||
presence_penalty: 0.0, // 0.0 = disabled
|
presence_penalty: 0.0, // 0.0 = disabled
|
||||||
|
@ -1013,6 +1015,8 @@
|
||||||
${FloatField({ label: "Typical P", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p })}
|
${FloatField({ label: "Typical P", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p })}
|
||||||
${FloatField({ label: "Presence penalty", max: 1.0, min: 0.0, name: "presence_penalty", step: 0.01, value: params.value.presence_penalty })}
|
${FloatField({ label: "Presence penalty", max: 1.0, min: 0.0, name: "presence_penalty", step: 0.01, value: params.value.presence_penalty })}
|
||||||
${FloatField({ label: "Frequency penalty", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty })}
|
${FloatField({ label: "Frequency penalty", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty })}
|
||||||
|
${FloatField({ label: "XTC probability", max: 1.0, min: 0.0, name: "xtc_probability", step: 0.01, value: params.value.xtc_probability })}
|
||||||
|
${FloatField({ label: "XTC threshold", max: 0.5, min: 0.0, name: "xtc_threshold", step: 0.01, value: params.value.xtc_threshold })}
|
||||||
</fieldset>
|
</fieldset>
|
||||||
<hr />
|
<hr />
|
||||||
<fieldset class="three">
|
<fieldset class="three">
|
||||||
|
|
File diff suppressed because one or more lines are too long
|
@ -529,7 +529,7 @@ export class SchemaConverter {
|
||||||
return joinSeq();
|
return joinSeq();
|
||||||
};
|
};
|
||||||
|
|
||||||
return this._addRule(name, "\"\\\"\" " + toRule(transform()) + " \"\\\"\" space")
|
return this._addRule(name, "\"\\\"\" (" + toRule(transform()) + ") \"\\\"\" space")
|
||||||
}
|
}
|
||||||
|
|
||||||
_notStrings(strings) {
|
_notStrings(strings) {
|
||||||
|
|
|
@ -133,10 +133,10 @@ struct slot_params {
|
||||||
int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
|
int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
|
||||||
int32_t n_predict = -1; // new tokens to predict
|
int32_t n_predict = -1; // new tokens to predict
|
||||||
|
|
||||||
std::vector<std::string> antiprompt;
|
int64_t t_max_prompt_ms = -1; // TODO: implement
|
||||||
|
int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
|
||||||
|
|
||||||
json input_prefix;
|
std::vector<std::string> antiprompt;
|
||||||
json input_suffix;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
struct server_slot {
|
struct server_slot {
|
||||||
|
@ -166,8 +166,13 @@ struct server_slot {
|
||||||
|
|
||||||
json prompt; // can be either a string, array of strings or array of token ids
|
json prompt; // can be either a string, array of strings or array of token ids
|
||||||
|
|
||||||
|
json input_prefix;
|
||||||
|
json input_suffix;
|
||||||
|
json input_extra;
|
||||||
|
|
||||||
// when a task is submitted, we first tokenize the prompt and store it here
|
// when a task is submitted, we first tokenize the prompt and store it here
|
||||||
std::vector<llama_token> prompt_tokens;
|
std::vector<llama_token> prompt_tokens;
|
||||||
|
std::vector<llama_token> extra_tokens;
|
||||||
|
|
||||||
std::string generated_text;
|
std::string generated_text;
|
||||||
std::vector<llama_token> cache_tokens;
|
std::vector<llama_token> cache_tokens;
|
||||||
|
@ -176,6 +181,7 @@ struct server_slot {
|
||||||
server_task_cmpl_type cmpl_type = SERVER_TASK_CMPL_TYPE_NORMAL;
|
server_task_cmpl_type cmpl_type = SERVER_TASK_CMPL_TYPE_NORMAL;
|
||||||
|
|
||||||
bool has_next_token = true;
|
bool has_next_token = true;
|
||||||
|
bool has_new_line = false;
|
||||||
bool truncated = false;
|
bool truncated = false;
|
||||||
bool stopped_eos = false;
|
bool stopped_eos = false;
|
||||||
bool stopped_word = false;
|
bool stopped_word = false;
|
||||||
|
@ -194,12 +200,6 @@ struct server_slot {
|
||||||
|
|
||||||
llama_token sampled;
|
llama_token sampled;
|
||||||
|
|
||||||
int32_t ga_i = 0; // group-attention state
|
|
||||||
int32_t ga_n = 1; // group-attention factor
|
|
||||||
int32_t ga_w = 512; // group-attention width
|
|
||||||
|
|
||||||
int32_t n_past_se = 0; // self-extend
|
|
||||||
|
|
||||||
// stats
|
// stats
|
||||||
size_t n_sent_text = 0; // number of sent text character
|
size_t n_sent_text = 0; // number of sent text character
|
||||||
size_t n_sent_token_probs = 0;
|
size_t n_sent_token_probs = 0;
|
||||||
|
@ -217,6 +217,7 @@ struct server_slot {
|
||||||
|
|
||||||
n_prompt_tokens = 0;
|
n_prompt_tokens = 0;
|
||||||
generated_text = "";
|
generated_text = "";
|
||||||
|
has_new_line = false;
|
||||||
truncated = false;
|
truncated = false;
|
||||||
stopped_eos = false;
|
stopped_eos = false;
|
||||||
stopped_word = false;
|
stopped_word = false;
|
||||||
|
@ -226,8 +227,6 @@ struct server_slot {
|
||||||
n_sent_text = 0;
|
n_sent_text = 0;
|
||||||
n_sent_token_probs = 0;
|
n_sent_token_probs = 0;
|
||||||
cmpl_type = SERVER_TASK_CMPL_TYPE_NORMAL;
|
cmpl_type = SERVER_TASK_CMPL_TYPE_NORMAL;
|
||||||
ga_i = 0;
|
|
||||||
n_past_se = 0;
|
|
||||||
|
|
||||||
generated_token_probs.clear();
|
generated_token_probs.clear();
|
||||||
}
|
}
|
||||||
|
@ -624,12 +623,6 @@ struct server_context {
|
||||||
|
|
||||||
int32_t n_ctx; // total context for all clients / slots
|
int32_t n_ctx; // total context for all clients / slots
|
||||||
|
|
||||||
// system prompt
|
|
||||||
bool system_need_update = false;
|
|
||||||
|
|
||||||
std::string system_prompt;
|
|
||||||
std::vector<llama_token> system_tokens;
|
|
||||||
|
|
||||||
// slots / clients
|
// slots / clients
|
||||||
std::vector<server_slot> slots;
|
std::vector<server_slot> slots;
|
||||||
json default_generation_settings_for_props;
|
json default_generation_settings_for_props;
|
||||||
|
@ -666,7 +659,7 @@ struct server_context {
|
||||||
bool load_model(const common_params & params_) {
|
bool load_model(const common_params & params_) {
|
||||||
params = params_;
|
params = params_;
|
||||||
|
|
||||||
// dedicate one sequence to the system prompt
|
// reserve one extra sequence (seq_id == 0) for extra features
|
||||||
params.n_parallel += 1;
|
params.n_parallel += 1;
|
||||||
|
|
||||||
common_init_result llama_init = common_init_from_params(params);
|
common_init_result llama_init = common_init_from_params(params);
|
||||||
|
@ -712,22 +705,6 @@ struct server_context {
|
||||||
|
|
||||||
SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx);
|
SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx);
|
||||||
|
|
||||||
const int ga_n = params.grp_attn_n;
|
|
||||||
const int ga_w = params.grp_attn_w;
|
|
||||||
|
|
||||||
if (ga_n != 1) {
|
|
||||||
GGML_ASSERT(ga_n > 0 && "ga_n must be positive"); // NOLINT
|
|
||||||
GGML_ASSERT(ga_w % ga_n == 0 && "ga_w must be a multiple of ga_n"); // NOLINT
|
|
||||||
//GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of ga_w"); // NOLINT
|
|
||||||
//GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT
|
|
||||||
|
|
||||||
SLT_INF(slot, "slot self-extend: ga_n = %d, ga_w = %d\n", ga_n, ga_w);
|
|
||||||
}
|
|
||||||
|
|
||||||
slot.ga_i = 0;
|
|
||||||
slot.ga_n = ga_n;
|
|
||||||
slot.ga_w = ga_w;
|
|
||||||
|
|
||||||
slot.sparams = params.sparams;
|
slot.sparams = params.sparams;
|
||||||
|
|
||||||
slot.callback_on_release = [this](int) {
|
slot.callback_on_release = [this](int) {
|
||||||
|
@ -754,12 +731,7 @@ struct server_context {
|
||||||
metrics.init();
|
metrics.init();
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<llama_token> tokenize(const json & json_prompt, bool add_special) const {
|
std::vector<llama_token> tokenize(const json & json_prompt, bool add_special, bool parse_special) const {
|
||||||
// TODO: currently, we tokenize using special tokens by default
|
|
||||||
// this is not always correct (see https://github.com/ggerganov/llama.cpp/pull/4160#issuecomment-1824826216)
|
|
||||||
// but it's better compared to completely ignoring ChatML and other chat templates
|
|
||||||
const bool TMP_FORCE_SPECIAL = true;
|
|
||||||
|
|
||||||
// If `add_bos` is true, we only add BOS, when json_prompt is a string,
|
// If `add_bos` is true, we only add BOS, when json_prompt is a string,
|
||||||
// or the first element of the json_prompt array is a string.
|
// or the first element of the json_prompt array is a string.
|
||||||
std::vector<llama_token> prompt_tokens;
|
std::vector<llama_token> prompt_tokens;
|
||||||
|
@ -772,10 +744,10 @@ struct server_context {
|
||||||
|
|
||||||
std::vector<llama_token> p;
|
std::vector<llama_token> p;
|
||||||
if (first) {
|
if (first) {
|
||||||
p = common_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL);
|
p = common_tokenize(ctx, s, add_special, parse_special);
|
||||||
first = false;
|
first = false;
|
||||||
} else {
|
} else {
|
||||||
p = common_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
|
p = common_tokenize(ctx, s, false, parse_special);
|
||||||
}
|
}
|
||||||
|
|
||||||
prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
|
prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
|
||||||
|
@ -789,7 +761,7 @@ struct server_context {
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
auto s = json_prompt.template get<std::string>();
|
auto s = json_prompt.template get<std::string>();
|
||||||
prompt_tokens = common_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL);
|
prompt_tokens = common_tokenize(ctx, s, add_special, parse_special);
|
||||||
}
|
}
|
||||||
|
|
||||||
return prompt_tokens;
|
return prompt_tokens;
|
||||||
|
@ -831,7 +803,7 @@ struct server_context {
|
||||||
int slot_prompt_len = slot_prompt.size();
|
int slot_prompt_len = slot_prompt.size();
|
||||||
|
|
||||||
// length of the Longest Common Prefix between the current slot's prompt and the input prompt
|
// length of the Longest Common Prefix between the current slot's prompt and the input prompt
|
||||||
int lcp_len = common_part(slot_prompt, prompt);
|
int lcp_len = longest_common_prefix(slot_prompt, prompt);
|
||||||
|
|
||||||
// fraction of the common substring length compared to the current slot's prompt length
|
// fraction of the common substring length compared to the current slot's prompt length
|
||||||
similarity = static_cast<float>(lcp_len) / slot_prompt_len;
|
similarity = static_cast<float>(lcp_len) / slot_prompt_len;
|
||||||
|
@ -892,6 +864,8 @@ struct server_context {
|
||||||
slot.sparams.top_k = json_value(data, "top_k", default_sparams.top_k);
|
slot.sparams.top_k = json_value(data, "top_k", default_sparams.top_k);
|
||||||
slot.sparams.top_p = json_value(data, "top_p", default_sparams.top_p);
|
slot.sparams.top_p = json_value(data, "top_p", default_sparams.top_p);
|
||||||
slot.sparams.min_p = json_value(data, "min_p", default_sparams.min_p);
|
slot.sparams.min_p = json_value(data, "min_p", default_sparams.min_p);
|
||||||
|
slot.sparams.xtc_probability = json_value(data, "xtc_probability", default_sparams.xtc_probability);
|
||||||
|
slot.sparams.xtc_threshold = json_value(data, "xtc_threshold", default_sparams.xtc_threshold);
|
||||||
slot.sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z);
|
slot.sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z);
|
||||||
slot.sparams.typ_p = json_value(data, "typical_p", default_sparams.typ_p);
|
slot.sparams.typ_p = json_value(data, "typical_p", default_sparams.typ_p);
|
||||||
slot.sparams.temp = json_value(data, "temperature", default_sparams.temp);
|
slot.sparams.temp = json_value(data, "temperature", default_sparams.temp);
|
||||||
|
@ -910,6 +884,8 @@ struct server_context {
|
||||||
slot.sparams.seed = json_value(data, "seed", default_sparams.seed);
|
slot.sparams.seed = json_value(data, "seed", default_sparams.seed);
|
||||||
slot.sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
|
slot.sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
|
||||||
slot.sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep);
|
slot.sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep);
|
||||||
|
//slot.params.t_max_prompt_ms = json_value(data, "t_max_prompt_ms", default_params.t_max_prompt_ms); // TODO: implement
|
||||||
|
slot.params.t_max_predict_ms = json_value(data, "t_max_predict_ms", default_params.t_max_predict_ms);
|
||||||
|
|
||||||
// process "json_schema" and "grammar"
|
// process "json_schema" and "grammar"
|
||||||
if (data.contains("json_schema") && !data.at("json_schema").is_null() && data.contains("grammar") && !data.at("grammar").is_null()) {
|
if (data.contains("json_schema") && !data.at("json_schema").is_null() && data.contains("grammar") && !data.at("grammar").is_null()) {
|
||||||
|
@ -928,11 +904,6 @@ struct server_context {
|
||||||
slot.sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
|
slot.sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (slot.params.cache_prompt && slot.ga_n != 1) {
|
|
||||||
slot.params.cache_prompt = false;
|
|
||||||
SLT_WRN(slot, "%s", "group-attention is not supported with prompt caching. disabling cache\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) {
|
if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) {
|
||||||
// Might be better to reject the request with a 400 ?
|
// Might be better to reject the request with a 400 ?
|
||||||
slot.params.n_predict = slot.n_predict;
|
slot.params.n_predict = slot.n_predict;
|
||||||
|
@ -940,11 +911,29 @@ struct server_context {
|
||||||
}
|
}
|
||||||
|
|
||||||
// infill
|
// infill
|
||||||
slot.params.input_prefix = json_value(data, "input_prefix", default_params.input_prefix);
|
slot.input_prefix = json_value(data, "input_prefix", json());
|
||||||
slot.params.input_suffix = json_value(data, "input_suffix", default_params.input_suffix);
|
slot.input_suffix = json_value(data, "input_suffix", json());
|
||||||
|
slot.input_extra = json_value(data, "input_extra", json());
|
||||||
|
|
||||||
|
SLT_DBG(slot, "extra_context chunks: %d\n", (int) slot.input_extra.size());
|
||||||
|
for (const auto & chunk : slot.input_extra) {
|
||||||
|
// { "text": string, "filename": string }
|
||||||
|
if (!chunk.contains("text") || !chunk["text"].is_string()) {
|
||||||
|
send_error(task, "extra_context chunk must contain a \"text\" field with a string value", ERROR_TYPE_INVALID_REQUEST);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// filename is optional
|
||||||
|
if (chunk.contains("filename") && !chunk["filename"].is_string()) {
|
||||||
|
send_error(task, "extra_context chunk's \"filename\" field must be a string", ERROR_TYPE_INVALID_REQUEST);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
SLT_DBG(slot, "extra_context chunk in file '%s':\n%s\n", chunk.value("filename", "").c_str(), chunk.value("text", "").c_str());
|
||||||
|
}
|
||||||
|
|
||||||
// get prompt
|
// get prompt
|
||||||
if (task.cmpl_type != SERVER_TASK_CMPL_TYPE_INFILL) {
|
{
|
||||||
const auto & prompt = data.find("prompt");
|
const auto & prompt = data.find("prompt");
|
||||||
if (prompt == data.end()) {
|
if (prompt == data.end()) {
|
||||||
send_error(task, "\"prompt\" must be provided", ERROR_TYPE_INVALID_REQUEST);
|
send_error(task, "\"prompt\" must be provided", ERROR_TYPE_INVALID_REQUEST);
|
||||||
|
@ -1067,51 +1056,6 @@ struct server_context {
|
||||||
clean_kv_cache = false;
|
clean_kv_cache = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
void system_prompt_update() {
|
|
||||||
SRV_DBG("updating system prompt: '%s'\n", system_prompt.c_str());
|
|
||||||
|
|
||||||
kv_cache_clear();
|
|
||||||
system_tokens.clear();
|
|
||||||
|
|
||||||
if (!system_prompt.empty()) {
|
|
||||||
system_tokens = common_tokenize(ctx, system_prompt, true);
|
|
||||||
|
|
||||||
const int32_t n_batch = llama_n_batch(ctx);
|
|
||||||
const int32_t n_tokens_prompt = system_tokens.size();
|
|
||||||
|
|
||||||
for (int32_t i = 0; i < n_tokens_prompt; i += n_batch) {
|
|
||||||
const int32_t n_tokens = std::min(n_batch, n_tokens_prompt - i);
|
|
||||||
|
|
||||||
common_batch_clear(batch);
|
|
||||||
|
|
||||||
for (int32_t j = 0; j < n_tokens; ++j) {
|
|
||||||
common_batch_add(batch, system_tokens[i + j], i + j, { 0 }, false);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (llama_decode(ctx, batch) != 0) {
|
|
||||||
SRV_ERR("%s", "llama_decode() failed\n");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// assign the system KV cache to all parallel sequences
|
|
||||||
for (int32_t i = 1; i <= params.n_parallel; ++i) {
|
|
||||||
llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
system_need_update = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool system_prompt_set(const std::string & sys_prompt) {
|
|
||||||
SRV_DBG("system prompt set: '%s'\n", system_prompt.c_str());
|
|
||||||
|
|
||||||
system_prompt = sys_prompt;
|
|
||||||
// update system_tokens and KV cache as soon as all slots are idle
|
|
||||||
system_need_update = true;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool process_token(completion_token_output & result, server_slot & slot) {
|
bool process_token(completion_token_output & result, server_slot & slot) {
|
||||||
// remember which tokens were sampled - used for repetition penalties during sampling
|
// remember which tokens were sampled - used for repetition penalties during sampling
|
||||||
const std::string token_str = common_token_to_piece(ctx, result.tok, params.special);
|
const std::string token_str = common_token_to_piece(ctx, result.tok, params.special);
|
||||||
|
@ -1147,22 +1091,21 @@ struct server_context {
|
||||||
size_t pos = std::min(slot.n_sent_text, slot.generated_text.size());
|
size_t pos = std::min(slot.n_sent_text, slot.generated_text.size());
|
||||||
|
|
||||||
const std::string str_test = slot.generated_text.substr(pos);
|
const std::string str_test = slot.generated_text.substr(pos);
|
||||||
bool is_stop_full = false;
|
bool send_text = true;
|
||||||
|
|
||||||
size_t stop_pos = slot.find_stopping_strings(str_test, token_str.size(), STOP_TYPE_FULL);
|
size_t stop_pos = slot.find_stopping_strings(str_test, token_str.size(), STOP_TYPE_FULL);
|
||||||
if (stop_pos != std::string::npos) {
|
if (stop_pos != std::string::npos) {
|
||||||
is_stop_full = true;
|
|
||||||
slot.generated_text.erase(
|
slot.generated_text.erase(
|
||||||
slot.generated_text.begin() + pos + stop_pos,
|
slot.generated_text.begin() + pos + stop_pos,
|
||||||
slot.generated_text.end());
|
slot.generated_text.end());
|
||||||
pos = std::min(slot.n_sent_text, slot.generated_text.size());
|
pos = std::min(slot.n_sent_text, slot.generated_text.size());
|
||||||
} else {
|
} else if (slot.has_next_token) {
|
||||||
is_stop_full = false;
|
|
||||||
stop_pos = slot.find_stopping_strings(str_test, token_str.size(), STOP_TYPE_PARTIAL);
|
stop_pos = slot.find_stopping_strings(str_test, token_str.size(), STOP_TYPE_PARTIAL);
|
||||||
|
send_text = stop_pos == std::string::npos;
|
||||||
}
|
}
|
||||||
|
|
||||||
// check if there is any token to predict
|
// check if there is any token to predict
|
||||||
if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0)) {
|
if (send_text) {
|
||||||
// no send the stop word in the response
|
// no send the stop word in the response
|
||||||
result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
|
result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
|
||||||
slot.n_sent_text += result.text_to_send.size();
|
slot.n_sent_text += result.text_to_send.size();
|
||||||
|
@ -1187,13 +1130,28 @@ struct server_context {
|
||||||
SLT_DBG(slot, "stopped by limit, n_decoded = %d, n_predict = %d\n", slot.n_decoded, slot.params.n_predict);
|
SLT_DBG(slot, "stopped by limit, n_decoded = %d, n_predict = %d\n", slot.n_decoded, slot.params.n_predict);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// if we have already seen a new line, we stop after a certain time limit
|
||||||
|
if (slot.has_new_line && slot.params.t_max_predict_ms > 0 &&
|
||||||
|
(ggml_time_us() - slot.t_start_generation > 1000.0f*slot.params.t_max_predict_ms)) {
|
||||||
|
slot.stopped_limit = true;
|
||||||
|
slot.has_next_token = false;
|
||||||
|
|
||||||
|
SLT_DBG(slot, "stopped by time limit, n_decoded = %d, t_max_predict_ms = %d ms\n", slot.n_decoded, (int) slot.params.t_max_predict_ms);
|
||||||
|
}
|
||||||
|
|
||||||
|
// check if there is a new line in the generated text
|
||||||
|
if (result.text_to_send.find('\n') != std::string::npos) {
|
||||||
|
slot.has_new_line = true;
|
||||||
|
}
|
||||||
|
|
||||||
// if context shift is disabled, we stop when it reaches the context limit
|
// if context shift is disabled, we stop when it reaches the context limit
|
||||||
if (slot.n_decoded >= slot.n_ctx) {
|
if (slot.n_past >= slot.n_ctx) {
|
||||||
slot.truncated = true;
|
slot.truncated = true;
|
||||||
slot.stopped_limit = true;
|
slot.stopped_limit = true;
|
||||||
slot.has_next_token = false;
|
slot.has_next_token = false;
|
||||||
|
|
||||||
SLT_DBG(slot, "stopped due to running out of context capacity, n_decoded = %d, n_ctx = %d\n", slot.n_decoded, slot.n_ctx);
|
SLT_DBG(slot, "stopped due to running out of context capacity, n_past = %d, n_prompt_tokens = %d, n_decoded = %d, n_ctx = %d\n",
|
||||||
|
slot.n_decoded, slot.n_prompt_tokens, slot.n_past, slot.n_ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (llama_token_is_eog(model, result.tok)) {
|
if (llama_token_is_eog(model, result.tok)) {
|
||||||
|
@ -1205,18 +1163,18 @@ struct server_context {
|
||||||
|
|
||||||
const auto n_ctx_train = llama_n_ctx_train(model);
|
const auto n_ctx_train = llama_n_ctx_train(model);
|
||||||
|
|
||||||
if (slot.params.n_predict < 1 && slot.n_predict < 1 && slot.ga_n == 1 && slot.n_prompt_tokens + slot.n_decoded >= n_ctx_train) {
|
if (slot.params.n_predict < 1 && slot.n_predict < 1 && slot.n_prompt_tokens + slot.n_decoded >= n_ctx_train) {
|
||||||
slot.truncated = true;
|
slot.truncated = true;
|
||||||
slot.stopped_limit = true;
|
slot.stopped_limit = true;
|
||||||
slot.has_next_token = false; // stop prediction
|
slot.has_next_token = false; // stop prediction
|
||||||
|
|
||||||
SLT_WRN(slot,
|
SLT_WRN(slot,
|
||||||
"n_predict (%d) is not set and self-context extend is disabled. "
|
"n_predict (%d) is set for infinite generation. "
|
||||||
"Limiting generated tokens to n_ctx_train (%d) to avoid EOS-less generation infinite loop\n",
|
"Limiting generated tokens to n_ctx_train (%d) to avoid EOS-less generation infinite loop\n",
|
||||||
slot.params.n_predict, n_ctx_train);
|
slot.params.n_predict, n_ctx_train);
|
||||||
}
|
}
|
||||||
|
|
||||||
SLT_DBG(slot, "n_decoded = %d, n_remaining = %d, next token: '%s'\n", slot.n_decoded, slot.n_remaining, token_str.c_str());
|
SLT_DBG(slot, "n_decoded = %d, n_remaining = %d, next token: %5d '%s'\n", slot.n_decoded, slot.n_remaining, result.tok, token_str.c_str());
|
||||||
|
|
||||||
return slot.has_next_token; // continue
|
return slot.has_next_token; // continue
|
||||||
}
|
}
|
||||||
|
@ -1240,6 +1198,8 @@ struct server_context {
|
||||||
{"top_k", slot.sparams.top_k},
|
{"top_k", slot.sparams.top_k},
|
||||||
{"top_p", slot.sparams.top_p},
|
{"top_p", slot.sparams.top_p},
|
||||||
{"min_p", slot.sparams.min_p},
|
{"min_p", slot.sparams.min_p},
|
||||||
|
{"xtc_probability", slot.sparams.xtc_probability},
|
||||||
|
{"xtc_threshold", slot.sparams.xtc_threshold},
|
||||||
{"tfs_z", slot.sparams.tfs_z},
|
{"tfs_z", slot.sparams.tfs_z},
|
||||||
{"typical_p", slot.sparams.typ_p},
|
{"typical_p", slot.sparams.typ_p},
|
||||||
{"repeat_last_n", slot.sparams.penalty_last_n},
|
{"repeat_last_n", slot.sparams.penalty_last_n},
|
||||||
|
@ -1335,6 +1295,7 @@ struct server_context {
|
||||||
{"tokens_evaluated", slot.n_prompt_tokens},
|
{"tokens_evaluated", slot.n_prompt_tokens},
|
||||||
{"generation_settings", get_formated_generation(slot)},
|
{"generation_settings", get_formated_generation(slot)},
|
||||||
{"prompt", slot.prompt},
|
{"prompt", slot.prompt},
|
||||||
|
{"has_new_line", slot.has_new_line},
|
||||||
{"truncated", slot.truncated},
|
{"truncated", slot.truncated},
|
||||||
{"stopped_eos", slot.stopped_eos},
|
{"stopped_eos", slot.stopped_eos},
|
||||||
{"stopped_word", slot.stopped_word},
|
{"stopped_word", slot.stopped_word},
|
||||||
|
@ -1484,9 +1445,8 @@ struct server_context {
|
||||||
if (prompt.is_string() || json_is_array_of_numbers(prompt)) {
|
if (prompt.is_string() || json_is_array_of_numbers(prompt)) {
|
||||||
data["index"] = 0;
|
data["index"] = 0;
|
||||||
create_task(data, false, nullptr);
|
create_task(data, false, nullptr);
|
||||||
}
|
} else if (prompt.is_array()) {
|
||||||
// otherwise, it's a multiple-prompt task, we break it into smaller tasks
|
// otherwise, it's a multiple-prompt task, we break it into smaller tasks
|
||||||
else if (prompt.is_array()) {
|
|
||||||
std::vector<json> prompts = prompt;
|
std::vector<json> prompts = prompt;
|
||||||
if (cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK) {
|
if (cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK) {
|
||||||
// prompts[0] is the question
|
// prompts[0] is the question
|
||||||
|
@ -1511,9 +1471,8 @@ struct server_context {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
} else {
|
||||||
// invalid case
|
// invalid case
|
||||||
else {
|
|
||||||
throw std::runtime_error(error_msg);
|
throw std::runtime_error(error_msg);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1663,6 +1622,7 @@ struct server_context {
|
||||||
slot_data["prompt"] = slot.prompt;
|
slot_data["prompt"] = slot.prompt;
|
||||||
slot_data["next_token"] = {
|
slot_data["next_token"] = {
|
||||||
{"has_next_token", slot.has_next_token},
|
{"has_next_token", slot.has_next_token},
|
||||||
|
{"has_new_line", slot.has_new_line},
|
||||||
{"n_remain", slot.n_remaining},
|
{"n_remain", slot.n_remaining},
|
||||||
{"n_decoded", slot.n_decoded},
|
{"n_decoded", slot.n_decoded},
|
||||||
{"stopped_eos", slot.stopped_eos},
|
{"stopped_eos", slot.stopped_eos},
|
||||||
|
@ -1786,6 +1746,9 @@ struct server_context {
|
||||||
}
|
}
|
||||||
slot->cache_tokens.resize(token_count);
|
slot->cache_tokens.resize(token_count);
|
||||||
|
|
||||||
|
// TODO: maybe detokenize the slot->cache_tokens instead?
|
||||||
|
slot->prompt = string_format("[restored %d tokens from file]", (int) token_count);
|
||||||
|
|
||||||
const int64_t t_end = ggml_time_us();
|
const int64_t t_end = ggml_time_us();
|
||||||
const double t_restore_ms = (t_end - t_start) / 1000.0;
|
const double t_restore_ms = (t_end - t_start) / 1000.0;
|
||||||
|
|
||||||
|
@ -1860,12 +1823,8 @@ struct server_context {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (all_idle) {
|
if (all_idle) {
|
||||||
if (system_need_update) {
|
|
||||||
system_prompt_update();
|
|
||||||
}
|
|
||||||
|
|
||||||
SRV_INF("%s", "all slots are idle\n");
|
SRV_INF("%s", "all slots are idle\n");
|
||||||
if (system_prompt.empty() && clean_kv_cache) {
|
if (clean_kv_cache) {
|
||||||
kv_cache_clear();
|
kv_cache_clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1886,8 +1845,7 @@ struct server_context {
|
||||||
// apply context-shift if needed
|
// apply context-shift if needed
|
||||||
// TODO: simplify and improve
|
// TODO: simplify and improve
|
||||||
for (server_slot & slot : slots) {
|
for (server_slot & slot : slots) {
|
||||||
if (slot.ga_n == 1) {
|
if (slot.is_processing() && slot.n_past + 1 >= slot.n_ctx) {
|
||||||
if (slot.is_processing() && (int) system_tokens.size() + slot.n_past >= slot.n_ctx - 1) {
|
|
||||||
if (!params.ctx_shift) {
|
if (!params.ctx_shift) {
|
||||||
// this check is redundant (for good)
|
// this check is redundant (for good)
|
||||||
// we should never get here, because generation should already stopped in process_token()
|
// we should never get here, because generation should already stopped in process_token()
|
||||||
|
@ -1898,13 +1856,13 @@ struct server_context {
|
||||||
|
|
||||||
// Shift context
|
// Shift context
|
||||||
const int n_keep = slot.params.n_keep + add_bos_token;
|
const int n_keep = slot.params.n_keep + add_bos_token;
|
||||||
const int n_left = (int) system_tokens.size() + slot.n_past - n_keep;
|
const int n_left = slot.n_past - n_keep;
|
||||||
const int n_discard = slot.params.n_discard ? slot.params.n_discard : (n_left / 2);
|
const int n_discard = slot.params.n_discard ? slot.params.n_discard : (n_left / 2);
|
||||||
|
|
||||||
SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard);
|
SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard);
|
||||||
|
|
||||||
llama_kv_cache_seq_rm (ctx, slot.id + 1, n_keep , n_keep + n_discard);
|
llama_kv_cache_seq_rm (ctx, slot.id + 1, n_keep , n_keep + n_discard);
|
||||||
llama_kv_cache_seq_add(ctx, slot.id + 1, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard);
|
llama_kv_cache_seq_add(ctx, slot.id + 1, n_keep + n_discard, slot.n_past, -n_discard);
|
||||||
|
|
||||||
if (slot.params.cache_prompt) {
|
if (slot.params.cache_prompt) {
|
||||||
for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) {
|
for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) {
|
||||||
|
@ -1919,7 +1877,6 @@ struct server_context {
|
||||||
slot.truncated = true;
|
slot.truncated = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
// start populating the batch for this iteration
|
// start populating the batch for this iteration
|
||||||
common_batch_clear(batch);
|
common_batch_clear(batch);
|
||||||
|
@ -1932,11 +1889,7 @@ struct server_context {
|
||||||
|
|
||||||
slot.i_batch = batch.n_tokens;
|
slot.i_batch = batch.n_tokens;
|
||||||
|
|
||||||
const int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past;
|
common_batch_add(batch, slot.sampled, slot.n_past, { slot.id + 1 }, true);
|
||||||
|
|
||||||
// TODO: we always have to take into account the "system_tokens"
|
|
||||||
// this is not great and needs to be improved somehow
|
|
||||||
common_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id + 1 }, true);
|
|
||||||
|
|
||||||
slot.n_past += 1;
|
slot.n_past += 1;
|
||||||
|
|
||||||
|
@ -1944,8 +1897,8 @@ struct server_context {
|
||||||
slot.cache_tokens.push_back(slot.sampled);
|
slot.cache_tokens.push_back(slot.sampled);
|
||||||
}
|
}
|
||||||
|
|
||||||
SLT_DBG(slot, "slot decode token, n_ctx = %d, n_past = %d, n_system_tokens = %d, n_cache_tokens = %d, truncated = %d\n",
|
SLT_DBG(slot, "slot decode token, n_ctx = %d, n_past = %d, n_cache_tokens = %d, truncated = %d\n",
|
||||||
slot.n_ctx, slot.n_past, (int) system_tokens.size(), (int) slot.cache_tokens.size(), slot.truncated);
|
slot.n_ctx, slot.n_past, (int) slot.cache_tokens.size(), slot.truncated);
|
||||||
}
|
}
|
||||||
|
|
||||||
// process in chunks of params.n_batch
|
// process in chunks of params.n_batch
|
||||||
|
@ -1972,39 +1925,14 @@ struct server_context {
|
||||||
slot.t_start_process_prompt = ggml_time_us();
|
slot.t_start_process_prompt = ggml_time_us();
|
||||||
slot.t_start_generation = 0;
|
slot.t_start_generation = 0;
|
||||||
|
|
||||||
if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_INFILL) {
|
switch (slot.cmpl_type) {
|
||||||
const bool add_bos = llama_add_bos_token(model);
|
case SERVER_TASK_CMPL_TYPE_NORMAL:
|
||||||
bool suff_rm_leading_spc = true;
|
case SERVER_TASK_CMPL_TYPE_EMBEDDING:
|
||||||
if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
|
{
|
||||||
params.input_suffix.erase(0, 1);
|
prompt_tokens = tokenize(slot.prompt, llama_add_bos_token(model), true);
|
||||||
suff_rm_leading_spc = false;
|
} break;
|
||||||
}
|
case SERVER_TASK_CMPL_TYPE_RERANK:
|
||||||
|
{
|
||||||
auto prefix_tokens = tokenize(slot.params.input_prefix, false);
|
|
||||||
auto suffix_tokens = tokenize(slot.params.input_suffix, false);
|
|
||||||
|
|
||||||
const int space_token = 29871; // TODO: this should not be hardcoded
|
|
||||||
if (suff_rm_leading_spc && !suffix_tokens.empty() && suffix_tokens[0] == space_token) {
|
|
||||||
suffix_tokens.erase(suffix_tokens.begin());
|
|
||||||
}
|
|
||||||
|
|
||||||
prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model));
|
|
||||||
suffix_tokens.insert(suffix_tokens.begin(), llama_token_suffix(model));
|
|
||||||
|
|
||||||
auto embd_inp = params.spm_infill ? suffix_tokens : prefix_tokens;
|
|
||||||
auto embd_end = params.spm_infill ? prefix_tokens : suffix_tokens;
|
|
||||||
if (add_bos) {
|
|
||||||
embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
|
|
||||||
}
|
|
||||||
embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
|
|
||||||
|
|
||||||
const llama_token middle_token = llama_token_middle(model);
|
|
||||||
if (middle_token >= 0) {
|
|
||||||
embd_inp.push_back(middle_token);
|
|
||||||
}
|
|
||||||
|
|
||||||
prompt_tokens = embd_inp;
|
|
||||||
} else if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK) {
|
|
||||||
// require slot.prompt to be array of 2 strings
|
// require slot.prompt to be array of 2 strings
|
||||||
if (!slot.prompt.is_array() || slot.prompt.size() != 2) {
|
if (!slot.prompt.is_array() || slot.prompt.size() != 2) {
|
||||||
SLT_ERR(slot, "%s", "invalid prompt for rerank task\n");
|
SLT_ERR(slot, "%s", "invalid prompt for rerank task\n");
|
||||||
|
@ -2017,18 +1945,106 @@ struct server_context {
|
||||||
prompt_tokens.clear();
|
prompt_tokens.clear();
|
||||||
prompt_tokens.push_back(llama_token_bos(model));
|
prompt_tokens.push_back(llama_token_bos(model));
|
||||||
{
|
{
|
||||||
const auto part = tokenize(slot.prompt[0], false);
|
const auto part = tokenize(slot.prompt[0], false, false);
|
||||||
prompt_tokens.insert(prompt_tokens.end(), part.begin(), part.end());
|
prompt_tokens.insert(prompt_tokens.end(), part.begin(), part.end());
|
||||||
}
|
}
|
||||||
prompt_tokens.push_back(llama_token_eos(model));
|
prompt_tokens.push_back(llama_token_eos(model));
|
||||||
prompt_tokens.push_back(llama_token_sep(model));
|
prompt_tokens.push_back(llama_token_sep(model));
|
||||||
{
|
{
|
||||||
const auto part = tokenize(slot.prompt[1], false);
|
const auto part = tokenize(slot.prompt[1], false, false);
|
||||||
prompt_tokens.insert(prompt_tokens.end(), part.begin(), part.end());
|
prompt_tokens.insert(prompt_tokens.end(), part.begin(), part.end());
|
||||||
}
|
}
|
||||||
prompt_tokens.push_back(llama_token_eos(model));
|
prompt_tokens.push_back(llama_token_eos(model));
|
||||||
|
} break;
|
||||||
|
case SERVER_TASK_CMPL_TYPE_INFILL:
|
||||||
|
{
|
||||||
|
// TODO: optimize this block by reducing memory allocations and movement
|
||||||
|
|
||||||
|
// use FIM repo-level pattern:
|
||||||
|
// ref: https://arxiv.org/pdf/2409.12186
|
||||||
|
//
|
||||||
|
// [FIM_REP]myproject
|
||||||
|
// [FIM_SEP]filename0
|
||||||
|
// extra chunk 0
|
||||||
|
// [FIM_SEP]filename1
|
||||||
|
// extra chunk 1
|
||||||
|
// ...
|
||||||
|
// [FIM_SEP]filename
|
||||||
|
// [FIM_PRE]prefix[FIM_SUF]suffix[FIM_MID]prompt
|
||||||
|
//
|
||||||
|
auto tokens_prefix = tokenize(slot.input_prefix, false, false);
|
||||||
|
auto tokens_suffix = tokenize(slot.input_suffix, false, false);
|
||||||
|
auto tokens_prompt = tokenize(slot.prompt, false, false);
|
||||||
|
|
||||||
|
slot.extra_tokens.clear();
|
||||||
|
if (llama_token_fim_rep(model) != LLAMA_TOKEN_NULL) {
|
||||||
|
static const auto k_fim_repo = tokenize("myproject\n", false, false);
|
||||||
|
|
||||||
|
slot.extra_tokens.push_back(llama_token_fim_rep(model));
|
||||||
|
slot.extra_tokens.insert(slot.extra_tokens.end(), k_fim_repo.begin(), k_fim_repo.end());
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const auto & chunk : slot.input_extra) {
|
||||||
|
// { "text": string, "filename": string }
|
||||||
|
const std::string text = chunk.value("text", "");
|
||||||
|
const std::string filename = chunk.value("filename", "tmp");
|
||||||
|
|
||||||
|
if (llama_token_fim_sep(model) != LLAMA_TOKEN_NULL) {
|
||||||
|
const auto k_fim_file = tokenize(filename + "\n", false, false);
|
||||||
|
|
||||||
|
slot.extra_tokens.insert(slot.extra_tokens.end(), llama_token_fim_sep(model));
|
||||||
|
slot.extra_tokens.insert(slot.extra_tokens.end(), k_fim_file.begin(), k_fim_file.end());
|
||||||
} else {
|
} else {
|
||||||
prompt_tokens = tokenize(slot.prompt, system_prompt.empty()); // add BOS if there isn't system prompt
|
// chunk separator in binary form to avoid confusing the AI
|
||||||
|
static const char k_chunk_prefix_str[] = {0x0a, 0x0a, 0x2d, 0x2d, 0x2d, 0x20, 0x73, 0x6e, 0x69, 0x70, 0x70, 0x65, 0x74, 0x20, 0x2d, 0x2d, 0x2d, 0x0a, 0x0a, 0x00};
|
||||||
|
static const auto k_chunk_prefix_tokens = tokenize(k_chunk_prefix_str, false, false);
|
||||||
|
|
||||||
|
slot.extra_tokens.insert(slot.extra_tokens.end(), k_chunk_prefix_tokens.begin(), k_chunk_prefix_tokens.end());
|
||||||
|
}
|
||||||
|
|
||||||
|
const auto chunk_tokens = tokenize(text, false, false);
|
||||||
|
slot.extra_tokens.insert(slot.extra_tokens.end(), chunk_tokens.begin(), chunk_tokens.end());
|
||||||
|
}
|
||||||
|
|
||||||
|
if (llama_token_fim_sep(model) != LLAMA_TOKEN_NULL) {
|
||||||
|
// TODO: current filename
|
||||||
|
static const auto k_fim_file = tokenize("filename\n", false, false);
|
||||||
|
|
||||||
|
slot.extra_tokens.insert(slot.extra_tokens.end(), llama_token_fim_sep(model));
|
||||||
|
slot.extra_tokens.insert(slot.extra_tokens.end(), k_fim_file.begin(), k_fim_file.end());
|
||||||
|
}
|
||||||
|
|
||||||
|
// for now pick FIM context to fit in a batch (ratio prefix:suffix = 3:1, TODO: configurable?)
|
||||||
|
const int n_suffix_take = std::min<int>(tokens_suffix.size(), (n_batch/4));
|
||||||
|
const int n_prefix_take = std::min<int>(tokens_prefix.size(), 3*(n_batch/4) - 3);
|
||||||
|
|
||||||
|
// fill the rest of the context with extra chunks
|
||||||
|
const int n_extra_take = std::min<int>(std::max<int>(0, slot.n_ctx - (n_batch) - 2*slot.n_predict), slot.extra_tokens.size());
|
||||||
|
|
||||||
|
tokens_prefix.erase(tokens_prefix.begin(), tokens_prefix.begin() + tokens_prefix.size() - n_prefix_take);
|
||||||
|
tokens_suffix.resize(n_suffix_take);
|
||||||
|
|
||||||
|
tokens_prefix.insert(tokens_prefix.begin(), llama_token_fim_pre(model));
|
||||||
|
tokens_prefix.insert(tokens_prefix.end(), tokens_prompt.begin(), tokens_prompt.end());
|
||||||
|
tokens_suffix.insert(tokens_suffix.begin(), llama_token_fim_suf(model));
|
||||||
|
|
||||||
|
auto embd_inp = params.spm_infill ? tokens_suffix : tokens_prefix;
|
||||||
|
auto embd_end = params.spm_infill ? tokens_prefix : tokens_suffix;
|
||||||
|
|
||||||
|
if (llama_add_bos_token(model)) {
|
||||||
|
embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
|
||||||
|
}
|
||||||
|
|
||||||
|
SLT_DBG(slot, "extra: n_ctx = %d, n_extra_take = %d, n_extra = %d\n", slot.n_ctx, n_extra_take, (int) slot.extra_tokens.size());
|
||||||
|
|
||||||
|
// put the extra context before the FIM prefix
|
||||||
|
embd_inp.insert(embd_inp.begin(), slot.extra_tokens.end() - n_extra_take, slot.extra_tokens.end());
|
||||||
|
|
||||||
|
embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
|
||||||
|
embd_inp.push_back(llama_token_fim_mid(model));
|
||||||
|
|
||||||
|
prompt_tokens = std::move(embd_inp);
|
||||||
|
} break;
|
||||||
}
|
}
|
||||||
|
|
||||||
slot.n_past = 0;
|
slot.n_past = 0;
|
||||||
|
@ -2036,6 +2052,19 @@ struct server_context {
|
||||||
|
|
||||||
SLT_INF(slot, "prompt tokenized, n_ctx_slot = %d, n_keep = %d, n_prompt_tokens = %d\n", slot.n_ctx, slot.params.n_keep, slot.n_prompt_tokens);
|
SLT_INF(slot, "prompt tokenized, n_ctx_slot = %d, n_keep = %d, n_prompt_tokens = %d\n", slot.n_ctx, slot.params.n_keep, slot.n_prompt_tokens);
|
||||||
|
|
||||||
|
// print prompt tokens (for debugging)
|
||||||
|
if (1) {
|
||||||
|
// first 16 tokens (avoid flooding logs)
|
||||||
|
for (int i = 0; i < std::min<int>(16, prompt_tokens.size()); i++) {
|
||||||
|
SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str());
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// all
|
||||||
|
for (int i = 0; i < (int) prompt_tokens.size(); i++) {
|
||||||
|
SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// empty prompt passed -> release the slot and send empty response
|
// empty prompt passed -> release the slot and send empty response
|
||||||
if (prompt_tokens.empty()) {
|
if (prompt_tokens.empty()) {
|
||||||
SLT_WRN(slot, "%s", "empty prompt - releasing slot\n");
|
SLT_WRN(slot, "%s", "empty prompt - releasing slot\n");
|
||||||
|
@ -2056,7 +2085,9 @@ struct server_context {
|
||||||
} else {
|
} else {
|
||||||
if (!params.ctx_shift) {
|
if (!params.ctx_shift) {
|
||||||
// if context shift is disabled, we make sure prompt size is smaller than KV size
|
// if context shift is disabled, we make sure prompt size is smaller than KV size
|
||||||
if ((int) system_tokens.size() + slot.n_prompt_tokens >= slot.n_ctx) {
|
// TODO: there should be a separate parameter that control prompt truncation
|
||||||
|
// context shift should be applied only during the generation phase
|
||||||
|
if (slot.n_prompt_tokens >= slot.n_ctx) {
|
||||||
slot.release();
|
slot.release();
|
||||||
send_error(slot, "the request exceeds the available context size. try increasing the context size or enable context shift", ERROR_TYPE_INVALID_REQUEST);
|
send_error(slot, "the request exceeds the available context size. try increasing the context size or enable context shift", ERROR_TYPE_INVALID_REQUEST);
|
||||||
continue;
|
continue;
|
||||||
|
@ -2067,8 +2098,8 @@ struct server_context {
|
||||||
}
|
}
|
||||||
slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep);
|
slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep);
|
||||||
|
|
||||||
// if input prompt is too big, truncate it (if group attention self-extend is disabled)
|
// if input prompt is too big, truncate it
|
||||||
if (slot.ga_n == 1 && slot.n_prompt_tokens >= slot.n_ctx) {
|
if (slot.n_prompt_tokens >= slot.n_ctx) {
|
||||||
const int n_left = slot.n_ctx - slot.params.n_keep;
|
const int n_left = slot.n_ctx - slot.params.n_keep;
|
||||||
|
|
||||||
const int n_block_size = n_left / 2;
|
const int n_block_size = n_left / 2;
|
||||||
|
@ -2095,19 +2126,61 @@ struct server_context {
|
||||||
|
|
||||||
common_sampler_reset(slot.smpl);
|
common_sampler_reset(slot.smpl);
|
||||||
|
|
||||||
if (!slot.params.cache_prompt) {
|
if (slot.params.cache_prompt) {
|
||||||
slot.n_past_se = 0;
|
|
||||||
slot.ga_i = 0;
|
|
||||||
} else {
|
|
||||||
GGML_ASSERT(slot.ga_n == 1);
|
|
||||||
|
|
||||||
// reuse any previously computed tokens that are common with the new prompt
|
// reuse any previously computed tokens that are common with the new prompt
|
||||||
slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
|
slot.n_past = longest_common_prefix(slot.cache_tokens, prompt_tokens);
|
||||||
|
|
||||||
// push the prompt into the sampling context (do not apply grammar)
|
// push the prompt into the sampling context (do not apply grammar)
|
||||||
for (int i = 0; i < slot.n_past; ++i) {
|
for (int i = 0; i < slot.n_past; ++i) {
|
||||||
common_sampler_accept(slot.smpl, slot.cache_tokens[i], false);
|
common_sampler_accept(slot.smpl, slot.cache_tokens[i], false);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// reuse chunks from the cached prompt by shifting their KV cache in the new position
|
||||||
|
if (params.n_cache_reuse > 0) {
|
||||||
|
size_t head_c = slot.n_past; // cache
|
||||||
|
size_t head_p = slot.n_past; // current prompt
|
||||||
|
|
||||||
|
SLT_DBG(slot, "trying to reuse chunks with size > %d, slot.n_past = %d\n", params.n_cache_reuse, slot.n_past);
|
||||||
|
|
||||||
|
while (head_c < slot.cache_tokens.size() &&
|
||||||
|
head_p < prompt_tokens.size()) {
|
||||||
|
|
||||||
|
size_t n_match = 0;
|
||||||
|
while (head_c + n_match < slot.cache_tokens.size() &&
|
||||||
|
head_p + n_match < prompt_tokens.size() &&
|
||||||
|
slot.cache_tokens[head_c + n_match] == prompt_tokens[head_p + n_match]) {
|
||||||
|
|
||||||
|
n_match++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (n_match >= (size_t) params.n_cache_reuse) {
|
||||||
|
SLT_INF(slot, "reusing chunk with size %zu, shifting KV cache [%zu, %zu) -> [%zu, %zu)\n", n_match, head_c, head_c + n_match, head_p, head_p + n_match);
|
||||||
|
//for (size_t i = head_p; i < head_p + n_match; i++) {
|
||||||
|
// SLT_DBG(slot, "cache token %3zu: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str());
|
||||||
|
//}
|
||||||
|
|
||||||
|
const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c;
|
||||||
|
|
||||||
|
llama_kv_cache_seq_rm (ctx, slot.id + 1, head_p, head_c);
|
||||||
|
llama_kv_cache_seq_add(ctx, slot.id + 1, head_c, -1, kv_shift);
|
||||||
|
|
||||||
|
for (size_t i = 0; i < n_match; i++) {
|
||||||
|
slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i];
|
||||||
|
|
||||||
|
common_sampler_accept(slot.smpl, slot.cache_tokens[head_p + i], false);
|
||||||
|
|
||||||
|
slot.n_past++;
|
||||||
|
}
|
||||||
|
|
||||||
|
head_c += n_match;
|
||||||
|
head_p += n_match;
|
||||||
|
} else {
|
||||||
|
head_c += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
SLT_DBG(slot, "after context reuse, new slot.n_past = %d\n", slot.n_past);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2116,9 +2189,6 @@ struct server_context {
|
||||||
SLT_WRN(slot, "need to evaluate at least 1 token to generate logits, n_past = %d, n_prompt_tokens = %d\n", slot.n_past, slot.n_prompt_tokens);
|
SLT_WRN(slot, "need to evaluate at least 1 token to generate logits, n_past = %d, n_prompt_tokens = %d\n", slot.n_past, slot.n_prompt_tokens);
|
||||||
|
|
||||||
slot.n_past--;
|
slot.n_past--;
|
||||||
if (slot.ga_i > 0) {
|
|
||||||
slot.n_past_se--;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
slot.n_prompt_tokens_processed = 0;
|
slot.n_prompt_tokens_processed = 0;
|
||||||
|
@ -2144,55 +2214,31 @@ struct server_context {
|
||||||
}
|
}
|
||||||
|
|
||||||
// keep only the common part
|
// keep only the common part
|
||||||
int p0 = (int) system_tokens.size() + slot.n_past;
|
if (!llama_kv_cache_seq_rm(ctx, slot.id + 1, slot.n_past, -1)) {
|
||||||
if (!llama_kv_cache_seq_rm(ctx, slot.id + 1, p0, -1)) {
|
|
||||||
// could not partially delete (likely using a non-Transformer model)
|
// could not partially delete (likely using a non-Transformer model)
|
||||||
llama_kv_cache_seq_rm(ctx, slot.id + 1, -1, -1);
|
llama_kv_cache_seq_rm(ctx, slot.id + 1, -1, -1);
|
||||||
|
|
||||||
p0 = (int) system_tokens.size();
|
// there is no common part left
|
||||||
if (p0 != 0) {
|
|
||||||
// copy over the system prompt when there is one
|
|
||||||
llama_kv_cache_seq_cp(ctx, 0, slot.id + 1, -1, -1);
|
|
||||||
}
|
|
||||||
|
|
||||||
// there is no common part left (except for the system prompt)
|
|
||||||
slot.n_past = 0;
|
slot.n_past = 0;
|
||||||
slot.n_past_se = 0;
|
|
||||||
slot.ga_i = 0;
|
|
||||||
// TODO: is the system prompt ever in the sampling context?
|
|
||||||
common_sampler_reset(slot.smpl);
|
common_sampler_reset(slot.smpl);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
SLT_INF(slot, "kv cache rm [%d, end)\n", slot.n_past);
|
||||||
|
|
||||||
// remove the non-common part from the cache
|
// remove the non-common part from the cache
|
||||||
slot.cache_tokens.resize(slot.n_past);
|
slot.cache_tokens.resize(slot.n_past);
|
||||||
|
|
||||||
SLT_INF(slot, "kv cache rm [%d, end)\n", p0);
|
|
||||||
|
|
||||||
int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past;
|
|
||||||
|
|
||||||
int32_t ga_i = slot.ga_i;
|
|
||||||
int32_t ga_n = slot.ga_n;
|
|
||||||
int32_t ga_w = slot.ga_w;
|
|
||||||
|
|
||||||
// add prompt tokens for processing in the current batch
|
// add prompt tokens for processing in the current batch
|
||||||
// TODO: the self-extend stuff here is a mess - simplify and/or abstract it somehow
|
while (slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch) {
|
||||||
for (; slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch; ++slot.n_past) {
|
common_batch_add(batch, prompt_tokens[slot.n_past], slot.n_past, { slot.id + 1 }, false);
|
||||||
if (slot.ga_n != 1) {
|
|
||||||
while (slot_npast >= ga_i + ga_w) {
|
|
||||||
const int bd = (ga_w/ga_n)*(ga_n - 1);
|
|
||||||
slot_npast -= bd;
|
|
||||||
ga_i += ga_w/ga_n;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
common_batch_add(batch, prompt_tokens[slot.n_past], system_tokens.size() + slot_npast, { slot.id + 1 }, false);
|
|
||||||
|
|
||||||
if (slot.params.cache_prompt) {
|
if (slot.params.cache_prompt) {
|
||||||
slot.cache_tokens.push_back(prompt_tokens[slot.n_past]);
|
slot.cache_tokens.push_back(prompt_tokens[slot.n_past]);
|
||||||
}
|
}
|
||||||
|
|
||||||
slot.n_prompt_tokens_processed++;
|
slot.n_prompt_tokens_processed++;
|
||||||
slot_npast++;
|
slot.n_past++;
|
||||||
}
|
}
|
||||||
|
|
||||||
SLT_INF(slot, "prompt processing progress, n_past = %d, n_tokens = %d, progress = %f\n", slot.n_past, batch.n_tokens, (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens);
|
SLT_INF(slot, "prompt processing progress, n_past = %d, n_tokens = %d, progress = %f\n", slot.n_past, batch.n_tokens, (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens);
|
||||||
|
@ -2233,34 +2279,6 @@ struct server_context {
|
||||||
for (int32_t i = 0; i < batch.n_tokens; i += n_batch) {
|
for (int32_t i = 0; i < batch.n_tokens; i += n_batch) {
|
||||||
const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);
|
const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);
|
||||||
|
|
||||||
for (auto & slot : slots) {
|
|
||||||
if (slot.ga_n != 1) {
|
|
||||||
// context extension via Self-Extend
|
|
||||||
// TODO: simplify and/or abstract this
|
|
||||||
while (slot.n_past_se >= slot.ga_i + slot.ga_w) {
|
|
||||||
const int ib = (slot.ga_n * slot.ga_i) / slot.ga_w;
|
|
||||||
const int bd = (slot.ga_w / slot.ga_n) * (slot.ga_n - 1);
|
|
||||||
const int dd = (slot.ga_w / slot.ga_n) - ib * bd - slot.ga_w;
|
|
||||||
|
|
||||||
SLT_DBG(slot, "shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd);
|
|
||||||
SLT_DBG(slot, "div: [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n);
|
|
||||||
SLT_DBG(slot, "shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
|
|
||||||
|
|
||||||
llama_kv_cache_seq_add(ctx, slot.id + 1, slot.ga_i, slot.n_past_se, ib * bd);
|
|
||||||
llama_kv_cache_seq_div(ctx, slot.id + 1, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n);
|
|
||||||
llama_kv_cache_seq_add(ctx, slot.id + 1, slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd);
|
|
||||||
|
|
||||||
slot.n_past_se -= bd;
|
|
||||||
|
|
||||||
slot.ga_i += slot.ga_w / slot.ga_n;
|
|
||||||
|
|
||||||
SLT_DBG(slot, "\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i);
|
|
||||||
}
|
|
||||||
|
|
||||||
slot.n_past_se += n_tokens;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_batch batch_view = {
|
llama_batch batch_view = {
|
||||||
n_tokens,
|
n_tokens,
|
||||||
batch.token + i,
|
batch.token + i,
|
||||||
|
@ -2415,10 +2433,6 @@ int main(int argc, char ** argv) {
|
||||||
// struct that contains llama context and inference
|
// struct that contains llama context and inference
|
||||||
server_context ctx_server;
|
server_context ctx_server;
|
||||||
|
|
||||||
if (!params.system_prompt.empty()) {
|
|
||||||
ctx_server.system_prompt_set(params.system_prompt);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (params.model_alias == "unknown") {
|
if (params.model_alias == "unknown") {
|
||||||
params.model_alias = params.model;
|
params.model_alias = params.model;
|
||||||
}
|
}
|
||||||
|
@ -2846,7 +2860,6 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
const auto handle_props = [&ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) {
|
const auto handle_props = [&ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) {
|
||||||
json data = {
|
json data = {
|
||||||
{ "system_prompt", ctx_server.system_prompt },
|
|
||||||
{ "default_generation_settings", ctx_server.default_generation_settings_for_props },
|
{ "default_generation_settings", ctx_server.default_generation_settings_for_props },
|
||||||
{ "total_slots", ctx_server.params.n_parallel },
|
{ "total_slots", ctx_server.params.n_parallel },
|
||||||
{ "chat_template", llama_get_chat_template(ctx_server.model) },
|
{ "chat_template", llama_get_chat_template(ctx_server.model) },
|
||||||
|
@ -2862,10 +2875,8 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
json data = json::parse(req.body);
|
json data = json::parse(req.body);
|
||||||
if (data.contains("system_prompt")) {
|
|
||||||
std::string system_prompt = data.at("system_prompt");
|
// update any props here
|
||||||
ctx_server.system_prompt_set(system_prompt);
|
|
||||||
}
|
|
||||||
|
|
||||||
res_ok(res, {{ "success", true }});
|
res_ok(res, {{ "success", true }});
|
||||||
};
|
};
|
||||||
|
@ -2925,7 +2936,23 @@ int main(int argc, char ** argv) {
|
||||||
return handle_completions_generic(SERVER_TASK_CMPL_TYPE_NORMAL, data, res);
|
return handle_completions_generic(SERVER_TASK_CMPL_TYPE_NORMAL, data, res);
|
||||||
};
|
};
|
||||||
|
|
||||||
const auto handle_infill = [&handle_completions_generic](const httplib::Request & req, httplib::Response & res) {
|
const auto handle_infill = [&ctx_server, &res_error, &handle_completions_generic](const httplib::Request & req, httplib::Response & res) {
|
||||||
|
std::string err;
|
||||||
|
if (llama_token_fim_pre(ctx_server.model) == LLAMA_TOKEN_NULL) {
|
||||||
|
err += "prefix token is missing. ";
|
||||||
|
}
|
||||||
|
if (llama_token_fim_suf(ctx_server.model) == LLAMA_TOKEN_NULL) {
|
||||||
|
err += "suffix token is missing. ";
|
||||||
|
}
|
||||||
|
if (llama_token_fim_mid(ctx_server.model) == LLAMA_TOKEN_NULL) {
|
||||||
|
err += "middle token is missing. ";
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!err.empty()) {
|
||||||
|
res_error(res, format_error_response(string_format("Infill is not supported by this model: %s", err.c_str()), ERROR_TYPE_NOT_SUPPORTED));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
json data = json::parse(req.body);
|
json data = json::parse(req.body);
|
||||||
return handle_completions_generic(SERVER_TASK_CMPL_TYPE_INFILL, data, res);
|
return handle_completions_generic(SERVER_TASK_CMPL_TYPE_INFILL, data, res);
|
||||||
};
|
};
|
||||||
|
@ -3011,7 +3038,8 @@ int main(int argc, char ** argv) {
|
||||||
if (body.count("content") != 0) {
|
if (body.count("content") != 0) {
|
||||||
const bool add_special = json_value(body, "add_special", false);
|
const bool add_special = json_value(body, "add_special", false);
|
||||||
const bool with_pieces = json_value(body, "with_pieces", false);
|
const bool with_pieces = json_value(body, "with_pieces", false);
|
||||||
std::vector<llama_token> tokens = ctx_server.tokenize(body.at("content"), add_special);
|
|
||||||
|
std::vector<llama_token> tokens = ctx_server.tokenize(body.at("content"), add_special, true);
|
||||||
|
|
||||||
if (with_pieces) {
|
if (with_pieces) {
|
||||||
for (const auto& token : tokens) {
|
for (const auto& token : tokens) {
|
||||||
|
@ -3362,6 +3390,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
ctx_server.queue_tasks.on_new_task(std::bind(
|
ctx_server.queue_tasks.on_new_task(std::bind(
|
||||||
&server_context::process_single_task, &ctx_server, std::placeholders::_1));
|
&server_context::process_single_task, &ctx_server, std::placeholders::_1));
|
||||||
|
|
||||||
ctx_server.queue_tasks.on_update_slots(std::bind(
|
ctx_server.queue_tasks.on_update_slots(std::bind(
|
||||||
&server_context::update_slots, &ctx_server));
|
&server_context::update_slots, &ctx_server));
|
||||||
|
|
||||||
|
|
|
@ -13,6 +13,10 @@ Feature: llama.cpp server
|
||||||
And 32 as batch size
|
And 32 as batch size
|
||||||
And 2 slots
|
And 2 slots
|
||||||
|
|
||||||
|
# the prompt is 301 tokens
|
||||||
|
# the slot context is 256/2 = 128 tokens
|
||||||
|
# the prompt is truncated to keep the last 109 tokens
|
||||||
|
# 64 tokens are generated thanks to shifting the context when it gets full
|
||||||
Scenario: Inference with context shift
|
Scenario: Inference with context shift
|
||||||
And 64 server max tokens to predict
|
And 64 server max tokens to predict
|
||||||
Then the server is starting
|
Then the server is starting
|
||||||
|
|
|
@ -195,14 +195,14 @@ static std::string gen_chatcmplid() {
|
||||||
// other common utils
|
// other common utils
|
||||||
//
|
//
|
||||||
|
|
||||||
static size_t common_part(const std::vector<llama_token> & a, const std::vector<llama_token> & b) {
|
static size_t longest_common_prefix(const std::vector<llama_token> & a, const std::vector<llama_token> & b) {
|
||||||
size_t i;
|
size_t i;
|
||||||
for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
|
for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
|
||||||
|
|
||||||
return i;
|
return i;
|
||||||
}
|
}
|
||||||
|
|
||||||
static size_t common_part(const std::string & a, const std::string & b) {
|
static size_t longest_common_prefix(const std::string & a, const std::string & b) {
|
||||||
size_t i;
|
size_t i;
|
||||||
for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
|
for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
|
||||||
|
|
||||||
|
@ -360,9 +360,9 @@ static json oaicompat_completion_params_parse(
|
||||||
|
|
||||||
// Handle "logprobs" field
|
// Handle "logprobs" field
|
||||||
// TODO: The response format of this option is not yet OAI-compatible, but seems like no one really using it; We may need to fix it in the future
|
// TODO: The response format of this option is not yet OAI-compatible, but seems like no one really using it; We may need to fix it in the future
|
||||||
if (body.contains("logprobs")) {
|
if (json_value(body, "logprobs", false)) {
|
||||||
llama_params["n_probs"] = json_value(body, "top_logprobs", 20);
|
llama_params["n_probs"] = json_value(body, "top_logprobs", 20);
|
||||||
} else if (body.contains("top_logprobs")) {
|
} else if (body.contains("top_logprobs") && !body.at("top_logprobs").is_null()) {
|
||||||
throw std::runtime_error("top_logprobs requires logprobs to be set to true");
|
throw std::runtime_error("top_logprobs requires logprobs to be set to true");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -24,6 +24,8 @@ GGML_API ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
|
||||||
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
|
GGML_API ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
|
||||||
|
|
||||||
|
GGML_API ggml_backend_reg_t ggml_backend_vk_reg(void);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -14,7 +14,7 @@
|
||||||
|
|
||||||
//#define GGML_ALLOCATOR_DEBUG
|
//#define GGML_ALLOCATOR_DEBUG
|
||||||
|
|
||||||
//#define AT_PRINTF(...) fprintf(stderr, __VA_ARGS__)
|
//#define AT_PRINTF(...) GGML_LOG_DEBUG(__VA_ARGS__)
|
||||||
#define AT_PRINTF(...)
|
#define AT_PRINTF(...)
|
||||||
|
|
||||||
|
|
||||||
|
@ -89,7 +89,7 @@ void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tenso
|
||||||
size = GGML_PAD(size, talloc->alignment);
|
size = GGML_PAD(size, talloc->alignment);
|
||||||
|
|
||||||
if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
|
if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
|
||||||
fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
|
GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
|
||||||
__func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
|
__func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
|
||||||
GGML_ABORT("not enough space in the buffer");
|
GGML_ABORT("not enough space in the buffer");
|
||||||
}
|
}
|
||||||
|
@ -172,7 +172,7 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
|
||||||
best_fit_block = alloc->n_free_blocks - 1;
|
best_fit_block = alloc->n_free_blocks - 1;
|
||||||
} else {
|
} else {
|
||||||
// this should never happen
|
// this should never happen
|
||||||
fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
|
GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
|
||||||
__func__, size, max_avail);
|
__func__, size, max_avail);
|
||||||
GGML_ABORT("not enough space in the buffer");
|
GGML_ABORT("not enough space in the buffer");
|
||||||
}
|
}
|
||||||
|
@ -209,16 +209,16 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
fprintf(stderr, "max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
|
GGML_LOG_DEBUG("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
|
||||||
for (int i = 0; i < 1024; i++) {
|
for (int i = 0; i < 1024; i++) {
|
||||||
if (alloc->allocated_tensors[i].tensor) {
|
if (alloc->allocated_tensors[i].tensor) {
|
||||||
fprintf(stderr, "%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
|
GGML_LOG_DEBUG("%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
|
||||||
alloc->allocated_tensors[i].offset,
|
alloc->allocated_tensors[i].offset,
|
||||||
alloc->allocated_tensors[i].offset + ggml_nbytes(alloc->allocated_tensors[i].tensor),
|
alloc->allocated_tensors[i].offset + ggml_nbytes(alloc->allocated_tensors[i].tensor),
|
||||||
ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0);
|
ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
fprintf(stderr, "\n");
|
GGML_LOG_DEBUG("\n");
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -348,7 +348,6 @@ struct tensor_alloc {
|
||||||
};
|
};
|
||||||
|
|
||||||
struct leaf_alloc {
|
struct leaf_alloc {
|
||||||
int buffer_id;
|
|
||||||
struct tensor_alloc leaf;
|
struct tensor_alloc leaf;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -740,7 +739,6 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
||||||
for (int i = 0; i < graph->n_leafs; i++) {
|
for (int i = 0; i < graph->n_leafs; i++) {
|
||||||
struct ggml_tensor * leaf = graph->leafs[i];
|
struct ggml_tensor * leaf = graph->leafs[i];
|
||||||
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
|
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
|
||||||
galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
|
|
||||||
if (leaf->view_src || leaf->data) {
|
if (leaf->view_src || leaf->data) {
|
||||||
galloc->leaf_allocs[i].leaf.buffer_id = -1;
|
galloc->leaf_allocs[i].leaf.buffer_id = -1;
|
||||||
galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
|
galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
|
||||||
|
@ -768,13 +766,13 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
||||||
// even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
|
// even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
|
||||||
if (new_size > cur_size || galloc->buffers[i] == NULL) {
|
if (new_size > cur_size || galloc->buffers[i] == NULL) {
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
ggml_backend_buffer_free(galloc->buffers[i]);
|
ggml_backend_buffer_free(galloc->buffers[i]);
|
||||||
galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
|
galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
|
||||||
if (galloc->buffers[i] == NULL) {
|
if (galloc->buffers[i] == NULL) {
|
||||||
fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
|
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
||||||
|
@ -825,14 +823,14 @@ static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_t
|
||||||
static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
|
static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
|
||||||
if (galloc->n_nodes != graph->n_nodes) {
|
if (galloc->n_nodes != graph->n_nodes) {
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
fprintf(stderr, "%s: graph has different number of nodes\n", __func__);
|
GGML_LOG_DEBUG("%s: graph has different number of nodes\n", __func__);
|
||||||
#endif
|
#endif
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (galloc->n_leafs != graph->n_leafs) {
|
if (galloc->n_leafs != graph->n_leafs) {
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
fprintf(stderr, "%s: graph has different number of leafs\n", __func__);
|
GGML_LOG_DEBUG("%s: graph has different number of leafs\n", __func__);
|
||||||
#endif
|
#endif
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -843,7 +841,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
|
||||||
|
|
||||||
if (!ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) {
|
if (!ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) {
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name);
|
GGML_LOG_DEBUG("%s: node %s is not valid\n", __func__, node->name);
|
||||||
#endif
|
#endif
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -855,7 +853,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
|
||||||
}
|
}
|
||||||
if (!ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) {
|
if (!ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) {
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
|
GGML_LOG_DEBUG("%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
|
||||||
#endif
|
#endif
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -869,14 +867,14 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
|
||||||
if (ggml_gallocr_needs_realloc(galloc, graph)) {
|
if (ggml_gallocr_needs_realloc(galloc, graph)) {
|
||||||
if (galloc->n_buffers == 1) {
|
if (galloc->n_buffers == 1) {
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
fprintf(stderr, "%s: reallocating buffers automatically\n", __func__);
|
GGML_LOG_DEBUG("%s: reallocating buffers automatically\n", __func__);
|
||||||
#endif
|
#endif
|
||||||
if (!ggml_gallocr_reserve(galloc, graph)) {
|
if (!ggml_gallocr_reserve(galloc, graph)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
fprintf(stderr, "%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__);
|
GGML_LOG_DEBUG("%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__);
|
||||||
#endif
|
#endif
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -940,7 +938,7 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
|
||||||
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
|
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
|
||||||
if (buffer == NULL) {
|
if (buffer == NULL) {
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
|
GGML_LOG_DEBUG("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
|
||||||
#endif
|
#endif
|
||||||
for (size_t i = 0; i < *n_buffers; i++) {
|
for (size_t i = 0; i < *n_buffers; i++) {
|
||||||
ggml_backend_buffer_free((*buffers)[i]);
|
ggml_backend_buffer_free((*buffers)[i]);
|
||||||
|
@ -990,7 +988,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
||||||
}
|
}
|
||||||
|
|
||||||
if (this_size > max_size) {
|
if (this_size > max_size) {
|
||||||
fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
|
GGML_LOG_ERROR("%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
|
||||||
__func__, t->name,
|
__func__, t->name,
|
||||||
ggml_backend_buft_name(buft),
|
ggml_backend_buft_name(buft),
|
||||||
this_size, max_size);
|
this_size, max_size);
|
||||||
|
@ -1022,7 +1020,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
||||||
|
|
||||||
if (n_buffers == 0) {
|
if (n_buffers == 0) {
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
|
GGML_LOG_DEBUG("%s: all tensors in the context are already allocated\n", __func__);
|
||||||
#endif
|
#endif
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
|
@ -379,7 +379,7 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst
|
||||||
ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
|
ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
|
||||||
} else if (!ggml_backend_buffer_copy_tensor(src, dst)) {
|
} else if (!ggml_backend_buffer_copy_tensor(src, dst)) {
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
fprintf(stderr, "%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer));
|
GGML_LOG_DEBUG("%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer));
|
||||||
#endif
|
#endif
|
||||||
size_t nbytes = ggml_nbytes(src);
|
size_t nbytes = ggml_nbytes(src);
|
||||||
void * data = malloc(nbytes);
|
void * data = malloc(nbytes);
|
||||||
|
@ -538,6 +538,10 @@ void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * na
|
||||||
#include "ggml-metal.h"
|
#include "ggml-metal.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef GGML_USE_VULKAN
|
||||||
|
#include "ggml-vulkan.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef GGML_USE_BLAS
|
#ifdef GGML_USE_BLAS
|
||||||
#include "ggml-blas.h"
|
#include "ggml-blas.h"
|
||||||
#endif
|
#endif
|
||||||
|
@ -557,6 +561,9 @@ struct ggml_backend_registry {
|
||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
register_backend(ggml_backend_metal_reg());
|
register_backend(ggml_backend_metal_reg());
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef GGML_USE_VULKAN
|
||||||
|
register_backend(ggml_backend_vk_reg());
|
||||||
|
#endif
|
||||||
#ifdef GGML_USE_BLAS
|
#ifdef GGML_USE_BLAS
|
||||||
register_backend(ggml_backend_blas_reg());
|
register_backend(ggml_backend_blas_reg());
|
||||||
#endif
|
#endif
|
||||||
|
@ -564,14 +571,14 @@ struct ggml_backend_registry {
|
||||||
register_backend(ggml_backend_rpc_reg());
|
register_backend(ggml_backend_rpc_reg());
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// TODO: sycl, vulkan, kompute, cann
|
// TODO: sycl, kompute, cann
|
||||||
|
|
||||||
register_backend(ggml_backend_cpu_reg());
|
register_backend(ggml_backend_cpu_reg());
|
||||||
}
|
}
|
||||||
|
|
||||||
void register_backend(ggml_backend_reg_t reg) {
|
void register_backend(ggml_backend_reg_t reg) {
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
fprintf(stderr, "%s: registered backend %s (%zu devices)\n",
|
GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
|
||||||
__func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
|
__func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
|
||||||
#endif
|
#endif
|
||||||
backends.push_back(reg);
|
backends.push_back(reg);
|
||||||
|
@ -582,7 +589,7 @@ struct ggml_backend_registry {
|
||||||
|
|
||||||
void register_device(ggml_backend_dev_t device) {
|
void register_device(ggml_backend_dev_t device) {
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
fprintf(stderr, "%s: registered device %s (%s)\n", __func__, ggml_backend_dev_name(device), ggml_backend_dev_description(device));
|
GGML_LOG_DEBUG("%s: registered device %s (%s)\n", __func__, ggml_backend_dev_name(device), ggml_backend_dev_description(device));
|
||||||
#endif
|
#endif
|
||||||
devices.push_back(device);
|
devices.push_back(device);
|
||||||
}
|
}
|
||||||
|
@ -682,8 +689,6 @@ ggml_backend_t ggml_backend_init_best(void) {
|
||||||
|
|
||||||
// backend CPU
|
// backend CPU
|
||||||
|
|
||||||
static const size_t TENSOR_ALIGNMENT = 32; // required for mmap as gguf only guarantees 32-byte alignment
|
|
||||||
|
|
||||||
static const char * ggml_backend_cpu_buffer_get_name(ggml_backend_buffer_t buffer) {
|
static const char * ggml_backend_cpu_buffer_get_name(ggml_backend_buffer_t buffer) {
|
||||||
return "CPU";
|
return "CPU";
|
||||||
|
|
||||||
|
@ -702,7 +707,7 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||||
free(buffer->context);
|
ggml_aligned_free(buffer->context, buffer->size);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
||||||
|
@ -770,14 +775,19 @@ static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_ty
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||||
size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
|
auto alloc_size = size;
|
||||||
void * data = malloc(size); // TODO: use GGML_ALIGNED_MALLOC (move to ggml-impl.h)
|
if (alloc_size == 0) {
|
||||||
|
alloc_size = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
void * data = ggml_aligned_malloc(alloc_size);
|
||||||
|
|
||||||
if (data == NULL) {
|
if (data == NULL) {
|
||||||
fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
|
GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, alloc_size);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, size);
|
return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, alloc_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
||||||
|
@ -836,7 +846,7 @@ static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_
|
||||||
void * ptr;
|
void * ptr;
|
||||||
int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
|
int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
|
||||||
if (result != 0) {
|
if (result != 0) {
|
||||||
fprintf(stderr, "failed to allocate HBM buffer of size %zu\n", size);
|
GGML_LOG_ERROR("failed to allocate HBM buffer of size %zu\n", size);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1459,7 +1469,7 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, co
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
fprintf(stderr, "%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n",
|
GGML_LOG_DEBUG("%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n",
|
||||||
__func__, ggml_op_desc(tensor), ggml_backend_buffer_name(buffer), tensor->name);
|
__func__, ggml_op_desc(tensor), ggml_backend_buffer_name(buffer), tensor->name);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -1554,13 +1564,13 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
|
||||||
for (int i = 0; i < graph->n_nodes; i++) {
|
for (int i = 0; i < graph->n_nodes; i++) {
|
||||||
if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
|
if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
|
||||||
ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
|
ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
|
||||||
fprintf(stderr, "\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend),
|
GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend),
|
||||||
sched->splits[cur_split].n_inputs);
|
sched->splits[cur_split].n_inputs);
|
||||||
for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
|
for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
|
||||||
fprintf(stderr, "[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
|
GGML_LOG_DEBUG("[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
|
||||||
fmt_size(ggml_nbytes(sched->splits[cur_split].inputs[j])));
|
fmt_size(ggml_nbytes(sched->splits[cur_split].inputs[j])));
|
||||||
}
|
}
|
||||||
fprintf(stderr, "\n");
|
GGML_LOG_DEBUG("\n");
|
||||||
cur_split++;
|
cur_split++;
|
||||||
}
|
}
|
||||||
struct ggml_tensor * node = graph->nodes[i];
|
struct ggml_tensor * node = graph->nodes[i];
|
||||||
|
@ -1568,7 +1578,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
|
ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
|
||||||
fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
|
GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
|
||||||
fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
|
fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
|
||||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||||
struct ggml_tensor * src = node->src[j];
|
struct ggml_tensor * src = node->src[j];
|
||||||
|
@ -1576,10 +1586,10 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
|
ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
|
||||||
fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
|
GGML_LOG_DEBUG(" %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
|
||||||
fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
|
fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
|
||||||
}
|
}
|
||||||
fprintf(stderr, "\n");
|
GGML_LOG_DEBUG("\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2093,11 +2103,11 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
|
||||||
// the re-allocation may cause the split inputs to be moved to a different address
|
// the re-allocation may cause the split inputs to be moved to a different address
|
||||||
ggml_backend_sched_synchronize(sched);
|
ggml_backend_sched_synchronize(sched);
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
fprintf(stderr, "%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
|
GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
|
||||||
#endif
|
#endif
|
||||||
ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
|
ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
|
||||||
if (!ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
|
if (!ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
|
||||||
fprintf(stderr, "%s: failed to allocate graph\n", __func__);
|
GGML_LOG_ERROR("%s: failed to allocate graph\n", __func__);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2491,7 +2501,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
|
||||||
struct ggml_context * ctx_unallocated = ggml_init(params);
|
struct ggml_context * ctx_unallocated = ggml_init(params);
|
||||||
|
|
||||||
if (ctx_allocated == NULL || ctx_unallocated == NULL) {
|
if (ctx_allocated == NULL || ctx_unallocated == NULL) {
|
||||||
fprintf(stderr, "failed to allocate context for graph copy\n");
|
GGML_LOG_ERROR("%s: failed to allocate context for graph copy\n", __func__);
|
||||||
ggml_hash_set_free(&hash_set);
|
ggml_hash_set_free(&hash_set);
|
||||||
free(node_copies);
|
free(node_copies);
|
||||||
free(node_init);
|
free(node_init);
|
||||||
|
@ -2514,7 +2524,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
|
||||||
// allocate nodes
|
// allocate nodes
|
||||||
ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
|
ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
|
||||||
if (buffer == NULL) {
|
if (buffer == NULL) {
|
||||||
fprintf(stderr, "failed to allocate buffer for graph copy\n");
|
GGML_LOG_ERROR("%s: failed to allocate buffer for graph copy\n", __func__);
|
||||||
ggml_hash_set_free(&hash_set);
|
ggml_hash_set_free(&hash_set);
|
||||||
free(node_copies);
|
free(node_copies);
|
||||||
free(node_init);
|
free(node_init);
|
||||||
|
|
|
@ -297,14 +297,14 @@ ggml_backend_t ggml_backend_blas_init(void) {
|
||||||
/* .context = */ ctx,
|
/* .context = */ ctx,
|
||||||
};
|
};
|
||||||
|
|
||||||
#if !defined(NDEBUG) && defined(OPENBLAS_VERSION) && defined(GGML_USE_OPENMP)
|
#if defined(OPENBLAS_VERSION) && defined(GGML_USE_OPENMP)
|
||||||
if (openblas_get_parallel() != OPENBLAS_OPENMP) {
|
if (openblas_get_parallel() != OPENBLAS_OPENMP) {
|
||||||
fprintf(stderr, "%s: warning: ggml is using OpenMP, but OpenBLAS was compiled without OpenMP support\n", __func__);
|
GGML_LOG_DEBUG("%s: warning: ggml is using OpenMP, but OpenBLAS was compiled without OpenMP support\n", __func__);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if !defined(NDEBUG) && defined(BLIS_ENABLE_CBLAS) && defined(GGML_USE_OPENMP) && !defined(BLIS_ENABLE_OPENMP)
|
#if defined(BLIS_ENABLE_CBLAS) && defined(GGML_USE_OPENMP) && !defined(BLIS_ENABLE_OPENMP)
|
||||||
fprintf(stderr, "%s: warning: ggml is using OpenMP, but BLIS was compiled without OpenMP support\n", __func__);
|
GGML_LOG_DEBUG("%s: warning: ggml is using OpenMP, but BLIS was compiled without OpenMP support\n", __func__);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
return backend;
|
return backend;
|
||||||
|
|
|
@ -1148,6 +1148,7 @@ ggml_backend_cann_buffer_type(int32_t device) {
|
||||||
for (int32_t i = 0; i < GGML_CANN_MAX_DEVICES; i++) {
|
for (int32_t i = 0; i < GGML_CANN_MAX_DEVICES; i++) {
|
||||||
ggml_backend_cann_buffer_types[i] = {
|
ggml_backend_cann_buffer_types[i] = {
|
||||||
/* .iface = */ ggml_backend_cann_buffer_type_interface,
|
/* .iface = */ ggml_backend_cann_buffer_type_interface,
|
||||||
|
/* .device = */ nullptr,
|
||||||
/* .context = */
|
/* .context = */
|
||||||
new ggml_backend_cann_buffer_type_context{
|
new ggml_backend_cann_buffer_type_context{
|
||||||
i, "CANN" + std::to_string(i)},
|
i, "CANN" + std::to_string(i)},
|
||||||
|
@ -1868,7 +1869,7 @@ static ggml_backend_event_t ggml_backend_cann_event_new(
|
||||||
ACL_CHECK(aclrtCreateEvent(&event));
|
ACL_CHECK(aclrtCreateEvent(&event));
|
||||||
|
|
||||||
return new ggml_backend_event{
|
return new ggml_backend_event{
|
||||||
/* .backend = */ backend,
|
/* .device = */ nullptr,
|
||||||
/* .context = */ event,
|
/* .context = */ event,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@ -1895,10 +1896,9 @@ static void ggml_backend_cann_event_free(ggml_backend_event_t event) {
|
||||||
*
|
*
|
||||||
* @param event Pointer to the event structure to be recorded.
|
* @param event Pointer to the event structure to be recorded.
|
||||||
*/
|
*/
|
||||||
static void ggml_backend_cann_event_record(ggml_backend_event_t event) {
|
static void ggml_backend_cann_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
|
||||||
ggml_backend_cann_context* cann_ctx =
|
ggml_backend_cann_context* cann_ctx =
|
||||||
(ggml_backend_cann_context*)event->backend->context;
|
(ggml_backend_cann_context*)backend->context;
|
||||||
|
|
||||||
ACL_CHECK(aclrtRecordEvent((aclrtEvent)event->context, cann_ctx->stream()));
|
ACL_CHECK(aclrtRecordEvent((aclrtEvent)event->context, cann_ctx->stream()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1916,8 +1916,7 @@ static void ggml_backend_cann_event_wait(ggml_backend_t backend,
|
||||||
ggml_backend_event_t event) {
|
ggml_backend_event_t event) {
|
||||||
ggml_backend_cann_context* cann_ctx =
|
ggml_backend_cann_context* cann_ctx =
|
||||||
(ggml_backend_cann_context*)backend->context;
|
(ggml_backend_cann_context*)backend->context;
|
||||||
|
if (ggml_backend_is_cann(backend)) {
|
||||||
if (ggml_backend_is_cann(event->backend)) {
|
|
||||||
ACL_CHECK(aclrtStreamWaitEvent(cann_ctx->stream(),
|
ACL_CHECK(aclrtStreamWaitEvent(cann_ctx->stream(),
|
||||||
(aclrtEvent)event->context));
|
(aclrtEvent)event->context));
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -291,7 +291,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
GGML_LOG_WARN(GGML_CUDA_NAME " buffer pool full, increase MAX_CUDA_BUFFERS\n");
|
GGML_LOG_DEBUG(GGML_CUDA_NAME " buffer pool full, increase MAX_CUDA_BUFFERS\n");
|
||||||
ggml_cuda_set_device(device);
|
ggml_cuda_set_device(device);
|
||||||
CUDA_CHECK(cudaFree(ptr));
|
CUDA_CHECK(cudaFree(ptr));
|
||||||
pool_size -= size;
|
pool_size -= size;
|
||||||
|
@ -980,7 +980,7 @@ static void * ggml_cuda_host_malloc(size_t size) {
|
||||||
if (err != cudaSuccess) {
|
if (err != cudaSuccess) {
|
||||||
// clear the error
|
// clear the error
|
||||||
cudaGetLastError();
|
cudaGetLastError();
|
||||||
GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
|
GGML_LOG_DEBUG("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
|
||||||
size / 1024.0 / 1024.0, cudaGetErrorString(err));
|
size / 1024.0 / 1024.0, cudaGetErrorString(err));
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
@ -2410,7 +2410,7 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_
|
||||||
|
|
||||||
if (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device) {
|
if (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device) {
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
GGML_LOG_WARN("%s: backend and buffer devices do not match\n", __func__);
|
GGML_LOG_DEBUG("%s: backend and buffer devices do not match\n", __func__);
|
||||||
#endif
|
#endif
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -2528,7 +2528,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
||||||
if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) {
|
if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) {
|
||||||
cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
|
cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
GGML_LOG_WARN("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
|
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2579,14 +2579,14 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
||||||
if (node->src[0] && node->src[0]->buffer && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
|
if (node->src[0] && node->src[0]->buffer && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
|
||||||
use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
|
use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
GGML_LOG_WARN("%s: disabling CUDA graphs due to split buffer\n", __func__);
|
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to split buffer\n", __func__);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
if (node->op == GGML_OP_MUL_MAT_ID) {
|
if (node->op == GGML_OP_MUL_MAT_ID) {
|
||||||
use_cuda_graph = false; // This node type is not supported by CUDA graph capture
|
use_cuda_graph = false; // This node type is not supported by CUDA graph capture
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
GGML_LOG_WARN("%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
|
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2595,7 +2595,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
||||||
// Changes in batch size or context size can cause changes to the grid size of some kernels.
|
// Changes in batch size or context size can cause changes to the grid size of some kernels.
|
||||||
use_cuda_graph = false;
|
use_cuda_graph = false;
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
GGML_LOG_WARN("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
|
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2607,7 +2607,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
||||||
if (!ptr) {
|
if (!ptr) {
|
||||||
use_cuda_graph = false;
|
use_cuda_graph = false;
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
GGML_LOG_WARN("%s: disabling CUDA graphs due to unsupported copy op\n", __func__);
|
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported copy op\n", __func__);
|
||||||
#endif
|
#endif
|
||||||
} else {
|
} else {
|
||||||
if (std::find(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), ptr) == ggml_cuda_cpy_fn_ptrs.end()) {
|
if (std::find(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), ptr) == ggml_cuda_cpy_fn_ptrs.end()) {
|
||||||
|
@ -2631,7 +2631,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
||||||
if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
|
if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
|
||||||
cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
|
cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
GGML_LOG_WARN("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
|
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2689,7 +2689,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
||||||
use_cuda_graph = false;
|
use_cuda_graph = false;
|
||||||
cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true;
|
cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true;
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
GGML_LOG_WARN("%s: disabling CUDA graphs due to failed graph capture\n", __func__);
|
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to failed graph capture\n", __func__);
|
||||||
#endif
|
#endif
|
||||||
} else {
|
} else {
|
||||||
graph_evaluated_or_captured = true; // CUDA graph has been captured
|
graph_evaluated_or_captured = true; // CUDA graph has been captured
|
||||||
|
@ -2858,7 +2858,7 @@ bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size) {
|
||||||
// clear the error
|
// clear the error
|
||||||
cudaGetLastError();
|
cudaGetLastError();
|
||||||
|
|
||||||
GGML_LOG_WARN("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__,
|
GGML_LOG_DEBUG("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__,
|
||||||
size / 1024.0 / 1024.0, cudaGetErrorString(err));
|
size / 1024.0 / 1024.0, cudaGetErrorString(err));
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
|
@ -416,10 +416,11 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx,
|
||||||
|
|
||||||
static __device__ void convert_f16(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
|
static __device__ void convert_f16(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
|
||||||
const half * x = (const half *) vx;
|
const half * x = (const half *) vx;
|
||||||
|
// load 2 halfs into register in a single instruction
|
||||||
|
const half2 x_reg = *((half2 *) &(x[ib + iqs]));
|
||||||
// automatic half -> float type cast if dfloat == float
|
// automatic half -> float type cast if dfloat == float
|
||||||
v.x = x[ib + iqs + 0];
|
v.x = __low2float(x_reg);
|
||||||
v.y = x[ib + iqs + 1];
|
v.y = __high2float(x_reg);
|
||||||
}
|
}
|
||||||
|
|
||||||
static constexpr __device__ dequantize_kernel_t get_dequantize_kernel(ggml_type type) {
|
static constexpr __device__ dequantize_kernel_t get_dequantize_kernel(ggml_type type) {
|
||||||
|
@ -476,13 +477,28 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
|
||||||
// matrix multiplication
|
// matrix multiplication
|
||||||
// for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
|
// for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
|
||||||
#ifdef GGML_CUDA_F16
|
#ifdef GGML_CUDA_F16
|
||||||
|
if ( y_offset == 1 ) {
|
||||||
|
// load 2 dfloats into register in a single instruction
|
||||||
|
const dfloat2 y_reg = *((dfloat2 *) &(y[iybs + iqs + j/qr]));
|
||||||
|
tmp += __hmul2(v, y_reg);
|
||||||
|
}
|
||||||
|
else {
|
||||||
tmp += __hmul2(v, {
|
tmp += __hmul2(v, {
|
||||||
y[iybs + iqs + j/qr + 0],
|
y[iybs + iqs + j/qr + 0],
|
||||||
y[iybs + iqs + j/qr + y_offset]
|
y[iybs + iqs + j/qr + y_offset]
|
||||||
});
|
});
|
||||||
|
}
|
||||||
#else
|
#else
|
||||||
|
if ( y_offset == 1 ) {
|
||||||
|
// load 2 dfloats into register in a single instruction
|
||||||
|
const dfloat2 y_reg = *((dfloat2 *) &(y[iybs + iqs + j/qr]));
|
||||||
|
tmp += v.x * y_reg.x;
|
||||||
|
tmp += v.y * y_reg.y;
|
||||||
|
}
|
||||||
|
else {
|
||||||
tmp += v.x * y[iybs + iqs + j/qr + 0];
|
tmp += v.x * y[iybs + iqs + j/qr + 0];
|
||||||
tmp += v.y * y[iybs + iqs + j/qr + y_offset];
|
tmp += v.y * y[iybs + iqs + j/qr + y_offset];
|
||||||
|
}
|
||||||
#endif // GGML_CUDA_F16
|
#endif // GGML_CUDA_F16
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,6 +19,9 @@ extern "C" {
|
||||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||||
|
|
||||||
|
// required for mmap as gguf only guarantees 32-byte alignment
|
||||||
|
#define TENSOR_ALIGNMENT 32
|
||||||
|
|
||||||
// static_assert should be a #define, but if it's not,
|
// static_assert should be a #define, but if it's not,
|
||||||
// fall back to the _Static_assert C11 keyword.
|
// fall back to the _Static_assert C11 keyword.
|
||||||
// if C99 - static_assert is noop
|
// if C99 - static_assert is noop
|
||||||
|
@ -196,6 +199,11 @@ struct ggml_cgraph {
|
||||||
|
|
||||||
struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
|
struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
|
||||||
|
|
||||||
|
// Memory allocation
|
||||||
|
|
||||||
|
void * ggml_aligned_malloc(size_t size);
|
||||||
|
void ggml_aligned_free(void * ptr, size_t size);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -1941,7 +1941,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||||
if (device->fp16) {
|
if (device->fp16) {
|
||||||
device_extensions.push_back("VK_KHR_shader_float16_int8");
|
device_extensions.push_back("VK_KHR_shader_float16_int8");
|
||||||
}
|
}
|
||||||
device->name = device->properties.deviceName.data();
|
device->name = GGML_VK_NAME + std::to_string(idx);
|
||||||
|
|
||||||
device_create_info = {
|
device_create_info = {
|
||||||
vk::DeviceCreateFlags(),
|
vk::DeviceCreateFlags(),
|
||||||
|
@ -1968,7 +1968,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
||||||
|
|
||||||
device->buffer_type = {
|
device->buffer_type = {
|
||||||
/* .iface = */ ggml_backend_vk_buffer_type_interface,
|
/* .iface = */ ggml_backend_vk_buffer_type_interface,
|
||||||
/* .device = */ nullptr,
|
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_vk_reg(), idx),
|
||||||
/* .context = */ new ggml_backend_vk_buffer_type_context{ device->name, device },
|
/* .context = */ new ggml_backend_vk_buffer_type_context{ device->name, device },
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -6378,7 +6378,7 @@ ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() {
|
||||||
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
||||||
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
||||||
},
|
},
|
||||||
/* .device = */ nullptr,
|
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_vk_reg(), 0),
|
||||||
/* .context = */ nullptr,
|
/* .context = */ nullptr,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -6581,9 +6581,135 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
||||||
UNUSED(backend);
|
UNUSED(backend);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
|
// TODO: enable async and synchronize
|
||||||
// ggml_backend_vk_context * ctx = (ggml_backend_vk_context *) backend->context;
|
static ggml_backend_i ggml_backend_vk_interface = {
|
||||||
|
/* .get_name = */ ggml_backend_vk_name,
|
||||||
|
/* .free = */ ggml_backend_vk_free,
|
||||||
|
/* .get_default_buffer_type = */ ggml_backend_vk_get_default_buffer_type,
|
||||||
|
/* .set_tensor_async = */ NULL, // ggml_backend_vk_set_tensor_async,
|
||||||
|
/* .get_tensor_async = */ NULL, // ggml_backend_vk_get_tensor_async,
|
||||||
|
/* .cpy_tensor_async = */ NULL, // ggml_backend_vk_cpy_tensor_async,
|
||||||
|
/* .synchronize = */ NULL, // ggml_backend_vk_synchronize,
|
||||||
|
/* .graph_plan_create = */ NULL,
|
||||||
|
/* .graph_plan_free = */ NULL,
|
||||||
|
/* .graph_plan_update = */ NULL,
|
||||||
|
/* .graph_plan_compute = */ NULL,
|
||||||
|
/* .graph_compute = */ ggml_backend_vk_graph_compute,
|
||||||
|
/* .supports_op = */ NULL,
|
||||||
|
/* .supports_buft = */ NULL,
|
||||||
|
/* .offload_op = */ NULL,
|
||||||
|
/* .event_record = */ NULL,
|
||||||
|
/* .event_wait = */ NULL,
|
||||||
|
};
|
||||||
|
|
||||||
|
static ggml_guid_t ggml_backend_vk_guid() {
|
||||||
|
static ggml_guid guid = { 0xb8, 0xf7, 0x4f, 0x86, 0x40, 0x3c, 0xe1, 0x02, 0x91, 0xc8, 0xdd, 0xe9, 0x02, 0x3f, 0xc0, 0x2b };
|
||||||
|
return &guid;
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_backend_t ggml_backend_vk_init(size_t dev_num) {
|
||||||
|
VK_LOG_DEBUG("ggml_backend_vk_init(" << dev_num << ")");
|
||||||
|
|
||||||
|
ggml_backend_vk_context * ctx = new ggml_backend_vk_context;
|
||||||
|
ggml_vk_init(ctx, dev_num);
|
||||||
|
|
||||||
|
ggml_backend_t vk_backend = new ggml_backend {
|
||||||
|
/* .guid = */ ggml_backend_vk_guid(),
|
||||||
|
/* .interface = */ ggml_backend_vk_interface,
|
||||||
|
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_vk_reg(), dev_num),
|
||||||
|
/* .context = */ ctx,
|
||||||
|
};
|
||||||
|
|
||||||
|
return vk_backend;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool ggml_backend_is_vk(ggml_backend_t backend) {
|
||||||
|
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_vk_guid());
|
||||||
|
}
|
||||||
|
|
||||||
|
int ggml_backend_vk_get_device_count() {
|
||||||
|
return ggml_vk_get_device_count();
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size) {
|
||||||
|
GGML_ASSERT(device < (int) vk_instance.device_indices.size());
|
||||||
|
int dev_idx = vk_instance.device_indices[device];
|
||||||
|
ggml_vk_get_device_description(dev_idx, description, description_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) {
|
||||||
|
GGML_ASSERT(device < (int) vk_instance.device_indices.size());
|
||||||
|
|
||||||
|
vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]];
|
||||||
|
|
||||||
|
vk::PhysicalDeviceMemoryProperties memprops = vkdev.getMemoryProperties();
|
||||||
|
|
||||||
|
for (const vk::MemoryHeap& heap : memprops.memoryHeaps) {
|
||||||
|
if (heap.flags & vk::MemoryHeapFlagBits::eDeviceLocal) {
|
||||||
|
*total = heap.size;
|
||||||
|
*free = heap.size;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//////////////////////////
|
||||||
|
|
||||||
|
struct ggml_backend_vk_device_context {
|
||||||
|
int device;
|
||||||
|
std::string name;
|
||||||
|
std::string description;
|
||||||
|
};
|
||||||
|
|
||||||
|
static const char * ggml_backend_vk_device_get_name(ggml_backend_dev_t dev) {
|
||||||
|
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
|
||||||
|
return ctx->name.c_str();
|
||||||
|
}
|
||||||
|
|
||||||
|
static const char * ggml_backend_vk_device_get_description(ggml_backend_dev_t dev) {
|
||||||
|
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
|
||||||
|
return ctx->description.c_str();
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_backend_vk_device_get_memory(ggml_backend_dev_t device, size_t * free, size_t * total) {
|
||||||
|
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)device->context;
|
||||||
|
ggml_backend_vk_get_device_memory(ctx->device, free, total);
|
||||||
|
}
|
||||||
|
|
||||||
|
static ggml_backend_buffer_type_t ggml_backend_vk_device_get_buffer_type(ggml_backend_dev_t dev) {
|
||||||
|
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
|
||||||
|
return ggml_backend_vk_buffer_type(ctx->device);
|
||||||
|
}
|
||||||
|
|
||||||
|
static ggml_backend_buffer_type_t ggml_backend_vk_device_get_host_buffer_type(ggml_backend_dev_t dev) {
|
||||||
|
UNUSED(dev);
|
||||||
|
return ggml_backend_vk_host_buffer_type();
|
||||||
|
}
|
||||||
|
|
||||||
|
static enum ggml_backend_dev_type ggml_backend_vk_device_get_type(ggml_backend_dev_t dev) {
|
||||||
|
UNUSED(dev);
|
||||||
|
return GGML_BACKEND_DEVICE_TYPE_GPU_FULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
|
||||||
|
props->name = ggml_backend_vk_device_get_name(dev);
|
||||||
|
props->description = ggml_backend_vk_device_get_description(dev);
|
||||||
|
props->type = ggml_backend_vk_device_get_type(dev);
|
||||||
|
ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
||||||
|
props->caps = {
|
||||||
|
/* async */ false,
|
||||||
|
/* host_buffer */ true,
|
||||||
|
/* events */ false,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
static ggml_backend_t ggml_backend_vk_device_init(ggml_backend_dev_t dev, const char * params) {
|
||||||
|
UNUSED(params);
|
||||||
|
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
|
||||||
|
return ggml_backend_vk_init(ctx->device);
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
||||||
switch (op->op) {
|
switch (op->op) {
|
||||||
case GGML_OP_UNARY:
|
case GGML_OP_UNARY:
|
||||||
switch (ggml_get_unary_op(op)) {
|
switch (ggml_get_unary_op(op)) {
|
||||||
|
@ -6701,97 +6827,101 @@ static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const ggml_tenso
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
UNUSED(backend);
|
UNUSED(dev);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_backend_vk_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
|
static bool ggml_backend_vk_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
|
||||||
|
if (buft->iface.get_name != ggml_backend_vk_buffer_type_name) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
|
||||||
|
ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
|
||||||
|
|
||||||
|
return buft_ctx->device->idx == ctx->device;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool ggml_backend_vk_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
||||||
const int min_batch_size = 32;
|
const int min_batch_size = 32;
|
||||||
|
|
||||||
return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
|
return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
|
||||||
(op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
|
(op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
|
||||||
|
|
||||||
UNUSED(backend);
|
UNUSED(dev);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_backend_vk_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
static const struct ggml_backend_device_i ggml_backend_vk_device_i = {
|
||||||
if (buft->iface.get_name != ggml_backend_vk_buffer_type_name) {
|
/* .get_name = */ ggml_backend_vk_device_get_name,
|
||||||
return false;
|
/* .get_description = */ ggml_backend_vk_device_get_description,
|
||||||
}
|
/* .get_memory = */ ggml_backend_vk_device_get_memory,
|
||||||
|
/* .get_type = */ ggml_backend_vk_device_get_type,
|
||||||
ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
|
/* .get_props = */ ggml_backend_vk_device_get_props,
|
||||||
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
/* .init_backend = */ ggml_backend_vk_device_init,
|
||||||
|
/* .get_buffer_type = */ ggml_backend_vk_device_get_buffer_type,
|
||||||
return buft_ctx->device == ctx->device;
|
/* .get_host_buffer_type = */ ggml_backend_vk_device_get_host_buffer_type,
|
||||||
}
|
/* .buffer_from_host_ptr = */ NULL,
|
||||||
|
/* .supports_op = */ ggml_backend_vk_device_supports_op,
|
||||||
// TODO: enable async and synchronize
|
/* .supports_buft = */ ggml_backend_vk_device_supports_buft,
|
||||||
static ggml_backend_i ggml_backend_vk_interface = {
|
/* .offload_op = */ ggml_backend_vk_device_offload_op,
|
||||||
/* .get_name = */ ggml_backend_vk_name,
|
/* .event_new = */ NULL,
|
||||||
/* .free = */ ggml_backend_vk_free,
|
/* .event_free = */ NULL,
|
||||||
/* .get_default_buffer_type = */ ggml_backend_vk_get_default_buffer_type,
|
/* .event_synchronize = */ NULL,
|
||||||
/* .set_tensor_async = */ NULL, // ggml_backend_vk_set_tensor_async,
|
|
||||||
/* .get_tensor_async = */ NULL, // ggml_backend_vk_get_tensor_async,
|
|
||||||
/* .cpy_tensor_async = */ NULL, // ggml_backend_vk_cpy_tensor_async,
|
|
||||||
/* .synchronize = */ NULL, // ggml_backend_vk_synchronize,
|
|
||||||
/* .graph_plan_create = */ NULL,
|
|
||||||
/* .graph_plan_free = */ NULL,
|
|
||||||
/* .graph_plan_update = */ NULL,
|
|
||||||
/* .graph_plan_compute = */ NULL,
|
|
||||||
/* .graph_compute = */ ggml_backend_vk_graph_compute,
|
|
||||||
/* .supports_op = */ ggml_backend_vk_supports_op,
|
|
||||||
/* .supports_buft = */ ggml_backend_vk_supports_buft,
|
|
||||||
/* .offload_op = */ ggml_backend_vk_offload_op,
|
|
||||||
/* .event_record = */ NULL,
|
|
||||||
/* .event_wait = */ NULL,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
static ggml_guid_t ggml_backend_vk_guid() {
|
static const char * ggml_backend_vk_reg_get_name(ggml_backend_reg_t reg) {
|
||||||
static ggml_guid guid = { 0xb8, 0xf7, 0x4f, 0x86, 0x40, 0x3c, 0xe1, 0x02, 0x91, 0xc8, 0xdd, 0xe9, 0x02, 0x3f, 0xc0, 0x2b };
|
UNUSED(reg);
|
||||||
return &guid;
|
return GGML_VK_NAME;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_t ggml_backend_vk_init(size_t dev_num) {
|
static size_t ggml_backend_vk_reg_get_device_count(ggml_backend_reg_t reg) {
|
||||||
VK_LOG_DEBUG("ggml_backend_vk_init(" << dev_num << ")");
|
UNUSED(reg);
|
||||||
|
return ggml_backend_vk_get_device_count();
|
||||||
|
}
|
||||||
|
|
||||||
ggml_backend_vk_context * ctx = new ggml_backend_vk_context;
|
static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg, size_t device) {
|
||||||
ggml_vk_init(ctx, dev_num);
|
static std::vector<ggml_backend_dev_t> devices;
|
||||||
|
|
||||||
ggml_backend_t vk_backend = new ggml_backend {
|
static bool initialized = false;
|
||||||
/* .guid = */ ggml_backend_vk_guid(),
|
|
||||||
/* .interface = */ ggml_backend_vk_interface,
|
{
|
||||||
/* .device = */ nullptr,
|
static std::mutex mutex;
|
||||||
|
std::lock_guard<std::mutex> lock(mutex);
|
||||||
|
if (!initialized) {
|
||||||
|
for (size_t i = 0; i < ggml_backend_vk_get_device_count(); i++) {
|
||||||
|
ggml_backend_vk_device_context * ctx = new ggml_backend_vk_device_context;
|
||||||
|
char desc[256];
|
||||||
|
ggml_backend_vk_get_device_description(i, desc, sizeof(desc));
|
||||||
|
ctx->device = i;
|
||||||
|
ctx->name = GGML_VK_NAME + std::to_string(i);
|
||||||
|
ctx->description = desc;
|
||||||
|
devices.push_back(new ggml_backend_device {
|
||||||
|
/* .iface = */ ggml_backend_vk_device_i,
|
||||||
|
/* .reg = */ reg,
|
||||||
/* .context = */ ctx,
|
/* .context = */ ctx,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
initialized = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_ASSERT(device < devices.size());
|
||||||
|
return devices[device];
|
||||||
|
}
|
||||||
|
|
||||||
|
static const struct ggml_backend_reg_i ggml_backend_vk_reg_i = {
|
||||||
|
/* .get_name = */ ggml_backend_vk_reg_get_name,
|
||||||
|
/* .get_device_count = */ ggml_backend_vk_reg_get_device_count,
|
||||||
|
/* .get_device = */ ggml_backend_vk_reg_get_device,
|
||||||
|
/* .get_proc_address = */ NULL,
|
||||||
};
|
};
|
||||||
|
|
||||||
return vk_backend;
|
ggml_backend_reg_t ggml_backend_vk_reg() {
|
||||||
}
|
static ggml_backend_reg reg = {
|
||||||
|
/* .iface = */ ggml_backend_vk_reg_i,
|
||||||
|
/* .context = */ nullptr,
|
||||||
|
};
|
||||||
|
|
||||||
bool ggml_backend_is_vk(ggml_backend_t backend) {
|
return ®
|
||||||
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_vk_guid());
|
|
||||||
}
|
|
||||||
|
|
||||||
int ggml_backend_vk_get_device_count() {
|
|
||||||
return ggml_vk_get_device_count();
|
|
||||||
}
|
|
||||||
|
|
||||||
void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size) {
|
|
||||||
ggml_vk_get_device_description(device, description, description_size);
|
|
||||||
}
|
|
||||||
|
|
||||||
void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) {
|
|
||||||
GGML_ASSERT(device < (int) vk_instance.device_indices.size());
|
|
||||||
|
|
||||||
vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]];
|
|
||||||
|
|
||||||
vk::PhysicalDeviceMemoryProperties memprops = vkdev.getMemoryProperties();
|
|
||||||
|
|
||||||
for (const vk::MemoryHeap& heap : memprops.memoryHeaps) {
|
|
||||||
if (heap.flags & vk::MemoryHeapFlagBits::eDeviceLocal) {
|
|
||||||
*total = heap.size;
|
|
||||||
*free = heap.size;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Extension availability
|
// Extension availability
|
||||||
|
|
|
@ -35,10 +35,6 @@
|
||||||
#include <omp.h>
|
#include <omp.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef GGML_USE_METAL
|
|
||||||
#include <unistd.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
|
#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
|
||||||
#undef GGML_USE_LLAMAFILE
|
#undef GGML_USE_LLAMAFILE
|
||||||
#endif
|
#endif
|
||||||
|
@ -189,6 +185,8 @@ typedef pthread_t ggml_thread_t;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(__APPLE__)
|
#if defined(__APPLE__)
|
||||||
|
#include <unistd.h>
|
||||||
|
#include <mach/mach.h>
|
||||||
#include <TargetConditionals.h>
|
#include <TargetConditionals.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -386,22 +384,40 @@ void ggml_log_callback_default(enum ggml_log_level level, const char * text, voi
|
||||||
//#define GGML_SOFT_MAX_ACCELERATE
|
//#define GGML_SOFT_MAX_ACCELERATE
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
void * ggml_aligned_malloc(size_t size) {
|
||||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||||
#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
|
return _aligned_malloc(size, TENSOR_ALIGNMENT);
|
||||||
#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
|
|
||||||
#else
|
#else
|
||||||
inline static void * ggml_aligned_malloc(size_t size) {
|
|
||||||
if (size == 0) {
|
if (size == 0) {
|
||||||
GGML_LOG_WARN("Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n");
|
GGML_LOG_WARN("Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n");
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
void * aligned_memory = NULL;
|
void * aligned_memory = NULL;
|
||||||
#ifdef GGML_USE_CPU_HBM
|
#ifdef GGML_USE_CPU_HBM
|
||||||
int result = hbw_posix_memalign(&aligned_memory, 16, size);
|
int result = hbw_posix_memalign(&aligned_memory, TENSOR_ALIGNMENT, size);
|
||||||
|
#elif TARGET_OS_OSX
|
||||||
|
kern_return_t alloc_status = vm_allocate((vm_map_t) mach_task_self(), (vm_address_t *) &aligned_memory, size, VM_FLAGS_ANYWHERE);
|
||||||
|
int result = EFAULT;
|
||||||
|
switch (alloc_status) {
|
||||||
|
case KERN_SUCCESS:
|
||||||
|
result = 0;
|
||||||
|
break;
|
||||||
|
case KERN_INVALID_ADDRESS:
|
||||||
|
result = EINVAL;
|
||||||
|
break;
|
||||||
|
case KERN_NO_SPACE:
|
||||||
|
result = ENOMEM;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
result = EFAULT;
|
||||||
|
break;
|
||||||
|
}
|
||||||
#elif GGML_USE_METAL
|
#elif GGML_USE_METAL
|
||||||
int result = posix_memalign(&aligned_memory, sysconf(_SC_PAGESIZE), size);
|
const long page_size = sysconf(_SC_PAGESIZE);
|
||||||
|
int result = posix_memalign(&aligned_memory, MAX(TENSOR_ALIGNMENT, page_size), size);
|
||||||
#else
|
#else
|
||||||
int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
|
int result = posix_memalign(&aligned_memory, TENSOR_ALIGNMENT, size);
|
||||||
#endif
|
#endif
|
||||||
if (result != 0) {
|
if (result != 0) {
|
||||||
// Handle allocation failure
|
// Handle allocation failure
|
||||||
|
@ -419,14 +435,26 @@ inline static void * ggml_aligned_malloc(size_t size) {
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
return aligned_memory;
|
return aligned_memory;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_aligned_free(void * ptr, size_t size) {
|
||||||
|
GGML_UNUSED(size);
|
||||||
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||||
|
_aligned_free(ptr);
|
||||||
|
#elif GGML_USE_CPU_HBM
|
||||||
|
if (ptr != NULL) {
|
||||||
|
hbw_free(ptr);
|
||||||
|
}
|
||||||
|
#elif TARGET_OS_OSX
|
||||||
|
if (ptr != NULL) {
|
||||||
|
vm_deallocate((vm_map_t)mach_task_self(), (vm_address_t)ptr, size);
|
||||||
}
|
}
|
||||||
#define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
|
|
||||||
#ifdef GGML_USE_CPU_HBM
|
|
||||||
#define GGML_ALIGNED_FREE(ptr) if(NULL != ptr) hbw_free(ptr)
|
|
||||||
#else
|
#else
|
||||||
#define GGML_ALIGNED_FREE(ptr) free(ptr)
|
free(ptr);
|
||||||
#endif
|
|
||||||
#endif
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
inline static void * ggml_malloc(size_t size) {
|
inline static void * ggml_malloc(size_t size) {
|
||||||
if (size == 0) {
|
if (size == 0) {
|
||||||
|
@ -3894,7 +3922,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
||||||
|
|
||||||
*ctx = (struct ggml_context) {
|
*ctx = (struct ggml_context) {
|
||||||
/*.mem_size =*/ mem_size,
|
/*.mem_size =*/ mem_size,
|
||||||
/*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : GGML_ALIGNED_MALLOC(mem_size),
|
/*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : ggml_aligned_malloc(mem_size),
|
||||||
/*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
|
/*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
|
||||||
/*.no_alloc =*/ params.no_alloc,
|
/*.no_alloc =*/ params.no_alloc,
|
||||||
/*.no_alloc_save =*/ params.no_alloc,
|
/*.no_alloc_save =*/ params.no_alloc,
|
||||||
|
@ -3934,7 +3962,7 @@ void ggml_free(struct ggml_context * ctx) {
|
||||||
__func__, i, ggml_used_mem(ctx));
|
__func__, i, ggml_used_mem(ctx));
|
||||||
|
|
||||||
if (ctx->mem_buffer_owned) {
|
if (ctx->mem_buffer_owned) {
|
||||||
GGML_ALIGNED_FREE(ctx->mem_buffer);
|
ggml_aligned_free(ctx->mem_buffer, ctx->mem_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
found = true;
|
found = true;
|
||||||
|
@ -19663,9 +19691,10 @@ static void ggml_thread_cpumask_next(const bool * global_mask, bool * local_mask
|
||||||
void ggml_threadpool_free(struct ggml_threadpool* threadpool) {
|
void ggml_threadpool_free(struct ggml_threadpool* threadpool) {
|
||||||
if (!threadpool) return;
|
if (!threadpool) return;
|
||||||
|
|
||||||
|
const int n_threads = threadpool->n_threads_max;
|
||||||
|
|
||||||
#ifndef GGML_USE_OPENMP
|
#ifndef GGML_USE_OPENMP
|
||||||
struct ggml_compute_state* workers = threadpool->workers;
|
struct ggml_compute_state* workers = threadpool->workers;
|
||||||
const int n_threads = threadpool->n_threads_max;
|
|
||||||
|
|
||||||
ggml_mutex_lock(&threadpool->mutex);
|
ggml_mutex_lock(&threadpool->mutex);
|
||||||
|
|
||||||
|
@ -19685,8 +19714,9 @@ void ggml_threadpool_free(struct ggml_threadpool* threadpool) {
|
||||||
ggml_cond_destroy(&threadpool->cond);
|
ggml_cond_destroy(&threadpool->cond);
|
||||||
#endif // GGML_USE_OPENMP
|
#endif // GGML_USE_OPENMP
|
||||||
|
|
||||||
GGML_ALIGNED_FREE(threadpool->workers);
|
const size_t workers_size = sizeof(struct ggml_compute_state) * n_threads;
|
||||||
GGML_ALIGNED_FREE(threadpool);
|
ggml_aligned_free(threadpool->workers, workers_size);
|
||||||
|
ggml_aligned_free(threadpool, sizeof(struct ggml_threadpool));
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifndef GGML_USE_OPENMP
|
#ifndef GGML_USE_OPENMP
|
||||||
|
@ -20123,7 +20153,7 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
|
||||||
struct ggml_cplan * cplan) {
|
struct ggml_cplan * cplan) {
|
||||||
|
|
||||||
struct ggml_threadpool * threadpool =
|
struct ggml_threadpool * threadpool =
|
||||||
GGML_ALIGNED_MALLOC(sizeof(struct ggml_threadpool));
|
ggml_aligned_malloc(sizeof(struct ggml_threadpool));
|
||||||
{
|
{
|
||||||
threadpool->cgraph = cgraph;
|
threadpool->cgraph = cgraph;
|
||||||
threadpool->cplan = cplan;
|
threadpool->cplan = cplan;
|
||||||
|
@ -20144,7 +20174,7 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
|
||||||
|
|
||||||
// Allocate and init workers state
|
// Allocate and init workers state
|
||||||
const size_t workers_size = sizeof(struct ggml_compute_state) * tpp->n_threads;
|
const size_t workers_size = sizeof(struct ggml_compute_state) * tpp->n_threads;
|
||||||
struct ggml_compute_state * workers = GGML_ALIGNED_MALLOC(workers_size);
|
struct ggml_compute_state * workers = ggml_aligned_malloc(workers_size);
|
||||||
|
|
||||||
memset(workers, 0, workers_size);
|
memset(workers, 0, workers_size);
|
||||||
for (int j = 0; j < tpp->n_threads; j++) {
|
for (int j = 0; j < tpp->n_threads; j++) {
|
||||||
|
|
|
@ -152,6 +152,8 @@ class Keys:
|
||||||
MERGES = "tokenizer.ggml.merges"
|
MERGES = "tokenizer.ggml.merges"
|
||||||
BOS_ID = "tokenizer.ggml.bos_token_id"
|
BOS_ID = "tokenizer.ggml.bos_token_id"
|
||||||
EOS_ID = "tokenizer.ggml.eos_token_id"
|
EOS_ID = "tokenizer.ggml.eos_token_id"
|
||||||
|
EOT_ID = "tokenizer.ggml.eot_token_id"
|
||||||
|
EOM_ID = "tokenizer.ggml.eom_token_id"
|
||||||
UNK_ID = "tokenizer.ggml.unknown_token_id"
|
UNK_ID = "tokenizer.ggml.unknown_token_id"
|
||||||
SEP_ID = "tokenizer.ggml.seperator_token_id"
|
SEP_ID = "tokenizer.ggml.seperator_token_id"
|
||||||
PAD_ID = "tokenizer.ggml.padding_token_id"
|
PAD_ID = "tokenizer.ggml.padding_token_id"
|
||||||
|
@ -168,11 +170,16 @@ class Keys:
|
||||||
CHAT_TEMPLATE_N = "tokenizer.chat_template.{name}"
|
CHAT_TEMPLATE_N = "tokenizer.chat_template.{name}"
|
||||||
CHAT_TEMPLATES = "tokenizer.chat_templates"
|
CHAT_TEMPLATES = "tokenizer.chat_templates"
|
||||||
# FIM/Infill special tokens constants
|
# FIM/Infill special tokens constants
|
||||||
|
FIM_PRE_ID = "tokenizer.ggml.fim_pre_token_id"
|
||||||
|
FIM_SUF_ID = "tokenizer.ggml.fim_suf_token_id"
|
||||||
|
FIM_MID_ID = "tokenizer.ggml.fim_mid_token_id"
|
||||||
|
FIM_PAD_ID = "tokenizer.ggml.fim_pad_token_id"
|
||||||
|
FIM_REP_ID = "tokenizer.ggml.fim_rep_token_id"
|
||||||
|
FIM_SEP_ID = "tokenizer.ggml.fim_sep_token_id"
|
||||||
|
# deprecated:
|
||||||
PREFIX_ID = "tokenizer.ggml.prefix_token_id"
|
PREFIX_ID = "tokenizer.ggml.prefix_token_id"
|
||||||
SUFFIX_ID = "tokenizer.ggml.suffix_token_id"
|
SUFFIX_ID = "tokenizer.ggml.suffix_token_id"
|
||||||
MIDDLE_ID = "tokenizer.ggml.middle_token_id"
|
MIDDLE_ID = "tokenizer.ggml.middle_token_id"
|
||||||
EOT_ID = "tokenizer.ggml.eot_token_id"
|
|
||||||
EOM_ID = "tokenizer.ggml.eom_token_id"
|
|
||||||
|
|
||||||
class Adapter:
|
class Adapter:
|
||||||
TYPE = "adapter.type"
|
TYPE = "adapter.type"
|
||||||
|
@ -1579,6 +1586,8 @@ KEY_TOKENIZER_SCORES = Keys.Tokenizer.SCORES
|
||||||
KEY_TOKENIZER_MERGES = Keys.Tokenizer.MERGES
|
KEY_TOKENIZER_MERGES = Keys.Tokenizer.MERGES
|
||||||
KEY_TOKENIZER_BOS_ID = Keys.Tokenizer.BOS_ID
|
KEY_TOKENIZER_BOS_ID = Keys.Tokenizer.BOS_ID
|
||||||
KEY_TOKENIZER_EOS_ID = Keys.Tokenizer.EOS_ID
|
KEY_TOKENIZER_EOS_ID = Keys.Tokenizer.EOS_ID
|
||||||
|
KEY_TOKENIZER_EOT_ID = Keys.Tokenizer.EOT_ID
|
||||||
|
KEY_TOKENIZER_EOM_ID = Keys.Tokenizer.EOM_ID
|
||||||
KEY_TOKENIZER_UNK_ID = Keys.Tokenizer.UNK_ID
|
KEY_TOKENIZER_UNK_ID = Keys.Tokenizer.UNK_ID
|
||||||
KEY_TOKENIZER_SEP_ID = Keys.Tokenizer.SEP_ID
|
KEY_TOKENIZER_SEP_ID = Keys.Tokenizer.SEP_ID
|
||||||
KEY_TOKENIZER_PAD_ID = Keys.Tokenizer.PAD_ID
|
KEY_TOKENIZER_PAD_ID = Keys.Tokenizer.PAD_ID
|
||||||
|
@ -1586,8 +1595,15 @@ KEY_TOKENIZER_CLS_ID = Keys.Tokenizer.CLS_ID
|
||||||
KEY_TOKENIZER_MASK_ID = Keys.Tokenizer.MASK_ID
|
KEY_TOKENIZER_MASK_ID = Keys.Tokenizer.MASK_ID
|
||||||
KEY_TOKENIZER_HF_JSON = Keys.Tokenizer.HF_JSON
|
KEY_TOKENIZER_HF_JSON = Keys.Tokenizer.HF_JSON
|
||||||
KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV
|
KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV
|
||||||
KEY_TOKENIZER_PRIFIX_ID = Keys.Tokenizer.PREFIX_ID
|
|
||||||
|
KEY_TOKENIZER_FIM_PRE_ID = Keys.Tokenizer.FIM_PRE_ID
|
||||||
|
KEY_TOKENIZER_FIM_SUF_ID = Keys.Tokenizer.FIM_SUF_ID
|
||||||
|
KEY_TOKENIZER_FIM_MID_ID = Keys.Tokenizer.FIM_MID_ID
|
||||||
|
KEY_TOKENIZER_FIM_PAD_ID = Keys.Tokenizer.FIM_PAD_ID
|
||||||
|
KEY_TOKENIZER_FIM_REP_ID = Keys.Tokenizer.FIM_REP_ID
|
||||||
|
KEY_TOKENIZER_FIM_SEP_ID = Keys.Tokenizer.FIM_SEP_ID
|
||||||
|
|
||||||
|
# deprecated
|
||||||
|
KEY_TOKENIZER_PREFIX_ID = Keys.Tokenizer.PREFIX_ID
|
||||||
KEY_TOKENIZER_SUFFIX_ID = Keys.Tokenizer.SUFFIX_ID
|
KEY_TOKENIZER_SUFFIX_ID = Keys.Tokenizer.SUFFIX_ID
|
||||||
KEY_TOKENIZER_MIDDLE_ID = Keys.Tokenizer.MIDDLE_ID
|
KEY_TOKENIZER_MIDDLE_ID = Keys.Tokenizer.MIDDLE_ID
|
||||||
KEY_TOKENIZER_EOT_ID = Keys.Tokenizer.EOT_ID
|
|
||||||
KEY_TOKENIZER_EOM_ID = Keys.Tokenizer.EOM_ID
|
|
||||||
|
|
|
@ -843,15 +843,6 @@ class GGUFWriter:
|
||||||
|
|
||||||
self.add_string(Keys.Tokenizer.CHAT_TEMPLATE, value)
|
self.add_string(Keys.Tokenizer.CHAT_TEMPLATE, value)
|
||||||
|
|
||||||
def add_prefix_token_id(self, id: int) -> None:
|
|
||||||
self.add_uint32(Keys.Tokenizer.PREFIX_ID, id)
|
|
||||||
|
|
||||||
def add_suffix_token_id(self, id: int) -> None:
|
|
||||||
self.add_uint32(Keys.Tokenizer.SUFFIX_ID, id)
|
|
||||||
|
|
||||||
def add_middle_token_id(self, id: int) -> None:
|
|
||||||
self.add_uint32(Keys.Tokenizer.MIDDLE_ID, id)
|
|
||||||
|
|
||||||
def add_eot_token_id(self, id: int) -> None:
|
def add_eot_token_id(self, id: int) -> None:
|
||||||
self.add_uint32(Keys.Tokenizer.EOT_ID, id)
|
self.add_uint32(Keys.Tokenizer.EOT_ID, id)
|
||||||
|
|
||||||
|
|
|
@ -899,6 +899,7 @@ extern "C" {
|
||||||
// Special tokens
|
// Special tokens
|
||||||
LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
|
LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
|
||||||
LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
|
LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
|
||||||
|
LLAMA_API llama_token llama_token_eot(const struct llama_model * model); // end-of-turn
|
||||||
LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification
|
LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification
|
||||||
LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator
|
LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator
|
||||||
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
|
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
|
||||||
|
@ -907,11 +908,17 @@ extern "C" {
|
||||||
LLAMA_API bool llama_add_bos_token(const struct llama_model * model);
|
LLAMA_API bool llama_add_bos_token(const struct llama_model * model);
|
||||||
LLAMA_API bool llama_add_eos_token(const struct llama_model * model);
|
LLAMA_API bool llama_add_eos_token(const struct llama_model * model);
|
||||||
|
|
||||||
// Codellama infill tokens
|
// infill tokens
|
||||||
LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
|
DEPRECATED(LLAMA_API llama_token llama_token_prefix(const struct llama_model * model), "use llama_token_fim_pre instead");
|
||||||
LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
|
DEPRECATED(LLAMA_API llama_token llama_token_middle(const struct llama_model * model), "use llama_token_fim_mid instead");
|
||||||
LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix
|
DEPRECATED(LLAMA_API llama_token llama_token_suffix(const struct llama_model * model), "use llama_token_fim_suf instead");
|
||||||
LLAMA_API llama_token llama_token_eot (const struct llama_model * model); // End of infill middle
|
|
||||||
|
LLAMA_API llama_token llama_token_fim_pre(const struct llama_model * model);
|
||||||
|
LLAMA_API llama_token llama_token_fim_suf(const struct llama_model * model);
|
||||||
|
LLAMA_API llama_token llama_token_fim_mid(const struct llama_model * model);
|
||||||
|
LLAMA_API llama_token llama_token_fim_pad(const struct llama_model * model);
|
||||||
|
LLAMA_API llama_token llama_token_fim_rep(const struct llama_model * model);
|
||||||
|
LLAMA_API llama_token llama_token_fim_sep(const struct llama_model * model);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Tokenization
|
// Tokenization
|
||||||
|
@ -948,6 +955,12 @@ extern "C" {
|
||||||
int32_t lstrip,
|
int32_t lstrip,
|
||||||
bool special);
|
bool special);
|
||||||
|
|
||||||
|
// check if token0 is contained as a prefix in token1
|
||||||
|
LLAMA_API bool llama_token_is_prefix(
|
||||||
|
const struct llama_model * model,
|
||||||
|
llama_token token0,
|
||||||
|
llama_token token1);
|
||||||
|
|
||||||
/// @details Convert the provided tokens into text (inverse of llama_tokenize()).
|
/// @details Convert the provided tokens into text (inverse of llama_tokenize()).
|
||||||
/// @param text The char pointer must be large enough to hold the resulting text.
|
/// @param text The char pointer must be large enough to hold the resulting text.
|
||||||
/// @return Returns the number of chars/bytes on success, no more than text_len_max.
|
/// @return Returns the number of chars/bytes on success, no more than text_len_max.
|
||||||
|
@ -1096,6 +1109,9 @@ extern "C" {
|
||||||
/// @details Dynamic temperature implementation (a.k.a. entropy) described in the paper https://arxiv.org/abs/2309.02772.
|
/// @details Dynamic temperature implementation (a.k.a. entropy) described in the paper https://arxiv.org/abs/2309.02772.
|
||||||
LLAMA_API struct llama_sampler * llama_sampler_init_temp_ext (float t, float delta, float exponent);
|
LLAMA_API struct llama_sampler * llama_sampler_init_temp_ext (float t, float delta, float exponent);
|
||||||
|
|
||||||
|
/// @details XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335
|
||||||
|
LLAMA_API struct llama_sampler * llama_sampler_init_xtc (float p, float t, size_t min_keep, uint32_t seed);
|
||||||
|
|
||||||
/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
||||||
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
|
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
|
||||||
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
|
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
|
||||||
|
@ -1140,6 +1156,28 @@ extern "C" {
|
||||||
int32_t n_logit_bias,
|
int32_t n_logit_bias,
|
||||||
const llama_logit_bias * logit_bias);
|
const llama_logit_bias * logit_bias);
|
||||||
|
|
||||||
|
// this sampler is meant to be used for fill-in-the-middle infilling
|
||||||
|
// it's supposed to be used after top_k + top_p sampling
|
||||||
|
//
|
||||||
|
// 1. if the sum of the EOG probs times the number of candidates is higher than the sum of the other probs -> pick EOG
|
||||||
|
// 2. combine probs of tokens that have the same prefix
|
||||||
|
//
|
||||||
|
// example:
|
||||||
|
//
|
||||||
|
// - before:
|
||||||
|
// "hel": 0.5
|
||||||
|
// "hell": 0.2
|
||||||
|
// "hello": 0.1
|
||||||
|
// "dummy": 0.1
|
||||||
|
//
|
||||||
|
// - after:
|
||||||
|
// "hel": 0.8
|
||||||
|
// "dummy": 0.1
|
||||||
|
//
|
||||||
|
// 3. discard non-EOG tokens with low prob
|
||||||
|
// 4. if no tokens are left -> pick EOT
|
||||||
|
//
|
||||||
|
LLAMA_API struct llama_sampler * llama_sampler_init_infill(const struct llama_model * model);
|
||||||
|
|
||||||
// Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise
|
// Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise
|
||||||
LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl);
|
LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl);
|
||||||
|
|
|
@ -44,7 +44,7 @@ maxhordelen = 400
|
||||||
modelbusy = threading.Lock()
|
modelbusy = threading.Lock()
|
||||||
requestsinqueue = 0
|
requestsinqueue = 0
|
||||||
defaultport = 5001
|
defaultport = 5001
|
||||||
KcppVersion = "1.76"
|
KcppVersion = "1.77"
|
||||||
showdebug = True
|
showdebug = True
|
||||||
guimode = False
|
guimode = False
|
||||||
showsamplerwarning = True
|
showsamplerwarning = True
|
||||||
|
@ -2250,8 +2250,8 @@ def show_gui():
|
||||||
|
|
||||||
import customtkinter as ctk
|
import customtkinter as ctk
|
||||||
nextstate = 0 #0=exit, 1=launch
|
nextstate = 0 #0=exit, 1=launch
|
||||||
original_windowwidth = 550
|
original_windowwidth = 580
|
||||||
original_windowheight = 550
|
original_windowheight = 560
|
||||||
windowwidth = original_windowwidth
|
windowwidth = original_windowwidth
|
||||||
windowheight = original_windowheight
|
windowheight = original_windowheight
|
||||||
ctk.set_appearance_mode("dark")
|
ctk.set_appearance_mode("dark")
|
||||||
|
|
|
@ -1059,6 +1059,101 @@ struct llama_sampler * llama_sampler_init_temp_ext(float temp, float delta, floa
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// xtc
|
||||||
|
|
||||||
|
struct llama_sampler_xtc {
|
||||||
|
const float probability;
|
||||||
|
const float threshold;
|
||||||
|
const size_t min_keep;
|
||||||
|
|
||||||
|
const uint32_t seed;
|
||||||
|
uint32_t seed_cur;
|
||||||
|
|
||||||
|
std::mt19937 rng;
|
||||||
|
};
|
||||||
|
|
||||||
|
static const char * llama_sampler_xtc_name(const struct llama_sampler * /*smpl*/) {
|
||||||
|
return "xtc";
|
||||||
|
}
|
||||||
|
|
||||||
|
static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
||||||
|
auto * ctx = (llama_sampler_xtc *) smpl->ctx;
|
||||||
|
|
||||||
|
if (ctx->probability <= 0.0f
|
||||||
|
|| ctx->threshold > 0.5f
|
||||||
|
|| cur_p->size < 2) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::uniform_real_distribution<float> distribution(0.0f, 1.0f);
|
||||||
|
float chance = distribution(ctx->rng);
|
||||||
|
if (chance > ctx->probability) return;
|
||||||
|
|
||||||
|
// in case it's not sorted/recalculated yet
|
||||||
|
llama_sampler_softmax_impl(cur_p);
|
||||||
|
|
||||||
|
int pos_last = 0;
|
||||||
|
|
||||||
|
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||||
|
if (cur_p->data[i].p >= ctx->threshold) {
|
||||||
|
pos_last = i;
|
||||||
|
} else break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (cur_p->size - pos_last >= ctx->min_keep && pos_last > 0) {
|
||||||
|
cur_p->data += pos_last;
|
||||||
|
cur_p->size -= pos_last;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct llama_sampler * llama_sampler_xtc_clone(const struct llama_sampler * smpl) {
|
||||||
|
const auto * ctx = (const llama_sampler_xtc *) smpl->ctx;
|
||||||
|
auto * result = llama_sampler_init_xtc(ctx->probability, ctx->threshold, ctx->min_keep, ctx->seed);
|
||||||
|
|
||||||
|
// copy the state
|
||||||
|
{
|
||||||
|
auto * result_ctx = (llama_sampler_xtc *) result->ctx;
|
||||||
|
|
||||||
|
result_ctx->rng = ctx->rng;
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void llama_sampler_xtc_free(struct llama_sampler * smpl) {
|
||||||
|
delete (llama_sampler_xtc *) smpl->ctx;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void llama_sampler_xtc_reset(struct llama_sampler * smpl) {
|
||||||
|
auto * ctx = (llama_sampler_xtc *) smpl->ctx;
|
||||||
|
ctx->seed_cur = get_rng_seed(ctx->seed);
|
||||||
|
ctx->rng.seed(ctx->seed_cur);
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct llama_sampler_i llama_sampler_xtc_i = {
|
||||||
|
/* .name = */ llama_sampler_xtc_name,
|
||||||
|
/* .accept = */ nullptr,
|
||||||
|
/* .apply = */ llama_sample_xtc_apply,
|
||||||
|
/* .reset = */ llama_sampler_xtc_reset,
|
||||||
|
/* .clone = */ llama_sampler_xtc_clone,
|
||||||
|
/* .free = */ llama_sampler_xtc_free,
|
||||||
|
};
|
||||||
|
|
||||||
|
struct llama_sampler * llama_sampler_init_xtc(float p, float t, size_t min_keep, uint32_t seed) {
|
||||||
|
auto seed_cur = get_rng_seed(seed);
|
||||||
|
return new llama_sampler {
|
||||||
|
/* .iface = */ &llama_sampler_xtc_i,
|
||||||
|
/* .ctx = */ new llama_sampler_xtc {
|
||||||
|
/* .probability = */ p,
|
||||||
|
/* .threshold = */ t,
|
||||||
|
/* .min_keep = */ min_keep,
|
||||||
|
/* .seed = */ seed,
|
||||||
|
/* .seed_cur = */ seed_cur,
|
||||||
|
/* .rng = */ std::mt19937(seed_cur),
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
// mirostat
|
// mirostat
|
||||||
|
|
||||||
struct llama_sampler_mirostat {
|
struct llama_sampler_mirostat {
|
||||||
|
@ -1644,6 +1739,207 @@ struct llama_sampler * llama_sampler_init_logit_bias(
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// infill
|
||||||
|
|
||||||
|
//#define GGML_DEBUG_SAMPLER_INFILL
|
||||||
|
|
||||||
|
struct llama_sampler_infill {
|
||||||
|
const struct llama_vocab * vocab;
|
||||||
|
};
|
||||||
|
|
||||||
|
static const char * llama_sampler_infill_name(const struct llama_sampler * /*smpl*/) {
|
||||||
|
return "infill";
|
||||||
|
}
|
||||||
|
|
||||||
|
static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
||||||
|
auto * ctx = (llama_sampler_infill *) smpl->ctx;
|
||||||
|
|
||||||
|
llama_sampler_softmax_impl(cur_p);
|
||||||
|
|
||||||
|
#if defined(GGML_DEBUG_SAMPLER_INFILL)
|
||||||
|
#define LOG_DBG_CUR LLAMA_LOG_DEBUG
|
||||||
|
#else
|
||||||
|
#define LOG_DBG_CUR(...)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||||
|
LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit);
|
||||||
|
}
|
||||||
|
|
||||||
|
float p_txt_sum = 0.0f;
|
||||||
|
float p_eog_sum = 0.0f;
|
||||||
|
|
||||||
|
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||||
|
if (llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id)) {
|
||||||
|
p_eog_sum += cur_p->data[i].p;
|
||||||
|
} else {
|
||||||
|
p_txt_sum += cur_p->data[i].p;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const float rat = p_eog_sum == 0.0 ? INFINITY : p_txt_sum / p_eog_sum; GGML_UNUSED(rat);
|
||||||
|
|
||||||
|
LOG_DBG_CUR("%s: p_txt_sum = %.2f, p_eog_sum = %.2f, rat = %.2f, n = %zu\n", __func__, p_txt_sum, p_eog_sum, rat, cur_p->size);
|
||||||
|
|
||||||
|
if (3*p_eog_sum*cur_p->size > p_txt_sum) {
|
||||||
|
LOG_DBG_CUR("%s: the ratio p_txt/p_eog = %.2f is too low -> sampling EOG\n", __func__, p_txt_sum/p_eog_sum);
|
||||||
|
|
||||||
|
// keep just the EOG tokens
|
||||||
|
const auto size_org = cur_p->size;
|
||||||
|
|
||||||
|
cur_p->size = 0;
|
||||||
|
|
||||||
|
float p_sum = 0.0f;
|
||||||
|
|
||||||
|
for (size_t i = 0; i < size_org; ++i) {
|
||||||
|
if (llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id)) {
|
||||||
|
p_sum += cur_p->data[i].p;
|
||||||
|
|
||||||
|
cur_p->data[cur_p->size++] = cur_p->data[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// normalize probs
|
||||||
|
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||||
|
cur_p->data[i].p /= p_sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t n_combined = 0; GGML_UNUSED(n_combined);
|
||||||
|
|
||||||
|
// combine tokens with common prefix
|
||||||
|
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||||
|
for (size_t j = 0; j < cur_p->size; ++j) {
|
||||||
|
if (cur_p->data[i].logit == -INFINITY) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (i == j || cur_p->data[j].logit == -INFINITY) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (llama_token_is_prefix_impl(*ctx->vocab, cur_p->data[i].id, cur_p->data[j].id)) {
|
||||||
|
if (cur_p->data[i].p > cur_p->data[j].p) {
|
||||||
|
cur_p->data[i].p += cur_p->data[j].p;
|
||||||
|
cur_p->data[j].logit = -INFINITY;
|
||||||
|
cur_p->data[j].p = 0.0f;
|
||||||
|
} else {
|
||||||
|
cur_p->data[j].p += cur_p->data[i].p;
|
||||||
|
cur_p->data[i].logit = -INFINITY;
|
||||||
|
cur_p->data[i].p = 0.0f;
|
||||||
|
}
|
||||||
|
|
||||||
|
n_combined++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t n_non_eog = 0;
|
||||||
|
|
||||||
|
size_t size_org = cur_p->size;
|
||||||
|
|
||||||
|
float p_sum = 0.0f;
|
||||||
|
float thold = 0.2f;
|
||||||
|
|
||||||
|
cur_p->size = 0;
|
||||||
|
|
||||||
|
LOG_DBG_CUR("%s: n_combined = %zu, applying thold = %.3f\n", __func__, n_combined, thold);
|
||||||
|
|
||||||
|
for (size_t i = 0; i < size_org; ++i) {
|
||||||
|
const bool is_eog = llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id);
|
||||||
|
|
||||||
|
if (cur_p->data[i].p < thold && !is_eog) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!is_eog) {
|
||||||
|
++n_non_eog;
|
||||||
|
}
|
||||||
|
|
||||||
|
p_sum += cur_p->data[i].p;
|
||||||
|
|
||||||
|
// keep this token
|
||||||
|
cur_p->data[cur_p->size++] = cur_p->data[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
LOG_DBG_CUR("%s: n_non_eog = %zu\n", __func__, n_non_eog);
|
||||||
|
|
||||||
|
// if no non-EOG tokens are left -> reduce cur_p to single EOT token
|
||||||
|
if (n_non_eog == 0) {
|
||||||
|
cur_p->size = 1;
|
||||||
|
cur_p->data[0].id = llama_token_eot_impl(*ctx->vocab);
|
||||||
|
cur_p->data[0].logit = 1.0f;
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// normalize probs
|
||||||
|
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||||
|
cur_p->data[i].p /= p_sum;
|
||||||
|
|
||||||
|
LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit);
|
||||||
|
}
|
||||||
|
|
||||||
|
size_org = cur_p->size;
|
||||||
|
p_sum = 0.0f;
|
||||||
|
thold = 1.0/(n_non_eog + 1);
|
||||||
|
|
||||||
|
cur_p->size = 0;
|
||||||
|
|
||||||
|
LOG_DBG_CUR("%s: applying thold = %.3f\n", __func__, thold);
|
||||||
|
|
||||||
|
for (size_t i = 0; i < size_org; ++i) {
|
||||||
|
const bool is_eog = llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id);
|
||||||
|
|
||||||
|
if (cur_p->data[i].p < thold && !is_eog) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
p_sum += cur_p->data[i].p;
|
||||||
|
|
||||||
|
cur_p->data[cur_p->size++] = cur_p->data[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
// normalize probs
|
||||||
|
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||||
|
cur_p->data[i].p /= p_sum;
|
||||||
|
|
||||||
|
LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit);
|
||||||
|
}
|
||||||
|
|
||||||
|
#undef LOG_DBG_CUR
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct llama_sampler * llama_sampler_infill_clone(const struct llama_sampler * smpl) {
|
||||||
|
const auto * ctx = (const llama_sampler_infill *) smpl->ctx;
|
||||||
|
return llama_sampler_init_infill_impl(*ctx->vocab);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void llama_sampler_infill_free(struct llama_sampler * smpl) {
|
||||||
|
delete (llama_sampler_infill *) smpl->ctx;
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct llama_sampler_i llama_sampler_infill_i = {
|
||||||
|
/* .name = */ llama_sampler_infill_name,
|
||||||
|
/* .accept = */ nullptr,
|
||||||
|
/* .apply = */ llama_sampler_infill_apply,
|
||||||
|
/* .reset = */ nullptr,
|
||||||
|
/* .clone = */ llama_sampler_infill_clone,
|
||||||
|
/* .free = */ llama_sampler_infill_free,
|
||||||
|
};
|
||||||
|
|
||||||
|
struct llama_sampler * llama_sampler_init_infill_impl(
|
||||||
|
const struct llama_vocab & vocab) {
|
||||||
|
return new llama_sampler {
|
||||||
|
/* .iface = */ &llama_sampler_infill_i,
|
||||||
|
/* .ctx = */ new llama_sampler_infill {
|
||||||
|
/* .vocab = */ &vocab,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
// utils
|
// utils
|
||||||
|
|
||||||
uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl) {
|
uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl) {
|
||||||
|
|
|
@ -4,8 +4,6 @@
|
||||||
|
|
||||||
#include "llama-grammar.h"
|
#include "llama-grammar.h"
|
||||||
|
|
||||||
#include <unordered_map>
|
|
||||||
|
|
||||||
struct llama_vocab;
|
struct llama_vocab;
|
||||||
struct llama_grammar;
|
struct llama_grammar;
|
||||||
|
|
||||||
|
@ -27,3 +25,6 @@ struct llama_sampler * llama_sampler_init_grammar_impl(
|
||||||
const struct llama_vocab & vocab,
|
const struct llama_vocab & vocab,
|
||||||
const char * grammar_str,
|
const char * grammar_str,
|
||||||
const char * grammar_root);
|
const char * grammar_root);
|
||||||
|
|
||||||
|
struct llama_sampler * llama_sampler_init_infill_impl(
|
||||||
|
const struct llama_vocab & vocab);
|
||||||
|
|
|
@ -230,7 +230,7 @@ struct llm_tokenizer_spm_session {
|
||||||
}
|
}
|
||||||
|
|
||||||
// seed the work queue with all possible 2-character tokens.
|
// seed the work queue with all possible 2-character tokens.
|
||||||
for (size_t i = 1; i < symbols.size(); ++i) {
|
for (int i = 1; i < (int) symbols.size(); ++i) {
|
||||||
try_add_bigram(i - 1, i);
|
try_add_bigram(i - 1, i);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -790,7 +790,7 @@ struct llm_tokenizer_bpe_session {
|
||||||
index++;
|
index++;
|
||||||
symbols.emplace_back(sym);
|
symbols.emplace_back(sym);
|
||||||
}
|
}
|
||||||
for (size_t i = 1; i < symbols.size(); ++i) {
|
for (int i = 1; i < (int) symbols.size(); ++i) {
|
||||||
add_new_bigram(i - 1, i);
|
add_new_bigram(i - 1, i);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1918,6 +1918,14 @@ llama_token llama_token_eos_impl(const struct llama_vocab & vocab) {
|
||||||
return vocab.special_eos_id;
|
return vocab.special_eos_id;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
llama_token llama_token_eot_impl(const struct llama_vocab & vocab) {
|
||||||
|
return vocab.special_eot_id;
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_token llama_token_eom_impl(const struct llama_vocab & vocab) {
|
||||||
|
return vocab.special_eom_id;
|
||||||
|
}
|
||||||
|
|
||||||
llama_token llama_token_cls_impl(const struct llama_vocab & vocab) {
|
llama_token llama_token_cls_impl(const struct llama_vocab & vocab) {
|
||||||
return vocab.special_cls_id;
|
return vocab.special_cls_id;
|
||||||
}
|
}
|
||||||
|
@ -1943,23 +1951,39 @@ bool llama_add_eos_token_impl(const struct llama_vocab & vocab) {
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_token llama_token_prefix_impl(const struct llama_vocab & vocab) {
|
llama_token llama_token_prefix_impl(const struct llama_vocab & vocab) {
|
||||||
return vocab.special_prefix_id;
|
return vocab.special_fim_pre_id;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_token llama_token_middle_impl(const struct llama_vocab & vocab) {
|
llama_token llama_token_middle_impl(const struct llama_vocab & vocab) {
|
||||||
return vocab.special_middle_id;
|
return vocab.special_fim_mid_id;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_token llama_token_suffix_impl(const struct llama_vocab & vocab) {
|
llama_token llama_token_suffix_impl(const struct llama_vocab & vocab) {
|
||||||
return vocab.special_suffix_id;
|
return vocab.special_fim_suf_id;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_token llama_token_eot_impl(const struct llama_vocab & vocab) {
|
llama_token llama_token_fim_pre_impl(const struct llama_vocab & vocab) {
|
||||||
return vocab.special_eot_id;
|
return vocab.special_fim_pre_id;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_token llama_token_eom_impl(const struct llama_vocab & vocab) {
|
llama_token llama_token_fim_suf_impl(const struct llama_vocab & vocab) {
|
||||||
return vocab.special_eom_id;
|
return vocab.special_fim_suf_id;
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_token llama_token_fim_mid_impl(const struct llama_vocab & vocab) {
|
||||||
|
return vocab.special_fim_mid_id;
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_token llama_token_fim_pad_impl(const struct llama_vocab & vocab) {
|
||||||
|
return vocab.special_fim_pad_id;
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_token llama_token_fim_rep_impl(const struct llama_vocab & vocab) {
|
||||||
|
return vocab.special_fim_rep_id;
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_token llama_token_fim_sep_impl(const struct llama_vocab & vocab) {
|
||||||
|
return vocab.special_fim_sep_id;
|
||||||
}
|
}
|
||||||
|
|
||||||
int32_t llama_tokenize_impl(
|
int32_t llama_tokenize_impl(
|
||||||
|
@ -2094,6 +2118,23 @@ int32_t llama_token_to_piece_impl(const struct llama_vocab & vocab, llama_token
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool llama_token_is_prefix_impl(
|
||||||
|
const struct llama_vocab & vocab,
|
||||||
|
llama_token token0,
|
||||||
|
llama_token token1) {
|
||||||
|
char text_buf_0[128];
|
||||||
|
char text_buf_1[128];
|
||||||
|
|
||||||
|
const int32_t len0 = llama_token_to_piece_impl(vocab, token0, text_buf_0, sizeof(text_buf_0) - 1, 0, false);
|
||||||
|
const int32_t len1 = llama_token_to_piece_impl(vocab, token1, text_buf_1, sizeof(text_buf_1) - 1, 0, false);
|
||||||
|
|
||||||
|
if (len0 <= 0 || len1 <= 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return len0 <= len1 && memcmp(text_buf_0, text_buf_1, len0) == 0;
|
||||||
|
}
|
||||||
|
|
||||||
int32_t llama_detokenize_impl(
|
int32_t llama_detokenize_impl(
|
||||||
const struct llama_vocab & vocab,
|
const struct llama_vocab & vocab,
|
||||||
const llama_token * tokens,
|
const llama_token * tokens,
|
||||||
|
|
|
@ -37,8 +37,11 @@ struct llama_vocab {
|
||||||
std::map<std::pair<std::string, std::string>, int> bpe_ranks;
|
std::map<std::pair<std::string, std::string>, int> bpe_ranks;
|
||||||
|
|
||||||
// default LLaMA special tokens
|
// default LLaMA special tokens
|
||||||
|
// TODO: should we set all of these to LLAMA_TOKEN_NULL?
|
||||||
id special_bos_id = 1;
|
id special_bos_id = 1;
|
||||||
id special_eos_id = 2;
|
id special_eos_id = 2;
|
||||||
|
id special_eot_id = LLAMA_TOKEN_NULL;
|
||||||
|
id special_eom_id = LLAMA_TOKEN_NULL;
|
||||||
id special_unk_id = 0;
|
id special_unk_id = 0;
|
||||||
id special_sep_id = LLAMA_TOKEN_NULL;
|
id special_sep_id = LLAMA_TOKEN_NULL;
|
||||||
id special_pad_id = LLAMA_TOKEN_NULL;
|
id special_pad_id = LLAMA_TOKEN_NULL;
|
||||||
|
@ -46,11 +49,14 @@ struct llama_vocab {
|
||||||
id special_mask_id = LLAMA_TOKEN_NULL;
|
id special_mask_id = LLAMA_TOKEN_NULL;
|
||||||
|
|
||||||
id linefeed_id = 13;
|
id linefeed_id = 13;
|
||||||
id special_prefix_id = LLAMA_TOKEN_NULL;
|
|
||||||
id special_suffix_id = LLAMA_TOKEN_NULL;
|
// fim tokens
|
||||||
id special_middle_id = LLAMA_TOKEN_NULL;
|
id special_fim_pre_id = LLAMA_TOKEN_NULL;
|
||||||
id special_eot_id = LLAMA_TOKEN_NULL; // TODO: move above after "eos_id", and here add "file separator" token
|
id special_fim_suf_id = LLAMA_TOKEN_NULL;
|
||||||
id special_eom_id = LLAMA_TOKEN_NULL;
|
id special_fim_mid_id = LLAMA_TOKEN_NULL;
|
||||||
|
id special_fim_pad_id = LLAMA_TOKEN_NULL;
|
||||||
|
id special_fim_rep_id = LLAMA_TOKEN_NULL; // repo
|
||||||
|
id special_fim_sep_id = LLAMA_TOKEN_NULL; // file separator
|
||||||
|
|
||||||
// set of all tokens that cause "end of generation"
|
// set of all tokens that cause "end of generation"
|
||||||
std::set<id> special_eog_ids;
|
std::set<id> special_eog_ids;
|
||||||
|
@ -104,19 +110,26 @@ bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token t
|
||||||
|
|
||||||
llama_token llama_token_bos_impl(const struct llama_vocab & vocab);
|
llama_token llama_token_bos_impl(const struct llama_vocab & vocab);
|
||||||
llama_token llama_token_eos_impl(const struct llama_vocab & vocab);
|
llama_token llama_token_eos_impl(const struct llama_vocab & vocab);
|
||||||
|
llama_token llama_token_eot_impl(const struct llama_vocab & vocab);
|
||||||
|
llama_token llama_token_eom_impl(const struct llama_vocab & vocab);
|
||||||
llama_token llama_token_cls_impl(const struct llama_vocab & vocab);
|
llama_token llama_token_cls_impl(const struct llama_vocab & vocab);
|
||||||
llama_token llama_token_sep_impl(const struct llama_vocab & vocab);
|
llama_token llama_token_sep_impl(const struct llama_vocab & vocab);
|
||||||
llama_token llama_token_nl_impl (const struct llama_vocab & vocab);
|
llama_token llama_token_nl_impl (const struct llama_vocab & vocab);
|
||||||
llama_token llama_token_pad_impl(const struct llama_vocab & vocab);
|
llama_token llama_token_pad_impl(const struct llama_vocab & vocab);
|
||||||
|
|
||||||
bool llama_add_bos_token_impl(const struct llama_vocab & vocab);
|
|
||||||
bool llama_add_eos_token_impl(const struct llama_vocab & vocab);
|
|
||||||
|
|
||||||
llama_token llama_token_prefix_impl(const struct llama_vocab & vocab);
|
llama_token llama_token_prefix_impl(const struct llama_vocab & vocab);
|
||||||
llama_token llama_token_middle_impl(const struct llama_vocab & vocab);
|
llama_token llama_token_middle_impl(const struct llama_vocab & vocab);
|
||||||
llama_token llama_token_suffix_impl(const struct llama_vocab & vocab);
|
llama_token llama_token_suffix_impl(const struct llama_vocab & vocab);
|
||||||
llama_token llama_token_eot_impl (const struct llama_vocab & vocab);
|
|
||||||
llama_token llama_token_eom_impl (const struct llama_vocab & vocab);
|
llama_token llama_token_fim_pre_impl(const struct llama_vocab & vocab);
|
||||||
|
llama_token llama_token_fim_suf_impl(const struct llama_vocab & vocab);
|
||||||
|
llama_token llama_token_fim_mid_impl(const struct llama_vocab & vocab);
|
||||||
|
llama_token llama_token_fim_pad_impl(const struct llama_vocab & vocab);
|
||||||
|
llama_token llama_token_fim_rep_impl(const struct llama_vocab & vocab);
|
||||||
|
llama_token llama_token_fim_sep_impl(const struct llama_vocab & vocab);
|
||||||
|
|
||||||
|
bool llama_add_bos_token_impl(const struct llama_vocab & vocab);
|
||||||
|
bool llama_add_eos_token_impl(const struct llama_vocab & vocab);
|
||||||
|
|
||||||
int32_t llama_tokenize_impl(
|
int32_t llama_tokenize_impl(
|
||||||
const struct llama_vocab & vocab,
|
const struct llama_vocab & vocab,
|
||||||
|
@ -136,6 +149,12 @@ int32_t llama_token_to_piece_impl(
|
||||||
int32_t lstrip,
|
int32_t lstrip,
|
||||||
bool special);
|
bool special);
|
||||||
|
|
||||||
|
// check if token0 is contained as a prefix in token1
|
||||||
|
bool llama_token_is_prefix_impl(
|
||||||
|
const struct llama_vocab & vocab,
|
||||||
|
llama_token token0,
|
||||||
|
llama_token token1);
|
||||||
|
|
||||||
int32_t llama_detokenize_impl(
|
int32_t llama_detokenize_impl(
|
||||||
const struct llama_vocab & vocab,
|
const struct llama_vocab & vocab,
|
||||||
const llama_token * tokens,
|
const llama_token * tokens,
|
||||||
|
|
428
src/llama.cpp
428
src/llama.cpp
|
@ -15,8 +15,6 @@
|
||||||
# include "ggml-cuda.h"
|
# include "ggml-cuda.h"
|
||||||
#elif defined(GGML_USE_CLBLAST)
|
#elif defined(GGML_USE_CLBLAST)
|
||||||
# include "ggml-opencl.h"
|
# include "ggml-opencl.h"
|
||||||
#elif defined(GGML_USE_VULKAN)
|
|
||||||
# include "ggml-vulkan.h"
|
|
||||||
#elif defined(GGML_USE_SYCL)
|
#elif defined(GGML_USE_SYCL)
|
||||||
# include "ggml-sycl.h"
|
# include "ggml-sycl.h"
|
||||||
#elif defined(GGML_USE_KOMPUTE)
|
#elif defined(GGML_USE_KOMPUTE)
|
||||||
|
@ -356,6 +354,8 @@ enum llm_kv {
|
||||||
LLM_KV_TOKENIZER_MERGES,
|
LLM_KV_TOKENIZER_MERGES,
|
||||||
LLM_KV_TOKENIZER_BOS_ID,
|
LLM_KV_TOKENIZER_BOS_ID,
|
||||||
LLM_KV_TOKENIZER_EOS_ID,
|
LLM_KV_TOKENIZER_EOS_ID,
|
||||||
|
LLM_KV_TOKENIZER_EOT_ID,
|
||||||
|
LLM_KV_TOKENIZER_EOM_ID,
|
||||||
LLM_KV_TOKENIZER_UNK_ID,
|
LLM_KV_TOKENIZER_UNK_ID,
|
||||||
LLM_KV_TOKENIZER_SEP_ID,
|
LLM_KV_TOKENIZER_SEP_ID,
|
||||||
LLM_KV_TOKENIZER_PAD_ID,
|
LLM_KV_TOKENIZER_PAD_ID,
|
||||||
|
@ -368,14 +368,20 @@ enum llm_kv {
|
||||||
LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP,
|
LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP,
|
||||||
LLM_KV_TOKENIZER_HF_JSON,
|
LLM_KV_TOKENIZER_HF_JSON,
|
||||||
LLM_KV_TOKENIZER_RWKV,
|
LLM_KV_TOKENIZER_RWKV,
|
||||||
LLM_KV_TOKENIZER_PREFIX_ID,
|
LLM_KV_TOKENIZER_FIM_PRE_ID,
|
||||||
LLM_KV_TOKENIZER_SUFFIX_ID,
|
LLM_KV_TOKENIZER_FIM_SUF_ID,
|
||||||
LLM_KV_TOKENIZER_MIDDLE_ID,
|
LLM_KV_TOKENIZER_FIM_MID_ID,
|
||||||
LLM_KV_TOKENIZER_EOT_ID,
|
LLM_KV_TOKENIZER_FIM_PAD_ID,
|
||||||
LLM_KV_TOKENIZER_EOM_ID,
|
LLM_KV_TOKENIZER_FIM_REP_ID,
|
||||||
|
LLM_KV_TOKENIZER_FIM_SEP_ID,
|
||||||
|
|
||||||
LLM_KV_ADAPTER_TYPE,
|
LLM_KV_ADAPTER_TYPE,
|
||||||
LLM_KV_ADAPTER_LORA_ALPHA,
|
LLM_KV_ADAPTER_LORA_ALPHA,
|
||||||
|
|
||||||
|
// deprecated:
|
||||||
|
LLM_KV_TOKENIZER_PREFIX_ID,
|
||||||
|
LLM_KV_TOKENIZER_SUFFIX_ID,
|
||||||
|
LLM_KV_TOKENIZER_MIDDLE_ID,
|
||||||
};
|
};
|
||||||
|
|
||||||
static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||||
|
@ -464,6 +470,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||||
{ LLM_KV_TOKENIZER_MERGES, "tokenizer.ggml.merges" },
|
{ LLM_KV_TOKENIZER_MERGES, "tokenizer.ggml.merges" },
|
||||||
{ LLM_KV_TOKENIZER_BOS_ID, "tokenizer.ggml.bos_token_id" },
|
{ LLM_KV_TOKENIZER_BOS_ID, "tokenizer.ggml.bos_token_id" },
|
||||||
{ LLM_KV_TOKENIZER_EOS_ID, "tokenizer.ggml.eos_token_id" },
|
{ LLM_KV_TOKENIZER_EOS_ID, "tokenizer.ggml.eos_token_id" },
|
||||||
|
{ LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" },
|
||||||
|
{ LLM_KV_TOKENIZER_EOM_ID, "tokenizer.ggml.eom_token_id" },
|
||||||
{ LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" },
|
{ LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" },
|
||||||
{ LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" },
|
{ LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" },
|
||||||
{ LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
|
{ LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
|
||||||
|
@ -476,14 +484,20 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||||
{ LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, "tokenizer.ggml.precompiled_charsmap" },
|
{ LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, "tokenizer.ggml.precompiled_charsmap" },
|
||||||
{ LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
|
{ LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
|
||||||
{ LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
|
{ LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
|
||||||
{ LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" },
|
{ LLM_KV_TOKENIZER_FIM_PRE_ID, "tokenizer.ggml.fim_pre_token_id" },
|
||||||
{ LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" },
|
{ LLM_KV_TOKENIZER_FIM_SUF_ID, "tokenizer.ggml.fim_suf_token_id" },
|
||||||
{ LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" },
|
{ LLM_KV_TOKENIZER_FIM_MID_ID, "tokenizer.ggml.fim_mid_token_id" },
|
||||||
{ LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" },
|
{ LLM_KV_TOKENIZER_FIM_PAD_ID, "tokenizer.ggml.fim_pad_token_id" },
|
||||||
{ LLM_KV_TOKENIZER_EOM_ID, "tokenizer.ggml.eom_token_id" },
|
{ LLM_KV_TOKENIZER_FIM_REP_ID, "tokenizer.ggml.fim_rep_token_id" },
|
||||||
|
{ LLM_KV_TOKENIZER_FIM_SEP_ID, "tokenizer.ggml.fim_sep_token_id" },
|
||||||
|
|
||||||
{ LLM_KV_ADAPTER_TYPE, "adapter.type" },
|
{ LLM_KV_ADAPTER_TYPE, "adapter.type" },
|
||||||
{ LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" },
|
{ LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" },
|
||||||
|
|
||||||
|
// deprecated
|
||||||
|
{ LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" },
|
||||||
|
{ LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" },
|
||||||
|
{ LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" },
|
||||||
};
|
};
|
||||||
|
|
||||||
struct LLM_KV {
|
struct LLM_KV {
|
||||||
|
@ -3417,8 +3431,6 @@ static int llama_get_device_count(const llama_model & model) {
|
||||||
|
|
||||||
#if defined(GGML_USE_SYCL)
|
#if defined(GGML_USE_SYCL)
|
||||||
count += ggml_backend_sycl_get_device_count();
|
count += ggml_backend_sycl_get_device_count();
|
||||||
#elif defined(GGML_USE_VULKAN)
|
|
||||||
count += ggml_backend_vk_get_device_count();
|
|
||||||
#elif defined(GGML_USE_CANN)
|
#elif defined(GGML_USE_CANN)
|
||||||
count += ggml_backend_cann_get_device_count();
|
count += ggml_backend_cann_get_device_count();
|
||||||
#endif
|
#endif
|
||||||
|
@ -3450,10 +3462,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(const llama_mode
|
||||||
}
|
}
|
||||||
#elif defined(GGML_USE_CPU_HBM)
|
#elif defined(GGML_USE_CPU_HBM)
|
||||||
buft = ggml_backend_cpu_hbm_buffer_type();
|
buft = ggml_backend_cpu_hbm_buffer_type();
|
||||||
#elif defined(GGML_USE_VULKAN)
|
|
||||||
if (host_buffer) {
|
|
||||||
buft = ggml_backend_vk_host_buffer_type();
|
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (buft == nullptr) {
|
if (buft == nullptr) {
|
||||||
|
@ -3472,9 +3480,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_
|
||||||
}
|
}
|
||||||
device -= (int)model.devices.size();
|
device -= (int)model.devices.size();
|
||||||
|
|
||||||
#if defined(GGML_USE_VULKAN)
|
#if defined(GGML_USE_SYCL)
|
||||||
buft = ggml_backend_vk_buffer_type(device);
|
|
||||||
#elif defined(GGML_USE_SYCL)
|
|
||||||
buft = ggml_backend_sycl_buffer_type(device);
|
buft = ggml_backend_sycl_buffer_type(device);
|
||||||
#elif defined(GGML_USE_CLBLAST)
|
#elif defined(GGML_USE_CLBLAST)
|
||||||
buft = ggml_backend_opencl_buffer_type();
|
buft = ggml_backend_opencl_buffer_type();
|
||||||
|
@ -3536,11 +3542,6 @@ static size_t llama_get_device_memory(const llama_model & model, int device) {
|
||||||
size_t free;
|
size_t free;
|
||||||
ggml_backend_sycl_get_device_memory(device, &free, &total);
|
ggml_backend_sycl_get_device_memory(device, &free, &total);
|
||||||
return free;
|
return free;
|
||||||
#elif defined(GGML_USE_VULKAN)
|
|
||||||
size_t total;
|
|
||||||
size_t free;
|
|
||||||
ggml_backend_vk_get_device_memory(device, &free, &total);
|
|
||||||
return free;
|
|
||||||
#elif defined(GGML_USE_CANN)
|
#elif defined(GGML_USE_CANN)
|
||||||
size_t total;
|
size_t total;
|
||||||
size_t free;
|
size_t free;
|
||||||
|
@ -6197,14 +6198,14 @@ static void llm_load_vocab(
|
||||||
vocab.type = LLAMA_VOCAB_TYPE_NONE;
|
vocab.type = LLAMA_VOCAB_TYPE_NONE;
|
||||||
|
|
||||||
// default special tokens
|
// default special tokens
|
||||||
vocab.special_bos_id = -1;
|
vocab.special_bos_id = LLAMA_TOKEN_NULL;
|
||||||
vocab.special_eos_id = -1;
|
vocab.special_eos_id = LLAMA_TOKEN_NULL;
|
||||||
vocab.special_unk_id = -1;
|
vocab.special_unk_id = LLAMA_TOKEN_NULL;
|
||||||
vocab.special_sep_id = -1;
|
vocab.special_sep_id = LLAMA_TOKEN_NULL;
|
||||||
vocab.special_pad_id = -1;
|
vocab.special_pad_id = LLAMA_TOKEN_NULL;
|
||||||
vocab.special_cls_id = -1;
|
vocab.special_cls_id = LLAMA_TOKEN_NULL;
|
||||||
vocab.special_mask_id = -1;
|
vocab.special_mask_id = LLAMA_TOKEN_NULL;
|
||||||
vocab.linefeed_id = -1;
|
vocab.linefeed_id = LLAMA_TOKEN_NULL;
|
||||||
|
|
||||||
// read vocab size from metadata
|
// read vocab size from metadata
|
||||||
if (!ml.get_key(LLM_KV_VOCAB_SIZE, vocab.n_vocab, false)) {
|
if (!ml.get_key(LLM_KV_VOCAB_SIZE, vocab.n_vocab, false)) {
|
||||||
|
@ -6221,16 +6222,16 @@ static void llm_load_vocab(
|
||||||
vocab.special_bos_id = 1;
|
vocab.special_bos_id = 1;
|
||||||
vocab.special_eos_id = 2;
|
vocab.special_eos_id = 2;
|
||||||
vocab.special_unk_id = 0;
|
vocab.special_unk_id = 0;
|
||||||
vocab.special_sep_id = -1;
|
vocab.special_sep_id = LLAMA_TOKEN_NULL;
|
||||||
vocab.special_pad_id = -1;
|
vocab.special_pad_id = LLAMA_TOKEN_NULL;
|
||||||
vocab.special_cls_id = -1;
|
vocab.special_cls_id = LLAMA_TOKEN_NULL;
|
||||||
vocab.special_mask_id = -1;
|
vocab.special_mask_id = LLAMA_TOKEN_NULL;
|
||||||
} else if (tokenizer_model == "bert") {
|
} else if (tokenizer_model == "bert") {
|
||||||
vocab.type = LLAMA_VOCAB_TYPE_WPM;
|
vocab.type = LLAMA_VOCAB_TYPE_WPM;
|
||||||
|
|
||||||
// default special tokens
|
// default special tokens
|
||||||
vocab.special_bos_id = -1;
|
vocab.special_bos_id = LLAMA_TOKEN_NULL;
|
||||||
vocab.special_eos_id = -1;
|
vocab.special_eos_id = LLAMA_TOKEN_NULL;
|
||||||
vocab.special_unk_id = 100;
|
vocab.special_unk_id = 100;
|
||||||
vocab.special_sep_id = 102;
|
vocab.special_sep_id = 102;
|
||||||
vocab.special_pad_id = 0;
|
vocab.special_pad_id = 0;
|
||||||
|
@ -6275,22 +6276,22 @@ static void llm_load_vocab(
|
||||||
// default special tokens
|
// default special tokens
|
||||||
vocab.special_bos_id = 11;
|
vocab.special_bos_id = 11;
|
||||||
vocab.special_eos_id = 11;
|
vocab.special_eos_id = 11;
|
||||||
vocab.special_unk_id = -1;
|
vocab.special_unk_id = LLAMA_TOKEN_NULL;
|
||||||
vocab.special_sep_id = -1;
|
vocab.special_sep_id = LLAMA_TOKEN_NULL;
|
||||||
vocab.special_pad_id = -1;
|
vocab.special_pad_id = LLAMA_TOKEN_NULL;
|
||||||
vocab.special_cls_id = -1;
|
vocab.special_cls_id = LLAMA_TOKEN_NULL;
|
||||||
vocab.special_mask_id = -1;
|
vocab.special_mask_id = LLAMA_TOKEN_NULL;
|
||||||
} else if (tokenizer_model == "t5") {
|
} else if (tokenizer_model == "t5") {
|
||||||
vocab.type = LLAMA_VOCAB_TYPE_UGM;
|
vocab.type = LLAMA_VOCAB_TYPE_UGM;
|
||||||
|
|
||||||
// default special tokens
|
// default special tokens
|
||||||
vocab.special_bos_id = -1;
|
vocab.special_bos_id = LLAMA_TOKEN_NULL;
|
||||||
vocab.special_eos_id = 1;
|
vocab.special_eos_id = 1;
|
||||||
vocab.special_unk_id = 2;
|
vocab.special_unk_id = 2;
|
||||||
vocab.special_sep_id = -1;
|
vocab.special_sep_id = LLAMA_TOKEN_NULL;
|
||||||
vocab.special_pad_id = 0;
|
vocab.special_pad_id = 0;
|
||||||
vocab.special_cls_id = -1;
|
vocab.special_cls_id = LLAMA_TOKEN_NULL;
|
||||||
vocab.special_mask_id = -1;
|
vocab.special_mask_id = LLAMA_TOKEN_NULL;
|
||||||
|
|
||||||
const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
|
const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
|
||||||
if (precompiled_charsmap_keyidx != -1) {
|
if (precompiled_charsmap_keyidx != -1) {
|
||||||
|
@ -6313,11 +6314,11 @@ static void llm_load_vocab(
|
||||||
vocab.type = LLAMA_VOCAB_TYPE_RWKV;
|
vocab.type = LLAMA_VOCAB_TYPE_RWKV;
|
||||||
|
|
||||||
// default special tokens
|
// default special tokens
|
||||||
vocab.special_bos_id = -1;
|
vocab.special_bos_id = LLAMA_TOKEN_NULL;
|
||||||
vocab.special_eos_id = -1;
|
vocab.special_eos_id = LLAMA_TOKEN_NULL;
|
||||||
vocab.special_unk_id = -1;
|
vocab.special_unk_id = LLAMA_TOKEN_NULL;
|
||||||
vocab.special_sep_id = -1;
|
vocab.special_sep_id = LLAMA_TOKEN_NULL;
|
||||||
vocab.special_pad_id = -1;
|
vocab.special_pad_id = LLAMA_TOKEN_NULL;
|
||||||
} else {
|
} else {
|
||||||
throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
|
throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
|
||||||
}
|
}
|
||||||
|
@ -6401,7 +6402,7 @@ static void llm_load_vocab(
|
||||||
} else if (
|
} else if (
|
||||||
tokenizer_pre == "chatglm-bpe") {
|
tokenizer_pre == "chatglm-bpe") {
|
||||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHATGLM4;
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHATGLM4;
|
||||||
vocab.special_bos_id = -1;
|
vocab.special_bos_id = LLAMA_TOKEN_NULL;
|
||||||
} else if (
|
} else if (
|
||||||
tokenizer_pre == "viking") {
|
tokenizer_pre == "viking") {
|
||||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_VIKING;
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_VIKING;
|
||||||
|
@ -6528,44 +6529,6 @@ static void llm_load_vocab(
|
||||||
|
|
||||||
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
|
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
|
||||||
if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
|
if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
|
||||||
// For Fill-In-the-Middle (FIM)/infill models which where converted
|
|
||||||
// prior to support of FIM special tokens in GGUF, the following
|
|
||||||
// will allow those models to continue to work. The general names
|
|
||||||
// of the known models are currently CodeLlama (LLM_ARCH_LLAMA) and
|
|
||||||
// CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once
|
|
||||||
// new versions of these models have been published.
|
|
||||||
std::string gen_name;
|
|
||||||
ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false);
|
|
||||||
|
|
||||||
std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(),
|
|
||||||
[](unsigned char c){ return std::tolower(c); });
|
|
||||||
|
|
||||||
if (gen_name.find("code") != std::string::npos) {
|
|
||||||
if (model.arch == LLM_ARCH_LLAMA
|
|
||||||
&& 32010 < vocab.id_to_token.size()
|
|
||||||
&& vocab.id_to_token[32007].text.find("<PRE>") != std::string::npos
|
|
||||||
&& vocab.id_to_token[32008].text.find("<SUF>") != std::string::npos
|
|
||||||
&& vocab.id_to_token[32009].text.find("<MID>") != std::string::npos
|
|
||||||
&& vocab.id_to_token[32010].text.find("<EOT>") != std::string::npos) {
|
|
||||||
vocab.special_prefix_id = 32007;
|
|
||||||
vocab.special_suffix_id = 32008;
|
|
||||||
vocab.special_middle_id = 32009;
|
|
||||||
vocab.special_eot_id = 32010;
|
|
||||||
} else if (model.arch == LLM_ARCH_GEMMA
|
|
||||||
&& 107 < vocab.id_to_token.size()
|
|
||||||
&& vocab.id_to_token[67].text == "<|fim_prefix|>"
|
|
||||||
&& vocab.id_to_token[69].text == "<|fim_suffix|>"
|
|
||||||
&& vocab.id_to_token[68].text == "<|fim_middle|>"
|
|
||||||
&& vocab.id_to_token[107].text == "<end_of_turn>") {
|
|
||||||
vocab.special_prefix_id = 67;
|
|
||||||
vocab.special_suffix_id = 69;
|
|
||||||
vocab.special_middle_id = 68;
|
|
||||||
// TODO: this is not EOT, it is "file separator" token, needs fix
|
|
||||||
// https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572
|
|
||||||
//vocab.special_eot_id = 70;
|
|
||||||
vocab.special_eot_id = 107;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
try {
|
try {
|
||||||
vocab.linefeed_id = llama_byte_to_token_impl(vocab, '\n');
|
vocab.linefeed_id = llama_byte_to_token_impl(vocab, '\n');
|
||||||
} catch (const std::exception & e) {
|
} catch (const std::exception & e) {
|
||||||
|
@ -6595,16 +6558,24 @@ static void llm_load_vocab(
|
||||||
const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
|
const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
|
||||||
{ LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id },
|
{ LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id },
|
||||||
{ LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id },
|
{ LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id },
|
||||||
|
{ LLM_KV_TOKENIZER_EOT_ID, vocab.special_eot_id },
|
||||||
|
{ LLM_KV_TOKENIZER_EOM_ID, vocab.special_eom_id },
|
||||||
{ LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
|
{ LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
|
||||||
{ LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
|
{ LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
|
||||||
{ LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
|
{ LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
|
||||||
{ LLM_KV_TOKENIZER_CLS_ID, vocab.special_cls_id },
|
{ LLM_KV_TOKENIZER_CLS_ID, vocab.special_cls_id },
|
||||||
{ LLM_KV_TOKENIZER_MASK_ID, vocab.special_mask_id },
|
{ LLM_KV_TOKENIZER_MASK_ID, vocab.special_mask_id },
|
||||||
{ LLM_KV_TOKENIZER_PREFIX_ID, vocab.special_prefix_id },
|
{ LLM_KV_TOKENIZER_FIM_PRE_ID, vocab.special_fim_pre_id },
|
||||||
{ LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_suffix_id },
|
{ LLM_KV_TOKENIZER_FIM_SUF_ID, vocab.special_fim_suf_id },
|
||||||
{ LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_middle_id },
|
{ LLM_KV_TOKENIZER_FIM_MID_ID, vocab.special_fim_mid_id },
|
||||||
{ LLM_KV_TOKENIZER_EOT_ID, vocab.special_eot_id },
|
{ LLM_KV_TOKENIZER_FIM_PAD_ID, vocab.special_fim_pad_id },
|
||||||
{ LLM_KV_TOKENIZER_EOM_ID, vocab.special_eom_id },
|
{ LLM_KV_TOKENIZER_FIM_REP_ID, vocab.special_fim_rep_id },
|
||||||
|
{ LLM_KV_TOKENIZER_FIM_SEP_ID, vocab.special_fim_sep_id },
|
||||||
|
|
||||||
|
// deprecated
|
||||||
|
{ LLM_KV_TOKENIZER_PREFIX_ID, vocab.special_fim_pre_id },
|
||||||
|
{ LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_fim_suf_id },
|
||||||
|
{ LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_fim_mid_id },
|
||||||
};
|
};
|
||||||
|
|
||||||
for (const auto & it : special_token_types) {
|
for (const auto & it : special_token_types) {
|
||||||
|
@ -6635,46 +6606,140 @@ static void llm_load_vocab(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
|
// auto-detect special tokens by text
|
||||||
//
|
// TODO: convert scripts should provide these tokens through the KV metadata LLM_KV_TOKENIZER_...
|
||||||
// TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOT_ID
|
// for now, we apply this workaround to find the tokens based on their text
|
||||||
// for now, we apply this workaround to find the EOT token based on its text
|
|
||||||
if (vocab.special_eot_id == -1) {
|
|
||||||
for (const auto & t : vocab.token_to_id) {
|
for (const auto & t : vocab.token_to_id) {
|
||||||
|
// find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
|
||||||
|
if (vocab.special_eot_id == LLAMA_TOKEN_NULL) {
|
||||||
if (false
|
if (false
|
||||||
// TODO: gemma "<end_of_turn>" is exported as a normal token, so the following check does not work
|
|
||||||
// need to fix convert script
|
|
||||||
//vocab.id_to_token[t.second].type == LLAMA_TOKEN_TYPE_CONTROL &&
|
|
||||||
|| t.first == "<|eot_id|>"
|
|| t.first == "<|eot_id|>"
|
||||||
|| t.first == "<|im_end|>"
|
|| t.first == "<|im_end|>"
|
||||||
|| t.first == "<|end|>"
|
|| t.first == "<|end|>"
|
||||||
|| t.first == "<end_of_turn>"
|
|| t.first == "<end_of_turn>"
|
||||||
|| t.first == "<|endoftext|>"
|
|| t.first == "<|endoftext|>"
|
||||||
|| t.first == "<EOT>"
|
|| t.first == "<EOT>"
|
||||||
|
|| t.first == "<|end▁of▁sentence|>" // DeepSeek
|
||||||
) {
|
) {
|
||||||
vocab.special_eot_id = t.second;
|
vocab.special_eot_id = t.second;
|
||||||
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||||
LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
||||||
__func__, t.first.c_str());
|
__func__, t.second, t.first.c_str());
|
||||||
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
||||||
}
|
}
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// find EOM token: "<|eom_id|>"
|
// find EOM token: "<|eom_id|>"
|
||||||
//
|
if (vocab.special_eom_id == LLAMA_TOKEN_NULL) {
|
||||||
// TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOM_ID
|
if (false
|
||||||
// for now, we apply this workaround to find the EOM token based on its text
|
|| t.first == "<|eom_id|>"
|
||||||
if (vocab.special_eom_id == -1) {
|
) {
|
||||||
const auto & t = vocab.token_to_id.find("<|eom_id|>");
|
vocab.special_eom_id = t.second;
|
||||||
if (t != vocab.token_to_id.end()) {
|
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||||
vocab.special_eom_id = t->second;
|
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
||||||
if ((vocab.id_to_token[t->second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
__func__, t.second, t.first.c_str());
|
||||||
LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
||||||
__func__, t->first.c_str());
|
}
|
||||||
vocab.id_to_token[t->second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// find FIM_PRE token: "<|fim_prefix|>", "<fim-prefix>", "<PRE>", etc.
|
||||||
|
if (vocab.special_fim_pre_id == LLAMA_TOKEN_NULL) {
|
||||||
|
if (false
|
||||||
|
|| t.first == "<|fim_prefix|>" // Qwen
|
||||||
|
|| t.first == "<fim-prefix>"
|
||||||
|
|| t.first == "<|fim▁begin|>" // DeepSeek
|
||||||
|
|| t.first == "<PRE>"
|
||||||
|
) {
|
||||||
|
vocab.special_fim_pre_id = t.second;
|
||||||
|
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||||
|
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
||||||
|
__func__, t.second, t.first.c_str());
|
||||||
|
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// find FIM_SUF token: "<|fim_suffix|>", "<fim-suffix>", "<SUF>", etc.
|
||||||
|
if (vocab.special_fim_suf_id == LLAMA_TOKEN_NULL) {
|
||||||
|
if (false
|
||||||
|
|| t.first == "<|fim_suffix|>" // Qwen
|
||||||
|
|| t.first == "<fim-suffix>"
|
||||||
|
|| t.first == "<|fim▁hole|>" // DeepSeek
|
||||||
|
|| t.first == "<SUF>"
|
||||||
|
) {
|
||||||
|
vocab.special_fim_suf_id = t.second;
|
||||||
|
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||||
|
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
||||||
|
__func__, t.second, t.first.c_str());
|
||||||
|
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// find FIM_MID token: "<|fim_middle|>", "<fim-middle>", "<MID>", etc.
|
||||||
|
if (vocab.special_fim_mid_id == LLAMA_TOKEN_NULL) {
|
||||||
|
if (false
|
||||||
|
|| t.first == "<|fim_middle|>" // Qwen
|
||||||
|
|| t.first == "<fim-middle>"
|
||||||
|
|| t.first == "<|fim▁end|>" // DeepSeek
|
||||||
|
|| t.first == "<MID>"
|
||||||
|
) {
|
||||||
|
vocab.special_fim_mid_id = t.second;
|
||||||
|
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||||
|
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
||||||
|
__func__, t.second, t.first.c_str());
|
||||||
|
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// find FIM_PAD token: "<|fim_pad|>", "<fim-pad>", "<PAD>", etc.
|
||||||
|
if (vocab.special_fim_pad_id == LLAMA_TOKEN_NULL) {
|
||||||
|
if (false
|
||||||
|
|| t.first == "<|fim_pad|>" // Qwen
|
||||||
|
|| t.first == "<fim-pad>"
|
||||||
|
|| t.first == "<PAD>"
|
||||||
|
) {
|
||||||
|
vocab.special_fim_pad_id = t.second;
|
||||||
|
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||||
|
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
||||||
|
__func__, t.second, t.first.c_str());
|
||||||
|
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// find FIM_REP token: "<|fim_repo|>", "<fim-repo>", "<REP>", etc.
|
||||||
|
if (vocab.special_fim_rep_id == LLAMA_TOKEN_NULL) {
|
||||||
|
if (false
|
||||||
|
|| t.first == "<|fim_repo|>" // Qwen
|
||||||
|
|| t.first == "<|repo_name|>"
|
||||||
|
|| t.first == "<fim-repo>"
|
||||||
|
|| t.first == "<REPO>"
|
||||||
|
) {
|
||||||
|
vocab.special_fim_rep_id = t.second;
|
||||||
|
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||||
|
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
||||||
|
__func__, t.second, t.first.c_str());
|
||||||
|
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// find FIM_SEP token: "<|file_sep|>"
|
||||||
|
if (vocab.special_fim_sep_id == LLAMA_TOKEN_NULL) {
|
||||||
|
if (false
|
||||||
|
|| t.first == "<|file_sep|>" // Qwen
|
||||||
|
) {
|
||||||
|
vocab.special_fim_sep_id = t.second;
|
||||||
|
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||||
|
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
||||||
|
__func__, t.second, t.first.c_str());
|
||||||
|
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -6683,6 +6748,19 @@ static void llm_load_vocab(
|
||||||
// this is currently determined based on the token text, which is obviously not ideal
|
// this is currently determined based on the token text, which is obviously not ideal
|
||||||
// ref: https://github.com/ggerganov/llama.cpp/issues/9606
|
// ref: https://github.com/ggerganov/llama.cpp/issues/9606
|
||||||
vocab.special_eog_ids.clear();
|
vocab.special_eog_ids.clear();
|
||||||
|
|
||||||
|
if (vocab.special_fim_pad_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_pad_id) == 0) {
|
||||||
|
vocab.special_eog_ids.insert(vocab.special_fim_pad_id);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (vocab.special_fim_rep_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_rep_id) == 0) {
|
||||||
|
vocab.special_eog_ids.insert(vocab.special_fim_rep_id);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (vocab.special_fim_sep_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_sep_id) == 0) {
|
||||||
|
vocab.special_eog_ids.insert(vocab.special_fim_sep_id);
|
||||||
|
}
|
||||||
|
|
||||||
for (const auto & t : vocab.token_to_id) {
|
for (const auto & t : vocab.token_to_id) {
|
||||||
if (false
|
if (false
|
||||||
|| t.first == "<|eot_id|>"
|
|| t.first == "<|eot_id|>"
|
||||||
|
@ -6695,24 +6773,31 @@ static void llm_load_vocab(
|
||||||
) {
|
) {
|
||||||
vocab.special_eog_ids.insert(t.second);
|
vocab.special_eog_ids.insert(t.second);
|
||||||
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||||
LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
||||||
__func__, t.first.c_str());
|
__func__, t.second, t.first.c_str());
|
||||||
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
// token is control, but not marked as EOG -> print a warning
|
||||||
|
if (vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL && vocab.special_eog_ids.count(t.second) == 0) {
|
||||||
|
LLAMA_LOG_WARN("%s: control token: %6d '%s' is not marked as EOG\n",
|
||||||
|
__func__, t.second, t.first.c_str());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (vocab.special_eos_id != -1 && vocab.special_eog_ids.count(vocab.special_eos_id) == 0) {
|
// sanity checks
|
||||||
|
if (vocab.special_eos_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eos_id) == 0) {
|
||||||
vocab.special_eog_ids.insert(vocab.special_eos_id);
|
vocab.special_eog_ids.insert(vocab.special_eos_id);
|
||||||
LLAMA_LOG_WARN("%s: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
|
LLAMA_LOG_WARN("%s: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (vocab.special_eot_id != -1 && vocab.special_eog_ids.count(vocab.special_eot_id) == 0) {
|
if (vocab.special_eot_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eot_id) == 0) {
|
||||||
vocab.special_eog_ids.insert(vocab.special_eot_id);
|
vocab.special_eog_ids.insert(vocab.special_eot_id);
|
||||||
LLAMA_LOG_WARN("%s: special_eot_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
|
LLAMA_LOG_WARN("%s: special_eot_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (vocab.special_eom_id != -1 && vocab.special_eog_ids.count(vocab.special_eom_id) == 0) {
|
if (vocab.special_eom_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eom_id) == 0) {
|
||||||
vocab.special_eog_ids.insert(vocab.special_eom_id);
|
vocab.special_eog_ids.insert(vocab.special_eom_id);
|
||||||
LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
|
LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
|
||||||
}
|
}
|
||||||
|
@ -6908,6 +6993,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
||||||
// special tokens
|
// special tokens
|
||||||
if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
|
if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
|
||||||
if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
|
if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
|
||||||
|
if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
|
||||||
|
if (vocab.special_eom_id != -1) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, vocab.special_eom_id, vocab.id_to_token[vocab.special_eom_id].text.c_str() ); }
|
||||||
if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
|
if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
|
||||||
if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
|
if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
|
||||||
if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
|
if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
|
||||||
|
@ -6915,11 +7002,13 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
||||||
if (vocab.special_mask_id != -1) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
|
if (vocab.special_mask_id != -1) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
|
||||||
|
|
||||||
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
|
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
|
||||||
if (vocab.special_prefix_id != -1) { LLAMA_LOG_INFO( "%s: PRE token = %d '%s'\n", __func__, vocab.special_prefix_id, vocab.id_to_token[vocab.special_prefix_id].text.c_str() ); }
|
|
||||||
if (vocab.special_suffix_id != -1) { LLAMA_LOG_INFO( "%s: SUF token = %d '%s'\n", __func__, vocab.special_suffix_id, vocab.id_to_token[vocab.special_suffix_id].text.c_str() ); }
|
if (vocab.special_fim_pre_id != -1) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n", __func__, vocab.special_fim_pre_id, vocab.id_to_token[vocab.special_fim_pre_id].text.c_str() ); }
|
||||||
if (vocab.special_middle_id != -1) { LLAMA_LOG_INFO( "%s: MID token = %d '%s'\n", __func__, vocab.special_middle_id, vocab.id_to_token[vocab.special_middle_id].text.c_str() ); }
|
if (vocab.special_fim_suf_id != -1) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n", __func__, vocab.special_fim_suf_id, vocab.id_to_token[vocab.special_fim_suf_id].text.c_str() ); }
|
||||||
if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
|
if (vocab.special_fim_mid_id != -1) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n", __func__, vocab.special_fim_mid_id, vocab.id_to_token[vocab.special_fim_mid_id].text.c_str() ); }
|
||||||
if (vocab.special_eom_id != -1) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, vocab.special_eom_id, vocab.id_to_token[vocab.special_eom_id].text.c_str() ); }
|
if (vocab.special_fim_pad_id != -1) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n", __func__, vocab.special_fim_pad_id, vocab.id_to_token[vocab.special_fim_pad_id].text.c_str() ); }
|
||||||
|
if (vocab.special_fim_rep_id != -1) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n", __func__, vocab.special_fim_rep_id, vocab.id_to_token[vocab.special_fim_rep_id].text.c_str() ); }
|
||||||
|
if (vocab.special_fim_sep_id != -1) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n", __func__, vocab.special_fim_sep_id, vocab.id_to_token[vocab.special_fim_sep_id].text.c_str() ); }
|
||||||
|
|
||||||
for (const auto & id : vocab.special_eog_ids) {
|
for (const auto & id : vocab.special_eog_ids) {
|
||||||
LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, vocab.id_to_token[id].text.c_str() );
|
LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, vocab.id_to_token[id].text.c_str() );
|
||||||
|
@ -16061,9 +16150,11 @@ struct llm_build_context {
|
||||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||||
|
|
||||||
cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, model.output_norm_b, LLM_NORM, cb, -1);
|
cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, model.output_norm_b, LLM_NORM, cb, -1);
|
||||||
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
cb(cur, "result_norm", -1);
|
||||||
|
|
||||||
|
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
||||||
cb(cur, "result_output", -1);
|
cb(cur, "result_output", -1);
|
||||||
|
|
||||||
ggml_build_forward_expand(gf, cur);
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
|
||||||
return gf;
|
return gf;
|
||||||
|
@ -19059,7 +19150,7 @@ bool llama_supports_mlock(void) {
|
||||||
}
|
}
|
||||||
|
|
||||||
bool llama_supports_gpu_offload(void) {
|
bool llama_supports_gpu_offload(void) {
|
||||||
#if defined(GGML_USE_CLBLAST) || defined(GGML_USE_VULKAN) || \
|
#if defined(GGML_USE_CLBLAST) || \
|
||||||
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
|
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
|
||||||
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
||||||
return true;
|
return true;
|
||||||
|
@ -19191,10 +19282,15 @@ struct llama_model * llama_load_model_from_file(
|
||||||
|
|
||||||
case GGML_BACKEND_DEVICE_TYPE_GPU:
|
case GGML_BACKEND_DEVICE_TYPE_GPU:
|
||||||
case GGML_BACKEND_DEVICE_TYPE_GPU_FULL:
|
case GGML_BACKEND_DEVICE_TYPE_GPU_FULL:
|
||||||
|
{
|
||||||
|
size_t free, total; // NOLINT
|
||||||
|
ggml_backend_dev_memory(dev, &free, &total);
|
||||||
|
LLAMA_LOG_INFO("%s: using device %s (%s) - %zu MiB free\n", __func__, ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), free/1024/1024);
|
||||||
model->devices.push_back(dev);
|
model->devices.push_back(dev);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
int status = llama_model_load(path_model, *model, params);
|
int status = llama_model_load(path_model, *model, params);
|
||||||
GGML_ASSERT(status <= 0);
|
GGML_ASSERT(status <= 0);
|
||||||
|
@ -19387,32 +19483,7 @@ struct llama_context * llama_new_context_with_model(
|
||||||
main_gpu -= (int)model->devices.size();
|
main_gpu -= (int)model->devices.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(GGML_USE_VULKAN)
|
#if defined(GGML_USE_SYCL)
|
||||||
if (model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
|
||||||
LLAMA_LOG_ERROR("%s: Row split not supported. Failed to initialize Vulkan backend\n", __func__);
|
|
||||||
llama_free(ctx);
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
|
|
||||||
ggml_backend_t backend = ggml_backend_vk_init(main_gpu);
|
|
||||||
if (backend == nullptr) {
|
|
||||||
LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
|
|
||||||
llama_free(ctx);
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
ctx->backends.push_back(backend);
|
|
||||||
} else {
|
|
||||||
for (int device = 0; device < ggml_backend_vk_get_device_count(); ++device) {
|
|
||||||
ggml_backend_t backend = ggml_backend_vk_init(device);
|
|
||||||
if (backend == nullptr) {
|
|
||||||
LLAMA_LOG_ERROR("%s: failed to initialize Vulkan%d backend\n", __func__, device);
|
|
||||||
llama_free(ctx);
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
ctx->backends.push_back(backend);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#elif defined(GGML_USE_SYCL)
|
|
||||||
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
||||||
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
||||||
ggml_backend_t backend = ggml_backend_sycl_init(main_gpu);
|
ggml_backend_t backend = ggml_backend_sycl_init(main_gpu);
|
||||||
|
@ -21387,6 +21458,10 @@ llama_token llama_token_eos(const struct llama_model * model) {
|
||||||
return llama_token_eos_impl(model->vocab);
|
return llama_token_eos_impl(model->vocab);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
llama_token llama_token_eot(const struct llama_model * model) {
|
||||||
|
return llama_token_eot_impl(model->vocab);
|
||||||
|
}
|
||||||
|
|
||||||
llama_token llama_token_cls(const struct llama_model * model) {
|
llama_token llama_token_cls(const struct llama_model * model) {
|
||||||
return llama_token_cls_impl(model->vocab);
|
return llama_token_cls_impl(model->vocab);
|
||||||
}
|
}
|
||||||
|
@ -21423,8 +21498,28 @@ llama_token llama_token_suffix(const struct llama_model * model) {
|
||||||
return llama_token_suffix_impl(model->vocab);
|
return llama_token_suffix_impl(model->vocab);
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_token llama_token_eot(const struct llama_model * model) {
|
llama_token llama_token_fim_pre(const struct llama_model * model) {
|
||||||
return llama_token_eot_impl(model->vocab);
|
return llama_token_fim_pre_impl(model->vocab);
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_token llama_token_fim_suf(const struct llama_model * model) {
|
||||||
|
return llama_token_fim_suf_impl(model->vocab);
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_token llama_token_fim_mid(const struct llama_model * model) {
|
||||||
|
return llama_token_fim_mid_impl(model->vocab);
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_token llama_token_fim_pad(const struct llama_model * model) {
|
||||||
|
return llama_token_fim_pad_impl(model->vocab);
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_token llama_token_fim_rep(const struct llama_model * model) {
|
||||||
|
return llama_token_fim_rep_impl(model->vocab);
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_token llama_token_fim_sep(const struct llama_model * model) {
|
||||||
|
return llama_token_fim_sep_impl(model->vocab);
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
|
@ -21452,6 +21547,13 @@ int32_t llama_token_to_piece(
|
||||||
return llama_token_to_piece_impl(model->vocab, token, buf, length, lstrip, special);
|
return llama_token_to_piece_impl(model->vocab, token, buf, length, lstrip, special);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool llama_token_is_prefix(
|
||||||
|
const struct llama_model * model,
|
||||||
|
llama_token token0,
|
||||||
|
llama_token token1) {
|
||||||
|
return llama_token_is_prefix_impl(model->vocab, token0, token1);
|
||||||
|
}
|
||||||
|
|
||||||
int32_t llama_detokenize(
|
int32_t llama_detokenize(
|
||||||
const struct llama_model * model,
|
const struct llama_model * model,
|
||||||
const llama_token * tokens,
|
const llama_token * tokens,
|
||||||
|
@ -21782,6 +21884,10 @@ struct llama_sampler * llama_sampler_init_grammar(const struct llama_model * mod
|
||||||
return llama_sampler_init_grammar_impl(model->vocab, grammar_str, grammar_root);
|
return llama_sampler_init_grammar_impl(model->vocab, grammar_str, grammar_root);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct llama_sampler * llama_sampler_init_infill(const struct llama_model * model) {
|
||||||
|
return llama_sampler_init_infill_impl(model->vocab);
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// model split
|
// model split
|
||||||
//
|
//
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue