Merge branch 'upstream' into concedo_experimental

# Conflicts:
#	.devops/full-cuda.Dockerfile
#	.devops/llama-cli-cuda.Dockerfile
#	.devops/llama-server-cuda.Dockerfile
#	.devops/llama-server-intel.Dockerfile
#	.devops/llama-server-rocm.Dockerfile
#	.devops/llama-server-vulkan.Dockerfile
#	.devops/llama-server.Dockerfile
#	.github/workflows/docker.yml
#	docs/docker.md
#	examples/llama-bench/llama-bench.cpp
#	flake.lock
#	ggml/include/ggml.h
#	ggml/src/CMakeLists.txt
#	scripts/sync-ggml.last
#	src/llama.cpp
#	tests/test-backend-ops.cpp
#	tests/test-grad0.cpp
#	tests/test-rope.cpp
This commit is contained in:
Concedo 2024-08-30 10:37:39 +08:00
commit d220495dd4
42 changed files with 100585 additions and 99448 deletions

View file

@ -252,6 +252,57 @@ int32_t cpu_get_num_math() {
return cpu_get_num_physical_cores();
}
// Helper for setting process priority
#if defined(_WIN32)
bool set_process_priority(enum ggml_sched_priority prio) {
if (prio == GGML_SCHED_PRIO_NORMAL) {
return true;
}
DWORD p = NORMAL_PRIORITY_CLASS;
switch (prio) {
case GGML_SCHED_PRIO_NORMAL: p = NORMAL_PRIORITY_CLASS; break;
case GGML_SCHED_PRIO_MEDIUM: p = ABOVE_NORMAL_PRIORITY_CLASS; break;
case GGML_SCHED_PRIO_HIGH: p = HIGH_PRIORITY_CLASS; break;
case GGML_SCHED_PRIO_REALTIME: p = REALTIME_PRIORITY_CLASS; break;
}
if (!SetPriorityClass(GetCurrentProcess(), p)) {
fprintf(stderr, "warn: failed to set process priority class %d : (%d)\n", prio, (int) GetLastError());
return false;
}
return true;
}
#else // MacOS and POSIX
#include <sys/types.h>
#include <sys/resource.h>
bool set_process_priority(enum ggml_sched_priority prio) {
if (prio == GGML_SCHED_PRIO_NORMAL) {
return true;
}
int p = 0;
switch (prio) {
case GGML_SCHED_PRIO_NORMAL: p = 0; break;
case GGML_SCHED_PRIO_MEDIUM: p = -5; break;
case GGML_SCHED_PRIO_HIGH: p = -10; break;
case GGML_SCHED_PRIO_REALTIME: p = -20; break;
}
if (!setpriority(PRIO_PROCESS, 0, p)) {
fprintf(stderr, "warn: failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno);
return false;
}
return true;
}
#endif
//
// CLI argument parsing
//
@ -278,6 +329,30 @@ void gpt_params_handle_model_default(gpt_params & params) {
}
}
void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) {
int32_t n_set = 0;
if (cpuparams.n_threads < 0) {
// Assuming everything about cpuparams is invalid
if (role_model != nullptr) {
cpuparams = *role_model;
} else {
cpuparams.n_threads = cpu_get_num_math();
}
}
for (int32_t i = 0; i < GGML_MAX_N_THREADS; i++) {
if (cpuparams.cpumask[i]) {
n_set++;
}
}
if (n_set && n_set < cpuparams.n_threads) {
// Not enough set bits, may experience performance issues.
fprintf(stderr, "warn: Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads);
}
}
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
bool invalid_param = false;
std::string arg;
@ -297,6 +372,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
}
}
postprocess_cpu_params(params.cpuparams, nullptr);
postprocess_cpu_params(params.cpuparams_batch, &params.cpuparams);
postprocess_cpu_params(params.draft_cpuparams, &params.cpuparams);
postprocess_cpu_params(params.draft_cpuparams_batch, &params.cpuparams_batch);
if (params.prompt_cache_all && (params.interactive || params.interactive_first)) {
throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
}
@ -328,7 +408,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
void gpt_params_parse_from_env(gpt_params & params) {
// we only care about server-related params for now
get_env("LLAMA_ARG_MODEL", params.model);
get_env("LLAMA_ARG_THREADS", params.n_threads);
get_env("LLAMA_ARG_MODEL_URL", params.model_url);
get_env("LLAMA_ARG_MODEL_ALIAS", params.model_alias);
get_env("LLAMA_ARG_HF_REPO", params.hf_repo);
get_env("LLAMA_ARG_HF_FILE", params.hf_file);
get_env("LLAMA_ARG_THREADS", params.cpuparams.n_threads);
get_env("LLAMA_ARG_CTX_SIZE", params.n_ctx);
get_env("LLAMA_ARG_N_PARALLEL", params.n_parallel);
get_env("LLAMA_ARG_BATCH", params.n_batch);
@ -342,6 +426,9 @@ void gpt_params_parse_from_env(gpt_params & params) {
get_env("LLAMA_ARG_EMBEDDINGS", params.embedding);
get_env("LLAMA_ARG_FLASH_ATTN", params.flash_attn);
get_env("LLAMA_ARG_DEFRAG_THOLD", params.defrag_thold);
get_env("LLAMA_ARG_CONT_BATCHING", params.cont_batching);
get_env("LLAMA_ARG_HOST", params.hostname);
get_env("LLAMA_ARG_PORT", params.port);
}
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
@ -362,6 +449,79 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
return true;
}
bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THREADS]) {
size_t dash_loc = range.find('-');
if (dash_loc == std::string::npos) {
fprintf(stderr, "Format of CPU range is invalid! Expected [<start>]-[<end>].\n");
return false;
}
size_t start_i;
size_t end_i;
if (dash_loc == 0) {
start_i = 0;
} else {
start_i = std::stoull(range.substr(0, dash_loc));
if (start_i >= GGML_MAX_N_THREADS) {
fprintf(stderr, "Start index out of bounds!\n");
return false;
}
}
if (dash_loc == range.length() - 1) {
end_i = GGML_MAX_N_THREADS - 1;
} else {
end_i = std::stoull(range.substr(dash_loc + 1));
if (end_i >= GGML_MAX_N_THREADS) {
fprintf(stderr, "End index out of bounds!\n");
return false;
}
}
for (size_t i = start_i; i <= end_i; i++) {
boolmask[i] = true;
}
return true;
}
bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREADS]) {
// Discard potential 0x prefix
size_t start_i = 0;
if (mask.length() >= 2 && mask.substr(0, 2) == "0x") {
start_i = 2;
}
size_t num_digits = mask.length() - start_i;
if (num_digits > 128) num_digits = 128;
size_t end_i = num_digits + start_i;
for (size_t i = start_i, n = (num_digits*4 - 1); i < end_i; i++, n-=4) {
char c = mask.at(i);
int8_t id = c;
if ((c >= '0' && c <= '9')) {
id -= '0';
} else if (c >= 'a' && c <= 'f') {
id -= 'a' - 10;
} else if (c >= 'A' && c <= 'F') {
id -= 'A' - 10;
} else {
fprintf(stderr, "Invalid hex character '%c' at position %d\n", c, int32_t(i));
return false;
}
boolmask[ n ] = boolmask[ n ] || ((id & 8) != 0);
boolmask[n - 1] = boolmask[n - 1] || ((id & 4) != 0);
boolmask[n - 2] = boolmask[n - 2] || ((id & 2) != 0);
boolmask[n - 3] = boolmask[n - 3] || ((id & 1) != 0);
}
return true;
}
#define CHECK_ARG if (++i >= argc) { invalid_param = true; return true; }
bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) {
@ -378,36 +538,142 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
}
if (arg == "-t" || arg == "--threads") {
CHECK_ARG
params.n_threads = std::stoi(argv[i]);
if (params.n_threads <= 0) {
params.n_threads = std::thread::hardware_concurrency();
params.cpuparams.n_threads = std::stoi(argv[i]);
if (params.cpuparams.n_threads <= 0) {
params.cpuparams.n_threads = std::thread::hardware_concurrency();
}
return true;
}
if (arg == "-C" || arg == "--cpu-mask") {
CHECK_ARG
std::string mask = argv[i];
params.cpuparams.mask_valid = true;
invalid_param = !parse_cpu_mask(mask, params.cpuparams.cpumask);
return true;
}
if (arg == "-Cr" || arg == "--cpu-range") {
CHECK_ARG
std::string range = argv[i];
params.cpuparams.mask_valid = true;
invalid_param = !parse_cpu_range(range, params.cpuparams.cpumask);
return true;
}
if (arg == "--prio") {
CHECK_ARG
params.cpuparams.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
return true;
}
if (arg == "--cpu-strict") {
CHECK_ARG
params.cpuparams.strict_cpu = std::stoul(argv[i]);
return true;
}
if (arg == "--poll") {
CHECK_ARG
params.cpuparams.poll = std::stoul(argv[i]);
return true;
}
if (arg == "-tb" || arg == "--threads-batch") {
CHECK_ARG
params.n_threads_batch = std::stoi(argv[i]);
if (params.n_threads_batch <= 0) {
params.n_threads_batch = std::thread::hardware_concurrency();
params.cpuparams_batch.n_threads = std::stoi(argv[i]);
if (params.cpuparams_batch.n_threads <= 0) {
params.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
}
return true;
}
if (arg == "-Cb" || arg == "--cpu-mask-batch") {
CHECK_ARG
std::string mask = argv[i];
params.cpuparams_batch.mask_valid = true;
invalid_param = !parse_cpu_mask(mask, params.cpuparams_batch.cpumask);
return true;
}
if (arg == "-Crb" || arg == "--cpu-range_batch") {
CHECK_ARG
std::string range = argv[i];
params.cpuparams_batch.mask_valid = true;
invalid_param = !parse_cpu_range(range, params.cpuparams_batch.cpumask);
return true;
}
if (arg == "--prio-batch") {
CHECK_ARG
params.cpuparams_batch.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
return true;
}
if (arg == "--cpu-strict-batch") {
params.cpuparams_batch.strict_cpu = true;
return true;
}
if (arg == "--poll-batch") {
CHECK_ARG
params.cpuparams_batch.poll = std::stoul(argv[i]);
return true;
}
if (arg == "-td" || arg == "--threads-draft") {
CHECK_ARG
params.n_threads_draft = std::stoi(argv[i]);
if (params.n_threads_draft <= 0) {
params.n_threads_draft = std::thread::hardware_concurrency();
params.draft_cpuparams.n_threads = std::stoi(argv[i]);
if (params.draft_cpuparams.n_threads <= 0) {
params.draft_cpuparams.n_threads = std::thread::hardware_concurrency();
}
return true;
}
if (arg == "-Cd" || arg == "--cpu-mask-draft") {
CHECK_ARG
std::string mask = argv[i];
params.draft_cpuparams.mask_valid = true;
invalid_param = !parse_cpu_mask(mask, params.draft_cpuparams.cpumask);
return true;
}
if (arg == "-Crd" || arg == "--cpu-range-draft") {
CHECK_ARG
std::string range = argv[i];
params.draft_cpuparams.mask_valid = true;
invalid_param = !parse_cpu_range(range, params.draft_cpuparams.cpumask);
return true;
}
if (arg == "--prio-draft") {
CHECK_ARG
params.draft_cpuparams.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
return true;
}
if (arg == "--cpu-strict-draft") {
params.draft_cpuparams.strict_cpu = true;
return true;
}
if (arg == "--poll-draft") {
CHECK_ARG
params.draft_cpuparams.poll = std::stoul(argv[i]);
return true;
}
if (arg == "-tbd" || arg == "--threads-batch-draft") {
CHECK_ARG
params.n_threads_batch_draft = std::stoi(argv[i]);
if (params.n_threads_batch_draft <= 0) {
params.n_threads_batch_draft = std::thread::hardware_concurrency();
params.draft_cpuparams_batch.n_threads = std::stoi(argv[i]);
if (params.draft_cpuparams_batch.n_threads <= 0) {
params.draft_cpuparams_batch.n_threads = std::thread::hardware_concurrency();
}
return true;
}
if (arg == "-Crbd" || arg == "--cpu-range-batch-draft") {
CHECK_ARG
std::string range = argv[i];
params.draft_cpuparams_batch.mask_valid = true;
invalid_param = !parse_cpu_range(range, params.draft_cpuparams_batch.cpumask);
return true;
}
if (arg == "--prio-batch-draft") {
CHECK_ARG
params.draft_cpuparams_batch.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
return true;
}
if (arg == "--cpu-strict-batch-draft") {
params.draft_cpuparams_batch.strict_cpu = true;
return true;
}
if (arg == "--poll-batch-draft") {
CHECK_ARG
params.draft_cpuparams_batch.poll = std::stoul(argv[i]);
return true;
}
if (arg == "-p" || arg == "--prompt") {
CHECK_ARG
params.prompt = argv[i];
@ -1492,11 +1758,40 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "*", " --no-display-prompt", "don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false" });
options.push_back({ "*", "-co, --color", "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" });
options.push_back({ "*", "-s, --seed SEED", "RNG seed (default: %d, use random seed for < 0)", params.seed });
options.push_back({ "*", "-t, --threads N", "number of threads to use during generation (default: %d)", params.n_threads });
options.push_back({ "*", "-t, --threads N", "number of threads to use during generation (default: %d)", params.cpuparams.n_threads });
options.push_back({ "*", "-tb, --threads-batch N", "number of threads to use during batch and prompt processing (default: same as --threads)" });
options.push_back({ "speculative", "-td, --threads-draft N", "number of threads to use during generation (default: same as --threads)" });
options.push_back({ "speculative", "-tbd, --threads-batch-draft N",
"number of threads to use during batch and prompt processing (default: same as --threads-draft)" });
options.push_back({ "speculative", "-tbd, --threads-batch-draft N","number of threads to use during batch and prompt processing (default: same as --threads-draft)" });
#ifndef GGML_USE_OPENMP
// these options are available only with the internal threadpool
options.push_back({ "*", "-C, --cpu-mask M", "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")"});
options.push_back({ "*", "-Cr, --cpu-range lo-hi", "range of CPUs for affinity. Complements --cpu-mask"});
options.push_back({ "*", " --cpu-strict <0|1>", "use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu});
options.push_back({ "*", " --priority N", "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority});
options.push_back({ "*", " --poll <0...100>", "use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll});
options.push_back({ "*", "-Cb, --cpu-mask-batch M", "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)"});
options.push_back({ "*", "-Crb, --cpu-range-batch lo-hi", "ranges of CPUs for affinity. Complements --cpu-mask-batch"});
options.push_back({ "*", " --cpu-strict-batch <0|1>","use strict CPU placement (default: same as --cpu-strict)"});
options.push_back({ "*", " --priority-batch N", "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: --priority)"});
options.push_back({ "*", " --poll-batch <0|1>", "use polling to wait for work (default: same as --poll"});
options.push_back({ "speculative", "-Cd, --cpu-mask-draft M", "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)"});
options.push_back({ "speculative", "-Crd, --cpu-range-draft lo-hi", "Ranges of CPUs for affinity. Complements --cpu-mask-draft"});
options.push_back({ "speculative", " --cpu-strict-draft <0|1>","Use strict CPU placement for draft model (default: same as --cpu-strict)"});
options.push_back({ "speculative", " --priority-draft N", "Set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: same as --priority)"});
options.push_back({ "speculative", " --poll-draft <0|1>", "Use polling to wait for draft model work (default: same as --poll])"});
options.push_back({ "speculative", "-Cbd, --cpu-mask-batch-draft M","Draft model CPU affinity mask. Complements cpu-range-draft-batch (default: same as --cpu-mask-draft)"});
options.push_back({ "speculative", "-Crbd, --cpu-range-batch-draft lo-hi",
"Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)"});
options.push_back({ "speculative", " --cpu-strict-batch-draft <0|1>",
"Use strict CPU placement for draft model (default: --cpu-strict-draft)"});
options.push_back({ "speculative", " --priority-batch-draft N","Set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: --priority-draft)"});
options.push_back({ "speculative", " --poll-batch-draft <0|1>","Use polling to wait for draft model work (default: --poll-draft)"});
#endif // GGML_USE_OPENMP
options.push_back({ "speculative", " --draft N", "number of tokens to draft for speculative decoding (default: %d)", params.n_draft });
options.push_back({ "speculative", "-ps, --p-split N", "speculative decoding split probability (default: %.1f)", (double)params.p_split });
options.push_back({ "*", "-lcs, --lookup-cache-static FNAME",
@ -1768,7 +2063,6 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "export-lora", "-m, --model", "model path from which to load base model (default '%s')", params.model.c_str() });
options.push_back({ "export-lora", " --lora FNAME", "path to LoRA adapter (can be repeated to use multiple adapters)" });
options.push_back({ "export-lora", " --lora-scaled FNAME S", "path to LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" });
options.push_back({ "*", "-t, --threads N", "number of threads to use during computation (default: %d)", params.n_threads });
options.push_back({ "export-lora", "-o, --output FNAME", "output file (default: '%s')", params.lora_outfile.c_str() });
printf("usage: %s [options]\n", argv[0]);
@ -1800,9 +2094,9 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
std::string gpt_params_get_system_info(const gpt_params & params) {
std::ostringstream os;
os << "system_info: n_threads = " << params.n_threads;
if (params.n_threads_batch != -1) {
os << " (n_threads_batch = " << params.n_threads_batch << ")";
os << "system_info: n_threads = " << params.cpuparams.n_threads;
if (params.cpuparams_batch.n_threads != -1) {
os << " (n_threads_batch = " << params.cpuparams_batch.n_threads << ")";
}
#if defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
// TODO: windows + arm64 + mingw64
@ -2326,8 +2620,9 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
cparams.n_seq_max = params.n_parallel;
cparams.n_batch = params.n_batch;
cparams.n_ubatch = params.n_ubatch;
cparams.n_threads = params.n_threads;
cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
cparams.n_threads = params.cpuparams.n_threads;
cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ?
params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
cparams.seed = params.seed;
cparams.logits_all = params.logits_all;
cparams.embeddings = params.embedding;
@ -2353,6 +2648,22 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
return cparams;
}
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params) {
struct ggml_threadpool_params tpp;
ggml_threadpool_params_init(&tpp, params.n_threads); // setup the defaults
if (params.mask_valid) {
std::memcpy(&tpp.cpumask, &params.cpumask, GGML_MAX_N_THREADS);
}
tpp.prio = params.priority;
tpp.poll = params.poll;
tpp.strict_cpu = params.strict_cpu;
return tpp;
}
#ifdef LLAMA_USE_CURL
static bool starts_with(const std::string & str, const std::string & prefix) {
@ -3342,7 +3653,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector);
fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency());
fprintf(stream, "threads: %d # default: %u\n", params.cpuparams.n_threads, std::thread::hardware_concurrency());
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);