Merge branch 'upstream' into concedo_experimental

# Conflicts:
#	.devops/full-cuda.Dockerfile
#	.devops/llama-cli-cuda.Dockerfile
#	.devops/llama-server-cuda.Dockerfile
#	.devops/llama-server-intel.Dockerfile
#	.devops/llama-server-rocm.Dockerfile
#	.devops/llama-server-vulkan.Dockerfile
#	.devops/llama-server.Dockerfile
#	.github/workflows/docker.yml
#	docs/docker.md
#	examples/llama-bench/llama-bench.cpp
#	flake.lock
#	ggml/include/ggml.h
#	ggml/src/CMakeLists.txt
#	scripts/sync-ggml.last
#	src/llama.cpp
#	tests/test-backend-ops.cpp
#	tests/test-grad0.cpp
#	tests/test-rope.cpp
This commit is contained in:
Concedo 2024-08-30 10:37:39 +08:00
commit d220495dd4
42 changed files with 100585 additions and 99448 deletions

View file

@ -252,6 +252,57 @@ int32_t cpu_get_num_math() {
return cpu_get_num_physical_cores();
}
// Helper for setting process priority
#if defined(_WIN32)
bool set_process_priority(enum ggml_sched_priority prio) {
if (prio == GGML_SCHED_PRIO_NORMAL) {
return true;
}
DWORD p = NORMAL_PRIORITY_CLASS;
switch (prio) {
case GGML_SCHED_PRIO_NORMAL: p = NORMAL_PRIORITY_CLASS; break;
case GGML_SCHED_PRIO_MEDIUM: p = ABOVE_NORMAL_PRIORITY_CLASS; break;
case GGML_SCHED_PRIO_HIGH: p = HIGH_PRIORITY_CLASS; break;
case GGML_SCHED_PRIO_REALTIME: p = REALTIME_PRIORITY_CLASS; break;
}
if (!SetPriorityClass(GetCurrentProcess(), p)) {
fprintf(stderr, "warn: failed to set process priority class %d : (%d)\n", prio, (int) GetLastError());
return false;
}
return true;
}
#else // MacOS and POSIX
#include <sys/types.h>
#include <sys/resource.h>
bool set_process_priority(enum ggml_sched_priority prio) {
if (prio == GGML_SCHED_PRIO_NORMAL) {
return true;
}
int p = 0;
switch (prio) {
case GGML_SCHED_PRIO_NORMAL: p = 0; break;
case GGML_SCHED_PRIO_MEDIUM: p = -5; break;
case GGML_SCHED_PRIO_HIGH: p = -10; break;
case GGML_SCHED_PRIO_REALTIME: p = -20; break;
}
if (!setpriority(PRIO_PROCESS, 0, p)) {
fprintf(stderr, "warn: failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno);
return false;
}
return true;
}
#endif
//
// CLI argument parsing
//
@ -278,6 +329,30 @@ void gpt_params_handle_model_default(gpt_params & params) {
}
}
void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) {
int32_t n_set = 0;
if (cpuparams.n_threads < 0) {
// Assuming everything about cpuparams is invalid
if (role_model != nullptr) {
cpuparams = *role_model;
} else {
cpuparams.n_threads = cpu_get_num_math();
}
}
for (int32_t i = 0; i < GGML_MAX_N_THREADS; i++) {
if (cpuparams.cpumask[i]) {
n_set++;
}
}
if (n_set && n_set < cpuparams.n_threads) {
// Not enough set bits, may experience performance issues.
fprintf(stderr, "warn: Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads);
}
}
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
bool invalid_param = false;
std::string arg;
@ -297,6 +372,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
}
}
postprocess_cpu_params(params.cpuparams, nullptr);
postprocess_cpu_params(params.cpuparams_batch, &params.cpuparams);
postprocess_cpu_params(params.draft_cpuparams, &params.cpuparams);
postprocess_cpu_params(params.draft_cpuparams_batch, &params.cpuparams_batch);
if (params.prompt_cache_all && (params.interactive || params.interactive_first)) {
throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
}
@ -328,7 +408,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
void gpt_params_parse_from_env(gpt_params & params) {
// we only care about server-related params for now
get_env("LLAMA_ARG_MODEL", params.model);
get_env("LLAMA_ARG_THREADS", params.n_threads);
get_env("LLAMA_ARG_MODEL_URL", params.model_url);
get_env("LLAMA_ARG_MODEL_ALIAS", params.model_alias);
get_env("LLAMA_ARG_HF_REPO", params.hf_repo);
get_env("LLAMA_ARG_HF_FILE", params.hf_file);
get_env("LLAMA_ARG_THREADS", params.cpuparams.n_threads);
get_env("LLAMA_ARG_CTX_SIZE", params.n_ctx);
get_env("LLAMA_ARG_N_PARALLEL", params.n_parallel);
get_env("LLAMA_ARG_BATCH", params.n_batch);
@ -342,6 +426,9 @@ void gpt_params_parse_from_env(gpt_params & params) {
get_env("LLAMA_ARG_EMBEDDINGS", params.embedding);
get_env("LLAMA_ARG_FLASH_ATTN", params.flash_attn);
get_env("LLAMA_ARG_DEFRAG_THOLD", params.defrag_thold);
get_env("LLAMA_ARG_CONT_BATCHING", params.cont_batching);
get_env("LLAMA_ARG_HOST", params.hostname);
get_env("LLAMA_ARG_PORT", params.port);
}
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
@ -362,6 +449,79 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
return true;
}
bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THREADS]) {
size_t dash_loc = range.find('-');
if (dash_loc == std::string::npos) {
fprintf(stderr, "Format of CPU range is invalid! Expected [<start>]-[<end>].\n");
return false;
}
size_t start_i;
size_t end_i;
if (dash_loc == 0) {
start_i = 0;
} else {
start_i = std::stoull(range.substr(0, dash_loc));
if (start_i >= GGML_MAX_N_THREADS) {
fprintf(stderr, "Start index out of bounds!\n");
return false;
}
}
if (dash_loc == range.length() - 1) {
end_i = GGML_MAX_N_THREADS - 1;
} else {
end_i = std::stoull(range.substr(dash_loc + 1));
if (end_i >= GGML_MAX_N_THREADS) {
fprintf(stderr, "End index out of bounds!\n");
return false;
}
}
for (size_t i = start_i; i <= end_i; i++) {
boolmask[i] = true;
}
return true;
}
bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREADS]) {
// Discard potential 0x prefix
size_t start_i = 0;
if (mask.length() >= 2 && mask.substr(0, 2) == "0x") {
start_i = 2;
}
size_t num_digits = mask.length() - start_i;
if (num_digits > 128) num_digits = 128;
size_t end_i = num_digits + start_i;
for (size_t i = start_i, n = (num_digits*4 - 1); i < end_i; i++, n-=4) {
char c = mask.at(i);
int8_t id = c;
if ((c >= '0' && c <= '9')) {
id -= '0';
} else if (c >= 'a' && c <= 'f') {
id -= 'a' - 10;
} else if (c >= 'A' && c <= 'F') {
id -= 'A' - 10;
} else {
fprintf(stderr, "Invalid hex character '%c' at position %d\n", c, int32_t(i));
return false;
}
boolmask[ n ] = boolmask[ n ] || ((id & 8) != 0);
boolmask[n - 1] = boolmask[n - 1] || ((id & 4) != 0);
boolmask[n - 2] = boolmask[n - 2] || ((id & 2) != 0);
boolmask[n - 3] = boolmask[n - 3] || ((id & 1) != 0);
}
return true;
}
#define CHECK_ARG if (++i >= argc) { invalid_param = true; return true; }
bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) {
@ -378,36 +538,142 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
}
if (arg == "-t" || arg == "--threads") {
CHECK_ARG
params.n_threads = std::stoi(argv[i]);
if (params.n_threads <= 0) {
params.n_threads = std::thread::hardware_concurrency();
params.cpuparams.n_threads = std::stoi(argv[i]);
if (params.cpuparams.n_threads <= 0) {
params.cpuparams.n_threads = std::thread::hardware_concurrency();
}
return true;
}
if (arg == "-C" || arg == "--cpu-mask") {
CHECK_ARG
std::string mask = argv[i];
params.cpuparams.mask_valid = true;
invalid_param = !parse_cpu_mask(mask, params.cpuparams.cpumask);
return true;
}
if (arg == "-Cr" || arg == "--cpu-range") {
CHECK_ARG
std::string range = argv[i];
params.cpuparams.mask_valid = true;
invalid_param = !parse_cpu_range(range, params.cpuparams.cpumask);
return true;
}
if (arg == "--prio") {
CHECK_ARG
params.cpuparams.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
return true;
}
if (arg == "--cpu-strict") {
CHECK_ARG
params.cpuparams.strict_cpu = std::stoul(argv[i]);
return true;
}
if (arg == "--poll") {
CHECK_ARG
params.cpuparams.poll = std::stoul(argv[i]);
return true;
}
if (arg == "-tb" || arg == "--threads-batch") {
CHECK_ARG
params.n_threads_batch = std::stoi(argv[i]);
if (params.n_threads_batch <= 0) {
params.n_threads_batch = std::thread::hardware_concurrency();
params.cpuparams_batch.n_threads = std::stoi(argv[i]);
if (params.cpuparams_batch.n_threads <= 0) {
params.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
}
return true;
}
if (arg == "-Cb" || arg == "--cpu-mask-batch") {
CHECK_ARG
std::string mask = argv[i];
params.cpuparams_batch.mask_valid = true;
invalid_param = !parse_cpu_mask(mask, params.cpuparams_batch.cpumask);
return true;
}
if (arg == "-Crb" || arg == "--cpu-range_batch") {
CHECK_ARG
std::string range = argv[i];
params.cpuparams_batch.mask_valid = true;
invalid_param = !parse_cpu_range(range, params.cpuparams_batch.cpumask);
return true;
}
if (arg == "--prio-batch") {
CHECK_ARG
params.cpuparams_batch.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
return true;
}
if (arg == "--cpu-strict-batch") {
params.cpuparams_batch.strict_cpu = true;
return true;
}
if (arg == "--poll-batch") {
CHECK_ARG
params.cpuparams_batch.poll = std::stoul(argv[i]);
return true;
}
if (arg == "-td" || arg == "--threads-draft") {
CHECK_ARG
params.n_threads_draft = std::stoi(argv[i]);
if (params.n_threads_draft <= 0) {
params.n_threads_draft = std::thread::hardware_concurrency();
params.draft_cpuparams.n_threads = std::stoi(argv[i]);
if (params.draft_cpuparams.n_threads <= 0) {
params.draft_cpuparams.n_threads = std::thread::hardware_concurrency();
}
return true;
}
if (arg == "-Cd" || arg == "--cpu-mask-draft") {
CHECK_ARG
std::string mask = argv[i];
params.draft_cpuparams.mask_valid = true;
invalid_param = !parse_cpu_mask(mask, params.draft_cpuparams.cpumask);
return true;
}
if (arg == "-Crd" || arg == "--cpu-range-draft") {
CHECK_ARG
std::string range = argv[i];
params.draft_cpuparams.mask_valid = true;
invalid_param = !parse_cpu_range(range, params.draft_cpuparams.cpumask);
return true;
}
if (arg == "--prio-draft") {
CHECK_ARG
params.draft_cpuparams.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
return true;
}
if (arg == "--cpu-strict-draft") {
params.draft_cpuparams.strict_cpu = true;
return true;
}
if (arg == "--poll-draft") {
CHECK_ARG
params.draft_cpuparams.poll = std::stoul(argv[i]);
return true;
}
if (arg == "-tbd" || arg == "--threads-batch-draft") {
CHECK_ARG
params.n_threads_batch_draft = std::stoi(argv[i]);
if (params.n_threads_batch_draft <= 0) {
params.n_threads_batch_draft = std::thread::hardware_concurrency();
params.draft_cpuparams_batch.n_threads = std::stoi(argv[i]);
if (params.draft_cpuparams_batch.n_threads <= 0) {
params.draft_cpuparams_batch.n_threads = std::thread::hardware_concurrency();
}
return true;
}
if (arg == "-Crbd" || arg == "--cpu-range-batch-draft") {
CHECK_ARG
std::string range = argv[i];
params.draft_cpuparams_batch.mask_valid = true;
invalid_param = !parse_cpu_range(range, params.draft_cpuparams_batch.cpumask);
return true;
}
if (arg == "--prio-batch-draft") {
CHECK_ARG
params.draft_cpuparams_batch.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
return true;
}
if (arg == "--cpu-strict-batch-draft") {
params.draft_cpuparams_batch.strict_cpu = true;
return true;
}
if (arg == "--poll-batch-draft") {
CHECK_ARG
params.draft_cpuparams_batch.poll = std::stoul(argv[i]);
return true;
}
if (arg == "-p" || arg == "--prompt") {
CHECK_ARG
params.prompt = argv[i];
@ -1492,11 +1758,40 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "*", " --no-display-prompt", "don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false" });
options.push_back({ "*", "-co, --color", "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" });
options.push_back({ "*", "-s, --seed SEED", "RNG seed (default: %d, use random seed for < 0)", params.seed });
options.push_back({ "*", "-t, --threads N", "number of threads to use during generation (default: %d)", params.n_threads });
options.push_back({ "*", "-t, --threads N", "number of threads to use during generation (default: %d)", params.cpuparams.n_threads });
options.push_back({ "*", "-tb, --threads-batch N", "number of threads to use during batch and prompt processing (default: same as --threads)" });
options.push_back({ "speculative", "-td, --threads-draft N", "number of threads to use during generation (default: same as --threads)" });
options.push_back({ "speculative", "-tbd, --threads-batch-draft N",
"number of threads to use during batch and prompt processing (default: same as --threads-draft)" });
options.push_back({ "speculative", "-tbd, --threads-batch-draft N","number of threads to use during batch and prompt processing (default: same as --threads-draft)" });
#ifndef GGML_USE_OPENMP
// these options are available only with the internal threadpool
options.push_back({ "*", "-C, --cpu-mask M", "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")"});
options.push_back({ "*", "-Cr, --cpu-range lo-hi", "range of CPUs for affinity. Complements --cpu-mask"});
options.push_back({ "*", " --cpu-strict <0|1>", "use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu});
options.push_back({ "*", " --priority N", "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority});
options.push_back({ "*", " --poll <0...100>", "use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll});
options.push_back({ "*", "-Cb, --cpu-mask-batch M", "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)"});
options.push_back({ "*", "-Crb, --cpu-range-batch lo-hi", "ranges of CPUs for affinity. Complements --cpu-mask-batch"});
options.push_back({ "*", " --cpu-strict-batch <0|1>","use strict CPU placement (default: same as --cpu-strict)"});
options.push_back({ "*", " --priority-batch N", "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: --priority)"});
options.push_back({ "*", " --poll-batch <0|1>", "use polling to wait for work (default: same as --poll"});
options.push_back({ "speculative", "-Cd, --cpu-mask-draft M", "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)"});
options.push_back({ "speculative", "-Crd, --cpu-range-draft lo-hi", "Ranges of CPUs for affinity. Complements --cpu-mask-draft"});
options.push_back({ "speculative", " --cpu-strict-draft <0|1>","Use strict CPU placement for draft model (default: same as --cpu-strict)"});
options.push_back({ "speculative", " --priority-draft N", "Set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: same as --priority)"});
options.push_back({ "speculative", " --poll-draft <0|1>", "Use polling to wait for draft model work (default: same as --poll])"});
options.push_back({ "speculative", "-Cbd, --cpu-mask-batch-draft M","Draft model CPU affinity mask. Complements cpu-range-draft-batch (default: same as --cpu-mask-draft)"});
options.push_back({ "speculative", "-Crbd, --cpu-range-batch-draft lo-hi",
"Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)"});
options.push_back({ "speculative", " --cpu-strict-batch-draft <0|1>",
"Use strict CPU placement for draft model (default: --cpu-strict-draft)"});
options.push_back({ "speculative", " --priority-batch-draft N","Set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: --priority-draft)"});
options.push_back({ "speculative", " --poll-batch-draft <0|1>","Use polling to wait for draft model work (default: --poll-draft)"});
#endif // GGML_USE_OPENMP
options.push_back({ "speculative", " --draft N", "number of tokens to draft for speculative decoding (default: %d)", params.n_draft });
options.push_back({ "speculative", "-ps, --p-split N", "speculative decoding split probability (default: %.1f)", (double)params.p_split });
options.push_back({ "*", "-lcs, --lookup-cache-static FNAME",
@ -1768,7 +2063,6 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "export-lora", "-m, --model", "model path from which to load base model (default '%s')", params.model.c_str() });
options.push_back({ "export-lora", " --lora FNAME", "path to LoRA adapter (can be repeated to use multiple adapters)" });
options.push_back({ "export-lora", " --lora-scaled FNAME S", "path to LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" });
options.push_back({ "*", "-t, --threads N", "number of threads to use during computation (default: %d)", params.n_threads });
options.push_back({ "export-lora", "-o, --output FNAME", "output file (default: '%s')", params.lora_outfile.c_str() });
printf("usage: %s [options]\n", argv[0]);
@ -1800,9 +2094,9 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
std::string gpt_params_get_system_info(const gpt_params & params) {
std::ostringstream os;
os << "system_info: n_threads = " << params.n_threads;
if (params.n_threads_batch != -1) {
os << " (n_threads_batch = " << params.n_threads_batch << ")";
os << "system_info: n_threads = " << params.cpuparams.n_threads;
if (params.cpuparams_batch.n_threads != -1) {
os << " (n_threads_batch = " << params.cpuparams_batch.n_threads << ")";
}
#if defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
// TODO: windows + arm64 + mingw64
@ -2326,8 +2620,9 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
cparams.n_seq_max = params.n_parallel;
cparams.n_batch = params.n_batch;
cparams.n_ubatch = params.n_ubatch;
cparams.n_threads = params.n_threads;
cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
cparams.n_threads = params.cpuparams.n_threads;
cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ?
params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
cparams.seed = params.seed;
cparams.logits_all = params.logits_all;
cparams.embeddings = params.embedding;
@ -2353,6 +2648,22 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
return cparams;
}
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params) {
struct ggml_threadpool_params tpp;
ggml_threadpool_params_init(&tpp, params.n_threads); // setup the defaults
if (params.mask_valid) {
std::memcpy(&tpp.cpumask, &params.cpumask, GGML_MAX_N_THREADS);
}
tpp.prio = params.priority;
tpp.poll = params.poll;
tpp.strict_cpu = params.strict_cpu;
return tpp;
}
#ifdef LLAMA_USE_CURL
static bool starts_with(const std::string & str, const std::string & prefix) {
@ -3342,7 +3653,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector);
fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency());
fprintf(stream, "threads: %d # default: %u\n", params.cpuparams.n_threads, std::thread::hardware_concurrency());
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);

View file

@ -63,13 +63,18 @@ enum dimre_method {
DIMRE_METHOD_MEAN,
};
struct cpu_params {
int n_threads = -1;
bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
bool mask_valid = false; // Default: any CPU
enum ggml_sched_priority priority = GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
bool strict_cpu = false; // Use strict CPU placement
uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling)
};
struct gpt_params {
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
int32_t n_threads = cpu_get_num_math();
int32_t n_threads_draft = -1;
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
int32_t n_threads_batch_draft = -1;
int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 0; // context size
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
@ -96,6 +101,11 @@ struct gpt_params {
int32_t yarn_orig_ctx = 0; // YaRN original context length
float defrag_thold = -1.0f; // KV cache defragmentation threshold
struct cpu_params cpuparams;
struct cpu_params cpuparams_batch;
struct cpu_params draft_cpuparams;
struct cpu_params draft_cpuparams_batch;
ggml_backend_sched_eval_callback cb_eval = nullptr;
void * cb_eval_user_data = nullptr;
@ -228,7 +238,7 @@ struct gpt_params {
int32_t port = 8080; // server listens on this network port
int32_t timeout_read = 600; // http read timeout in seconds
int32_t timeout_write = timeout_read; // http write timeout in seconds
int32_t n_threads_http = -1; // number of threads to process HTTP requests
int n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
std::string hostname = "127.0.0.1";
std::string public_path = "";
@ -301,6 +311,11 @@ void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);
std::string gpt_params_get_system_info(const gpt_params & params);
bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]);
bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr);
bool set_process_priority(enum ggml_sched_priority prio);
//
// String utils
//
@ -352,7 +367,8 @@ struct llama_init_result {
struct llama_init_result llama_init_from_gpt_params(gpt_params & params);
struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
struct llama_context_params llama_context_params_from_gpt_params (const gpt_params & params);
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);

View file

@ -18,7 +18,7 @@ constexpr float rms_norm_eps = 5e-6f;
#endif
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
if (plan.work_size > 0) {
buf.resize(plan.work_size);

View file

@ -22,7 +22,7 @@
#endif
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
if (plan.work_size > 0) {
buf.resize(plan.work_size);
@ -55,7 +55,7 @@ static void tensor_dump(const ggml_tensor * tensor, const char * name) {
#define TENSOR_DUMP(tensor) tensor_dump(tensor, #tensor)
struct benchmark_params_struct {
int32_t n_threads = 1;
int n_threads = 1;
int32_t n_iterations = 10;
};

View file

@ -486,7 +486,7 @@ int main(int argc, char ** argv) {
if (use_pca) {
// run PCA
PCA::pca_params pca_params;
pca_params.n_threads = params.n_threads;
pca_params.n_threads = params.cpuparams.n_threads;
pca_params.n_batch = params.n_pca_batch;
pca_params.n_iterations = params.n_pca_iterations;
PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);

View file

@ -410,7 +410,7 @@ int main(int argc, char ** argv) {
g_verbose = (params.verbosity == 1);
try {
lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.n_threads);
lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.cpuparams.n_threads);
ctx.run_merge();
} catch (const std::exception & err) {
fprintf(stderr, "%s\n", err.what());

File diff suppressed because it is too large Load diff

View file

@ -71,8 +71,8 @@ actor LlamaContext {
var ctx_params = llama_context_default_params()
ctx_params.seed = 1234
ctx_params.n_ctx = 2048
ctx_params.n_threads = UInt32(n_threads)
ctx_params.n_threads_batch = UInt32(n_threads)
ctx_params.n_threads = Int32(n_threads)
ctx_params.n_threads_batch = Int32(n_threads)
let context = llama_new_context_with_model(model, ctx_params)
guard let context else {

View file

@ -15,8 +15,8 @@ cd llama.cpp
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) by us)
```bash
python ./examples/minicpmv/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5
python ./examples/minicpmv/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 2
python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5
python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 2
python ./convert_hf_to_gguf.py ../MiniCPM-Llama3-V-2_5/model
# quantize int4 version

View file

@ -129,14 +129,14 @@ static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_para
if (!params->image.empty()) {
LOG_TEE("using base64 encoded image instead of command line image path\n");
}
embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->n_threads, prompt);
embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt);
if (!embed) {
LOG_TEE("%s: can't load image from prompt\n", __func__);
return NULL;
}
params->prompt = remove_image_from_prompt(prompt);
} else {
embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->n_threads, fname.c_str());
embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->cpuparams.n_threads, fname.c_str());
if (!embed) {
fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str());
return NULL;

View file

@ -180,7 +180,7 @@ static const char * sample(struct llama_sampling_context * ctx_sampling,
static struct llava_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){
auto ctx_clip = clip_init_context(params);
auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->n_threads, fname.c_str());
auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str());
if (!embeds) {
std::cerr << "error: failed to load image " << fname << ". Terminating\n\n";
return NULL;

View file

@ -222,6 +222,40 @@ int main(int argc, char ** argv) {
return 1;
}
LOG("%s: llama threadpool init = n_threads = %d\n",
__func__,
(int) params.cpuparams.n_threads
);
struct ggml_threadpool_params tpp_batch =
ggml_threadpool_params_from_cpu_params(params.cpuparams_batch);
struct ggml_threadpool_params tpp =
ggml_threadpool_params_from_cpu_params(params.cpuparams);
set_process_priority(params.cpuparams.priority);
struct ggml_threadpool * threadpool_batch = NULL;
if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
threadpool_batch = ggml_threadpool_new(&tpp_batch);
if (!threadpool_batch) {
LOG_TEE("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
exit(1);
}
// Start the non-batch threadpool in the paused state
tpp.paused = true;
}
struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp);
if (!threadpool) {
LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
exit(1);
}
llama_attach_threadpool(ctx, threadpool, threadpool_batch);
if (ctx_guidance) {
llama_attach_threadpool(ctx_guidance, threadpool, threadpool_batch);
}
const int n_ctx_train = llama_n_ctx_train(model);
const int n_ctx = llama_n_ctx(ctx);
LOG("n_ctx: %d\n", n_ctx);
@ -990,6 +1024,9 @@ int main(int argc, char ** argv) {
llama_sampling_free(ctx_sampling);
llama_backend_free();
ggml_threadpool_free(threadpool);
ggml_threadpool_free(threadpool_batch);
#ifndef LOG_DISABLE_LOGS
LOG_TEE("Log end\n");
#endif // LOG_DISABLE_LOGS

View file

@ -249,23 +249,49 @@ logging:
Available environment variables (if specified, these variables will override parameters specified in arguments):
- `LLAMA_CACHE` (cache directory, used by `--hf-repo`)
- `HF_TOKEN` (Hugging Face access token, used when accessing a gated model with `--hf-repo`)
- `LLAMA_ARG_MODEL`
- `LLAMA_ARG_THREADS`
- `LLAMA_ARG_CTX_SIZE`
- `LLAMA_ARG_N_PARALLEL`
- `LLAMA_ARG_BATCH`
- `LLAMA_ARG_UBATCH`
- `LLAMA_ARG_N_GPU_LAYERS`
- `LLAMA_ARG_THREADS_HTTP`
- `LLAMA_ARG_CHAT_TEMPLATE`
- `LLAMA_ARG_N_PREDICT`
- `LLAMA_ARG_ENDPOINT_METRICS`
- `LLAMA_ARG_ENDPOINT_SLOTS`
- `LLAMA_ARG_EMBEDDINGS`
- `LLAMA_ARG_FLASH_ATTN`
- `LLAMA_ARG_DEFRAG_THOLD`
- `LLAMA_CACHE`: cache directory, used by `--hf-repo`
- `HF_TOKEN`: Hugging Face access token, used when accessing a gated model with `--hf-repo`
- `LLAMA_ARG_MODEL`: equivalent to `-m`
- `LLAMA_ARG_MODEL_URL`: equivalent to `-mu`
- `LLAMA_ARG_MODEL_ALIAS`: equivalent to `-a`
- `LLAMA_ARG_HF_REPO`: equivalent to `--hf-repo`
- `LLAMA_ARG_HF_FILE`: equivalent to `--hf-file`
- `LLAMA_ARG_THREADS`: equivalent to `-t`
- `LLAMA_ARG_CTX_SIZE`: equivalent to `-c`
- `LLAMA_ARG_N_PARALLEL`: equivalent to `-np`
- `LLAMA_ARG_BATCH`: equivalent to `-b`
- `LLAMA_ARG_UBATCH`: equivalent to `-ub`
- `LLAMA_ARG_N_GPU_LAYERS`: equivalent to `-ngl`
- `LLAMA_ARG_THREADS_HTTP`: equivalent to `--threads-http`
- `LLAMA_ARG_CHAT_TEMPLATE`: equivalent to `--chat-template`
- `LLAMA_ARG_N_PREDICT`: equivalent to `-n`
- `LLAMA_ARG_ENDPOINT_METRICS`: if set to `1`, it will enable metrics endpoint (equivalent to `--metrics`)
- `LLAMA_ARG_ENDPOINT_SLOTS`: if set to `0`, it will **disable** slots endpoint (equivalent to `--no-slots`). This feature is enabled by default.
- `LLAMA_ARG_EMBEDDINGS`: if set to `1`, it will enable embeddings endpoint (equivalent to `--embeddings`)
- `LLAMA_ARG_FLASH_ATTN`: if set to `1`, it will enable flash attention (equivalent to `-fa`)
- `LLAMA_ARG_CONT_BATCHING`: if set to `0`, it will **disable** continuous batching (equivalent to `--no-cont-batching`). This feature is enabled by default.
- `LLAMA_ARG_DEFRAG_THOLD`: equivalent to `-dt`
- `LLAMA_ARG_HOST`: equivalent to `--host`
- `LLAMA_ARG_PORT`: equivalent to `--port`
Example usage of docker compose with environment variables:
```yml
services:
llamacpp-server:
image: ghcr.io/ggerganov/llama.cpp:server
ports:
- 8080:8080
volumes:
- ./models:/models
environment:
# alternatively, you can use "LLAMA_ARG_MODEL_URL" to download the model
LLAMA_ARG_MODEL: /models/my_model.gguf
LLAMA_ARG_CTX_SIZE: 4096
LLAMA_ARG_N_PARALLEL: 2
LLAMA_ARG_ENDPOINT_METRICS: 1 # to disable, either remove or set to 0
LLAMA_ARG_PORT: 8080
```
## Build

View file

@ -2535,8 +2535,8 @@ int main(int argc, char ** argv) {
});
LOG_INFO("system info", {
{"n_threads", params.n_threads},
{"n_threads_batch", params.n_threads_batch},
{"n_threads", params.cpuparams.n_threads},
{"n_threads_batch", params.cpuparams_batch.n_threads},
{"total_threads", std::thread::hardware_concurrency()},
{"system_info", llama_print_system_info()},
});
@ -2573,7 +2573,7 @@ int main(int argc, char ** argv) {
auto res_error = [](httplib::Response & res, json error_data) {
json final_response {{"error", error_data}};
res.set_content(final_response.dump(), MIMETYPE_JSON);
res.set_content(final_response.dump(-1, ' ', false, json::error_handler_t::replace), MIMETYPE_JSON);
res.status = json_value(error_data, "code", 500);
};

View file

@ -75,10 +75,11 @@ int main(int argc, char ** argv) {
// load the draft model
params.model = params.model_draft;
params.n_gpu_layers = params.n_gpu_layers_draft;
if (params.n_threads_draft > 0) {
params.n_threads = params.n_threads_draft;
if (params.draft_cpuparams.n_threads > 0) {
params.cpuparams.n_threads = params.draft_cpuparams.n_threads;
}
params.n_threads_batch = params.n_threads_batch_draft;
params.cpuparams_batch.n_threads = params.draft_cpuparams_batch.n_threads;
llama_init_result llama_init_dft = llama_init_from_gpt_params(params);
model_dft = llama_init_dft.model;
ctx_dft = llama_init_dft.context;

View file

@ -63,6 +63,7 @@ extern "C" {
GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
// "offset" refers to the offset of the tensor data for setting/getting data
GGML_API GGML_CALL void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
@ -102,6 +103,7 @@ extern "C" {
GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend);
GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
// Create a backend buffer from an existing pointer

View file

@ -220,7 +220,7 @@
#include <stdio.h>
#define GGML_FILE_MAGIC 0x67676d6c // "ggml"
#define GGML_FILE_VERSION 1
#define GGML_FILE_VERSION 2
#define GGML_QNT_VERSION 2 // bump this on quantization format changes
#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
@ -231,6 +231,8 @@
#define GGML_MAX_SRC 10
#ifndef GGML_MAX_NAME
#define GGML_MAX_NAME 128
#define GGML_MAX_N_THREADS 512
#endif
#define GGML_MAX_OP_PARAMS 64
#define GGML_DEFAULT_N_THREADS 4
@ -459,6 +461,8 @@ extern "C" {
GGML_OP_SQR,
GGML_OP_SQRT,
GGML_OP_LOG,
GGML_OP_SIN,
GGML_OP_COS,
GGML_OP_SUM,
GGML_OP_SUM_ROWS,
GGML_OP_MEAN,
@ -496,9 +500,11 @@ extern "C" {
GGML_OP_CLAMP,
GGML_OP_CONV_TRANSPOSE_1D,
GGML_OP_IM2COL,
GGML_OP_IM2COL_BACK,
GGML_OP_CONV_TRANSPOSE_2D,
GGML_OP_POOL_1D,
GGML_OP_POOL_2D,
GGML_OP_POOL_2D_BACK,
GGML_OP_UPSCALE, // nearest interpolate
GGML_OP_PAD,
GGML_OP_ARANGE,
@ -630,6 +636,29 @@ extern "C" {
// If it returns true, the computation is aborted
typedef bool (*ggml_abort_callback)(void * data);
// Scheduling priorities
enum ggml_sched_priority {
GGML_SCHED_PRIO_NORMAL,
GGML_SCHED_PRIO_MEDIUM,
GGML_SCHED_PRIO_HIGH,
GGML_SCHED_PRIO_REALTIME
};
// Threadpool params
// Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
struct ggml_threadpool_params {
bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
int n_threads; // number of threads
enum ggml_sched_priority prio; // thread priority
uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling)
bool strict_cpu; // strict cpu placement
bool paused; // start in paused state
};
struct ggml_threadpool; // forward declaration, see ggml.c
typedef struct ggml_threadpool * ggml_threadpool_t;
// the compute plan that needs to be prepared for ggml_graph_compute()
// since https://github.com/ggerganov/ggml/issues/287
struct ggml_cplan {
@ -637,6 +666,7 @@ extern "C" {
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
int n_threads;
struct ggml_threadpool * threadpool;
// abort ggml_graph_compute when true
ggml_abort_callback abort_callback;
@ -975,6 +1005,22 @@ extern "C" {
struct ggml_context * ctx,
struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_sin(
struct ggml_context * ctx,
struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_sin_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_cos(
struct ggml_context * ctx,
struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_cos_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a);
// return scalar
GGML_API struct ggml_tensor * ggml_sum(
struct ggml_context * ctx,
@ -1572,34 +1618,49 @@ extern "C" {
float min,
float max);
// im2col
// converts data into a format that effectively results in a convolution when combined with matrix multiplication
GGML_API struct ggml_tensor * ggml_im2col(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
int s0,
int s1,
int p0,
int p1,
int d0,
int d1,
struct ggml_tensor * a, // convolution kernel
struct ggml_tensor * b, // data
int s0, // stride dimension 0
int s1, // stride dimension 1
int p0, // padding dimension 0
int p1, // padding dimension 1
int d0, // dilation dimension 0
int d1, // dilation dimension 1
bool is_2D,
enum ggml_type dst_type);
GGML_API struct ggml_tensor * ggml_im2col_back(
struct ggml_context * ctx,
struct ggml_tensor * a, // convolution kernel
struct ggml_tensor * b, // gradient of im2col output
int64_t * ne, // shape of im2col input
int s0, // stride dimension 0
int s1, // stride dimension 1
int p0, // padding dimension 0
int p1, // padding dimension 1
int d0, // dilation dimension 0
int d1, // dilation dimension 1
bool is_2D);
GGML_API struct ggml_tensor * ggml_conv_depthwise_2d(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
int s0,
int s1,
int p0,
int p1,
int d0,
int d1);
struct ggml_tensor * a, // convolution kernel
struct ggml_tensor * b, // data
int s0, // stride dimension 0
int s1, // stride dimension 1
int p0, // padding dimension 0
int p1, // padding dimension 1
int d0, // dilation dimension 0
int d1); // dilation dimension 1
GGML_API struct ggml_tensor * ggml_conv_1d(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
struct ggml_tensor * a, // convolution kernel
struct ggml_tensor * b, // data
int s0, // stride
int p0, // padding
int d0); // dilation
@ -1608,29 +1669,29 @@ extern "C" {
// alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
GGML_API struct ggml_tensor* ggml_conv_1d_ph(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
int s,
int d);
struct ggml_tensor * a, // convolution kernel
struct ggml_tensor * b, // data
int s, // stride
int d); // dilation
GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
int s0,
int p0,
int d0);
struct ggml_tensor * a, // convolution kernel
struct ggml_tensor * b, // data
int s0, // stride
int p0, // padding
int d0); // dilation
GGML_API struct ggml_tensor * ggml_conv_2d(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
int s0,
int s1,
int p0,
int p1,
int d0,
int d1);
struct ggml_tensor * a, // convolution kernel
struct ggml_tensor * b, // data
int s0, // stride dimension 0
int s1, // stride dimension 1
int p0, // padding dimension 0
int p1, // padding dimension 1
int d0, // dilation dimension 0
int d1); // dilation dimension 1
// kernel size is a->ne[0] x a->ne[1]
@ -1692,6 +1753,18 @@ extern "C" {
float p0,
float p1);
GGML_API struct ggml_tensor * ggml_pool_2d_back(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * af, // "a"/input used in forward pass
enum ggml_op_pool op,
int k0,
int k1,
int s0,
int s1,
float p0,
float p1);
// nearest interpolate
// multiplies ne0 and ne1 by scale factor
// used in stable-diffusion
@ -2016,10 +2089,23 @@ extern "C" {
GGML_API size_t ggml_graph_overhead(void);
GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params *p, int n_threads);
GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params *p0, const struct ggml_threadpool_params *p1);
GGML_API struct ggml_threadpool* ggml_threadpool_new (struct ggml_threadpool_params * params);
GGML_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
GGML_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
// ggml_graph_plan() has to be called before ggml_graph_compute()
// when plan.work_size > 0, caller must allocate memory for plan.work_data
GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
GGML_API enum ggml_status ggml_graph_compute( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
GGML_API struct ggml_cplan ggml_graph_plan(
const struct ggml_cgraph * cgraph,
int n_threads, /* = GGML_DEFAULT_N_THREADS */
struct ggml_threadpool * threadpool /* = NULL */ );
GGML_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
// same as ggml_graph_compute() but the work data is allocated as a part of the context
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);

View file

@ -723,6 +723,8 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
struct ggml_backend_cpu_context {
int n_threads;
ggml_threadpool_t threadpool;
void * work_data;
size_t work_size;
@ -759,7 +761,7 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg
struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
cpu_plan->cgraph = *cgraph; // FIXME: deep copy
if (cpu_plan->cplan.work_size > 0) {
@ -796,7 +798,7 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backe
GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
if (cpu_ctx->work_size < cplan.work_size) {
free(cpu_ctx->work_data);
@ -873,6 +875,7 @@ ggml_backend_t ggml_backend_cpu_init(void) {
}
ctx->n_threads = GGML_DEFAULT_N_THREADS;
ctx->threadpool = NULL;
ctx->work_data = NULL;
ctx->work_size = 0;
ctx->abort_callback = NULL;
@ -903,6 +906,18 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
ctx->n_threads = n_threads;
}
void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_threadpool_t threadpool) {
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
if (ctx->threadpool && ctx->threadpool != threadpool) {
// already had a different threadpool, pause/suspend it before switching
ggml_threadpool_pause(ctx->threadpool);
}
ctx->threadpool = threadpool;
}
void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));

View file

@ -11,8 +11,10 @@ bool g_mul_mat_q = false;
#include "ggml-cuda/binbcast.cuh"
#include "ggml-cuda/clamp.cuh"
#include "ggml-cuda/concat.cuh"
#include "ggml-cuda/conv-transpose-1d.cuh"
#include "ggml-cuda/convert.cuh"
#include "ggml-cuda/cpy.cuh"
#include "ggml-cuda/cross-entropy-loss.cuh"
#include "ggml-cuda/diagmask.cuh"
#include "ggml-cuda/dmmv.cuh"
#include "ggml-cuda/fattn.cuh"
@ -31,7 +33,6 @@ bool g_mul_mat_q = false;
#include "ggml-cuda/tsembd.cuh"
#include "ggml-cuda/unary.cuh"
#include "ggml-cuda/upscale.cuh"
#include "ggml-cuda/conv-transpose-1d.cuh"
#include <algorithm>
#include <array>
@ -2185,6 +2186,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_ADD:
ggml_cuda_op_add(ctx, dst);
break;
case GGML_OP_SUB:
ggml_cuda_op_sub(ctx, dst);
break;
case GGML_OP_ACC:
ggml_cuda_op_acc(ctx, dst);
break;
@ -2271,6 +2275,12 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_SQRT:
ggml_cuda_op_sqrt(ctx, dst);
break;
case GGML_OP_SIN:
ggml_cuda_op_sin(ctx, dst);
break;
case GGML_OP_COS:
ggml_cuda_op_cos(ctx, dst);
break;
case GGML_OP_CLAMP:
ggml_cuda_op_clamp(ctx, dst);
break;
@ -2307,6 +2317,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_FLASH_ATTN_EXT:
ggml_cuda_flash_attn_ext(ctx, dst);
break;
case GGML_OP_CROSS_ENTROPY_LOSS:
ggml_cuda_cross_entropy_loss(ctx, dst);
break;
default:
return false;
}
@ -2614,6 +2627,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
for (int j = 0; j < GGML_MAX_SRC; j++) {
if (node->src[j] != nullptr) {
assert(node->src[j]->buffer);
assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer));
}
}
@ -2857,12 +2871,15 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
case GGML_OP_TRANSPOSE:
case GGML_OP_NORM:
case GGML_OP_ADD:
case GGML_OP_SUB:
case GGML_OP_MUL:
case GGML_OP_DIV:
case GGML_OP_RMS_NORM:
case GGML_OP_SCALE:
case GGML_OP_SQR:
case GGML_OP_SQRT:
case GGML_OP_SIN:
case GGML_OP_COS:
case GGML_OP_CLAMP:
case GGML_OP_CONT:
case GGML_OP_DIAG_MASK_INF:
@ -2894,6 +2911,8 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
}
return ggml_cuda_info().devices[cuda_ctx->device].cc >= CC_VOLTA &&
op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16;
case GGML_OP_CROSS_ENTROPY_LOSS:
return true;
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
default:
return false;

View file

@ -9,6 +9,10 @@ static __device__ __forceinline__ float op_add(const float a, const float b) {
return a + b;
}
static __device__ __forceinline__ float op_sub(const float a, const float b) {
return a - b;
}
static __device__ __forceinline__ float op_mul(const float a, const float b) {
return a * b;
}
@ -271,6 +275,10 @@ void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
}
void ggml_cuda_op_sub(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_sub>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
}
void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_mul>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
}

View file

@ -2,5 +2,6 @@
void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_sub(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View file

@ -0,0 +1,106 @@
#include "common.cuh"
#include "cross-entropy-loss.cuh"
#include "sumrows.cuh"
#include <cmath>
#include <cstdint>
static __global__ void cross_entropy_loss_f32(const float * logits, const float * labels, float * dst, const int nclasses, const int k) {
const int warp_id = threadIdx.x / WARP_SIZE;
const int lane_id = threadIdx.x % WARP_SIZE;
const int i0 = blockDim.x*blockIdx.x + warp_id*WARP_SIZE;
const int ne_tmp = WARP_SIZE*nclasses;
extern __shared__ float tmp_all[];
float * tmp_logits = tmp_all + (2*warp_id + 0)*ne_tmp;
float * tmp_labels = tmp_all + (2*warp_id + 1)*ne_tmp;
// Each warp first loads ne_tmp logits/labels into shared memory:
for (int i = lane_id; i < ne_tmp; i += WARP_SIZE) {
const int ig = i0*nclasses + i; // ig == i global
tmp_logits[i] = ig < k*nclasses ? logits[ig] : 0.0f;
tmp_labels[i] = ig < k*nclasses ? labels[ig] : 0.0f;
}
// Each thread in the warp then calculates the cross entropy loss for a single row.
// TODO: pad in order to avoid shared memory bank conflicts.
// Find maximum for softmax:
float max = -INFINITY;
for (int i = 0; i < nclasses; ++i) {
max = fmaxf(max, tmp_logits[lane_id*nclasses + i]);
}
// Calculate log(softmax(logits)) which is just logits - max:
float sum = 0.0f;
for (int i = 0; i < nclasses; ++i) {
float val = tmp_logits[lane_id*nclasses + i] - max;
sum += expf(val);
tmp_logits[lane_id*nclasses + i] = val;
}
sum = logf(sum);
// log(exp(logits - max) / sum) = (logits - max) - log(sum)
float loss = 0.0f;
for (int i = 0; i < nclasses; ++i) {
loss += (tmp_logits[lane_id*nclasses + i] - sum) * tmp_labels[lane_id*nclasses + i];
}
loss = -warp_reduce_sum(loss) / (float)k;
__syncthreads();
if (lane_id == 0) {
tmp_all[warp_id] = loss;
}
__syncthreads();
if (warp_id != 0) {
return;
}
loss = lane_id < CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE/WARP_SIZE ? tmp_all[lane_id] : 0.0f;
loss = warp_reduce_sum(loss);
if (lane_id != 0) {
return;
}
dst[blockIdx.x] = loss;
}
void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const ggml_tensor * src1 = dst->src[1];
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(ggml_is_contiguous(src1));
GGML_ASSERT(ggml_is_contiguous(dst));
const int64_t ne00 = src0->ne[0];
const int64_t nrows = ggml_nrows(src0);
const float * src0_d = (const float *) src0->data;
const float * src1_d = (const float *) src1->data;
float * dst_d = (float *) dst->data;
ggml_cuda_pool & pool = ctx.pool();
cudaStream_t stream = ctx.stream();
const dim3 blocks_dim(CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE, 1, 1);
const dim3 blocks_num((nrows + CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE - 1) / CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE, 1, 1);
const int shmem = 2*CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE*ne00*sizeof(float);
ggml_cuda_pool_alloc<float> dst_tmp(pool, blocks_num.x);
cross_entropy_loss_f32<<<blocks_num, blocks_dim, shmem, stream>>>(src0_d, src1_d, dst_tmp.ptr, ne00, nrows);
// Combine results from individual blocks:
sum_rows_f32_cuda(dst_tmp.ptr, dst_d, blocks_num.x, 1, stream);
}

View file

@ -0,0 +1,5 @@
#include "common.cuh"
#define CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE 256
void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View file

@ -16,7 +16,7 @@ static __global__ void k_sum_rows_f32(const float * x, float * dst, const int nc
}
}
static void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
const dim3 block_dims(WARP_SIZE, 1, 1);
const dim3 block_nums(nrows, 1, 1);
k_sum_rows_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
@ -32,7 +32,6 @@ void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
GGML_ASSERT( dst->type == GGML_TYPE_F32);
GGML_ASSERT(ggml_is_contiguous(src0));
const int64_t ncols = src0->ne[0];
const int64_t nrows = ggml_nrows(src0);

View file

@ -1,3 +1,5 @@
#include "common.cuh"
void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream);
void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View file

@ -101,6 +101,24 @@ static __global__ void sqrt_f32(const float * x, float * dst, const int k) {
dst[i] = sqrtf(x[i]);
}
static __global__ void sin_f32(const float * x, float * dst, const int k) {
const int i = blockDim.x*blockIdx.x + threadIdx.x;
if (i >= k) {
return;
}
dst[i] = sinf(x[i]);
}
static __global__ void cos_f32(const float * x, float * dst, const int k) {
const int i = blockDim.x*blockIdx.x + threadIdx.x;
if (i >= k) {
return;
}
dst[i] = cosf(x[i]);
}
static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
gelu_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
@ -156,6 +174,16 @@ static void sqrt_f32_cuda(const float * x, float * dst, const int k, cudaStream_
sqrt_f32<<<num_blocks, CUDA_SQRT_BLOCK_SIZE, 0, stream>>>(x, dst, k);
}
static void sin_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_SIN_BLOCK_SIZE - 1) / CUDA_SIN_BLOCK_SIZE;
sin_f32<<<num_blocks, CUDA_SIN_BLOCK_SIZE, 0, stream>>>(x, dst, k);
}
static void cos_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_COS_BLOCK_SIZE - 1) / CUDA_COS_BLOCK_SIZE;
cos_f32<<<num_blocks, CUDA_COS_BLOCK_SIZE, 0, stream>>>(x, dst, k);
}
void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const float * src0_d = (const float *)src0->data;
@ -312,3 +340,31 @@ void ggml_cuda_op_sqrt(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
sqrt_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
}
void ggml_cuda_op_sin(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const float * src0_d = (const float *)src0->data;
float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream();
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
sin_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
}
void ggml_cuda_op_cos(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const float * src0_d = (const float *)src0->data;
float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream();
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
cos_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
}

View file

@ -9,6 +9,8 @@
#define CUDA_HARDSWISH_BLOCK_SIZE 256
#define CUDA_SQR_BLOCK_SIZE 256
#define CUDA_SQRT_BLOCK_SIZE 256
#define CUDA_SIN_BLOCK_SIZE 256
#define CUDA_COS_BLOCK_SIZE 256
void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
@ -31,3 +33,7 @@ void ggml_cuda_op_leaky_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
void ggml_cuda_op_sqr(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_sqrt(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_sin(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_cos(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View file

@ -31,6 +31,8 @@ struct ggml_metal_kernel {
enum ggml_metal_kernel_type {
GGML_METAL_KERNEL_TYPE_ADD,
GGML_METAL_KERNEL_TYPE_ADD_ROW,
GGML_METAL_KERNEL_TYPE_SUB,
GGML_METAL_KERNEL_TYPE_SUB_ROW,
GGML_METAL_KERNEL_TYPE_MUL,
GGML_METAL_KERNEL_TYPE_MUL_ROW,
GGML_METAL_KERNEL_TYPE_DIV,
@ -207,6 +209,9 @@ enum ggml_metal_kernel_type {
GGML_METAL_KERNEL_TYPE_CPY_F32_IQ4_NL,
GGML_METAL_KERNEL_TYPE_CONCAT,
GGML_METAL_KERNEL_TYPE_SQR,
GGML_METAL_KERNEL_TYPE_SQRT,
GGML_METAL_KERNEL_TYPE_SIN,
GGML_METAL_KERNEL_TYPE_COS,
GGML_METAL_KERNEL_TYPE_SUM_ROWS,
GGML_METAL_KERNEL_TYPE_COUNT
@ -493,6 +498,8 @@ static struct ggml_backend_metal_context * ggml_metal_init(int n_cb) {
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD, add, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_ROW, add_row, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUB, sub, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUB_ROW, sub_row, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL, mul, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_ROW, mul_row, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIV, div, true);
@ -669,6 +676,9 @@ static struct ggml_backend_metal_context * ggml_metal_init(int n_cb) {
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_IQ4_NL, cpy_f32_iq4_nl, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CONCAT, concat, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SQR, sqr, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SQRT, sqrt, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SIN, sin, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_COS, cos, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS, sum_rows, true);
}
@ -769,15 +779,20 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_context * ctx
case GGML_OP_PERMUTE:
case GGML_OP_CONCAT:
case GGML_OP_ADD:
case GGML_OP_SUB:
case GGML_OP_ACC:
case GGML_OP_MUL:
case GGML_OP_DIV:
case GGML_OP_REPEAT:
case GGML_OP_SCALE:
case GGML_OP_CLAMP:
case GGML_OP_SQR:
case GGML_OP_SUM_ROWS:
return true;
case GGML_OP_SQR:
case GGML_OP_SQRT:
case GGML_OP_SIN:
case GGML_OP_COS:
return ggml_is_contiguous(op->src[0]);
case GGML_OP_SUM_ROWS:
case GGML_OP_SOFT_MAX:
case GGML_OP_RMS_NORM:
case GGML_OP_GROUP_NORM:
@ -1057,6 +1072,7 @@ static enum ggml_status ggml_metal_graph_compute(
[encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
} break;
case GGML_OP_ADD:
case GGML_OP_SUB:
case GGML_OP_MUL:
case GGML_OP_DIV:
{
@ -1080,6 +1096,7 @@ static enum ggml_status ggml_metal_graph_compute(
nb = ne00 / 4;
switch (dst->op) {
case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW].pipeline; break;
case GGML_OP_SUB: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUB_ROW].pipeline; break;
case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_ROW].pipeline; break;
case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV_ROW].pipeline; break;
default: GGML_ABORT("fatal error");
@ -1089,6 +1106,7 @@ static enum ggml_status ggml_metal_graph_compute(
} else {
switch (dst->op) {
case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD].pipeline; break;
case GGML_OP_SUB: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUB].pipeline; break;
case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL].pipeline; break;
case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV].pipeline; break;
default: GGML_ABORT("fatal error");
@ -1416,6 +1434,48 @@ static enum ggml_status ggml_metal_graph_compute(
const int64_t n = ggml_nelements(dst);
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break;
case GGML_OP_SQRT:
{
GGML_ASSERT(ggml_is_contiguous(src0));
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SQRT].pipeline;
[encoder setComputePipelineState:pipeline];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
const int64_t n = ggml_nelements(dst);
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break;
case GGML_OP_SIN:
{
GGML_ASSERT(ggml_is_contiguous(src0));
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SIN].pipeline;
[encoder setComputePipelineState:pipeline];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
const int64_t n = ggml_nelements(dst);
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break;
case GGML_OP_COS:
{
GGML_ASSERT(ggml_is_contiguous(src0));
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_COS].pipeline;
[encoder setComputePipelineState:pipeline];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
const int64_t n = ggml_nelements(dst);
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break;
case GGML_OP_SUM_ROWS:

View file

@ -17,7 +17,7 @@ enum ggml_sort_order {
GGML_SORT_ORDER_DESC,
};
// general-purpose kernel for addition, multiplication and division of two tensors
// general-purpose kernel for addition, subtraction, multiplication and division of two tensors
// pros: works for non-contiguous tensors, supports broadcast across all dims
// cons: not very efficient
kernel void kernel_add(
@ -70,6 +70,56 @@ kernel void kernel_add(
}
}
kernel void kernel_sub(
device const char * src0,
device const char * src1,
device char * dst,
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
constant int64_t & ne03,
constant uint64_t & nb00,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant uint64_t & nb03,
constant int64_t & ne10,
constant int64_t & ne11,
constant int64_t & ne12,
constant int64_t & ne13,
constant uint64_t & nb10,
constant uint64_t & nb11,
constant uint64_t & nb12,
constant uint64_t & nb13,
constant int64_t & ne0,
constant int64_t & ne1,
constant int64_t & ne2,
constant int64_t & ne3,
constant uint64_t & nb0,
constant uint64_t & nb1,
constant uint64_t & nb2,
constant uint64_t & nb3,
constant int64_t & offs,
uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]],
uint3 ntg[[threads_per_threadgroup]]) {
const int64_t i03 = tgpig.z;
const int64_t i02 = tgpig.y;
const int64_t i01 = tgpig.x;
const int64_t i13 = i03 % ne13;
const int64_t i12 = i02 % ne12;
const int64_t i11 = i01 % ne11;
device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01 + offs;
device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
device char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1 + offs;
for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
const int i10 = i0 % ne10;
*((device float *)(dst_ptr + i0*nb0)) = *((device float *)(src0_ptr + i0*nb00)) - *((device float *)(src1_ptr + i10*nb10));
}
}
kernel void kernel_mul(
device const char * src0,
device const char * src1,
@ -226,6 +276,15 @@ kernel void kernel_add_row(
dst[tpig] = src0[tpig] + src1[tpig % nb];
}
kernel void kernel_sub_row(
device const float4 * src0,
device const float4 * src1,
device float4 * dst,
constant uint64_t & nb [[buffer(28)]],
uint tpig[[thread_position_in_grid]]) {
dst[tpig] = src0[tpig] - src1[tpig % nb];
}
kernel void kernel_mul_row(
device const float4 * src0,
device const float4 * src1,
@ -358,6 +417,27 @@ kernel void kernel_sqr(
dst[tpig] = src0[tpig] * src0[tpig];
}
kernel void kernel_sqrt(
device const float * src0,
device float * dst,
uint tpig[[thread_position_in_grid]]) {
dst[tpig] = sqrt(src0[tpig]);
}
kernel void kernel_sin(
device const float * src0,
device float * dst,
uint tpig[[thread_position_in_grid]]) {
dst[tpig] = sin(src0[tpig]);
}
kernel void kernel_cos(
device const float * src0,
device float * dst,
uint tpig[[thread_position_in_grid]]) {
dst[tpig] = cos(src0[tpig]);
}
kernel void kernel_sum_rows(
device const float * src0,
device float * dst,

View file

@ -3645,7 +3645,7 @@ void quantize_row_q8_K(const float * restrict x, void * restrict y, int64_t k) {
quantize_row_q8_K_ref(x, y, k);
}
//===================================== Dot ptoducts =================================
//===================================== Dot products =================================
//
// Helper functions

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -188,6 +188,8 @@ struct vk_device_struct {
vk_pipeline pipeline_upscale_f32;
vk_pipeline pipeline_scale_f32;
vk_pipeline pipeline_sqr_f32;
vk_pipeline pipeline_sin_f32;
vk_pipeline pipeline_cos_f32;
vk_pipeline pipeline_clamp_f32;
vk_pipeline pipeline_pad_f32;
vk_pipeline pipeline_repeat_f32;
@ -1702,6 +1704,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_scale_f32, "scale_f32", scale_f32_len, scale_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_sqr_f32, "sqr_f32", sqr_f32_len, sqr_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_sin_f32, "sin_f32", sin_f32_len, sin_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_cos_f32, "cos_f32", cos_f32_len, cos_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_clamp_f32, "clamp_f32", clamp_f32_len, clamp_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
@ -4023,6 +4027,16 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
return ctx->device->pipeline_sqr_f32;
}
return nullptr;
case GGML_OP_SIN:
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
return ctx->device->pipeline_sin_f32;
}
return nullptr;
case GGML_OP_COS:
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
return ctx->device->pipeline_cos_f32;
}
return nullptr;
case GGML_OP_CLAMP:
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
return ctx->device->pipeline_clamp_f32;
@ -4171,6 +4185,8 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
case GGML_OP_UPSCALE:
case GGML_OP_SCALE:
case GGML_OP_SQR:
case GGML_OP_SIN:
case GGML_OP_COS:
case GGML_OP_CLAMP:
case GGML_OP_PAD:
case GGML_OP_REPEAT:
@ -4381,6 +4397,8 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
case GGML_OP_MUL:
case GGML_OP_SCALE:
case GGML_OP_SQR:
case GGML_OP_SIN:
case GGML_OP_COS:
case GGML_OP_CLAMP:
case GGML_OP_PAD:
case GGML_OP_REPEAT:
@ -4598,6 +4616,32 @@ static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context& subctx, const
}, dryrun);
}
static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
const uint32_t src0_type_size = ggml_type_size(src0->type);
const uint32_t dst_type_size = ggml_type_size(dst->type);
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SIN, {
(uint32_t)ggml_nelements(src0),
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
0,
0.0f, 0.0f,
});
}
static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
const uint32_t src0_type_size = ggml_type_size(src0->type);
const uint32_t dst_type_size = ggml_type_size(dst->type);
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_COS, {
(uint32_t)ggml_nelements(src0),
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
0,
0.0f, 0.0f,
});
}
static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
float * op_params = (float *)dst->op_params;
const uint32_t src0_type_size = ggml_type_size(src0->type);
@ -5658,6 +5702,8 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
case GGML_OP_UPSCALE:
case GGML_OP_SCALE:
case GGML_OP_SQR:
case GGML_OP_SIN:
case GGML_OP_COS:
case GGML_OP_CLAMP:
case GGML_OP_PAD:
case GGML_OP_CPY:
@ -5735,6 +5781,14 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
case GGML_OP_SQR:
ggml_vk_sqr(ctx, compute_ctx, src0, node, dryrun);
break;
case GGML_OP_SIN:
ggml_vk_sin(ctx, compute_ctx, src0, node);
break;
case GGML_OP_COS:
ggml_vk_cos(ctx, compute_ctx, src0, node);
break;
case GGML_OP_CLAMP:
ggml_vk_clamp(ctx, compute_ctx, src0, node, dryrun);
@ -5851,6 +5905,8 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
case GGML_OP_UPSCALE:
case GGML_OP_SCALE:
case GGML_OP_SQR:
case GGML_OP_SIN:
case GGML_OP_COS:
case GGML_OP_CLAMP:
case GGML_OP_PAD:
case GGML_OP_CPY:
@ -6582,6 +6638,8 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
case GGML_OP_UPSCALE:
case GGML_OP_SCALE:
case GGML_OP_SQR:
case GGML_OP_SIN:
case GGML_OP_COS:
case GGML_OP_CLAMP:
case GGML_OP_PAD:
case GGML_OP_CONT:
@ -7024,6 +7082,10 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) {
tensor_clone = ggml_scale(ggml_ctx, src0_clone, ((float *)tensor->op_params)[0]);
} else if (tensor->op == GGML_OP_SQR) {
tensor_clone = ggml_sqr(ggml_ctx, src0_clone);
} else if (tensor->op == GGML_OP_SIN) {
tensor_clone = ggml_sin(ggml_ctx, src0_clone);
} else if (tensor->op == GGML_OP_COS) {
tensor_clone = ggml_cos(ggml_ctx, src0_clone);
} else if (tensor->op == GGML_OP_CLAMP) {
tensor_clone = ggml_clamp(ggml_ctx, src0_clone, ((float *)tensor->op_params)[0], ((float *)tensor->op_params)[1]);
} else if (tensor->op == GGML_OP_PAD) {

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,15 @@
#version 450
#include "types.comp"
#include "generic_unary_head.comp"
void main() {
const uint idx = get_idx();
if (idx >= p.ne) {
return;
}
const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(cos(val));
}

View file

@ -0,0 +1,15 @@
#version 450
#include "types.comp"
#include "generic_unary_head.comp"
void main() {
const uint idx = get_idx();
if (idx >= p.ne) {
return;
}
const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
data_d[p.d_offset + dst_idx(idx)] = D_TYPE(sin(val));
}

View file

@ -397,6 +397,14 @@ void process_shaders(std::vector<std::future<void>>& tasks) {
string_to_spv("sqr_f32", "square.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
}));
tasks.push_back(std::async(std::launch::async, [] {
string_to_spv("sin_f32", "sin.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
}));
tasks.push_back(std::async(std::launch::async, [] {
string_to_spv("cos_f32", "cos.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
}));
tasks.push_back(std::async(std::launch::async, [] {
string_to_spv("clamp_f32", "clamp.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
}));

View file

@ -1173,8 +1173,8 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
file_format = in_file_format;
file_format_meta = in_file_format_meta;
kcpp_params->n_threads = inputs.threads;
kcpp_params->n_threads_batch = inputs.blasthreads;
kcpp_params->cpuparams.n_threads = inputs.threads;
kcpp_params->cpuparams_batch.n_threads = inputs.blasthreads;
bool isGguf = (file_format == FileFormat::GGUF_GENERIC);
kcpp_params->n_batch = GetBatchSize(inputs.blasbatchsize, in_file_format);
kcpp_params->n_ubatch = kcpp_params->n_batch;
@ -1283,7 +1283,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
int err = llama_v2_apply_lora_from_file(llama_ctx_v2,
lora_filename.c_str(),
lora_base_arg,
kcpp_params->n_threads);
kcpp_params->cpuparams.n_threads);
if (err != 0)
{
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
@ -1295,7 +1295,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
//determine mem per token
const std::vector<int> tmp = {1, 2, 3, 4};
llama_v2_eval(llama_ctx_v2, tmp.data(), tmp.size(), 0, kcpp_params->n_threads);
llama_v2_eval(llama_ctx_v2, tmp.data(), tmp.size(), 0, kcpp_params->cpuparams.n_threads);
return ModelLoadResult::SUCCESS;
}
else if(file_format == FileFormat::GGJT_3)
@ -1350,7 +1350,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
int err = llama_v3_apply_lora_from_file(llama_ctx_v3,
lora_filename.c_str(),
lora_base_arg,
kcpp_params->n_threads);
kcpp_params->cpuparams.n_threads);
if (err != 0)
{
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
@ -1362,7 +1362,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
//determine mem per token
const std::vector<int> tmp = {1, 2, 3, 4};
auto er = llama_v3_eval(llama_ctx_v3, tmp.data(), tmp.size(), 0, kcpp_params->n_threads);
auto er = llama_v3_eval(llama_ctx_v3, tmp.data(), tmp.size(), 0, kcpp_params->cpuparams.n_threads);
if(er!=0)
{
printf("\nLLAMA EVAL returned nonzero!\n");
@ -1424,8 +1424,8 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
llama_ctx_params.n_batch = kcpp_params->n_batch;
llama_ctx_params.n_ubatch = kcpp_params->n_ubatch;
llama_ctx_params.n_threads = kcpp_params->n_threads;
llama_ctx_params.n_threads_batch = kcpp_params->n_threads_batch;
llama_ctx_params.n_threads = kcpp_params->cpuparams.n_threads;
llama_ctx_params.n_threads_batch = kcpp_params->cpuparams_batch.n_threads;
#if defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN)
bool ts_all_zero = true;
@ -1539,11 +1539,11 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
bool useWorldTokenizer = false;
if (file_format == FileFormat::RWKV_1)
{
rwkv_ctx_v2 = rwkv_v2_init_from_file(modelname.c_str(), kcpp_params->n_threads);
rwkv_ctx_v2 = rwkv_v2_init_from_file(modelname.c_str(), kcpp_params->cpuparams.n_threads);
}
else //rwkv_2
{
rwkv_ctx_v3 = rwkv_init_from_file(modelname.c_str(), kcpp_params->n_threads);
rwkv_ctx_v3 = rwkv_init_from_file(modelname.c_str(), kcpp_params->cpuparams.n_threads);
if(inputs.gpulayers>0)
{
@ -1622,7 +1622,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
rwkv_ctx_v3->logits_out = (float *)malloc(logitbufsiz);
rwkv_ctx_v3->state_in = nullptr;
bool testeval = rwkv_eval(rwkv_ctx_v3, kcpp_params->n_threads, 0, rwkv_ctx_v3->state_in, rwkv_ctx_v3->state_out, rwkv_ctx_v3->logits_out);
bool testeval = rwkv_eval(rwkv_ctx_v3, kcpp_params->cpuparams.n_threads, 0, rwkv_ctx_v3->state_in, rwkv_ctx_v3->state_out, rwkv_ctx_v3->logits_out);
if (!testeval)
{
printf("\nError: RWKV Init Eval Failed!\n");
@ -1654,7 +1654,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
n_vocab = gpt2_ctx_v1.hparams.n_vocab;
// determine the required inference memory per token:
legacy_gpt2_eval(gpt2_ctx_v1, kcpp_params->n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, file_format);
legacy_gpt2_eval(gpt2_ctx_v1, kcpp_params->cpuparams.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, file_format);
return ModelLoadResult::SUCCESS;
}
else if (file_format == FileFormat::GPT2_2 || file_format==FileFormat::GPT2_3 || file_format==FileFormat::GPT2_4)
@ -1676,7 +1676,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
n_vocab = gpt2_ctx_v3.hparams.n_vocab;
// determine the required inference memory per token:
gpt2_eval(gpt2_ctx_v3, kcpp_params->n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, v3_use_scratch);
gpt2_eval(gpt2_ctx_v3, kcpp_params->cpuparams.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, v3_use_scratch);
return ModelLoadResult::SUCCESS;
}
else
@ -1699,7 +1699,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
n_vocab = gpt2_ctx_v2.hparams.n_vocab;
// determine the required inference memory per token:
gpt2_v2_eval(gpt2_ctx_v2, kcpp_params->n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, file_format);
gpt2_v2_eval(gpt2_ctx_v2, kcpp_params->cpuparams.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, file_format);
return ModelLoadResult::SUCCESS;
}
}
@ -1720,7 +1720,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
n_vocab = gptj_ctx_v1.hparams.n_vocab;
// determine the required inference memory per token:
legacy_gptj_eval(gptj_ctx_v1, kcpp_params->n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, file_format);
legacy_gptj_eval(gptj_ctx_v1, kcpp_params->cpuparams.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, file_format);
//if the logits are NAN or duplicated, it means the model is incompatible
if(logits.size()>0 && IsNanCheck(logits[0]))
@ -1751,14 +1751,14 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
n_vocab = gptj_ctx_v3.hparams.n_vocab;
// determine the required inference memory per token:
gptj_eval(gptj_ctx_v3, kcpp_params->n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, v3_use_scratch);
gptj_eval(gptj_ctx_v3, kcpp_params->cpuparams.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, v3_use_scratch);
//if the logits are NAN or duplicated, it means the model is incompatible
std::vector<float> oldlogits(logits);
//this is another hack because they change the library - we run the eval through the model
//twice and compare logits. if they give the same logits for different inputs, model is broken
gptj_eval(gptj_ctx_v3, kcpp_params->n_threads, 0, {4, 5, 6, 7}, logits, mem_per_token, v3_use_scratch);
gptj_eval(gptj_ctx_v3, kcpp_params->cpuparams.n_threads, 0, {4, 5, 6, 7}, logits, mem_per_token, v3_use_scratch);
if(logits.size()>0 && (IsNanCheck(logits[0]) || LogitsDuplicated(oldlogits,logits)))
{
@ -1789,14 +1789,14 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
n_vocab = gptj_ctx_v2.hparams.n_vocab;
// determine the required inference memory per token:
gptj_v2_eval(gptj_ctx_v2, kcpp_params->n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
gptj_v2_eval(gptj_ctx_v2, kcpp_params->cpuparams.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
//if the logits are NAN or duplicated, it means the model is incompatible
std::vector<float> oldlogits(logits);
//this is another hack because they change the library - we run the eval through the model
//twice and compare logits. if they give the same logits for different inputs, model is broken
gptj_v2_eval(gptj_ctx_v2, kcpp_params->n_threads, 0, {4, 5, 6, 7}, logits, mem_per_token);
gptj_v2_eval(gptj_ctx_v2, kcpp_params->cpuparams.n_threads, 0, {4, 5, 6, 7}, logits, mem_per_token);
if(logits.size()>0 && (IsNanCheck(logits[0]) || LogitsDuplicated(oldlogits,logits)))
{
@ -1827,7 +1827,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
n_vocab = neox_ctx_v3.hparams.n_vocab;
// determine the required inference memory per token:
gpt_neox_eval(neox_ctx_v3, kcpp_params->n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, v3_use_scratch);
gpt_neox_eval(neox_ctx_v3, kcpp_params->cpuparams.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, v3_use_scratch);
return ModelLoadResult::SUCCESS;
}
@ -1851,7 +1851,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
n_vocab = neox_ctx_v2.hparams.n_vocab;
// determine the required inference memory per token:
gpt_neox_v2_eval(neox_ctx_v2, kcpp_params->n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
gpt_neox_v2_eval(neox_ctx_v2, kcpp_params->cpuparams.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
if(logits.size()>0 && file_format==FileFormat::NEOX_2 && !IsNanCheck(logits[0]))
{
@ -1859,7 +1859,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
std::vector<int> test_embd = ::gpt_tokenize(vocab, "1 2 3 4 5 6 7");
auto orig_par_res = neox_ctx_v2.hparams.par_res;
neox_ctx_v2.hparams.par_res = 0; //test with residual false
gpt_neox_v2_eval(neox_ctx_v2, kcpp_params->n_threads, 0, test_embd, logits, mem_per_token);
gpt_neox_v2_eval(neox_ctx_v2, kcpp_params->cpuparams.n_threads, 0, test_embd, logits, mem_per_token);
neox_ctx_v2.hparams.par_res = orig_par_res;
int topid = std::max_element(logits.begin(),logits.end())-logits.begin();
std::string predicted = vocab.id_to_token[topid].c_str();
@ -1888,7 +1888,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
n_vocab = mpt_ctx_v3.hparams.n_vocab;
// determine the required inference memory per token:
mpt_eval(mpt_ctx_v3, kcpp_params->n_threads, 0, { 0, 1, 2, 3 }, logits, false, mem_per_token, v3_use_scratch);
mpt_eval(mpt_ctx_v3, kcpp_params->cpuparams.n_threads, 0, { 0, 1, 2, 3 }, logits, false, mem_per_token, v3_use_scratch);
return ModelLoadResult::SUCCESS;
}
else
@ -1966,10 +1966,10 @@ int GetThreadsToUse(bool blasmode)
}
else
{
return kcpp_params->n_threads_batch;
return kcpp_params->cpuparams_batch.n_threads;
}
}
return kcpp_params->n_threads;
return kcpp_params->cpuparams.n_threads;
}
generation_outputs gpttype_generate(const generation_inputs inputs)
@ -2263,7 +2263,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
else
{
llava_images[i].clp_image_tokens = 0;
if (!llava_image_embed_make_with_clip_img(clp_ctx, kcpp_params->n_threads, clp_img_data, &llava_images[i].clp_img_embd, &llava_images[i].clp_image_tokens)) {
if (!llava_image_embed_make_with_clip_img(clp_ctx, kcpp_params->cpuparams.n_threads, clp_img_data, &llava_images[i].clp_img_embd, &llava_images[i].clp_image_tokens)) {
printf("\nError: Clip image %d failed to create embd!",i);
}
if(debugmode==1)

View file

@ -304,8 +304,8 @@ extern "C" {
uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode
uint32_t n_ubatch; // physical maximum batch size
uint32_t n_seq_max; // max number of sequences (i.e. distinct states for recurrent models)
uint32_t n_threads; // number of threads to use for generation
uint32_t n_threads_batch; // number of threads to use for batch processing
int32_t n_threads; // number of threads to use for generation
int32_t n_threads_batch; // number of threads to use for batch processing
enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
@ -428,6 +428,13 @@ extern "C" {
//optional:
LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa);
// Optional: an auto threadpool gets created in ggml if not passed explicitly
LLAMA_API void llama_attach_threadpool(
struct llama_context * ctx,
ggml_threadpool_t threadpool,
ggml_threadpool_t threadpool_batch);
LLAMA_API void llama_detach_threadpool(struct llama_context * ctx);
// Call once at the end of the program - currently only used for MPI
LLAMA_API void llama_backend_free(void);
@ -839,13 +846,13 @@ extern "C" {
// Set the number of threads used for decoding
// n_threads is the number of threads used for generation (single token)
// n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);
LLAMA_API void llama_set_n_threads(struct llama_context * ctx, int32_t n_threads, int32_t n_threads_batch);
// Get the number of threads used for generation of a single token.
LLAMA_API uint32_t llama_n_threads(struct llama_context * ctx);
LLAMA_API int32_t llama_n_threads(struct llama_context * ctx);
// Get the number of threads used for prompt and batch processing (multiple token).
LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx);
LLAMA_API int32_t llama_n_threads_batch(struct llama_context * ctx);
// Set whether the model is in embeddings mode or not
// If true, embeddings will be returned but logits will not

View file

@ -163,7 +163,7 @@ static bool ggml_graph_compute_helper(
int n_threads,
ggml_abort_callback abort_callback,
void * abort_callback_data) {
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
plan.abort_callback = abort_callback;
plan.abort_callback_data = abort_callback_data;

View file

@ -2386,8 +2386,8 @@ struct llama_cparams {
uint32_t n_batch;
uint32_t n_ubatch;
uint32_t n_seq_max;
uint32_t n_threads; // number of threads to use for generation
uint32_t n_threads_batch; // number of threads to use for batch processing
int n_threads; // number of threads to use for generation
int n_threads_batch; // number of threads to use for batch processing
float rope_freq_base;
float rope_freq_scale;
@ -3104,6 +3104,9 @@ struct llama_context {
#endif
ggml_backend_t backend_cpu = nullptr;
ggml_threadpool_t threadpool = nullptr;
ggml_threadpool_t threadpool_batch = nullptr;
bool has_evaluated_once = false;
int64_t t_start_us;
@ -15572,7 +15575,8 @@ static void llama_output_reorder(struct llama_context * ctx) {
static void llama_graph_compute(
llama_context & lctx,
ggml_cgraph * gf,
int n_threads) {
int n_threads,
ggml_threadpool * threadpool) {
#ifdef GGML_USE_METAL
if (ggml_backend_is_metal(lctx.backend_metal)) {
ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
@ -15581,6 +15585,7 @@ static void llama_graph_compute(
if (lctx.backend_cpu != nullptr) {
ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
ggml_backend_cpu_set_threadpool(lctx.backend_cpu, threadpool);
ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
}
#ifdef GGML_USE_BLAS
@ -15701,6 +15706,8 @@ static int llama_decode_internal(
}
int n_threads = (n_tokens < 32) ? cparams.n_threads : cparams.n_threads_batch;
ggml_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch;
GGML_ASSERT(n_threads > 0);
// non-causal masks do not use the KV cache
@ -15762,7 +15769,7 @@ static int llama_decode_internal(
llama_set_inputs(lctx, ubatch);
llama_graph_compute(lctx, gf, n_threads);
llama_graph_compute(lctx, gf, n_threads, threadpool);
// update the kv ring buffer
{
@ -15939,7 +15946,9 @@ static int llama_encode_internal(
lctx.inp_embd_enc = NULL;
lctx.n_outputs = n_tokens;
const int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
ggml_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch;
GGML_ASSERT(n_threads > 0);
ggml_backend_sched_reset(lctx.sched);
@ -15971,7 +15980,7 @@ static int llama_encode_internal(
llama_set_inputs(lctx, ubatch);
llama_graph_compute(lctx, gf, n_threads);
llama_graph_compute(lctx, gf, n_threads, threadpool);
// extract embeddings
if (embd) {
@ -16253,7 +16262,7 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids);
llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
#endif
//const int64_t t_end = ggml_time_us();
@ -16279,7 +16288,7 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
llama_set_k_shift(lctx);
llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
need_reserve = true;
}
@ -16898,7 +16907,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
// TODO: avoid hardcoded tensor names - use the TN_* constants
if (name.find("attn_v.weight") != std::string::npos ||
name.find("attn_qkv.weight") != std::string::npos) {
name.find("attn_qkv.weight") != std::string::npos ||
name.find("attn_kv_b.weight")!= std::string::npos) {
++qs.n_attention_wv;
} else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
qs.has_output = true;
@ -17512,6 +17522,19 @@ void llama_numa_init(enum ggml_numa_strategy numa) {
}
}
void llama_attach_threadpool(
struct llama_context * ctx,
ggml_threadpool_t threadpool,
ggml_threadpool_t threadpool_batch) {
ctx->threadpool = threadpool;
ctx->threadpool_batch = threadpool_batch ? threadpool_batch : threadpool;
}
void llama_detach_threadpool(struct llama_context * ctx) {
ctx->threadpool = nullptr;
ctx->threadpool_batch = nullptr;
}
void llama_backend_free(void) {
ggml_quantize_free();
}
@ -19428,7 +19451,6 @@ size_t llama_state_seq_load_file(struct llama_context * ctx, const char * filepa
}
}
void printcache(struct llama_context * ctx)
{
struct llama_kv_cache & cache = ctx->kv_self;
@ -19439,16 +19461,16 @@ void printcache(struct llama_context * ctx)
printf("%s",vals.c_str());
}
void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) {
void llama_set_n_threads(struct llama_context * ctx, int32_t n_threads, int32_t n_threads_batch) {
ctx->cparams.n_threads = n_threads;
ctx->cparams.n_threads_batch = n_threads_batch;
}
uint32_t llama_n_threads(struct llama_context * ctx) {
int32_t llama_n_threads(struct llama_context * ctx) {
return ctx->cparams.n_threads;
}
uint32_t llama_n_threads_batch(struct llama_context * ctx) {
int32_t llama_n_threads_batch(struct llama_context * ctx) {
return ctx->cparams.n_threads_batch;
}