diff --git a/.gitignore b/.gitignore index 4f17b7861..4d4f4336e 100644 --- a/.gitignore +++ b/.gitignore @@ -21,6 +21,7 @@ gcovr-report/ build*/ out/ tmp/ +autogen-*.md models/* models-mnt diff --git a/common/common.cpp b/common/common.cpp index 29b9b283b..166c7a1de 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -26,6 +26,7 @@ #include #include #include +#include #if defined(__APPLE__) && defined(__MACH__) #include @@ -78,41 +79,6 @@ using json = nlohmann::ordered_json; -// -// Environment variable utils -// - -template -static typename std::enable_if::value, void>::type -get_env(std::string name, T & target) { - char * value = std::getenv(name.c_str()); - target = value ? std::string(value) : target; -} - -template -static typename std::enable_if::value && std::is_integral::value, void>::type -get_env(std::string name, T & target) { - char * value = std::getenv(name.c_str()); - target = value ? std::stoi(value) : target; -} - -template -static typename std::enable_if::value, void>::type -get_env(std::string name, T & target) { - char * value = std::getenv(name.c_str()); - target = value ? std::stof(value) : target; -} - -template -static typename std::enable_if::value, void>::type -get_env(std::string name, T & target) { - char * value = std::getenv(name.c_str()); - if (value) { - std::string val(value); - target = val == "1" || val == "true"; - } -} - // // CPU utils // @@ -307,7 +273,33 @@ bool set_process_priority(enum ggml_sched_priority prio) { // CLI argument parsing // -void gpt_params_handle_model_default(gpt_params & params) { +#ifdef __GNUC__ +#ifdef __MINGW32__ +#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__))) +#else +#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__))) +#endif +#else +#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) +#endif + +LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2) +static std::string format(const char * fmt, ...) { + va_list ap; + va_list ap2; + va_start(ap, fmt); + va_copy(ap2, ap); + int size = vsnprintf(NULL, 0, fmt, ap); + GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT + std::vector buf(size + 1); + int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2); + GGML_ASSERT(size2 == size); + va_end(ap2); + va_end(ap); + return std::string(buf.data(), size); +} + +static void gpt_params_handle_model_default(gpt_params & params) { if (!params.hf_repo.empty()) { // short-hand to avoid specifying --hf-file -> default it to --model if (params.hf_file.empty()) { @@ -353,7 +345,47 @@ void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) } } -bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { +bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vector & options) { + std::string arg; + const std::string arg_prefix = "--"; + gpt_sampler_params & sparams = params.sparams; + + std::unordered_map arg_to_options; + for (auto & opt : options) { + for (const auto & arg : opt.args) { + arg_to_options[arg] = &opt; + } + } + + // handle environment variables + for (auto & opt : options) { + std::string value; + if (opt.get_value_from_env(value)) { + try { + if (opt.handler_void && (value == "1" || value == "true")) { + opt.handler_void(params); + } + if (opt.handler_int) { + opt.handler_int(params, std::stoi(value)); + } + if (opt.handler_string) { + opt.handler_string(params, value); + continue; + } + } catch (std::exception & e) { + throw std::invalid_argument(format( + "error while handling environment variable \"%s\": %s\n\n", opt.env, e.what())); + } + } + } + + // handle command line arguments + auto check_arg = [&](int i) { + if (i+1 >= argc) { + throw std::invalid_argument("expected value for argument"); + } + }; + for (int i = 1; i < argc; i++) { const std::string arg_prefix = "--"; @@ -361,13 +393,43 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) { std::replace(arg.begin(), arg.end(), '_', '-'); } - - bool invalid_param = false; - if (!gpt_params_find_arg(argc, argv, arg, params, i, invalid_param)) { - throw std::invalid_argument("error: unknown argument: " + arg); + if (arg_to_options.find(arg) == arg_to_options.end()) { + throw std::invalid_argument(format("error: invalid argument: %s", arg.c_str())); } - if (invalid_param) { - throw std::invalid_argument("error: invalid parameter for argument: " + arg); + auto opt = *arg_to_options[arg]; + if (opt.has_value_from_env()) { + fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str()); + } + try { + if (opt.handler_void) { + opt.handler_void(params); + continue; + } + + // arg with single value + check_arg(i); + std::string val = argv[++i]; + if (opt.handler_int) { + opt.handler_int(params, std::stoi(val)); + continue; + } + if (opt.handler_string) { + opt.handler_string(params, val); + continue; + } + + // arg with 2 values + check_arg(i); + std::string val2 = argv[++i]; + if (opt.handler_str_str) { + opt.handler_str_str(params, val, val2); + continue; + } + } catch (std::exception & e) { + throw std::invalid_argument(format( + "error while handling argument \"%s\": %s\n\n" + "usage:\n%s\n\nto show complete usage, run with -h", + arg.c_str(), e.what(), arg_to_options[arg]->to_string().c_str())); } } @@ -382,12 +444,6 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { gpt_params_handle_model_default(params); - if (params.hf_token.empty()) { - get_env("HF_TOKEN", params.hf_token); - } - - auto & sparams = params.sparams; - if (params.escape) { string_process_escapes(params.prompt); string_process_escapes(params.input_prefix); @@ -409,41 +465,21 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { return true; } -void gpt_params_parse_from_env(gpt_params & params) { - // we only care about server-related params for now - get_env("LLAMA_ARG_MODEL", params.model); - get_env("LLAMA_ARG_MODEL_URL", params.model_url); - get_env("LLAMA_ARG_MODEL_ALIAS", params.model_alias); - get_env("LLAMA_ARG_HF_REPO", params.hf_repo); - get_env("LLAMA_ARG_HF_FILE", params.hf_file); - get_env("LLAMA_ARG_THREADS", params.cpuparams.n_threads); - get_env("LLAMA_ARG_CTX_SIZE", params.n_ctx); - get_env("LLAMA_ARG_N_PARALLEL", params.n_parallel); - get_env("LLAMA_ARG_BATCH", params.n_batch); - get_env("LLAMA_ARG_UBATCH", params.n_ubatch); - get_env("LLAMA_ARG_N_GPU_LAYERS", params.n_gpu_layers); - get_env("LLAMA_ARG_THREADS_HTTP", params.n_threads_http); - get_env("LLAMA_ARG_CHAT_TEMPLATE", params.chat_template); - get_env("LLAMA_ARG_N_PREDICT", params.n_predict); - get_env("LLAMA_ARG_ENDPOINT_METRICS", params.endpoint_metrics); - get_env("LLAMA_ARG_ENDPOINT_SLOTS", params.endpoint_slots); - get_env("LLAMA_ARG_EMBEDDINGS", params.embedding); - get_env("LLAMA_ARG_FLASH_ATTN", params.flash_attn); - get_env("LLAMA_ARG_DEFRAG_THOLD", params.defrag_thold); - get_env("LLAMA_ARG_CONT_BATCHING", params.cont_batching); - get_env("LLAMA_ARG_HOST", params.hostname); - get_env("LLAMA_ARG_PORT", params.port); -} - -bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { +bool gpt_params_parse(int argc, char ** argv, gpt_params & params, std::vector & options) { const auto params_org = params; // the example can modify the default params try { - if (!gpt_params_parse_ex(argc, argv, params) || params.usage) { + if (!gpt_params_parse_ex(argc, argv, params, options)) { params = params_org; - params.usage = true; return false; } + if (params.usage) { + gpt_params_print_usage(params, options); + if (params.print_usage) { + params.print_usage(argc, argv); + } + exit(0); + } } catch (const std::invalid_argument & ex) { fprintf(stderr, "%s\n", ex.what()); params = params_org; @@ -526,1558 +562,1741 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD return true; } -#define CHECK_ARG if (++i >= argc) { invalid_param = true; return true; } - -bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) { - const char split_delim = ','; - - auto & sparams = params.sparams; - - if (arg == "-s" || arg == "--seed") { - CHECK_ARG - sparams.seed = std::stoul(argv[i]); - return true; - } - if (arg == "-t" || arg == "--threads") { - CHECK_ARG - params.cpuparams.n_threads = std::stoi(argv[i]); - if (params.cpuparams.n_threads <= 0) { - params.cpuparams.n_threads = std::thread::hardware_concurrency(); - } - return true; - } - if (arg == "-C" || arg == "--cpu-mask") { - CHECK_ARG - std::string mask = argv[i]; - params.cpuparams.mask_valid = true; - invalid_param = !parse_cpu_mask(mask, params.cpuparams.cpumask); - return true; - } - if (arg == "-Cr" || arg == "--cpu-range") { - CHECK_ARG - std::string range = argv[i]; - params.cpuparams.mask_valid = true; - invalid_param = !parse_cpu_range(range, params.cpuparams.cpumask); - return true; - } - if (arg == "--prio") { - CHECK_ARG - params.cpuparams.priority = (enum ggml_sched_priority) std::stoul(argv[i]); - return true; - } - if (arg == "--cpu-strict") { - CHECK_ARG - params.cpuparams.strict_cpu = std::stoul(argv[i]); - return true; - } - if (arg == "--poll") { - CHECK_ARG - params.cpuparams.poll = std::stoul(argv[i]); - return true; - } - if (arg == "-tb" || arg == "--threads-batch") { - CHECK_ARG - params.cpuparams_batch.n_threads = std::stoi(argv[i]); - if (params.cpuparams_batch.n_threads <= 0) { - params.cpuparams_batch.n_threads = std::thread::hardware_concurrency(); - } - return true; - } - if (arg == "-Cb" || arg == "--cpu-mask-batch") { - CHECK_ARG - std::string mask = argv[i]; - params.cpuparams_batch.mask_valid = true; - invalid_param = !parse_cpu_mask(mask, params.cpuparams_batch.cpumask); - return true; - } - if (arg == "-Crb" || arg == "--cpu-range_batch") { - CHECK_ARG - std::string range = argv[i]; - params.cpuparams_batch.mask_valid = true; - invalid_param = !parse_cpu_range(range, params.cpuparams_batch.cpumask); - return true; - } - if (arg == "--prio-batch") { - CHECK_ARG - params.cpuparams_batch.priority = (enum ggml_sched_priority) std::stoul(argv[i]); - return true; - } - if (arg == "--cpu-strict-batch") { - params.cpuparams_batch.strict_cpu = true; - return true; - } - if (arg == "--poll-batch") { - CHECK_ARG - params.cpuparams_batch.poll = std::stoul(argv[i]); - return true; - } - if (arg == "-td" || arg == "--threads-draft") { - CHECK_ARG - params.draft_cpuparams.n_threads = std::stoi(argv[i]); - if (params.draft_cpuparams.n_threads <= 0) { - params.draft_cpuparams.n_threads = std::thread::hardware_concurrency(); - } - return true; - } - if (arg == "-Cd" || arg == "--cpu-mask-draft") { - CHECK_ARG - std::string mask = argv[i]; - params.draft_cpuparams.mask_valid = true; - invalid_param = !parse_cpu_mask(mask, params.draft_cpuparams.cpumask); - return true; - } - if (arg == "-Crd" || arg == "--cpu-range-draft") { - CHECK_ARG - std::string range = argv[i]; - params.draft_cpuparams.mask_valid = true; - invalid_param = !parse_cpu_range(range, params.draft_cpuparams.cpumask); - return true; - } - if (arg == "--prio-draft") { - CHECK_ARG - params.draft_cpuparams.priority = (enum ggml_sched_priority) std::stoul(argv[i]); - return true; - } - if (arg == "--cpu-strict-draft") { - params.draft_cpuparams.strict_cpu = true; - return true; - } - if (arg == "--poll-draft") { - CHECK_ARG - params.draft_cpuparams.poll = std::stoul(argv[i]); - return true; - } - if (arg == "-tbd" || arg == "--threads-batch-draft") { - CHECK_ARG - params.draft_cpuparams_batch.n_threads = std::stoi(argv[i]); - if (params.draft_cpuparams_batch.n_threads <= 0) { - params.draft_cpuparams_batch.n_threads = std::thread::hardware_concurrency(); - } - return true; - } - if (arg == "-Crbd" || arg == "--cpu-range-batch-draft") { - CHECK_ARG - std::string range = argv[i]; - params.draft_cpuparams_batch.mask_valid = true; - invalid_param = !parse_cpu_range(range, params.draft_cpuparams_batch.cpumask); - return true; - } - if (arg == "--prio-batch-draft") { - CHECK_ARG - params.draft_cpuparams_batch.priority = (enum ggml_sched_priority) std::stoul(argv[i]); - return true; - } - if (arg == "--cpu-strict-batch-draft") { - params.draft_cpuparams_batch.strict_cpu = true; - return true; - } - if (arg == "--poll-batch-draft") { - CHECK_ARG - params.draft_cpuparams_batch.poll = std::stoul(argv[i]); - return true; - } - if (arg == "-p" || arg == "--prompt") { - CHECK_ARG - params.prompt = argv[i]; - return true; - } - if (arg == "-e" || arg == "--escape") { - params.escape = true; - return true; - } - if (arg == "--no-escape") { - params.escape = false; - return true; - } - if (arg == "--prompt-cache") { - CHECK_ARG - params.path_prompt_cache = argv[i]; - return true; - } - if (arg == "--prompt-cache-all") { - params.prompt_cache_all = true; - return true; - } - if (arg == "--prompt-cache-ro") { - params.prompt_cache_ro = true; - return true; - } - if (arg == "-bf" || arg == "--binary-file") { - CHECK_ARG - std::ifstream file(argv[i], std::ios::binary); - if (!file) { - fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); - invalid_param = true; - return true; - } - // store the external file name in params - params.prompt_file = argv[i]; - std::ostringstream ss; - ss << file.rdbuf(); - params.prompt = ss.str(); - fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), argv[i]); - return true; - } - if (arg == "-f" || arg == "--file") { - CHECK_ARG - std::ifstream file(argv[i]); - if (!file) { - fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); - invalid_param = true; - return true; - } - // store the external file name in params - params.prompt_file = argv[i]; - std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(params.prompt)); - if (!params.prompt.empty() && params.prompt.back() == '\n') { - params.prompt.pop_back(); - } - return true; - } - if (arg == "--in-file") { - CHECK_ARG - std::ifstream file(argv[i]); - if (!file) { - fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); - invalid_param = true; - return true; - } - params.in_files.push_back(argv[i]); - return true; - } - if (arg == "-n" || arg == "--predict" || arg == "--n-predict") { - CHECK_ARG - params.n_predict = std::stoi(argv[i]); - return true; - } - if (arg == "--top-k") { - CHECK_ARG - sparams.top_k = std::stoi(argv[i]); - return true; - } - if (arg == "-c" || arg == "--ctx-size") { - CHECK_ARG - params.n_ctx = std::stoi(argv[i]); - return true; - } - if (arg == "--grp-attn-n" || arg == "-gan") { - CHECK_ARG - params.grp_attn_n = std::stoi(argv[i]); - return true; - } - if (arg == "--grp-attn-w" || arg == "-gaw") { - CHECK_ARG - params.grp_attn_w = std::stoi(argv[i]); - return true; - } - if (arg == "--rope-freq-base") { - CHECK_ARG - params.rope_freq_base = std::stof(argv[i]); - return true; - } - if (arg == "--rope-freq-scale") { - CHECK_ARG - params.rope_freq_scale = std::stof(argv[i]); - return true; - } - if (arg == "--rope-scaling") { - CHECK_ARG - std::string value(argv[i]); - /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; } - else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; } - else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; } - else { invalid_param = true; } - return true; - } - if (arg == "--rope-scale") { - CHECK_ARG - params.rope_freq_scale = 1.0f / std::stof(argv[i]); - return true; - } - if (arg == "--yarn-orig-ctx") { - CHECK_ARG - params.yarn_orig_ctx = std::stoi(argv[i]); - return true; - } - if (arg == "--yarn-ext-factor") { - CHECK_ARG - params.yarn_ext_factor = std::stof(argv[i]); - return true; - } - if (arg == "--yarn-attn-factor") { - CHECK_ARG - params.yarn_attn_factor = std::stof(argv[i]); - return true; - } - if (arg == "--yarn-beta-fast") { - CHECK_ARG - params.yarn_beta_fast = std::stof(argv[i]); - return true; - } - if (arg == "--yarn-beta-slow") { - CHECK_ARG - params.yarn_beta_slow = std::stof(argv[i]); - return true; - } - if (arg == "--pooling") { - CHECK_ARG - std::string value(argv[i]); - /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; } - else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; } - else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; } - else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; } - else { invalid_param = true; } - return true; - } - if (arg == "--attention") { - CHECK_ARG - std::string value(argv[i]); - /**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; } - else if (value == "non-causal") { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; } - else { invalid_param = true; } - return true; - } - if (arg == "--defrag-thold" || arg == "-dt") { - CHECK_ARG - params.defrag_thold = std::stof(argv[i]); - return true; - } - if (arg == "--samplers") { - CHECK_ARG - const auto sampler_names = string_split(argv[i], ';'); - sparams.samplers = gpt_sampler_types_from_names(sampler_names, true); - return true; - } - if (arg == "--sampling-seq") { - CHECK_ARG - sparams.samplers = gpt_sampler_types_from_chars(argv[i]); - return true; - } - if (arg == "--top-p") { - CHECK_ARG - sparams.top_p = std::stof(argv[i]); - return true; - } - if (arg == "--min-p") { - CHECK_ARG - sparams.min_p = std::stof(argv[i]); - return true; - } - if (arg == "--temp") { - CHECK_ARG - sparams.temp = std::stof(argv[i]); - sparams.temp = std::max(sparams.temp, 0.0f); - return true; - } - if (arg == "--tfs") { - CHECK_ARG - sparams.tfs_z = std::stof(argv[i]); - return true; - } - if (arg == "--typical") { - CHECK_ARG - sparams.typ_p = std::stof(argv[i]); - return true; - } - if (arg == "--repeat-last-n") { - CHECK_ARG - sparams.penalty_last_n = std::stoi(argv[i]); - sparams.n_prev = std::max(sparams.n_prev, sparams.penalty_last_n); - return true; - } - if (arg == "--repeat-penalty") { - CHECK_ARG - sparams.penalty_repeat = std::stof(argv[i]); - return true; - } - if (arg == "--frequency-penalty") { - CHECK_ARG - sparams.penalty_freq = std::stof(argv[i]); - return true; - } - if (arg == "--presence-penalty") { - CHECK_ARG - sparams.penalty_present = std::stof(argv[i]); - return true; - } - if (arg == "--dynatemp-range") { - CHECK_ARG - sparams.dynatemp_range = std::stof(argv[i]); - return true; - } - if (arg == "--dynatemp-exp") { - CHECK_ARG - sparams.dynatemp_exponent = std::stof(argv[i]); - return true; - } - if (arg == "--mirostat") { - CHECK_ARG - sparams.mirostat = std::stoi(argv[i]); - return true; - } - if (arg == "--mirostat-lr") { - CHECK_ARG - sparams.mirostat_eta = std::stof(argv[i]); - return true; - } - if (arg == "--mirostat-ent") { - CHECK_ARG - sparams.mirostat_tau = std::stof(argv[i]); - return true; - } - if (arg == "-b" || arg == "--batch-size") { - CHECK_ARG - params.n_batch = std::stoi(argv[i]); - return true; - } - if (arg == "-ub" || arg == "--ubatch-size") { - CHECK_ARG - params.n_ubatch = std::stoi(argv[i]); - return true; - } - if (arg == "--keep") { - CHECK_ARG - params.n_keep = std::stoi(argv[i]); - return true; - } - if (arg == "--draft") { - CHECK_ARG - params.n_draft = std::stoi(argv[i]); - return true; - } - if (arg == "--chunks") { - CHECK_ARG - params.n_chunks = std::stoi(argv[i]); - return true; - } - if (arg == "-np" || arg == "--parallel") { - CHECK_ARG - params.n_parallel = std::stoi(argv[i]); - return true; - } - if (arg == "-ns" || arg == "--sequences") { - CHECK_ARG - params.n_sequences = std::stoi(argv[i]); - return true; - } - if (arg == "--p-split" || arg == "-ps") { - CHECK_ARG - params.p_split = std::stof(argv[i]); - return true; - } - if (arg == "-m" || arg == "--model") { - CHECK_ARG - params.model = argv[i]; - return true; - } - if (arg == "-md" || arg == "--model-draft") { - CHECK_ARG - params.model_draft = argv[i]; - return true; - } - if (arg == "-a" || arg == "--alias") { - CHECK_ARG - params.model_alias = argv[i]; - return true; - } - if (arg == "-mu" || arg == "--model-url") { - CHECK_ARG - params.model_url = argv[i]; - return true; - } - if (arg == "-hft" || arg == "--hf-token") { - if (++i >= argc) { - invalid_param = true; - return true; - } - params.hf_token = argv[i]; - return true; - } - if (arg == "-hfr" || arg == "--hf-repo") { - CHECK_ARG - params.hf_repo = argv[i]; - return true; - } - if (arg == "-hff" || arg == "--hf-file") { - CHECK_ARG - params.hf_file = argv[i]; - return true; - } - if (arg == "--lora") { - CHECK_ARG - params.lora_adapters.push_back({ - std::string(argv[i]), - 1.0, - }); - return true; - } - if (arg == "--lora-scaled") { - CHECK_ARG - std::string lora_adapter = argv[i]; - CHECK_ARG - params.lora_adapters.push_back({ - lora_adapter, - std::stof(argv[i]), - }); - return true; - } - if (arg == "--lora-init-without-apply") { - params.lora_init_without_apply = true; - return true; - } - if (arg == "--control-vector") { - CHECK_ARG - params.control_vectors.push_back({ 1.0f, argv[i], }); - return true; - } - if (arg == "--control-vector-scaled") { - CHECK_ARG - const char* fname = argv[i]; - CHECK_ARG - params.control_vectors.push_back({ std::stof(argv[i]), fname, }); - return true; - } - if (arg == "--control-vector-layer-range") { - CHECK_ARG - params.control_vector_layer_start = std::stoi(argv[i]); - CHECK_ARG - params.control_vector_layer_end = std::stoi(argv[i]); - return true; - } - if (arg == "--mmproj") { - CHECK_ARG - params.mmproj = argv[i]; - return true; - } - if (arg == "--image") { - CHECK_ARG - params.image.emplace_back(argv[i]); - return true; - } - if (arg == "-i" || arg == "--interactive") { - params.interactive = true; - return true; - } - if (arg == "-sp" || arg == "--special") { - params.special = true; - return true; - } - if (arg == "--embedding" || arg == "--embeddings") { - params.embedding = true; - return true; - } - if (arg == "--embd-normalize") { - CHECK_ARG - params.embd_normalize = std::stoi(argv[i]); - return true; - } - if (arg == "--embd-output-format") { - CHECK_ARG - params.embd_out = argv[i]; - return true; - } - if (arg == "--embd-separator") { - CHECK_ARG - params.embd_sep = argv[i]; - return true; - } - if (arg == "-if" || arg == "--interactive-first") { - params.interactive_first = true; - return true; - } - if (arg == "-cnv" || arg == "--conversation") { - params.conversation = true; - return true; - } - if (arg == "--infill") { - params.infill = true; - return true; - } - if (arg == "-dkvc" || arg == "--dump-kv-cache") { - params.dump_kv_cache = true; - return true; - } - if (arg == "-nkvo" || arg == "--no-kv-offload") { - params.no_kv_offload = true; - return true; - } - if (arg == "-ctk" || arg == "--cache-type-k") { - params.cache_type_k = argv[++i]; - return true; - } - if (arg == "-ctv" || arg == "--cache-type-v") { - params.cache_type_v = argv[++i]; - return true; - } - if (arg == "-mli" || arg == "--multiline-input") { - params.multiline_input = true; - return true; - } - if (arg == "--simple-io") { - params.simple_io = true; - return true; - } - if (arg == "-cb" || arg == "--cont-batching") { - params.cont_batching = true; - return true; - } - if (arg == "-nocb" || arg == "--no-cont-batching") { - params.cont_batching = false; - return true; - } - if (arg == "-fa" || arg == "--flash-attn") { - params.flash_attn = true; - return true; - } - if (arg == "-co" || arg == "--color") { - params.use_color = true; - return true; - } - if (arg == "--mlock") { - params.use_mlock = true; - return true; - } - if (arg == "-ngl" || arg == "--gpu-layers" || arg == "--n-gpu-layers") { - CHECK_ARG - params.n_gpu_layers = std::stoi(argv[i]); - if (!llama_supports_gpu_offload()) { - fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n"); - fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); - } - return true; - } - if (arg == "-ngld" || arg == "--gpu-layers-draft" || arg == "--n-gpu-layers-draft") { - CHECK_ARG - params.n_gpu_layers_draft = std::stoi(argv[i]); - if (!llama_supports_gpu_offload()) { - fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n"); - fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); - } - return true; - } - if (arg == "--main-gpu" || arg == "-mg") { - CHECK_ARG - params.main_gpu = std::stoi(argv[i]); -#ifndef GGML_USE_CUDA_SYCL_VULKAN - fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the main GPU has no effect.\n"); -#endif // GGML_USE_CUDA_SYCL_VULKAN - return true; - } - if (arg == "--split-mode" || arg == "-sm") { - CHECK_ARG - std::string arg_next = argv[i]; - if (arg_next == "none") { - params.split_mode = LLAMA_SPLIT_MODE_NONE; - } - else if (arg_next == "layer") { - params.split_mode = LLAMA_SPLIT_MODE_LAYER; - } - else if (arg_next == "row") { -#ifdef GGML_USE_SYCL - fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n"); - exit(1); -#endif // GGML_USE_SYCL - params.split_mode = LLAMA_SPLIT_MODE_ROW; - } - else { - invalid_param = true; - return true; - } -#ifndef GGML_USE_CUDA_SYCL_VULKAN - fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the split mode has no effect.\n"); -#endif // GGML_USE_CUDA_SYCL_VULKAN - return true; - } - if (arg == "--tensor-split" || arg == "-ts") { - CHECK_ARG - std::string arg_next = argv[i]; - - // split string by , and / - const std::regex regex{ R"([,/]+)" }; - std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 }; - std::vector split_arg{ it, {} }; - if (split_arg.size() >= llama_max_devices()) { - invalid_param = true; - return true; - } - for (size_t i = 0; i < llama_max_devices(); ++i) { - if (i < split_arg.size()) { - params.tensor_split[i] = std::stof(split_arg[i]); - } - else { - params.tensor_split[i] = 0.0f; - } - } -#ifndef GGML_USE_CUDA_SYCL_VULKAN - fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting a tensor split has no effect.\n"); -#endif // GGML_USE_CUDA_SYCL_VULKAN - return true; - } -#ifdef GGML_USE_RPC - if (arg == "--rpc") { - CHECK_ARG - params.rpc_servers = argv[i]; - return true; - } -#endif - if (arg == "--no-mmap") { - params.use_mmap = false; - return true; - } - if (arg == "--numa") { - CHECK_ARG - std::string value(argv[i]); - /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; } - else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; } - else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; } - else { invalid_param = true; } - return true; - } - if (arg == "-v" || arg == "--verbose") { - params.verbosity = 1; - return true; - } - if (arg == "--verbosity") { - CHECK_ARG - params.verbosity = std::stoi(argv[i]); - return true; - } - if (arg == "--verbose-prompt") { - params.verbose_prompt = true; - return true; - } - if (arg == "--no-display-prompt") { - params.display_prompt = false; - return true; - } - if (arg == "-r" || arg == "--reverse-prompt") { - CHECK_ARG - params.antiprompt.emplace_back(argv[i]); - return true; - } - if (arg == "-ld" || arg == "--logdir") { - CHECK_ARG - params.logdir = argv[i]; - - if (params.logdir.back() != DIRECTORY_SEPARATOR) { - params.logdir += DIRECTORY_SEPARATOR; - } - return true; - } - if (arg == "-lcs" || arg == "--lookup-cache-static") { - CHECK_ARG - params.lookup_cache_static = argv[i]; - return true; - } - if (arg == "-lcd" || arg == "--lookup-cache-dynamic") { - CHECK_ARG - params.lookup_cache_dynamic = argv[i]; - return true; - } - if (arg == "--save-all-logits" || arg == "--kl-divergence-base") { - CHECK_ARG - params.logits_file = argv[i]; - return true; - } - if (arg == "--perplexity" || arg == "--all-logits") { - params.logits_all = true; - return true; - } - if (arg == "--ppl-stride") { - CHECK_ARG - params.ppl_stride = std::stoi(argv[i]); - return true; - } - if (arg == "--ppl-output-type") { - CHECK_ARG - params.ppl_output_type = std::stoi(argv[i]); - return true; - } - if (arg == "-ptc" || arg == "--print-token-count") { - CHECK_ARG - params.n_print = std::stoi(argv[i]); - return true; - } - if (arg == "--check-tensors") { - params.check_tensors = true; - return true; - } - if (arg == "--hellaswag") { - params.hellaswag = true; - return true; - } - if (arg == "--hellaswag-tasks") { - CHECK_ARG - params.hellaswag_tasks = std::stoi(argv[i]); - return true; - } - if (arg == "--winogrande") { - params.winogrande = true; - return true; - } - if (arg == "--winogrande-tasks") { - CHECK_ARG - params.winogrande_tasks = std::stoi(argv[i]); - return true; - } - if (arg == "--multiple-choice") { - params.multiple_choice = true; - return true; - } - if (arg == "--multiple-choice-tasks") { - CHECK_ARG - params.multiple_choice_tasks = std::stoi(argv[i]); - return true; - } - if (arg == "--kl-divergence") { - params.kl_divergence = true; - return true; - } - if (arg == "--ignore-eos") { - sparams.ignore_eos = true; - return true; - } - if (arg == "--penalize-nl") { - sparams.penalize_nl = true; - return true; - } - if (arg == "-l" || arg == "--logit-bias") { - CHECK_ARG - std::stringstream ss(argv[i]); - llama_token key; - char sign; - std::string value_str; - try { - if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) { - const float bias = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f); - sparams.logit_bias.push_back({key, bias}); - } - else { - throw std::exception(); - } - } - catch (const std::exception&) { - invalid_param = true; - return true; - } - return true; - } - if (arg == "-h" || arg == "--help" || arg == "--usage" ) { - params.usage = true; - return true; - } - if (arg == "--version") { - fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT); - fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET); - exit(0); - } - if (arg == "--in-prefix-bos") { - params.input_prefix_bos = true; - params.enable_chat_template = false; - return true; - } - if (arg == "--in-prefix") { - CHECK_ARG - params.input_prefix = argv[i]; - params.enable_chat_template = false; - return true; - } - if (arg == "--in-suffix") { - CHECK_ARG - params.input_suffix = argv[i]; - params.enable_chat_template = false; - return true; - } - if (arg == "--spm-infill") { - params.spm_infill = true; - return true; - } - if (arg == "--grammar") { - CHECK_ARG - sparams.grammar = argv[i]; - return true; - } - if (arg == "--grammar-file") { - CHECK_ARG - std::ifstream file(argv[i]); - if (!file) { - fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); - invalid_param = true; - return true; - } - std::copy( - std::istreambuf_iterator(file), - std::istreambuf_iterator(), - std::back_inserter(sparams.grammar) - ); - return true; - } - // if (arg == "-j" || arg == "--json-schema") { - // CHECK_ARG - // sparams.grammar = json_schema_to_grammar(json::parse(argv[i])); - // return true; - // } - if (arg == "--override-kv") { - CHECK_ARG - if (!string_parse_kv_override(argv[i], params.kv_overrides)) { - fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]); - invalid_param = true; - return true; - } - return true; - } - if (arg == "--host") { - CHECK_ARG - params.hostname = argv[i]; - return true; - } - if (arg == "--port") { - CHECK_ARG - params.port = std::stoi(argv[i]); - return true; - } - if (arg == "--path") { - CHECK_ARG - params.public_path = argv[i]; - return true; - } - if (arg == "--api-key") { - CHECK_ARG - params.api_keys.push_back(argv[i]); - return true; - } - if (arg == "--api-key-file") { - CHECK_ARG - std::ifstream key_file(argv[i]); - if (!key_file) { - fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); - invalid_param = true; - return true; - } - std::string key; - while (std::getline(key_file, key)) { - if (!key.empty()) { - params.api_keys.push_back(key); - } - } - key_file.close(); - return true; - } - if (arg == "--ssl-key-file") { - CHECK_ARG - params.ssl_file_key = argv[i]; - return true; - } - if (arg == "--ssl-cert-file") { - CHECK_ARG - params.ssl_file_cert = argv[i]; - return true; - } - if (arg == "--timeout" || arg == "-to") { - CHECK_ARG - params.timeout_read = std::stoi(argv[i]); - params.timeout_write = std::stoi(argv[i]); - return true; - } - if (arg == "--threads-http") { - CHECK_ARG - params.n_threads_http = std::stoi(argv[i]); - return true; - } - if (arg == "-spf" || arg == "--system-prompt-file") { - CHECK_ARG - std::ifstream file(argv[i]); - if (!file) { - fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); - invalid_param = true; - return true; - } - std::string system_prompt; - std::copy( - std::istreambuf_iterator(file), - std::istreambuf_iterator(), - std::back_inserter(system_prompt) - ); - params.system_prompt = system_prompt; - return true; - } - if (arg == "--log-format") { - CHECK_ARG - if (std::strcmp(argv[i], "json") == 0) { - params.log_json = true; - } else if (std::strcmp(argv[i], "text") == 0) { - params.log_json = false; +static std::vector break_str_into_lines(std::string input, size_t max_char_per_line) { + std::vector result; + std::istringstream iss(input); + std::string line; + auto add_line = [&](const std::string& l) { + if (l.length() <= max_char_per_line) { + result.push_back(l); } else { - invalid_param = true; - return true; + std::istringstream line_stream(l); + std::string word, current_line; + while (line_stream >> word) { + if (current_line.length() + !current_line.empty() + word.length() > max_char_per_line) { + if (!current_line.empty()) result.push_back(current_line); + current_line = word; + } else { + current_line += (!current_line.empty() ? " " : "") + word; + } + } + if (!current_line.empty()) result.push_back(current_line); } - return true; + }; + while (std::getline(iss, line)) { + add_line(line); } - if (arg == "--no-slots") { - params.endpoint_slots = false; - return true; - } - if (arg == "--metrics") { - params.endpoint_metrics = true; - return true; - } - if (arg == "--slot-save-path") { - CHECK_ARG - params.slot_save_path = argv[i]; - // if doesn't end with DIRECTORY_SEPARATOR, add it - if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) { - params.slot_save_path += DIRECTORY_SEPARATOR; - } - return true; - } - if (arg == "--chat-template") { - CHECK_ARG - if (!llama_chat_verify_template(argv[i])) { - fprintf(stderr, "error: the supplied chat template is not supported: %s\n", argv[i]); - fprintf(stderr, "note: llama.cpp does not use jinja parser, we only support commonly used templates\n"); - invalid_param = true; - return true; - } - params.chat_template = argv[i]; - return true; - } - if (arg == "--slot-prompt-similarity" || arg == "-sps") { - CHECK_ARG - params.slot_prompt_similarity = std::stof(argv[i]); - return true; - } - if (arg == "-pps") { - params.is_pp_shared = true; - return true; - } - if (arg == "-npp") { - CHECK_ARG - auto p = string_split(argv[i], split_delim); - params.n_pp.insert(params.n_pp.end(), p.begin(), p.end()); - return true; - } - if (arg == "-ntg") { - CHECK_ARG - auto p = string_split(argv[i], split_delim); - params.n_tg.insert(params.n_tg.end(), p.begin(), p.end()); - return true; - } - if (arg == "-npl") { - CHECK_ARG - auto p = string_split(argv[i], split_delim); - params.n_pl.insert(params.n_pl.end(), p.begin(), p.end()); - return true; - } - if (arg == "--context-file") { - CHECK_ARG - std::ifstream file(argv[i], std::ios::binary); - if (!file) { - fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); - invalid_param = true; - return true; - } - params.context_files.push_back(argv[i]); - return true; - } - if (arg == "--chunk-size") { - CHECK_ARG - params.chunk_size = std::stoi(argv[i]); - return true; - } - if (arg == "--chunk-separator") { - CHECK_ARG - params.chunk_separator = argv[i]; - return true; - } - if (arg == "--junk") { - CHECK_ARG - params.n_junk = std::stoi(argv[i]); - return true; - } - if (arg == "--pos") { - CHECK_ARG - params.i_pos = std::stoi(argv[i]); - return true; - } - if (arg == "-o" || arg == "--output" || arg == "--output-file") { - CHECK_ARG - params.out_file = argv[i]; - params.cvector_outfile = argv[i]; - params.lora_outfile = argv[i]; - return true; - } - if (arg == "-ofreq" || arg == "--output-frequency") { - CHECK_ARG - params.n_out_freq = std::stoi(argv[i]); - return true; - } - if (arg == "--save-frequency") { - CHECK_ARG - params.n_save_freq = std::stoi(argv[i]); - return true; - } - if (arg == "--process-output") { - params.process_output = true; - return true; - } - if (arg == "--no-ppl") { - params.compute_ppl = false; - return true; - } - if (arg == "--chunk" || arg == "--from-chunk") { - CHECK_ARG - params.i_chunk = std::stoi(argv[i]); - return true; - } - // cvector params - if (arg == "--positive-file") { - CHECK_ARG - params.cvector_positive_file = argv[i]; - return true; - } - if (arg == "--negative-file") { - CHECK_ARG - params.cvector_negative_file = argv[i]; - return true; - } - if (arg == "--pca-batch") { - CHECK_ARG - params.n_pca_batch = std::stoi(argv[i]); - return true; - } - if (arg == "--pca-iter") { - CHECK_ARG - params.n_pca_iterations = std::stoi(argv[i]); - return true; - } - if (arg == "--method") { - CHECK_ARG - std::string value(argv[i]); - /**/ if (value == "pca") { params.cvector_dimre_method = DIMRE_METHOD_PCA; } - else if (value == "mean") { params.cvector_dimre_method = DIMRE_METHOD_MEAN; } - else { invalid_param = true; } - return true; - } - if (arg == "--output-format") { - CHECK_ARG - std::string value(argv[i]); - /**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; } - else if (value == "md") { params.batched_bench_output_jsonl = false; } - else { invalid_param = true; } - return true; - } - if (arg == "--no-warmup") { - params.warmup = false; - return true; - } -#ifndef LOG_DISABLE_LOGS - // Parse args for logging parameters - if (log_param_single_parse(argv[i])) { - // Do nothing, log_param_single_parse automatically does it's thing - // and returns if a match was found and parsed. - return true; - } - if (log_param_pair_parse( /*check_but_dont_parse*/ true, argv[i])) { - // We have a matching known parameter requiring an argument, - // now we need to check if there is anything after this argv - // and flag invalid_param or parse it. - CHECK_ARG - if (!log_param_pair_parse( /*check_but_dont_parse*/ false, argv[i - 1], argv[i])) { - invalid_param = true; - return true; - } - return true; - } - // End of Parse args for logging parameters -#endif // LOG_DISABLE_LOGS - - return false; + return result; } -#ifdef __GNUC__ -#ifdef __MINGW32__ -#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__))) -#else -#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__))) -#endif -#else -#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) -#endif +std::string llama_arg::to_string() { + // params for printing to console + const static int n_leading_spaces = 40; + const static int n_char_per_line_help = 70; // TODO: detect this based on current console + std::string leading_spaces(n_leading_spaces, ' '); -void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { - const auto & sparams = params.sparams; + std::ostringstream ss; + for (const auto arg : args) { + if (arg == args.front()) { + if (args.size() == 1) { + ss << arg; + } else { + // first arg is usually abbreviation, we need padding to make it more beautiful + auto tmp = std::string(arg) + ", "; + ss << format("%-7s", tmp.c_str()); + } + } else { + ss << arg << (arg != args.back() ? ", " : ""); + } + } + if (value_hint) ss << " " << value_hint; + if (value_hint_2) ss << " " << value_hint_2; + if (ss.tellp() > n_leading_spaces - 3) { + // current line is too long, add new line + ss << "\n" << leading_spaces; + } else { + // padding between arg and help, same line + ss << std::string(leading_spaces.size() - ss.tellp(), ' '); + } + const auto help_lines = break_str_into_lines(help, n_char_per_line_help); + for (const auto & line : help_lines) { + ss << (&line == &help_lines.front() ? "" : leading_spaces) << line << "\n"; + } + return ss.str(); +} + +void gpt_params_print_usage(gpt_params & params, std::vector & options) { + auto print_options = [](std::vector & options) { + for (llama_arg * opt : options) { + printf("%s", opt->to_string().c_str()); + } + }; + + std::vector common_options; + std::vector specific_options; + for (auto & opt : options) { + // in case multiple LLAMA_EXAMPLE_* are set, we prioritize the LLAMA_EXAMPLE_* matching current example + if (opt.in_example(params.curr_ex)) { + specific_options.push_back(&opt); + } else { + common_options.push_back(&opt); + } + } + printf("----- common options -----\n\n"); + print_options(common_options); + // TODO: maybe convert enum llama_example to string + printf("\n\n----- example-specific options -----\n\n"); + print_options(specific_options); +} + +std::vector gpt_params_parser_init(gpt_params & params, llama_example ex) { + return gpt_params_parser_init(params, ex, nullptr); +} + +std::vector gpt_params_parser_init(gpt_params & params, llama_example ex, std::function print_usage) { + std::vector options; + params.print_usage = print_usage; + params.curr_ex = ex; std::string sampler_type_chars; std::string sampler_type_names; - for (const auto & sampler : sparams.samplers) { + for (const auto & sampler : params.sparams.samplers) { sampler_type_chars += gpt_sampler_type_to_chr(sampler); sampler_type_names += gpt_sampler_type_to_str(sampler) + ";"; } sampler_type_names.pop_back(); - struct option_info { - LLAMA_COMMON_ATTRIBUTE_FORMAT(4, 5) - option_info(const std::string & tags, const char * args, const char * desc, ...) : tags(tags), args(args), desc(desc) { - va_list args_list; - va_start(args_list, desc); - char buffer[1024]; - vsnprintf(buffer, sizeof(buffer), desc, args_list); - va_end(args_list); - this->desc = buffer; + + /** + * filter options by example + * rules: + * - all examples inherit options from LLAMA_EXAMPLE_COMMON + * - if LLAMA_EXAMPLE_* is set (other than COMMON), we only show the option in the corresponding example + * - if both {LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_*,} are set, we will prioritize the LLAMA_EXAMPLE_* matching current example + */ + auto add_opt = [&](llama_arg arg) { + if (arg.in_example(ex) || arg.in_example(LLAMA_EXAMPLE_COMMON)) { + options.push_back(std::move(arg)); } - - option_info(const std::string & grp) : grp(grp) {} - - std::string tags; - std::string args; - std::string desc; - std::string grp; }; - std::vector options; - // TODO: filter by tags - - options.push_back({ "general" }); - options.push_back({ "*", "-h, --help, --usage", "print usage and exit" }); - options.push_back({ "*", " --version", "show version and build info" }); - options.push_back({ "*", "-v, --verbose", "print verbose information" }); - options.push_back({ "*", " --verbosity N", "set specific verbosity level (default: %d)", params.verbosity }); - options.push_back({ "*", " --verbose-prompt", "print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false" }); - options.push_back({ "*", " --no-display-prompt", "don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false" }); - options.push_back({ "*", "-co, --color", "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" }); - options.push_back({ "*", "-t, --threads N", "number of threads to use during generation (default: %d)", params.cpuparams.n_threads }); - options.push_back({ "*", "-tb, --threads-batch N", "number of threads to use during batch and prompt processing (default: same as --threads)" }); - options.push_back({ "speculative", "-td, --threads-draft N", "number of threads to use during generation (default: same as --threads)" }); - options.push_back({ "speculative", "-tbd, --threads-batch-draft N","number of threads to use during batch and prompt processing (default: same as --threads-draft)" }); - -#ifndef GGML_USE_OPENMP - // these options are available only with the internal threadpool - options.push_back({ "*", "-C, --cpu-mask M", "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")"}); - options.push_back({ "*", "-Cr, --cpu-range lo-hi", "range of CPUs for affinity. Complements --cpu-mask"}); - options.push_back({ "*", " --cpu-strict <0|1>", "use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu}); - options.push_back({ "*", " --priority N", "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority}); - options.push_back({ "*", " --poll <0...100>", "use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll}); - - options.push_back({ "*", "-Cb, --cpu-mask-batch M", "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)"}); - options.push_back({ "*", "-Crb, --cpu-range-batch lo-hi", "ranges of CPUs for affinity. Complements --cpu-mask-batch"}); - options.push_back({ "*", " --cpu-strict-batch <0|1>","use strict CPU placement (default: same as --cpu-strict)"}); - options.push_back({ "*", " --priority-batch N", "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: --priority)"}); - options.push_back({ "*", " --poll-batch <0|1>", "use polling to wait for work (default: same as --poll"}); - - options.push_back({ "speculative", "-Cd, --cpu-mask-draft M", "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)"}); - options.push_back({ "speculative", "-Crd, --cpu-range-draft lo-hi", "Ranges of CPUs for affinity. Complements --cpu-mask-draft"}); - options.push_back({ "speculative", " --cpu-strict-draft <0|1>","Use strict CPU placement for draft model (default: same as --cpu-strict)"}); - options.push_back({ "speculative", " --priority-draft N", "Set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: same as --priority)"}); - options.push_back({ "speculative", " --poll-draft <0|1>", "Use polling to wait for draft model work (default: same as --poll])"}); - - options.push_back({ "speculative", "-Cbd, --cpu-mask-batch-draft M","Draft model CPU affinity mask. Complements cpu-range-draft-batch (default: same as --cpu-mask-draft)"}); - options.push_back({ "speculative", "-Crbd, --cpu-range-batch-draft lo-hi", - "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)"}); - options.push_back({ "speculative", " --cpu-strict-batch-draft <0|1>", - "Use strict CPU placement for draft model (default: --cpu-strict-draft)"}); - options.push_back({ "speculative", " --priority-batch-draft N","Set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: --priority-draft)"}); - options.push_back({ "speculative", " --poll-batch-draft <0|1>","Use polling to wait for draft model work (default: --poll-draft)"}); -#endif // GGML_USE_OPENMP - - options.push_back({ "speculative", " --draft N", "number of tokens to draft for speculative decoding (default: %d)", params.n_draft }); - options.push_back({ "speculative", "-ps, --p-split N", "speculative decoding split probability (default: %.1f)", (double)params.p_split }); - options.push_back({ "*", "-lcs, --lookup-cache-static FNAME", - "path to static lookup cache to use for lookup decoding (not updated by generation)" }); - options.push_back({ "*", "-lcd, --lookup-cache-dynamic FNAME", - "path to dynamic lookup cache to use for lookup decoding (updated by generation)" }); - - options.push_back({ "*", "-c, --ctx-size N", "size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx }); - options.push_back({ "*", "-n, --predict N", "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict }); - options.push_back({ "*", "-b, --batch-size N", "logical maximum batch size (default: %d)", params.n_batch }); - options.push_back({ "*", "-ub, --ubatch-size N", "physical maximum batch size (default: %d)", params.n_ubatch }); - options.push_back({ "*", " --keep N", "number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep }); - options.push_back({ "*", " --chunks N", "max number of chunks to process (default: %d, -1 = all)", params.n_chunks }); - options.push_back({ "*", "-fa, --flash-attn", "enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled" }); - options.push_back({ "*", "-p, --prompt PROMPT", "prompt to start generation with\n" - "in conversation mode, this will be used as system prompt\n" - "(default: '%s')", params.prompt.c_str() }); - options.push_back({ "*", "-f, --file FNAME", "a file containing the prompt (default: none)" }); - options.push_back({ "*", " --in-file FNAME", "an input file (repeat to specify multiple files)" }); - options.push_back({ "*", "-bf, --binary-file FNAME", "binary file containing the prompt (default: none)" }); - options.push_back({ "*", "-e, --escape", "process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false" }); - options.push_back({ "*", " --no-escape", "do not process escape sequences" }); - options.push_back({ "main", "-ptc, --print-token-count N", "print token count every N tokens (default: %d)", params.n_print }); - options.push_back({ "main", " --prompt-cache FNAME", "file to cache prompt state for faster startup (default: none)" }); - options.push_back({ "main", " --prompt-cache-all", "if specified, saves user input and generations to cache as well\n" - "not supported with --interactive or other interactive options" }); - options.push_back({ "main", " --prompt-cache-ro", "if specified, uses the prompt cache but does not update it" }); - options.push_back({ "main", "-r, --reverse-prompt PROMPT", - "halt generation at PROMPT, return control in interactive mode\n" - "can be specified more than once for multiple prompts" }); - options.push_back({ "main", "-sp, --special", "special tokens output enabled (default: %s)", params.special ? "true" : "false" }); - options.push_back({ "main", "-cnv, --conversation", "run in conversation mode, does not print special tokens and suffix/prefix\n" - "if suffix/prefix are not specified, default chat template will be used\n" - "(default: %s)", params.conversation ? "true" : "false" }); - options.push_back({ "main infill", "-i, --interactive", "run in interactive mode (default: %s)", params.interactive ? "true" : "false" }); - options.push_back({ "main infill", "-if, --interactive-first", "run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false" }); - options.push_back({ "main infill", "-mli, --multiline-input", "allows you to write or paste multiple lines without ending each in '\\'" }); - options.push_back({ "main infill", " --in-prefix-bos", "prefix BOS to user inputs, preceding the `--in-prefix` string" }); - options.push_back({ "main infill", " --in-prefix STRING", "string to prefix user inputs with (default: empty)" }); - options.push_back({ "main infill", " --in-suffix STRING", "string to suffix after user inputs with (default: empty)" }); - options.push_back({ "main", " --no-warmup", "skip warming up the model with an empty run" }); - options.push_back({ "server infill", - " --spm-infill", "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" }); - - options.push_back({ "sampling" }); - options.push_back({ "*", "-s, --seed SEED", "RNG seed (default: %d, use random seed for < 0)", sparams.seed }); - options.push_back({ "*", " --samplers SAMPLERS", "samplers that will be used for generation in the order, separated by \';\'\n" - "(default: %s)", sampler_type_names.c_str() }); - options.push_back({ "*", " --sampling-seq SEQUENCE", - "simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str() }); - options.push_back({ "*", " --ignore-eos", "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)" }); - options.push_back({ "*", " --penalize-nl", "penalize newline tokens (default: %s)", sparams.penalize_nl ? "true" : "false" }); - options.push_back({ "*", " --temp T", "temperature (default: %.1f)", (double)sparams.temp }); - options.push_back({ "*", " --top-k N", "top-k sampling (default: %d, 0 = disabled)", sparams.top_k }); - options.push_back({ "*", " --top-p P", "top-p sampling (default: %.1f, 1.0 = disabled)", (double)sparams.top_p }); - options.push_back({ "*", " --min-p P", "min-p sampling (default: %.1f, 0.0 = disabled)", (double)sparams.min_p }); - options.push_back({ "*", " --tfs P", "tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)sparams.tfs_z }); - options.push_back({ "*", " --typical P", "locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)sparams.typ_p }); - options.push_back({ "*", " --repeat-last-n N", "last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", sparams.penalty_last_n }); - options.push_back({ "*", " --repeat-penalty N", "penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)sparams.penalty_repeat }); - options.push_back({ "*", " --presence-penalty N", "repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)sparams.penalty_present }); - options.push_back({ "*", " --frequency-penalty N", "repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)sparams.penalty_freq }); - options.push_back({ "*", " --dynatemp-range N", "dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)sparams.dynatemp_range }); - options.push_back({ "*", " --dynatemp-exp N", "dynamic temperature exponent (default: %.1f)", (double)sparams.dynatemp_exponent }); - options.push_back({ "*", " --mirostat N", "use Mirostat sampling.\n" - "Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n" - "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", sparams.mirostat }); - options.push_back({ "*", " --mirostat-lr N", "Mirostat learning rate, parameter eta (default: %.1f)", (double)sparams.mirostat_eta }); - options.push_back({ "*", " --mirostat-ent N", "Mirostat target entropy, parameter tau (default: %.1f)", (double)sparams.mirostat_tau }); - options.push_back({ "*", " -l TOKEN_ID(+/-)BIAS", "modifies the likelihood of token appearing in the completion,\n" - "i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n" - "or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'" }); - options.push_back({ "main", " --chat-template JINJA_TEMPLATE", - "set custom jinja chat template (default: template taken from model's metadata)\n" - "if suffix/prefix are specified, template will be disabled\n" - "only commonly used templates are accepted:\n" - "https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" }); - options.push_back({ "grammar" }); - options.push_back({ "*", " --grammar GRAMMAR", "BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", sparams.grammar.c_str() }); - options.push_back({ "*", " --grammar-file FNAME", "file to read grammar from" }); - options.push_back({ "*", "-j, --json-schema SCHEMA", - "JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\n" - "For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead" }); - - options.push_back({ "embedding" }); - options.push_back({ "embedding", " --pooling {none,mean,cls,last}", - "pooling type for embeddings, use model default if unspecified" }); - options.push_back({ "embedding", " --attention {causal,non-causal}", - "attention type for embeddings, use model default if unspecified" }); - - options.push_back({ "context hacking" }); - options.push_back({ "*", " --rope-scaling {none,linear,yarn}", - "RoPE frequency scaling method, defaults to linear unless specified by the model" }); - options.push_back({ "*", " --rope-scale N", "RoPE context scaling factor, expands context by a factor of N" }); - options.push_back({ "*", " --rope-freq-base N", "RoPE base frequency, used by NTK-aware scaling (default: loaded from model)" }); - options.push_back({ "*", " --rope-freq-scale N", "RoPE frequency scaling factor, expands context by a factor of 1/N" }); - options.push_back({ "*", " --yarn-orig-ctx N", "YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx }); - options.push_back({ "*", " --yarn-ext-factor N", "YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor }); - options.push_back({ "*", " --yarn-attn-factor N", "YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor }); - options.push_back({ "*", " --yarn-beta-slow N", "YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow }); - options.push_back({ "*", " --yarn-beta-fast N", "YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast }); - options.push_back({ "*", "-gan, --grp-attn-n N", "group-attention factor (default: %d)", params.grp_attn_n }); - options.push_back({ "*", "-gaw, --grp-attn-w N", "group-attention width (default: %.1f)", (double)params.grp_attn_w }); - options.push_back({ "*", "-dkvc, --dump-kv-cache", "verbose print of the KV cache" }); - options.push_back({ "*", "-nkvo, --no-kv-offload", "disable KV offload" }); - options.push_back({ "*", "-ctk, --cache-type-k TYPE", "KV cache data type for K (default: %s)", params.cache_type_k.c_str() }); - options.push_back({ "*", "-ctv, --cache-type-v TYPE", "KV cache data type for V (default: %s)", params.cache_type_v.c_str() }); - - options.push_back({ "perplexity" }); - options.push_back({ "perplexity", " --all-logits", "return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false" }); - options.push_back({ "perplexity", " --hellaswag", "compute HellaSwag score over random tasks from datafile supplied with -f" }); - options.push_back({ "perplexity", " --hellaswag-tasks N", "number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks }); - options.push_back({ "perplexity", " --winogrande", "compute Winogrande score over random tasks from datafile supplied with -f" }); - options.push_back({ "perplexity", " --winogrande-tasks N", "number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks }); - options.push_back({ "perplexity", " --multiple-choice", "compute multiple choice score over random tasks from datafile supplied with -f" }); - options.push_back({ "perplexity", " --multiple-choice-tasks N", - "number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks }); - options.push_back({ "perplexity", " --kl-divergence", "computes KL-divergence to logits provided via --kl-divergence-base" }); - options.push_back({ "perplexity", " --ppl-stride N", "stride for perplexity calculation (default: %d)", params.ppl_stride }); - options.push_back({ "perplexity", " --ppl-output-type {0,1}", - "output type for perplexity calculation (default: %d)", params.ppl_output_type }); - - options.push_back({ "parallel" }); - options.push_back({ "*", "-dt, --defrag-thold N", "KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold }); - options.push_back({ "*", "-np, --parallel N", "number of parallel sequences to decode (default: %d)", params.n_parallel }); - options.push_back({ "*", "-ns, --sequences N", "number of sequences to decode (default: %d)", params.n_sequences }); - options.push_back({ "*", "-cb, --cont-batching", "enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled" }); - options.push_back({ "*", "-nocb, --no-cont-batching", "disable continuous batching" }); - - options.push_back({ "multi-modality" }); - options.push_back({ "*", " --mmproj FILE", "path to a multimodal projector file for LLaVA. see examples/llava/README.md" }); - options.push_back({ "*", " --image FILE", "path to an image file. use with multimodal models. Specify multiple times for batching" }); - - options.push_back({ "backend" }); + add_opt(llama_arg( + {"-h", "--help", "--usage"}, + "print usage and exit", + [](gpt_params & params) { + params.usage = true; + } + )); + add_opt(llama_arg( + {"--version"}, + "show version and build info", + [](gpt_params &) { + fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT); + fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET); + exit(0); + } + )); + add_opt(llama_arg( + {"-v", "--verbose"}, + "print verbose information", + [](gpt_params & params) { + params.verbosity = 1; + } + )); + add_opt(llama_arg( + {"--verbosity"}, "N", + format("set specific verbosity level (default: %d)", params.verbosity), + [](gpt_params & params, int value) { + params.verbosity = value; + } + )); + add_opt(llama_arg( + {"--verbose-prompt"}, + format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"), + [](gpt_params & params) { + params.verbose_prompt = true; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"--no-display-prompt"}, + format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"), + [](gpt_params & params) { + params.display_prompt = false; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"-co", "--color"}, + format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"), + [](gpt_params & params) { + params.use_color = true; + } + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL})); + add_opt(llama_arg( + {"-s", "--seed"}, "SEED", + format("RNG seed (default: %d, use random seed for < 0)", params.sparams.seed), + [](gpt_params & params, const std::string & value) { + params.sparams.seed = std::stoul(value); + } + )); + add_opt(llama_arg( + {"-t", "--threads"}, "N", + format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads), + [](gpt_params & params, int value) { + params.cpuparams.n_threads = value; + if (params.cpuparams.n_threads <= 0) { + params.cpuparams.n_threads = std::thread::hardware_concurrency(); + } + } + ).set_env("LLAMA_ARG_THREADS")); + add_opt(llama_arg( + {"-tb", "--threads-batch"}, "N", + "number of threads to use during batch and prompt processing (default: same as --threads)", + [](gpt_params & params, int value) { + params.cpuparams_batch.n_threads = value; + if (params.cpuparams_batch.n_threads <= 0) { + params.cpuparams_batch.n_threads = std::thread::hardware_concurrency(); + } + } + )); + add_opt(llama_arg( + {"-td", "--threads-draft"}, "N", + "number of threads to use during generation (default: same as --threads)", + [](gpt_params & params, int value) { + params.draft_cpuparams.n_threads = value; + if (params.draft_cpuparams.n_threads <= 0) { + params.draft_cpuparams.n_threads = std::thread::hardware_concurrency(); + } + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"-tbd", "--threads-batch-draft"}, "N", + "number of threads to use during batch and prompt processing (default: same as --threads-draft)", + [](gpt_params & params, int value) { + params.draft_cpuparams_batch.n_threads = value; + if (params.draft_cpuparams_batch.n_threads <= 0) { + params.draft_cpuparams_batch.n_threads = std::thread::hardware_concurrency(); + } + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"-C", "--cpu-mask"}, "M", + "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")", + [](gpt_params & params, const std::string & mask) { + params.cpuparams.mask_valid = true; + if (!parse_cpu_mask(mask, params.cpuparams.cpumask)) { + throw std::invalid_argument("invalid cpumask"); + } + } + )); + add_opt(llama_arg( + {"-Cr", "--cpu-range"}, "lo-hi", + "range of CPUs for affinity. Complements --cpu-mask", + [](gpt_params & params, const std::string & range) { + params.cpuparams.mask_valid = true; + if (!parse_cpu_range(range, params.cpuparams.cpumask)) { + throw std::invalid_argument("invalid range"); + } + } + )); + add_opt(llama_arg( + {"--cpu-strict"}, "<0|1>", + format("use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu), + [](gpt_params & params, const std::string & value) { + params.cpuparams.strict_cpu = std::stoul(value); + } + )); + add_opt(llama_arg( + {"--prio"}, "N", + format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority), + [](gpt_params & params, int prio) { + if (prio < 0 || prio > 3) { + throw std::invalid_argument("invalid value"); + } + params.cpuparams.priority = (enum ggml_sched_priority) prio; + } + )); + add_opt(llama_arg( + {"--poll"}, "<0...100>", + format("use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll), + [](gpt_params & params, const std::string & value) { + params.cpuparams.poll = std::stoul(value); + } + )); + add_opt(llama_arg( + {"-Cb", "--cpu-mask-batch"}, "M", + "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)", + [](gpt_params & params, const std::string & mask) { + params.cpuparams_batch.mask_valid = true; + if (!parse_cpu_mask(mask, params.cpuparams_batch.cpumask)) { + throw std::invalid_argument("invalid cpumask"); + } + } + )); + add_opt(llama_arg( + {"-Crb", "--cpu-range-batch"}, "lo-hi", + "ranges of CPUs for affinity. Complements --cpu-mask-batch", + [](gpt_params & params, const std::string & range) { + params.cpuparams_batch.mask_valid = true; + if (!parse_cpu_range(range, params.cpuparams_batch.cpumask)) { + throw std::invalid_argument("invalid range"); + } + } + )); + add_opt(llama_arg( + {"--cpu-strict-batch"}, "<0|1>", + "use strict CPU placement (default: same as --cpu-strict)", + [](gpt_params & params, int value) { + params.cpuparams_batch.strict_cpu = value; + } + )); + add_opt(llama_arg( + {"--prio-batch"}, "N", + format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams_batch.priority), + [](gpt_params & params, int prio) { + if (prio < 0 || prio > 3) { + throw std::invalid_argument("invalid value"); + } + params.cpuparams_batch.priority = (enum ggml_sched_priority) prio; + } + )); + add_opt(llama_arg( + {"--poll-batch"}, "<0|1>", + "use polling to wait for work (default: same as --poll)", + [](gpt_params & params, int value) { + params.cpuparams_batch.poll = value; + } + )); + add_opt(llama_arg( + {"-Cd", "--cpu-mask-draft"}, "M", + "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)", + [](gpt_params & params, const std::string & mask) { + params.draft_cpuparams.mask_valid = true; + if (!parse_cpu_mask(mask, params.draft_cpuparams.cpumask)) { + throw std::invalid_argument("invalid cpumask"); + } + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"-Crd", "--cpu-range-draft"}, "lo-hi", + "Ranges of CPUs for affinity. Complements --cpu-mask-draft", + [](gpt_params & params, const std::string & range) { + params.draft_cpuparams.mask_valid = true; + if (!parse_cpu_range(range, params.draft_cpuparams.cpumask)) { + throw std::invalid_argument("invalid range"); + } + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"--cpu-strict-draft"}, "<0|1>", + "Use strict CPU placement for draft model (default: same as --cpu-strict)", + [](gpt_params & params, int value) { + params.draft_cpuparams.strict_cpu = value; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"--prio-draft"}, "N", + format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams.priority), + [](gpt_params & params, int prio) { + if (prio < 0 || prio > 3) { + throw std::invalid_argument("invalid value"); + } + params.draft_cpuparams.priority = (enum ggml_sched_priority) prio; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"--poll-draft"}, "<0|1>", + "Use polling to wait for draft model work (default: same as --poll])", + [](gpt_params & params, int value) { + params.draft_cpuparams.poll = value; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"-Cbd", "--cpu-mask-batch-draft"}, "M", + "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)", + [](gpt_params & params, const std::string & mask) { + params.draft_cpuparams_batch.mask_valid = true; + if (!parse_cpu_mask(mask, params.draft_cpuparams_batch.cpumask)) { + throw std::invalid_argument("invalid cpumask"); + } + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"-Crbd", "--cpu-range-batch-draft"}, "lo-hi", + "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)", + [](gpt_params & params, const std::string & range) { + params.draft_cpuparams_batch.mask_valid = true; + if (!parse_cpu_range(range, params.draft_cpuparams_batch.cpumask)) { + throw std::invalid_argument("invalid cpumask"); + } + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"--cpu-strict-batch-draft"}, "<0|1>", + "Use strict CPU placement for draft model (default: --cpu-strict-draft)", + [](gpt_params & params, int value) { + params.draft_cpuparams_batch.strict_cpu = value; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"--prio-batch-draft"}, "N", + format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams_batch.priority), + [](gpt_params & params, int prio) { + if (prio < 0 || prio > 3) { + throw std::invalid_argument("invalid value"); + } + params.draft_cpuparams_batch.priority = (enum ggml_sched_priority) prio; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"--poll-batch-draft"}, "<0|1>", + "Use polling to wait for draft model work (default: --poll-draft)", + [](gpt_params & params, int value) { + params.draft_cpuparams_batch.poll = value; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"--draft"}, "N", + format("number of tokens to draft for speculative decoding (default: %d)", params.n_draft), + [](gpt_params & params, int value) { + params.n_draft = value; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"-ps", "--p-split"}, "N", + format("speculative decoding split probability (default: %.1f)", (double)params.p_split), + [](gpt_params & params, const std::string & value) { + params.p_split = std::stof(value); + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"-lcs", "--lookup-cache-static"}, "FNAME", + "path to static lookup cache to use for lookup decoding (not updated by generation)", + [](gpt_params & params, const std::string & value) { + params.lookup_cache_static = value; + } + )); + add_opt(llama_arg( + {"-lcd", "--lookup-cache-dynamic"}, "FNAME", + "path to dynamic lookup cache to use for lookup decoding (updated by generation)", + [](gpt_params & params, const std::string & value) { + params.lookup_cache_dynamic = value; + } + )); + add_opt(llama_arg( + {"-c", "--ctx-size"}, "N", + format("size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx), + [](gpt_params & params, int value) { + params.n_ctx = value; + } + ).set_env("LLAMA_ARG_CTX_SIZE")); + add_opt(llama_arg( + {"-n", "--predict", "--n-predict"}, "N", + format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict), + [](gpt_params & params, int value) { + params.n_predict = value; + } + ).set_env("LLAMA_ARG_N_PREDICT")); + add_opt(llama_arg( + {"-b", "--batch-size"}, "N", + format("logical maximum batch size (default: %d)", params.n_batch), + [](gpt_params & params, int value) { + params.n_batch = value; + } + ).set_env("LLAMA_ARG_BATCH")); + add_opt(llama_arg( + {"-ub", "--ubatch-size"}, "N", + format("physical maximum batch size (default: %d)", params.n_ubatch), + [](gpt_params & params, int value) { + params.n_ubatch = value; + } + ).set_env("LLAMA_ARG_UBATCH")); + add_opt(llama_arg( + {"--keep"}, "N", + format("number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep), + [](gpt_params & params, int value) { + params.n_keep = value; + } + )); + add_opt(llama_arg( + {"--chunks"}, "N", + format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks), + [](gpt_params & params, int value) { + params.n_chunks = value; + } + )); + add_opt(llama_arg( + {"-fa", "--flash-attn"}, + format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"), + [](gpt_params & params) { + params.flash_attn = true; + } + ).set_env("LLAMA_ARG_FLASH_ATTN")); + add_opt(llama_arg( + {"-p", "--prompt"}, "PROMPT", + ex == LLAMA_EXAMPLE_MAIN + ? "prompt to start generation with\nif -cnv is set, this will be used as system prompt" + : "prompt to start generation with", + [](gpt_params & params, const std::string & value) { + params.prompt = value; + } + )); + add_opt(llama_arg( + {"-f", "--file"}, "FNAME", + "a file containing the prompt (default: none)", + [](gpt_params & params, const std::string & value) { + std::ifstream file(value); + if (!file) { + throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); + } + // store the external file name in params + params.prompt_file = value; + std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(params.prompt)); + if (!params.prompt.empty() && params.prompt.back() == '\n') { + params.prompt.pop_back(); + } + } + )); + add_opt(llama_arg( + {"--in-file"}, "FNAME", + "an input file (repeat to specify multiple files)", + [](gpt_params & params, const std::string & value) { + std::ifstream file(value); + if (!file) { + throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); + } + params.in_files.push_back(value); + } + )); + add_opt(llama_arg( + {"-bf", "--binary-file"}, "FNAME", + "binary file containing the prompt (default: none)", + [](gpt_params & params, const std::string & value) { + std::ifstream file(value, std::ios::binary); + if (!file) { + throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); + } + // store the external file name in params + params.prompt_file = value; + std::ostringstream ss; + ss << file.rdbuf(); + params.prompt = ss.str(); + fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), value.c_str()); + } + )); + add_opt(llama_arg( + {"-e", "--escape"}, + format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"), + [](gpt_params & params) { + params.escape = true; + } + )); + add_opt(llama_arg( + {"--no-escape"}, + "do not process escape sequences", + [](gpt_params & params) { + params.escape = false; + } + )); + add_opt(llama_arg( + {"-ptc", "--print-token-count"}, "N", + format("print token count every N tokens (default: %d)", params.n_print), + [](gpt_params & params, int value) { + params.n_print = value; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"--prompt-cache"}, "FNAME", + "file to cache prompt state for faster startup (default: none)", + [](gpt_params & params, const std::string & value) { + params.path_prompt_cache = value; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"--prompt-cache-all"}, + "if specified, saves user input and generations to cache as well\n", + [](gpt_params & params) { + params.prompt_cache_all = true; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"--prompt-cache-ro"}, + "if specified, uses the prompt cache but does not update it", + [](gpt_params & params) { + params.prompt_cache_ro = true; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"-r", "--reverse-prompt"}, "PROMPT", + "halt generation at PROMPT, return control in interactive mode\n", + [](gpt_params & params, const std::string & value) { + params.antiprompt.emplace_back(value); + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"-sp", "--special"}, + format("special tokens output enabled (default: %s)", params.special ? "true" : "false"), + [](gpt_params & params) { + params.special = true; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"-cnv", "--conversation"}, + format( + "run in conversation mode:\n" + "- does not print special tokens and suffix/prefix\n" + "- interactive mode is also enabled\n" + "(default: %s)", + params.conversation ? "true" : "false" + ), + [](gpt_params & params) { + params.conversation = true; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"-i", "--interactive"}, + format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"), + [](gpt_params & params) { + params.interactive = true; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"-if", "--interactive-first"}, + format("run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"), + [](gpt_params & params) { + params.interactive_first = true; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"-mli", "--multiline-input"}, + "allows you to write or paste multiple lines without ending each in '\\'", + [](gpt_params & params) { + params.multiline_input = true; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"--in-prefix-bos"}, + "prefix BOS to user inputs, preceding the `--in-prefix` string", + [](gpt_params & params) { + params.input_prefix_bos = true; + params.enable_chat_template = false; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"--in-prefix"}, "STRING", + "string to prefix user inputs with (default: empty)", + [](gpt_params & params, const std::string & value) { + params.input_prefix = value; + params.enable_chat_template = false; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"--in-suffix"}, "STRING", + "string to suffix after user inputs with (default: empty)", + [](gpt_params & params, const std::string & value) { + params.input_suffix = value; + params.enable_chat_template = false; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"--no-warmup"}, + "skip warming up the model with an empty run", + [](gpt_params & params) { + params.warmup = false; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"--spm-infill"}, + format( + "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", + params.spm_infill ? "enabled" : "disabled" + ), + [](gpt_params & params) { + params.spm_infill = true; + } + ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_INFILL})); + add_opt(llama_arg( + {"--samplers"}, "SAMPLERS", + format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()), + [](gpt_params & params, const std::string & value) { + const auto sampler_names = string_split(value, ';'); + params.sparams.samplers = gpt_sampler_types_from_names(sampler_names, true); + } + )); + add_opt(llama_arg( + {"--sampling-seq"}, "SEQUENCE", + format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()), + [](gpt_params & params, const std::string & value) { + params.sparams.samplers = gpt_sampler_types_from_chars(value); + } + )); + add_opt(llama_arg( + {"--ignore-eos"}, + "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)", + [](gpt_params & params) { + params.sparams.ignore_eos = true; + } + )); + add_opt(llama_arg( + {"--penalize-nl"}, + format("penalize newline tokens (default: %s)", params.sparams.penalize_nl ? "true" : "false"), + [](gpt_params & params) { + params.sparams.penalize_nl = true; + } + )); + add_opt(llama_arg( + {"--temp"}, "N", + format("temperature (default: %.1f)", (double)params.sparams.temp), + [](gpt_params & params, const std::string & value) { + params.sparams.temp = std::stof(value); + params.sparams.temp = std::max(params.sparams.temp, 0.0f); + } + )); + add_opt(llama_arg( + {"--top-k"}, "N", + format("top-k sampling (default: %d, 0 = disabled)", params.sparams.top_k), + [](gpt_params & params, int value) { + params.sparams.top_k = value; + } + )); + add_opt(llama_arg( + {"--top-p"}, "N", + format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sparams.top_p), + [](gpt_params & params, const std::string & value) { + params.sparams.top_p = std::stof(value); + } + )); + add_opt(llama_arg( + {"--min-p"}, "N", + format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sparams.min_p), + [](gpt_params & params, const std::string & value) { + params.sparams.min_p = std::stof(value); + } + )); + add_opt(llama_arg( + {"--tfs"}, "N", + format("tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)params.sparams.tfs_z), + [](gpt_params & params, const std::string & value) { + params.sparams.tfs_z = std::stof(value); + } + )); + add_opt(llama_arg( + {"--typical"}, "N", + format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sparams.typ_p), + [](gpt_params & params, const std::string & value) { + params.sparams.typ_p = std::stof(value); + } + )); + add_opt(llama_arg( + {"--repeat-last-n"}, "N", + format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sparams.penalty_last_n), + [](gpt_params & params, int value) { + params.sparams.penalty_last_n = value; + params.sparams.n_prev = std::max(params.sparams.n_prev, params.sparams.penalty_last_n); + } + )); + add_opt(llama_arg( + {"--repeat-penalty"}, "N", + format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sparams.penalty_repeat), + [](gpt_params & params, const std::string & value) { + params.sparams.penalty_repeat = std::stof(value); + } + )); + add_opt(llama_arg( + {"--presence-penalty"}, "N", + format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_present), + [](gpt_params & params, const std::string & value) { + params.sparams.penalty_present = std::stof(value); + } + )); + add_opt(llama_arg( + {"--frequency-penalty"}, "N", + format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_freq), + [](gpt_params & params, const std::string & value) { + params.sparams.penalty_freq = std::stof(value); + } + )); + add_opt(llama_arg( + {"--dynatemp-range"}, "N", + format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sparams.dynatemp_range), + [](gpt_params & params, const std::string & value) { + params.sparams.dynatemp_range = std::stof(value); + } + )); + add_opt(llama_arg( + {"--dynatemp-exp"}, "N", + format("dynamic temperature exponent (default: %.1f)", (double)params.sparams.dynatemp_exponent), + [](gpt_params & params, const std::string & value) { + params.sparams.dynatemp_exponent = std::stof(value); + } + )); + add_opt(llama_arg( + {"--mirostat"}, "N", + format("use Mirostat sampling.\nTop K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n" + "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sparams.mirostat), + [](gpt_params & params, int value) { + params.sparams.mirostat = value; + } + )); + add_opt(llama_arg( + {"--mirostat-lr"}, "N", + format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sparams.mirostat_eta), + [](gpt_params & params, const std::string & value) { + params.sparams.mirostat_eta = std::stof(value); + } + )); + add_opt(llama_arg( + {"--mirostat-ent"}, "N", + format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sparams.mirostat_tau), + [](gpt_params & params, const std::string & value) { + params.sparams.mirostat_tau = std::stof(value); + } + )); + add_opt(llama_arg( + {"-l", "--logit-bias"}, "TOKEN_ID(+/-)BIAS", + "modifies the likelihood of token appearing in the completion,\n" + "i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n" + "or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'", + [](gpt_params & params, const std::string & value) { + std::stringstream ss(value); + llama_token key; + char sign; + std::string value_str; + try { + if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) { + const float bias = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f); + params.sparams.logit_bias.push_back({key, bias}); + } else { + throw std::invalid_argument("invalid input format"); + } + } catch (const std::exception&) { + throw std::invalid_argument("invalid input format"); + } + } + )); + add_opt(llama_arg( + {"--grammar"}, "GRAMMAR", + format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sparams.grammar.c_str()), + [](gpt_params & params, const std::string & value) { + params.sparams.grammar = value; + } + )); + add_opt(llama_arg( + {"--grammar-file"}, "FNAME", + "file to read grammar from", + [](gpt_params & params, const std::string & value) { + std::ifstream file(value); + if (!file) { + throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); + } + std::copy( + std::istreambuf_iterator(file), + std::istreambuf_iterator(), + std::back_inserter(params.sparams.grammar) + ); + } + )); + add_opt(llama_arg( + {"-j", "--json-schema"}, "SCHEMA", + "JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead", + [](gpt_params & params, const std::string & value) { + //params.sparams.grammar = json_schema_to_grammar(json::parse(value)); + } + )); + add_opt(llama_arg( + {"--pooling"}, "{none,mean,cls,last}", + "pooling type for embeddings, use model default if unspecified", + [](gpt_params & params, const std::string & value) { + /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; } + else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; } + else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; } + else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; } + else { throw std::invalid_argument("invalid value"); } + } + ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); + add_opt(llama_arg( + {"--attention"}, "{causal,non,causal}", + "attention type for embeddings, use model default if unspecified", + [](gpt_params & params, const std::string & value) { + /**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; } + else if (value == "non-causal") { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; } + else { throw std::invalid_argument("invalid value"); } + } + ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); + add_opt(llama_arg( + {"--rope-scaling"}, "{none,linear,yarn}", + "RoPE frequency scaling method, defaults to linear unless specified by the model", + [](gpt_params & params, const std::string & value) { + /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; } + else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; } + else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; } + else { throw std::invalid_argument("invalid value"); } + } + )); + add_opt(llama_arg( + {"--rope-scale"}, "N", + "RoPE context scaling factor, expands context by a factor of N", + [](gpt_params & params, const std::string & value) { + params.rope_freq_scale = 1.0f / std::stof(value); + } + )); + add_opt(llama_arg( + {"--rope-freq-base"}, "N", + "RoPE base frequency, used by NTK-aware scaling (default: loaded from model)", + [](gpt_params & params, const std::string & value) { + params.rope_freq_base = std::stof(value); + } + )); + add_opt(llama_arg( + {"--rope-freq-scale"}, "N", + "RoPE frequency scaling factor, expands context by a factor of 1/N", + [](gpt_params & params, const std::string & value) { + params.rope_freq_scale = std::stof(value); + } + )); + add_opt(llama_arg( + {"--yarn-orig-ctx"}, "N", + format("YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx), + [](gpt_params & params, int value) { + params.yarn_orig_ctx = value; + } + )); + add_opt(llama_arg( + {"--yarn-ext-factor"}, "N", + format("YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor), + [](gpt_params & params, const std::string & value) { + params.yarn_ext_factor = std::stof(value); + } + )); + add_opt(llama_arg( + {"--yarn-attn-factor"}, "N", + format("YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor), + [](gpt_params & params, const std::string & value) { + params.yarn_attn_factor = std::stof(value); + } + )); + add_opt(llama_arg( + {"--yarn-beta-slow"}, "N", + format("YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow), + [](gpt_params & params, const std::string & value) { + params.yarn_beta_slow = std::stof(value); + } + )); + add_opt(llama_arg( + {"--yarn-beta-fast"}, "N", + format("YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast), + [](gpt_params & params, const std::string & value) { + params.yarn_beta_fast = std::stof(value); + } + )); + add_opt(llama_arg( + {"-gan", "--grp-attn-n"}, "N", + format("group-attention factor (default: %d)", params.grp_attn_n), + [](gpt_params & params, int value) { + params.grp_attn_n = value; + } + )); + add_opt(llama_arg( + {"-gaw", "--grp-attn-w"}, "N", + format("group-attention width (default: %.1f)", (double)params.grp_attn_w), + [](gpt_params & params, int value) { + params.grp_attn_w = value; + } + )); + add_opt(llama_arg( + {"-dkvc", "--dump-kv-cache"}, + "verbose print of the KV cache", + [](gpt_params & params) { + params.dump_kv_cache = true; + } + )); + add_opt(llama_arg( + {"-nkvo", "--no-kv-offload"}, + "disable KV offload", + [](gpt_params & params) { + params.no_kv_offload = true; + } + )); + add_opt(llama_arg( + {"-ctk", "--cache-type-k"}, "TYPE", + format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()), + [](gpt_params & params, const std::string & value) { + // TODO: get the type right here + params.cache_type_k = value; + } + )); + add_opt(llama_arg( + {"-ctv", "--cache-type-v"}, "TYPE", + format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()), + [](gpt_params & params, const std::string & value) { + // TODO: get the type right here + params.cache_type_v = value; + } + )); + add_opt(llama_arg( + {"--perplexity", "--all-logits"}, + format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"), + [](gpt_params & params) { + params.logits_all = true; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(llama_arg( + {"--hellaswag"}, + "compute HellaSwag score over random tasks from datafile supplied with -f", + [](gpt_params & params) { + params.hellaswag = true; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(llama_arg( + {"--hellaswag-tasks"}, "N", + format("number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks), + [](gpt_params & params, int value) { + params.hellaswag_tasks = value; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(llama_arg( + {"--winogrande"}, + "compute Winogrande score over random tasks from datafile supplied with -f", + [](gpt_params & params) { + params.winogrande = true; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(llama_arg( + {"--winogrande-tasks"}, "N", + format("number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks), + [](gpt_params & params, int value) { + params.winogrande_tasks = value; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(llama_arg( + {"--multiple-choice"}, + "compute multiple choice score over random tasks from datafile supplied with -f", + [](gpt_params & params) { + params.multiple_choice = true; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(llama_arg( + {"--multiple-choice-tasks"}, "N", + format("number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks), + [](gpt_params & params, int value) { + params.multiple_choice_tasks = value; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(llama_arg( + {"--kl-divergence"}, + "computes KL-divergence to logits provided via --kl-divergence-base", + [](gpt_params & params) { + params.kl_divergence = true; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(llama_arg( + {"--save-all-logits", "--kl-divergence-base"}, "FNAME", + "set logits file", + [](gpt_params & params, const std::string & value) { + params.logits_file = value; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(llama_arg( + {"--ppl-stride"}, "N", + format("stride for perplexity calculation (default: %d)", params.ppl_stride), + [](gpt_params & params, int value) { + params.ppl_stride = value; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(llama_arg( + {"--ppl-output-type"}, "<0|1>", + format("output type for perplexity calculation (default: %d)", params.ppl_output_type), + [](gpt_params & params, int value) { + params.ppl_output_type = value; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(llama_arg( + {"-dt", "--defrag-thold"}, "N", + format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold), + [](gpt_params & params, const std::string & value) { + params.defrag_thold = std::stof(value); + } + ).set_env("LLAMA_ARG_DEFRAG_THOLD")); + add_opt(llama_arg( + {"-np", "--parallel"}, "N", + format("number of parallel sequences to decode (default: %d)", params.n_parallel), + [](gpt_params & params, int value) { + params.n_parallel = value; + } + )); + add_opt(llama_arg( + {"-ns", "--sequences"}, "N", + format("number of sequences to decode (default: %d)", params.n_sequences), + [](gpt_params & params, int value) { + params.n_sequences = value; + } + )); + add_opt(llama_arg( + {"-cb", "--cont-batching"}, + format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"), + [](gpt_params & params) { + params.cont_batching = true; + } + ).set_env("LLAMA_ARG_CONT_BATCHING")); + add_opt(llama_arg( + {"-nocb", "--no-cont-batching"}, + "disable continuous batching", + [](gpt_params & params) { + params.cont_batching = false; + } + ).set_env("LLAMA_ARG_NO_CONT_BATCHING")); + add_opt(llama_arg( + {"--mmproj"}, "FILE", + "path to a multimodal projector file for LLaVA. see examples/llava/README.md", + [](gpt_params & params, const std::string & value) { + params.mmproj = value; + } + ).set_examples({LLAMA_EXAMPLE_LLAVA})); + add_opt(llama_arg( + {"--image"}, "FILE", + "path to an image file. use with multimodal models. Specify multiple times for batching", + [](gpt_params & params, const std::string & value) { + params.image.emplace_back(value); + } + ).set_examples({LLAMA_EXAMPLE_LLAVA})); #ifdef GGML_USE_RPC - options.push_back({ "*", " --rpc SERVERS", "comma separated list of RPC servers" }); + add_opt(llama_arg( + {"--rpc"}, "SERVERS", + "comma separated list of RPC servers", + [](gpt_params & params, const std::string & value) { + params.rpc_servers = value; + } + )); #endif + add_opt(llama_arg( + {"--mlock"}, + "force system to keep model in RAM rather than swapping or compressing", + [](gpt_params & params) { + params.use_mlock = true; + } + )); + add_opt(llama_arg( + {"--no-mmap"}, + "do not memory-map model (slower load but may reduce pageouts if not using mlock)", + [](gpt_params & params) { + params.use_mmap = false; + } + )); + add_opt(llama_arg( + {"--numa"}, "TYPE", + "attempt optimizations that help on some NUMA systems\n" + "- distribute: spread execution evenly over all nodes\n" + "- isolate: only spawn threads on CPUs on the node that execution started on\n" + "- numactl: use the CPU map provided by numactl\n" + "if run without this previously, it is recommended to drop the system page cache before using this\n" + "see https://github.com/ggerganov/llama.cpp/issues/1437", + [](gpt_params & params, const std::string & value) { + /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; } + else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; } + else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; } + else { throw std::invalid_argument("invalid value"); } + } + )); + add_opt(llama_arg( + {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N", + "number of layers to store in VRAM", + [](gpt_params & params, int value) { + params.n_gpu_layers = value; + if (!llama_supports_gpu_offload()) { + fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n"); + fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); + } + } + ).set_env("LLAMA_ARG_N_GPU_LAYERS")); + add_opt(llama_arg( + {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N", + "number of layers to store in VRAM for the draft model", + [](gpt_params & params, int value) { + params.n_gpu_layers_draft = value; + if (!llama_supports_gpu_offload()) { + fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n"); + fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); + } + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"-sm", "--split-mode"}, "{none,layer,row}", + "how to split the model across multiple GPUs, one of:\n" + "- none: use one GPU only\n" + "- layer (default): split layers and KV across GPUs\n" + "- row: split rows across GPUs", + [](gpt_params & params, const std::string & value) { + std::string arg_next = value; + if (arg_next == "none") { + params.split_mode = LLAMA_SPLIT_MODE_NONE; + } else if (arg_next == "layer") { + params.split_mode = LLAMA_SPLIT_MODE_LAYER; + } + else if (arg_next == "row") { +#ifdef GGML_USE_SYCL + fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n"); + exit(1); +#endif // GGML_USE_SYCL + params.split_mode = LLAMA_SPLIT_MODE_ROW; + } + else { + throw std::invalid_argument("invalid value"); + } +#ifndef GGML_USE_CUDA_SYCL_VULKAN + fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the split mode has no effect.\n"); +#endif // GGML_USE_CUDA_SYCL_VULKAN + } + )); + add_opt(llama_arg( + {"-ts", "--tensor-split"}, "N0,N1,N2,...", + "fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1", + [](gpt_params & params, const std::string & value) { + std::string arg_next = value; - if (llama_supports_mlock()) { - options.push_back({ "*", " --mlock", "force system to keep model in RAM rather than swapping or compressing" }); - } - if (llama_supports_mmap()) { - options.push_back({ "*", " --no-mmap", "do not memory-map model (slower load but may reduce pageouts if not using mlock)" }); - } - options.push_back({ "*", " --numa TYPE", "attempt optimizations that help on some NUMA systems\n" - " - distribute: spread execution evenly over all nodes\n" - " - isolate: only spawn threads on CPUs on the node that execution started on\n" - " - numactl: use the CPU map provided by numactl\n" - "if run without this previously, it is recommended to drop the system page cache before using this\n" - "see https://github.com/ggerganov/llama.cpp/issues/1437" }); - - if (llama_supports_gpu_offload()) { - options.push_back({ "*", "-ngl, --gpu-layers N", - "number of layers to store in VRAM" }); - options.push_back({ "*", "-ngld, --gpu-layers-draft N", - "number of layers to store in VRAM for the draft model" }); - options.push_back({ "*", "-sm, --split-mode SPLIT_MODE", - "how to split the model across multiple GPUs, one of:\n" - " - none: use one GPU only\n" - " - layer (default): split layers and KV across GPUs\n" - " - row: split rows across GPUs" }); - options.push_back({ "*", "-ts, --tensor-split SPLIT", - "fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1" }); - options.push_back({ "*", "-mg, --main-gpu i", "the GPU to use for the model (with split-mode = none),\n" - "or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu }); - } - - options.push_back({ "model" }); - options.push_back({ "*", " --check-tensors", "check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false" }); - options.push_back({ "*", " --override-kv KEY=TYPE:VALUE", - "advanced option to override model metadata by key. may be specified multiple times.\n" - "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false" }); - options.push_back({ "*", " --lora FNAME", "apply LoRA adapter (can be repeated to use multiple adapters)" }); - options.push_back({ "*", " --lora-scaled FNAME S", "apply LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" }); - options.push_back({ "*", " --control-vector FNAME", "add a control vector\n" - "note: this argument can be repeated to add multiple control vectors" }); - options.push_back({ "*", " --control-vector-scaled FNAME SCALE", - "add a control vector with user defined scaling SCALE\n" - "note: this argument can be repeated to add multiple scaled control vectors" }); - options.push_back({ "*", " --control-vector-layer-range START END", - "layer range to apply the control vector(s) to, start and end inclusive" }); - options.push_back({ "*", "-m, --model FNAME", "model path (default: models/$filename with filename from --hf-file\n" - "or --model-url if set, otherwise %s)", DEFAULT_MODEL_PATH }); - options.push_back({ "*", "-md, --model-draft FNAME", "draft model for speculative decoding (default: unused)" }); - options.push_back({ "*", "-mu, --model-url MODEL_URL", "model download url (default: unused)" }); - options.push_back({ "*", "-hfr, --hf-repo REPO", "Hugging Face model repository (default: unused)" }); - options.push_back({ "*", "-hff, --hf-file FILE", "Hugging Face model file (default: unused)" }); - options.push_back({ "*", "-hft, --hf-token TOKEN", "Hugging Face access token (default: value from HF_TOKEN environment variable)" }); - - options.push_back({ "retrieval" }); - options.push_back({ "retrieval", " --context-file FNAME", "file to load context from (repeat to specify multiple files)" }); - options.push_back({ "retrieval", " --chunk-size N", "minimum length of embedded text chunks (default: %d)", params.chunk_size }); - options.push_back({ "retrieval", " --chunk-separator STRING", - "separator between chunks (default: '%s')", params.chunk_separator.c_str() }); - - options.push_back({ "passkey" }); - options.push_back({ "passkey", " --junk N", "number of times to repeat the junk text (default: %d)", params.n_junk }); - options.push_back({ "passkey", " --pos N", "position of the passkey in the junk text (default: %d)", params.i_pos }); - - options.push_back({ "imatrix" }); - options.push_back({ "imatrix", "-o, --output FNAME", "output file (default: '%s')", params.out_file.c_str() }); - options.push_back({ "imatrix", " --output-frequency N", "output the imatrix every N iterations (default: %d)", params.n_out_freq }); - options.push_back({ "imatrix", " --save-frequency N", "save an imatrix copy every N iterations (default: %d)", params.n_save_freq }); - options.push_back({ "imatrix", " --process-output", "collect data for the output tensor (default: %s)", params.process_output ? "true" : "false" }); - options.push_back({ "imatrix", " --no-ppl", "do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false" }); - options.push_back({ "imatrix", " --chunk N", "start processing the input from chunk N (default: %d)", params.i_chunk }); - - options.push_back({ "bench" }); - options.push_back({ "bench", "-pps", "is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false" }); - options.push_back({ "bench", "-npp n0,n1,...", "number of prompt tokens" }); - options.push_back({ "bench", "-ntg n0,n1,...", "number of text generation tokens" }); - options.push_back({ "bench", "-npl n0,n1,...", "number of parallel prompts" }); - - options.push_back({ "embedding" }); - options.push_back({ "embedding", " --embd-normalize", "normalisation for embendings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize }); - options.push_back({ "embedding", " --embd-output-format", "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix" }); - options.push_back({ "embedding", " --embd-separator", "separator of embendings (default \\n) for example \"<#sep#>\"" }); - - options.push_back({ "server" }); - options.push_back({ "server", " --host HOST", "ip address to listen (default: %s)", params.hostname.c_str() }); - options.push_back({ "server", " --port PORT", "port to listen (default: %d)", params.port }); - options.push_back({ "server", " --path PATH", "path to serve static files from (default: %s)", params.public_path.c_str() }); - options.push_back({ "server", " --embedding(s)", "restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled" }); - options.push_back({ "server", " --api-key KEY", "API key to use for authentication (default: none)" }); - options.push_back({ "server", " --api-key-file FNAME", "path to file containing API keys (default: none)" }); - options.push_back({ "server", " --ssl-key-file FNAME", "path to file a PEM-encoded SSL private key" }); - options.push_back({ "server", " --ssl-cert-file FNAME", "path to file a PEM-encoded SSL certificate" }); - options.push_back({ "server", " --timeout N", "server read/write timeout in seconds (default: %d)", params.timeout_read }); - options.push_back({ "server", " --threads-http N", "number of threads used to process HTTP requests (default: %d)", params.n_threads_http }); - options.push_back({ "server", " --system-prompt-file FNAME", - "set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications" }); - options.push_back({ "server", " --log-format {text,json}", - "log output format: json or text (default: json)" }); - options.push_back({ "server", " --metrics", "enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled" }); - options.push_back({ "server", " --no-slots", "disables slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled" }); - options.push_back({ "server", " --slot-save-path PATH", "path to save slot kv cache (default: disabled)" }); - options.push_back({ "server", " --chat-template JINJA_TEMPLATE", - "set custom jinja chat template (default: template taken from model's metadata)\n" - "only commonly used templates are accepted:\n" - "https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" }); - options.push_back({ "server", "-sps, --slot-prompt-similarity SIMILARITY", - "how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity }); - options.push_back({ "server", " --lora-init-without-apply", "load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"}); + // split string by , and / + const std::regex regex{ R"([,/]+)" }; + std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 }; + std::vector split_arg{ it, {} }; + if (split_arg.size() >= llama_max_devices()) { + throw std::invalid_argument( + format("got %d input configs, but system only has %d devices", (int)split_arg.size(), (int)llama_max_devices()) + ); + } + for (size_t i = 0; i < llama_max_devices(); ++i) { + if (i < split_arg.size()) { + params.tensor_split[i] = std::stof(split_arg[i]); + } else { + params.tensor_split[i] = 0.0f; + } + } +#ifndef GGML_USE_CUDA_SYCL_VULKAN + fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting a tensor split has no effect.\n"); +#endif // GGML_USE_CUDA_SYCL_VULKAN + } + )); + add_opt(llama_arg( + {"-mg", "--main-gpu"}, "INDEX", + format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu), + [](gpt_params & params, int value) { + params.main_gpu = value; +#ifndef GGML_USE_CUDA_SYCL_VULKAN + fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the main GPU has no effect.\n"); +#endif // GGML_USE_CUDA_SYCL_VULKAN + } + )); + add_opt(llama_arg( + {"--check-tensors"}, + format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"), + [](gpt_params & params) { + params.check_tensors = true; + } + )); + add_opt(llama_arg( + {"--override-kv"}, "KEY=TYPE:VALUE", + "advanced option to override model metadata by key. may be specified multiple times.\n" + "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false", + [](gpt_params & params, const std::string & value) { + if (!string_parse_kv_override(value.c_str(), params.kv_overrides)) { + throw std::runtime_error(format("error: Invalid type for KV override: %s\n", value.c_str())); + } + } + )); + add_opt(llama_arg( + {"--lora"}, "FNAME", + "path to LoRA adapter (can be repeated to use multiple adapters)", + [](gpt_params & params, const std::string & value) { + params.lora_adapters.push_back({ std::string(value), 1.0 }); + } + ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA})); + add_opt(llama_arg( + {"--lora-scaled"}, "FNAME", "SCALE", + "path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)", + [](gpt_params & params, const std::string & fname, const std::string & scale) { + params.lora_adapters.push_back({ fname, std::stof(scale) }); + } + ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA})); + add_opt(llama_arg( + {"--control-vector"}, "FNAME", + "add a control vector\nnote: this argument can be repeated to add multiple control vectors", + [](gpt_params & params, const std::string & value) { + params.control_vectors.push_back({ 1.0f, value, }); + } + )); + add_opt(llama_arg( + {"--control-vector-scaled"}, "FNAME", "SCALE", + "add a control vector with user defined scaling SCALE\n" + "note: this argument can be repeated to add multiple scaled control vectors", + [](gpt_params & params, const std::string & fname, const std::string & scale) { + params.control_vectors.push_back({ std::stof(scale), fname }); + } + )); + add_opt(llama_arg( + {"--control-vector-layer-range"}, "START", "END", + "layer range to apply the control vector(s) to, start and end inclusive", + [](gpt_params & params, const std::string & start, const std::string & end) { + params.control_vector_layer_start = std::stoi(start); + params.control_vector_layer_end = std::stoi(end); + } + )); + add_opt(llama_arg( + {"-a", "--alias"}, "STRING", + "set alias for model name (to be used by REST API)", + [](gpt_params & params, const std::string & value) { + params.model_alias = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"-m", "--model"}, "FNAME", + ex == LLAMA_EXAMPLE_EXPORT_LORA + ? std::string("model path from which to load base model") + : format( + "model path (default: `models/$filename` with filename from `--hf-file` " + "or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH + ), + [](gpt_params & params, const std::string & value) { + params.model = value; + } + ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL")); + add_opt(llama_arg( + {"-md", "--model-draft"}, "FNAME", + "draft model for speculative decoding (default: unused)", + [](gpt_params & params, const std::string & value) { + params.model_draft = value; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"-mu", "--model-url"}, "MODEL_URL", + "model download url (default: unused)", + [](gpt_params & params, const std::string & value) { + params.model_url = value; + } + ).set_env("LLAMA_ARG_MODEL_URL")); + add_opt(llama_arg( + {"-hfr", "--hf-repo"}, "REPO", + "Hugging Face model repository (default: unused)", + [](gpt_params & params, const std::string & value) { + params.hf_repo = value; + } + ).set_env("LLAMA_ARG_HF_REPO")); + add_opt(llama_arg( + {"-hff", "--hf-file"}, "FILE", + "Hugging Face model file (default: unused)", + [](gpt_params & params, const std::string & value) { + params.hf_file = value; + } + ).set_env("LLAMA_ARG_HF_FILE")); + add_opt(llama_arg( + {"-hft", "--hf-token"}, "TOKEN", + "Hugging Face access token (default: value from HF_TOKEN environment variable)", + [](gpt_params & params, const std::string & value) { + params.hf_token = value; + } + ).set_env("HF_TOKEN")); + add_opt(llama_arg( + {"--context-file"}, "FNAME", + "file to load context from (repeat to specify multiple files)", + [](gpt_params & params, const std::string & value) { + std::ifstream file(value, std::ios::binary); + if (!file) { + throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); + } + params.context_files.push_back(value); + } + ).set_examples({LLAMA_EXAMPLE_RETRIEVAL})); + add_opt(llama_arg( + {"--chunk-size"}, "N", + format("minimum length of embedded text chunks (default: %d)", params.chunk_size), + [](gpt_params & params, int value) { + params.chunk_size = value; + } + ).set_examples({LLAMA_EXAMPLE_RETRIEVAL})); + add_opt(llama_arg( + {"--chunk-separator"}, "STRING", + format("separator between chunks (default: '%s')", params.chunk_separator.c_str()), + [](gpt_params & params, const std::string & value) { + params.chunk_separator = value; + } + ).set_examples({LLAMA_EXAMPLE_RETRIEVAL})); + add_opt(llama_arg( + {"--junk"}, "N", + format("number of times to repeat the junk text (default: %d)", params.n_junk), + [](gpt_params & params, int value) { + params.n_junk = value; + } + ).set_examples({LLAMA_EXAMPLE_PASSKEY})); + add_opt(llama_arg( + {"--pos"}, "N", + format("position of the passkey in the junk text (default: %d)", params.i_pos), + [](gpt_params & params, int value) { + params.i_pos = value; + } + ).set_examples({LLAMA_EXAMPLE_PASSKEY})); + add_opt(llama_arg( + {"-o", "--output", "--output-file"}, "FNAME", + format("output file (default: '%s')", + ex == LLAMA_EXAMPLE_EXPORT_LORA + ? params.lora_outfile.c_str() + : ex == LLAMA_EXAMPLE_CVECTOR_GENERATOR + ? params.cvector_outfile.c_str() + : params.out_file.c_str()), + [](gpt_params & params, const std::string & value) { + params.out_file = value; + params.cvector_outfile = value; + params.lora_outfile = value; + } + ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA})); + add_opt(llama_arg( + {"-ofreq", "--output-frequency"}, "N", + format("output the imatrix every N iterations (default: %d)", params.n_out_freq), + [](gpt_params & params, int value) { + params.n_out_freq = value; + } + ).set_examples({LLAMA_EXAMPLE_IMATRIX})); + add_opt(llama_arg( + {"--save-frequency"}, "N", + format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq), + [](gpt_params & params, int value) { + params.n_save_freq = value; + } + ).set_examples({LLAMA_EXAMPLE_IMATRIX})); + add_opt(llama_arg( + {"--process-output"}, + format("collect data for the output tensor (default: %s)", params.process_output ? "true" : "false"), + [](gpt_params & params) { + params.process_output = true; + } + ).set_examples({LLAMA_EXAMPLE_IMATRIX})); + add_opt(llama_arg( + {"--no-ppl"}, + format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"), + [](gpt_params & params) { + params.compute_ppl = false; + } + ).set_examples({LLAMA_EXAMPLE_IMATRIX})); + add_opt(llama_arg( + {"--chunk", "--from-chunk"}, "N", + format("start processing the input from chunk N (default: %d)", params.i_chunk), + [](gpt_params & params, int value) { + params.i_chunk = value; + } + ).set_examples({LLAMA_EXAMPLE_IMATRIX})); + add_opt(llama_arg( + {"-pps"}, + format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"), + [](gpt_params & params) { + params.is_pp_shared = true; + } + ).set_examples({LLAMA_EXAMPLE_BENCH})); + add_opt(llama_arg( + {"-npp"}, "n0,n1,...", + "number of prompt tokens", + [](gpt_params & params, const std::string & value) { + auto p = string_split(value, ','); + params.n_pp.insert(params.n_pp.end(), p.begin(), p.end()); + } + ).set_examples({LLAMA_EXAMPLE_BENCH})); + add_opt(llama_arg( + {"-ntg"}, "n0,n1,...", + "number of text generation tokens", + [](gpt_params & params, const std::string & value) { + auto p = string_split(value, ','); + params.n_tg.insert(params.n_tg.end(), p.begin(), p.end()); + } + ).set_examples({LLAMA_EXAMPLE_BENCH})); + add_opt(llama_arg( + {"-npl"}, "n0,n1,...", + "number of parallel prompts", + [](gpt_params & params, const std::string & value) { + auto p = string_split(value, ','); + params.n_pl.insert(params.n_pl.end(), p.begin(), p.end()); + } + ).set_examples({LLAMA_EXAMPLE_BENCH})); + add_opt(llama_arg( + {"--embd-normalize"}, "N", + format("normalisation for embendings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize), + [](gpt_params & params, int value) { + params.embd_normalize = value; + } + ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); + add_opt(llama_arg( + {"--embd-output-format"}, "FORMAT", + "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix", + [](gpt_params & params, const std::string & value) { + params.embd_out = value; + } + ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); + add_opt(llama_arg( + {"--embd-separator"}, "STRING", + "separator of embendings (default \\n) for example \"<#sep#>\"", + [](gpt_params & params, const std::string & value) { + params.embd_sep = value; + } + ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); + add_opt(llama_arg( + {"--host"}, "HOST", + format("ip address to listen (default: %s)", params.hostname.c_str()), + [](gpt_params & params, const std::string & value) { + params.hostname = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_HOST")); + add_opt(llama_arg( + {"--port"}, "PORT", + format("port to listen (default: %d)", params.port), + [](gpt_params & params, int value) { + params.port = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PORT")); + add_opt(llama_arg( + {"--path"}, "PATH", + format("path to serve static files from (default: %s)", params.public_path.c_str()), + [](gpt_params & params, const std::string & value) { + params.public_path = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"--embedding", "--embeddings"}, + format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"), + [](gpt_params & params) { + params.embedding = true; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS")); + add_opt(llama_arg( + {"--api-key"}, "KEY", + "API key to use for authentication (default: none)", + [](gpt_params & params, const std::string & value) { + params.api_keys.push_back(value); + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_API_KEY")); + add_opt(llama_arg( + {"--api-key-file"}, "FNAME", + "path to file containing API keys (default: none)", + [](gpt_params & params, const std::string & value) { + std::ifstream key_file(value); + if (!key_file) { + throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); + } + std::string key; + while (std::getline(key_file, key)) { + if (!key.empty()) { + params.api_keys.push_back(key); + } + } + key_file.close(); + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"--ssl-key-file"}, "FNAME", + "path to file a PEM-encoded SSL private key", + [](gpt_params & params, const std::string & value) { + params.ssl_file_key = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"--ssl-cert-file"}, "FNAME", + "path to file a PEM-encoded SSL certificate", + [](gpt_params & params, const std::string & value) { + params.ssl_file_cert = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"-to", "--timeout"}, "N", + format("server read/write timeout in seconds (default: %d)", params.timeout_read), + [](gpt_params & params, int value) { + params.timeout_read = value; + params.timeout_write = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"--threads-http"}, "N", + format("number of threads used to process HTTP requests (default: %d)", params.n_threads_http), + [](gpt_params & params, int value) { + params.n_threads_http = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP")); + add_opt(llama_arg( + {"-spf", "--system-prompt-file"}, "FNAME", + "set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications", + [](gpt_params & params, const std::string & value) { + std::ifstream file(value); + if (!file) { + throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); + } + std::string system_prompt; + std::copy( + std::istreambuf_iterator(file), + std::istreambuf_iterator(), + std::back_inserter(system_prompt) + ); + params.system_prompt = system_prompt; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"--log-format"}, "{text, json}", + "log output format: json or text (default: json)", + [](gpt_params & params, const std::string & value) { + if (value == "json") { + params.log_json = true; + } else if (value == "text") { + params.log_json = false; + } else { + throw std::invalid_argument("invalid value"); + } + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"--metrics"}, + format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"), + [](gpt_params & params) { + params.endpoint_metrics = true; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS")); + add_opt(llama_arg( + {"--no-slots"}, + format("disables slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"), + [](gpt_params & params) { + params.endpoint_slots = false; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_ENDPOINT_SLOTS")); + add_opt(llama_arg( + {"--slot-save-path"}, "PATH", + "path to save slot kv cache (default: disabled)", + [](gpt_params & params, const std::string & value) { + params.slot_save_path = value; + // if doesn't end with DIRECTORY_SEPARATOR, add it + if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) { + params.slot_save_path += DIRECTORY_SEPARATOR; + } + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"--chat-template"}, "JINJA_TEMPLATE", + "set custom jinja chat template (default: template taken from model's metadata)\n" + "if suffix/prefix are specified, template will be disabled\n" + "only commonly used templates are accepted:\nhttps://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template", + [](gpt_params & params, const std::string & value) { + if (!llama_chat_verify_template(value)) { + throw std::runtime_error(format( + "error: the supplied chat template is not supported: %s\n" + "note: llama.cpp does not use jinja parser, we only support commonly used templates\n", + value.c_str() + )); + } + params.chat_template = value; + } + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE")); + add_opt(llama_arg( + {"-sps", "--slot-prompt-similarity"}, "SIMILARITY", + format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity), + [](gpt_params & params, const std::string & value) { + params.slot_prompt_similarity = std::stof(value); + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"--lora-init-without-apply"}, + format("load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"), + [](gpt_params & params) { + params.lora_init_without_apply = true; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"--simple-io"}, + "use basic IO for better compatibility in subprocesses and limited consoles", + [](gpt_params & params) { + params.simple_io = true; + } + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL})); + add_opt(llama_arg( + {"-ld", "--logdir"}, "LOGDIR", + "path under which to save YAML logs (no logging if unset)", + [](gpt_params & params, const std::string & value) { + params.logdir = value; + if (params.logdir.back() != DIRECTORY_SEPARATOR) { + params.logdir += DIRECTORY_SEPARATOR; + } + } + )); + add_opt(llama_arg( + {"--positive-file"}, "FNAME", + format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()), + [](gpt_params & params, const std::string & value) { + params.cvector_positive_file = value; + } + ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); + add_opt(llama_arg( + {"--negative-file"}, "FNAME", + format("negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str()), + [](gpt_params & params, const std::string & value) { + params.cvector_negative_file = value; + } + ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); + add_opt(llama_arg( + {"--pca-batch"}, "N", + format("batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch), + [](gpt_params & params, int value) { + params.n_pca_batch = value; + } + ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); + add_opt(llama_arg( + {"--pca-iter"}, "N", + format("number of iterations used for PCA (default: %d)", params.n_pca_iterations), + [](gpt_params & params, int value) { + params.n_pca_iterations = value; + } + ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); + add_opt(llama_arg( + {"--method"}, "{pca, mean}", + "dimensionality reduction method to be used (default: pca)", + [](gpt_params & params, const std::string & value) { + /**/ if (value == "pca") { params.cvector_dimre_method = DIMRE_METHOD_PCA; } + else if (value == "mean") { params.cvector_dimre_method = DIMRE_METHOD_MEAN; } + else { throw std::invalid_argument("invalid value"); } + } + ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); + add_opt(llama_arg( + {"--output-format"}, "{md,jsonl}", + "output format for batched-bench results (default: md)", + [](gpt_params & params, const std::string & value) { + /**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; } + else if (value == "md") { params.batched_bench_output_jsonl = false; } + else { std::invalid_argument("invalid value"); } + } + ).set_examples({LLAMA_EXAMPLE_BENCH})); #ifndef LOG_DISABLE_LOGS - options.push_back({ "logging" }); - options.push_back({ "*", " --simple-io", "use basic IO for better compatibility in subprocesses and limited consoles" }); - options.push_back({ "*", "-ld, --logdir LOGDIR", "path under which to save YAML logs (no logging if unset)" }); - options.push_back({ "logging", " --log-test", "Run simple logging test" }); - options.push_back({ "logging", " --log-disable", "Disable trace logs" }); - options.push_back({ "logging", " --log-enable", "Enable trace logs" }); - options.push_back({ "logging", " --log-file FNAME", "Specify a log filename (without extension)" }); - options.push_back({ "logging", " --log-new", "Create a separate new log file on start. " - "Each log file will have unique name: \"..log\"" }); - options.push_back({ "logging", " --log-append", "Don't truncate the old log file." }); + // TODO: make this looks less weird + add_opt(llama_arg( + {"--log-test"}, + "Log test", + [](gpt_params &) { log_param_single_parse("--log-test"); } + )); + add_opt(llama_arg( + {"--log-disable"}, + "Log disable", + [](gpt_params &) { log_param_single_parse("--log-disable"); } + )); + add_opt(llama_arg( + {"--log-enable"}, + "Log enable", + [](gpt_params &) { log_param_single_parse("--log-enable"); } + )); + add_opt(llama_arg( + {"--log-new"}, + "Log new", + [](gpt_params &) { log_param_single_parse("--log-new"); } + )); + add_opt(llama_arg( + {"--log-append"}, + "Log append", + [](gpt_params &) { log_param_single_parse("--log-append"); } + )); + add_opt(llama_arg( + {"--log-file"}, "FNAME", + "Log file", + [](gpt_params &, const std::string & value) { log_param_pair_parse(false, "--log-file", value); } + )); #endif // LOG_DISABLE_LOGS - options.push_back({ "cvector" }); - options.push_back({ "cvector", "-o, --output FNAME", "output file (default: '%s')", params.cvector_outfile.c_str() }); - options.push_back({ "cvector", " --positive-file FNAME", "positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str() }); - options.push_back({ "cvector", " --negative-file FNAME", "negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str() }); - options.push_back({ "cvector", " --pca-batch N", "batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch }); - options.push_back({ "cvector", " --pca-iter N", "number of iterations used for PCA (default: %d)", params.n_pca_iterations }); - options.push_back({ "cvector", " --method {pca,mean}", "dimensionality reduction method to be used (default: pca)" }); - - options.push_back({ "export-lora" }); - options.push_back({ "export-lora", "-m, --model", "model path from which to load base model (default '%s')", params.model.c_str() }); - options.push_back({ "export-lora", " --lora FNAME", "path to LoRA adapter (can be repeated to use multiple adapters)" }); - options.push_back({ "export-lora", " --lora-scaled FNAME S", "path to LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" }); - options.push_back({ "export-lora", "-o, --output FNAME", "output file (default: '%s')", params.lora_outfile.c_str() }); - - options.push_back({ "batched-bench" }); - options.push_back({ "batched-bench", " --output-format {md,jsonl}", "output format for batched-bench results (default: md)" }); - - printf("usage: %s [options]\n", argv[0]); - - for (const auto & o : options) { - if (!o.grp.empty()) { - printf("\n%s:\n\n", o.grp.c_str()); - continue; - } - printf(" %-32s", o.args.c_str()); - if (o.args.length() > 30) { - printf("\n%34s", ""); - } - - const auto desc = o.desc; - size_t start = 0; - size_t end = desc.find('\n'); - while (end != std::string::npos) { - printf("%s\n%34s", desc.substr(start, end - start).c_str(), ""); - start = end + 1; - end = desc.find('\n', start); - } - - printf("%s\n", desc.substr(start).c_str()); - } - printf("\n"); + return options; } std::string gpt_params_get_system_info(const gpt_params & params) { @@ -2515,10 +2734,15 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { llama_token bos = llama_token_bos(model); llama_token eos = llama_token_eos(model); // some models (e.g. T5) don't have a BOS token - if (bos != -1) { + if (bos != LLAMA_TOKEN_NULL) { tmp.push_back(bos); } - tmp.push_back(eos); + if (eos != LLAMA_TOKEN_NULL) { + tmp.push_back(eos); + } + if (tmp.empty()) { + tmp.push_back(0); + } if (llama_model_has_encoder(model)) { llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size(), 0, 0)); diff --git a/common/common.h b/common/common.h index 224528d48..480e597a8 100644 --- a/common/common.h +++ b/common/common.h @@ -14,8 +14,10 @@ #include #include #include +#include #include #include +#include #ifdef _WIN32 #define DIRECTORY_SEPARATOR '\\' @@ -57,6 +59,25 @@ int32_t cpu_get_num_math(); // CLI argument parsing // +enum llama_example { + LLAMA_EXAMPLE_COMMON, + LLAMA_EXAMPLE_SPECULATIVE, + LLAMA_EXAMPLE_MAIN, + LLAMA_EXAMPLE_INFILL, + LLAMA_EXAMPLE_EMBEDDING, + LLAMA_EXAMPLE_PERPLEXITY, + LLAMA_EXAMPLE_RETRIEVAL, + LLAMA_EXAMPLE_PASSKEY, + LLAMA_EXAMPLE_IMATRIX, + LLAMA_EXAMPLE_BENCH, + LLAMA_EXAMPLE_SERVER, + LLAMA_EXAMPLE_CVECTOR_GENERATOR, + LLAMA_EXAMPLE_EXPORT_LORA, + LLAMA_EXAMPLE_LLAVA, + + LLAMA_EXAMPLE_COUNT, +}; + // dimensionality reduction methods, used by cvector-generator enum dimre_method { DIMRE_METHOD_PCA, @@ -73,6 +94,8 @@ struct cpu_params { }; struct gpt_params { + enum llama_example curr_ex = LLAMA_EXAMPLE_COMMON; + uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed int32_t n_predict = -1; // new tokens to predict @@ -193,6 +216,7 @@ struct gpt_params { bool kl_divergence = false; // compute KL divergence + std::function print_usage = nullptr; // print example-specific usage and example bool usage = false; // print usage bool use_color = false; // use color to distinguish generations and inputs bool special = false; // enable special token output @@ -214,7 +238,6 @@ struct gpt_params { bool use_mlock = false; // use mlock to keep model in memory bool verbose_prompt = false; // print prompt tokens before generation bool display_prompt = true; // print prompt before generation - bool infill = false; // use infill mode bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes bool no_kv_offload = false; // disable KV offloading bool warmup = true; // warmup run @@ -303,13 +326,91 @@ struct gpt_params { bool batched_bench_output_jsonl = false; }; -void gpt_params_parse_from_env(gpt_params & params); -void gpt_params_handle_model_default(gpt_params & params); +struct llama_arg { + std::set examples = {LLAMA_EXAMPLE_COMMON}; + std::vector args; + const char * value_hint = nullptr; // help text or example for arg value + const char * value_hint_2 = nullptr; // for second arg value + const char * env = nullptr; + std::string help; + void (*handler_void) (gpt_params & params) = nullptr; + void (*handler_string) (gpt_params & params, const std::string &) = nullptr; + void (*handler_str_str)(gpt_params & params, const std::string &, const std::string &) = nullptr; + void (*handler_int) (gpt_params & params, int) = nullptr; -bool gpt_params_parse_ex (int argc, char ** argv, gpt_params & params); -bool gpt_params_parse (int argc, char ** argv, gpt_params & params); -bool gpt_params_find_arg (int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param); -void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params); + llama_arg( + const std::initializer_list & args, + const char * value_hint, + const std::string & help, + void (*handler)(gpt_params & params, const std::string &) + ) : args(args), value_hint(value_hint), help(help), handler_string(handler) {} + + llama_arg( + const std::initializer_list & args, + const char * value_hint, + const std::string & help, + void (*handler)(gpt_params & params, int) + ) : args(args), value_hint(value_hint), help(help), handler_int(handler) {} + + llama_arg( + const std::initializer_list & args, + const std::string & help, + void (*handler)(gpt_params & params) + ) : args(args), help(help), handler_void(handler) {} + + // support 2 values for arg + llama_arg( + const std::initializer_list & args, + const char * value_hint, + const char * value_hint_2, + const std::string & help, + void (*handler)(gpt_params & params, const std::string &, const std::string &) + ) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {} + + llama_arg & set_examples(std::initializer_list examples) { + this->examples = std::move(examples); + return *this; + } + + llama_arg & set_env(const char * env) { + help = help + "\n(env: " + env + ")"; + this->env = env; + return *this; + } + + bool in_example(enum llama_example ex) { + return examples.find(ex) != examples.end(); + } + + bool get_value_from_env(std::string & output) const { + if (env == nullptr) return false; + char * value = std::getenv(env); + if (value) { + output = value; + return true; + } + return false; + } + + bool has_value_from_env() const { + return env != nullptr && std::getenv(env); + } + + std::string to_string(); +}; + +// initialize list of options (arguments) that can be used by the current example +std::vector gpt_params_parser_init(gpt_params & params, llama_example ex); +// optionally, we can provide "print_usage" to print example usage +std::vector gpt_params_parser_init(gpt_params & params, llama_example ex, std::function print_usage); + +// parse input arguments from CLI +// if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message) +bool gpt_params_parse (int argc, char ** argv, gpt_params & params, std::vector & options); +bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vector & options); + +// print full usage message; it will be called internally by gpt_params_parse() if "-h" is set +void gpt_params_print_usage(gpt_params & params, std::vector & options); std::string gpt_params_get_system_info(const gpt_params & params); diff --git a/common/sampling.cpp b/common/sampling.cpp index c81b4d233..7806b77e0 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -145,7 +145,7 @@ struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const st /* .params = */ params, /* .grmr = */ llama_sampler_init_grammar(model, params.grammar.c_str(), "root"), /* .chain = */ llama_sampler_chain_init(lparams), - /* .prev = */ ring_buffer(params.n_prev), + /* .prev = */ ring_buffer(std::max(32, params.n_prev)), /* .cur = */ {}, /* .cur_p = */ {}, }; diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp index b043c74cc..f3b0c433b 100644 --- a/examples/batched-bench/batched-bench.cpp +++ b/examples/batched-bench/batched-bench.cpp @@ -28,9 +28,7 @@ static std::vector parse_list(char * p) { return ret; } -static void print_usage(int argc, char ** argv, const gpt_params & params) { - gpt_params_print_usage(argc, argv, params); - +static void print_usage(int, char ** argv) { LOG_TEE("\nexample usage:\n"); LOG_TEE("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]); LOG_TEE("\n"); @@ -39,8 +37,8 @@ static void print_usage(int argc, char ** argv, const gpt_params & params) { int main(int argc, char ** argv) { gpt_params params; - if (!gpt_params_parse(argc, argv, params)) { - print_usage(argc, argv, params); + auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_BENCH, print_usage); + if (!gpt_params_parse(argc, argv, params, options)) { return 1; } diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp index f321f6104..f5f309022 100644 --- a/examples/batched/batched.cpp +++ b/examples/batched/batched.cpp @@ -6,9 +6,7 @@ #include #include -static void print_usage(int argc, char ** argv, const gpt_params & params) { - gpt_params_print_usage(argc, argv, params); - +static void print_usage(int, char ** argv) { LOG_TEE("\nexample usage:\n"); LOG_TEE("\n %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]); LOG_TEE("\n"); @@ -20,8 +18,8 @@ int main(int argc, char ** argv) { params.prompt = "Hello my name is"; params.n_predict = 32; - if (!gpt_params_parse(argc, argv, params)) { - print_usage(argc, argv, params); + auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, print_usage); + if (!gpt_params_parse(argc, argv, params, options)) { return 1; } diff --git a/examples/cvector-generator/cvector-generator.cpp b/examples/cvector-generator/cvector-generator.cpp index a68268388..0795175a1 100644 --- a/examples/cvector-generator/cvector-generator.cpp +++ b/examples/cvector-generator/cvector-generator.cpp @@ -35,9 +35,7 @@ static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) { return ret; } -static void print_usage(int argc, char ** argv, const gpt_params & params) { - gpt_params_print_usage(argc, argv, params); - +static void print_usage(int, char ** argv) { printf("\nexample usage:\n"); printf("\n CPU only: %s -m ./llama-3.Q4_K_M.gguf\n", argv[0]); printf("\n with GPU: %s -m ./llama-3.Q4_K_M.gguf -ngl 99\n", argv[0]); @@ -390,8 +388,8 @@ static int prepare_entries(gpt_params & params, train_context & ctx_train) { int main(int argc, char ** argv) { gpt_params params; - if (!gpt_params_parse(argc, argv, params)) { - print_usage(argc, argv, params); + auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage); + if (!gpt_params_parse(argc, argv, params, options)) { return 1; } diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index ebe58b13b..7a141f953 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -80,8 +80,8 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu int main(int argc, char ** argv) { gpt_params params; - if (!gpt_params_parse(argc, argv, params)) { - gpt_params_print_usage(argc, argv, params); + auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_EMBEDDING); + if (!gpt_params_parse(argc, argv, params, options)) { return 1; } diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp index aea15c864..881111ffd 100644 --- a/examples/eval-callback/eval-callback.cpp +++ b/examples/eval-callback/eval-callback.cpp @@ -144,8 +144,8 @@ int main(int argc, char ** argv) { gpt_params params; - if (!gpt_params_parse(argc, argv, params)) { - gpt_params_print_usage(argc, argv, params); + auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON); + if (!gpt_params_parse(argc, argv, params, options)) { return 1; } diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp index 8df457e21..544e7fff6 100644 --- a/examples/export-lora/export-lora.cpp +++ b/examples/export-lora/export-lora.cpp @@ -391,9 +391,7 @@ struct lora_merge_ctx { } }; -static void print_usage(int argc, char ** argv, const gpt_params & params) { - gpt_params_print_usage(argc, argv, params); - +static void print_usage(int, char ** argv) { printf("\nexample usage:\n"); printf("\n %s -m base-model.gguf --lora lora-file.gguf -o merged-model-f16.gguf\n", argv[0]); printf("\nNOTE: output model is F16\n"); @@ -403,8 +401,8 @@ static void print_usage(int argc, char ** argv, const gpt_params & params) { int main(int argc, char ** argv) { gpt_params params; - if (!gpt_params_parse(argc, argv, params)) { - print_usage(argc, argv, params); + auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage); + if (!gpt_params_parse(argc, argv, params, options)) { return 1; } diff --git a/examples/gen-docs/CMakeLists.txt b/examples/gen-docs/CMakeLists.txt new file mode 100644 index 000000000..c94cda776 --- /dev/null +++ b/examples/gen-docs/CMakeLists.txt @@ -0,0 +1,5 @@ +set(TARGET llama-gen-docs) +add_executable(${TARGET} gen-docs.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/gen-docs/gen-docs.cpp b/examples/gen-docs/gen-docs.cpp new file mode 100644 index 000000000..8b1dafd63 --- /dev/null +++ b/examples/gen-docs/gen-docs.cpp @@ -0,0 +1,51 @@ +#include "common.h" + +#include +#include + +// Export usage message (-h) to markdown format + +static void export_md(std::string fname, llama_example ex) { + std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc); + + gpt_params params; + auto options = gpt_params_parser_init(params, ex); + + file << "| Argument | Explanation |\n"; + file << "| -------- | ----------- |\n"; + for (auto & opt : options) { + file << "| `"; + // args + for (const auto & arg : opt.args) { + if (arg == opt.args.front()) { + file << arg; + if (opt.args.size() > 1) file << ", "; + } else { + file << arg << (arg != opt.args.back() ? ", " : ""); + } + } + // value hint + if (opt.value_hint) { + std::string md_value_hint(opt.value_hint); + string_replace_all(md_value_hint, "|", "\\|"); + file << " " << md_value_hint; + } + if (opt.value_hint_2) { + std::string md_value_hint_2(opt.value_hint_2); + string_replace_all(md_value_hint_2, "|", "\\|"); + file << " " << md_value_hint_2; + } + // help text + std::string md_help(opt.help); + string_replace_all(md_help, "\n", "
"); + string_replace_all(md_help, "|", "\\|"); + file << "` | " << md_help << " |\n"; + } +} + +int main(int, char **) { + export_md("autogen-main.md", LLAMA_EXAMPLE_MAIN); + export_md("autogen-server.md", LLAMA_EXAMPLE_SERVER); + + return 0; +} diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp index 4e801c69d..e1efbf573 100644 --- a/examples/gritlm/gritlm.cpp +++ b/examples/gritlm/gritlm.cpp @@ -154,8 +154,8 @@ static std::string gritlm_instruction(const std::string & instruction) { int main(int argc, char * argv[]) { gpt_params params; - if (!gpt_params_parse(argc, argv, params)) { - gpt_params_print_usage(argc, argv, params); + auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON); + if (!gpt_params_parse(argc, argv, params, options)) { return 1; } diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index 635bd9849..c310c229a 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -18,9 +18,7 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif -static void print_usage(int argc, char ** argv, const gpt_params & params) { - gpt_params_print_usage(argc, argv, params); - +static void print_usage(int, char ** argv) { LOG_TEE("\nexample usage:\n"); LOG_TEE("\n %s \\\n" " -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \\\n" @@ -580,8 +578,8 @@ int main(int argc, char ** argv) { params.logits_all = true; params.verbosity = 1; - if (!gpt_params_parse(argc, argv, params)) { - print_usage(argc, argv, params); + auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_IMATRIX, print_usage); + if (!gpt_params_parse(argc, argv, params, options)) { return 1; } diff --git a/examples/llama.android/llama/src/main/cpp/llama-android.cpp b/examples/llama.android/llama/src/main/cpp/llama-android.cpp index 921793751..06ec160c2 100644 --- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp +++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp @@ -269,12 +269,6 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model( return env->NewStringUTF(result.str().c_str()); } -extern "C" -JNIEXPORT void JNICALL -Java_android_llama_cpp_LLamaAndroid_free_1batch(JNIEnv *, jobject, jlong batch_pointer) { - llama_batch_free(*reinterpret_cast(batch_pointer)); -} - extern "C" JNIEXPORT jlong JNICALL Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) { @@ -311,6 +305,29 @@ Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens, return reinterpret_cast(batch); } +extern "C" +JNIEXPORT void JNICALL +Java_android_llama_cpp_LLamaAndroid_free_1batch(JNIEnv *, jobject, jlong batch_pointer) { + llama_batch_free(*reinterpret_cast(batch_pointer)); +} + +extern "C" +JNIEXPORT jlong JNICALL +Java_android_llama_cpp_LLamaAndroid_new_1sampler(JNIEnv *, jobject) { + auto sparams = llama_sampler_chain_default_params(); + sparams.no_perf = true; + llama_sampler * smpl = llama_sampler_chain_init(sparams); + llama_sampler_chain_add(smpl, llama_sampler_init_greedy()); + + return reinterpret_cast(smpl); +} + +extern "C" +JNIEXPORT void JNICALL +Java_android_llama_cpp_LLamaAndroid_free_1sampler(JNIEnv *, jobject, jlong sampler_pointer) { + llama_sampler_free(reinterpret_cast(sampler_pointer)); +} + extern "C" JNIEXPORT void JNICALL Java_android_llama_cpp_LLamaAndroid_backend_1init(JNIEnv *, jobject) { @@ -380,14 +397,14 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop( JNIEnv * env, jobject, jlong context_pointer, - jlong sampling_pointer, jlong batch_pointer, + jlong sampler_pointer, jint n_len, jobject intvar_ncur ) { const auto context = reinterpret_cast(context_pointer); - const auto sampling = reinterpret_cast(sampling_pointer); - const auto batch = reinterpret_cast(batch_pointer); + const auto batch = reinterpret_cast(batch_pointer); + const auto sampler = reinterpret_cast(sampler_pointer); const auto model = llama_get_model(context); if (!la_int_var) la_int_var = env->GetObjectClass(intvar_ncur); @@ -395,9 +412,9 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop( if (!la_int_var_inc) la_int_var_inc = env->GetMethodID(la_int_var, "inc", "()V"); // sample the most likely token - const auto new_token_id = llama_sampler_sample(sampling, context, batch->n_tokens - 1); + const auto new_token_id = llama_sampler_sample(sampler, context, -1); - llama_sampler_accept(sampling, new_token_id); + llama_sampler_accept(sampler, new_token_id); const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value); if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) { diff --git a/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt b/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt index 6c63e54e0..cf520e459 100644 --- a/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt +++ b/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt @@ -45,8 +45,10 @@ class LLamaAndroid { private external fun free_context(context: Long) private external fun backend_init(numa: Boolean) private external fun backend_free() - private external fun free_batch(batch: Long) private external fun new_batch(nTokens: Int, embd: Int, nSeqMax: Int): Long + private external fun free_batch(batch: Long) + private external fun new_sampler(): Long + private external fun free_sampler(sampler: Long) private external fun bench_model( context: Long, model: Long, @@ -69,6 +71,7 @@ class LLamaAndroid { private external fun completion_loop( context: Long, batch: Long, + sampler: Long, nLen: Int, ncur: IntVar ): String? @@ -101,8 +104,11 @@ class LLamaAndroid { val batch = new_batch(512, 0, 1) if (batch == 0L) throw IllegalStateException("new_batch() failed") + val sampler = new_sampler() + if (sampler == 0L) throw IllegalStateException("new_sampler() failed") + Log.i(tag, "Loaded model $pathToModel") - threadLocalState.set(State.Loaded(model, context, batch)) + threadLocalState.set(State.Loaded(model, context, batch, sampler)) } else -> throw IllegalStateException("Model already loaded") } @@ -114,7 +120,7 @@ class LLamaAndroid { is State.Loaded -> { val ncur = IntVar(completion_init(state.context, state.batch, message, nlen)) while (ncur.value <= nlen) { - val str = completion_loop(state.context, state.batch, nlen, ncur) + val str = completion_loop(state.context, state.batch, state.sampler, nlen, ncur) if (str == null) { break } @@ -138,6 +144,7 @@ class LLamaAndroid { free_context(state.context) free_model(state.model) free_batch(state.batch) + free_sampler(state.sampler); threadLocalState.set(State.Idle) } @@ -161,7 +168,7 @@ class LLamaAndroid { private sealed interface State { data object Idle: State - data class Loaded(val model: Long, val context: Long, val batch: Long): State + data class Loaded(val model: Long, val context: Long, val batch: Long, val sampler: Long): State } // Enforce only one instance of Llm. diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp index 4d7ccc91f..5845d0106 100644 --- a/examples/llava/llava-cli.cpp +++ b/examples/llava/llava-cli.cpp @@ -112,9 +112,7 @@ struct llava_context { struct llama_model * model = NULL; }; -static void print_usage(int argc, char ** argv, const gpt_params & params) { - gpt_params_print_usage(argc, argv, params); - +static void print_usage(int, char ** argv) { LOG_TEE("\n example usage:\n"); LOG_TEE("\n %s -m --mmproj --image --image [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); LOG_TEE("\n note: a lower temperature value like 0.1 is recommended for better quality.\n"); @@ -280,8 +278,8 @@ int main(int argc, char ** argv) { gpt_params params; - if (!gpt_params_parse(argc, argv, params)) { - print_usage(argc, argv, params); + auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_LLAVA, print_usage); + if (!gpt_params_parse(argc, argv, params, options)) { return 1; } @@ -293,7 +291,7 @@ int main(int argc, char ** argv) { #endif // LOG_DISABLE_LOGS if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) { - print_usage(argc, argv, {}); + print_usage(argc, argv); return 1; } auto model = llava_init(¶ms); diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp index 237da9429..57e7d42c5 100644 --- a/examples/llava/minicpmv-cli.cpp +++ b/examples/llava/minicpmv-cli.cpp @@ -253,8 +253,8 @@ int main(int argc, char ** argv) { gpt_params params; - if (!gpt_params_parse(argc, argv, params)) { - show_additional_info(argc, argv); + auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, show_additional_info); + if (!gpt_params_parse(argc, argv, params, options)) { return 1; } @@ -266,7 +266,6 @@ int main(int argc, char ** argv) { #endif // LOG_DISABLE_LOGS if (params.mmproj.empty() || (params.image.empty())) { - gpt_params_print_usage(argc, argv, params); show_additional_info(argc, argv); return 1; } diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp index c2e931c65..5027a483a 100644 --- a/examples/lookahead/lookahead.cpp +++ b/examples/lookahead/lookahead.cpp @@ -36,8 +36,8 @@ struct ngram_container { int main(int argc, char ** argv) { gpt_params params; - if (!gpt_params_parse(argc, argv, params)) { - gpt_params_print_usage(argc, argv, params); + auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON); + if (!gpt_params_parse(argc, argv, params, options)) { return 1; } diff --git a/examples/lookup/lookup-create.cpp b/examples/lookup/lookup-create.cpp index 5f04709f5..795b06c88 100644 --- a/examples/lookup/lookup-create.cpp +++ b/examples/lookup/lookup-create.cpp @@ -13,8 +13,8 @@ int main(int argc, char ** argv){ gpt_params params; - if (!gpt_params_parse(argc, argv, params)) { - gpt_params_print_usage(argc, argv, params); + auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON); + if (!gpt_params_parse(argc, argv, params, options)) { return 1; } diff --git a/examples/lookup/lookup-stats.cpp b/examples/lookup/lookup-stats.cpp index 400f3e0b0..93299ef8b 100644 --- a/examples/lookup/lookup-stats.cpp +++ b/examples/lookup/lookup-stats.cpp @@ -15,8 +15,8 @@ int main(int argc, char ** argv){ gpt_params params; - if (!gpt_params_parse(argc, argv, params)) { - gpt_params_print_usage(argc, argv, params); + auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON); + if (!gpt_params_parse(argc, argv, params, options)) { return 1; } diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp index 071400b7e..9ac7f6b47 100644 --- a/examples/lookup/lookup.cpp +++ b/examples/lookup/lookup.cpp @@ -12,8 +12,8 @@ int main(int argc, char ** argv){ gpt_params params; - if (!gpt_params_parse(argc, argv, params)) { - gpt_params_print_usage(argc, argv, params); + auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON); + if (!gpt_params_parse(argc, argv, params, options)) { return 1; } diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 7fe673bd6..f443360eb 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -42,6 +42,13 @@ static std::vector * g_output_tokens; static bool is_interacting = false; static bool need_insert_eot = false; +static void print_usage(int, char ** argv) { + printf("\nexample usage:\n"); + printf("\n text generation: %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128\n", argv[0]); + printf("\n chat (conversation): %s -m your_model.gguf -p \"You are a helpful assistant\" -cnv\n", argv[0]); + printf("\n"); +} + static bool file_exists(const std::string & path) { std::ifstream f(path.c_str()); return f.good(); @@ -132,9 +139,9 @@ static std::string chat_add_and_format(struct llama_model * model, std::vector #include -static void print_usage(int argc, char ** argv, const gpt_params & params) { - gpt_params_print_usage(argc, argv, params); - +static void print_usage(int, char ** argv) { LOG_TEE("\nexample usage:\n"); LOG_TEE("\n %s -m model.gguf --junk 250 --pos 90 --keep 32 --grp-attn-n 2 [--seed 1234]\n", argv[0]); LOG_TEE("\n"); @@ -21,8 +19,8 @@ int main(int argc, char ** argv) { params.n_keep = 32; params.i_pos = -1; - if (!gpt_params_parse(argc, argv, params)) { - print_usage(argc, argv, params); + auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_PASSKEY, print_usage); + if (!gpt_params_parse(argc, argv, params, options)) { return 1; } diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index b606e5464..420dbb813 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -1968,8 +1968,8 @@ int main(int argc, char ** argv) { params.n_ctx = 512; params.logits_all = true; - if (!gpt_params_parse(argc, argv, params)) { - gpt_params_print_usage(argc, argv, params); + auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_PERPLEXITY); + if (!gpt_params_parse(argc, argv, params, options)) { return 1; } diff --git a/examples/quantize/README.md b/examples/quantize/README.md index 5d1e11c67..704f0d56b 100644 --- a/examples/quantize/README.md +++ b/examples/quantize/README.md @@ -54,6 +54,8 @@ As the models are currently fully loaded into memory, you will need adequate dis Several quantization methods are supported. They differ in the resulting model disk size and inference speed. +The quantization formats `Q4_0_4_4`, `Q4_0_4_8` and `Q4_0_8_8` are block interleaved variants of the `Q4_0` format, providing a data layout that is better suited for specific implementations of optimized mulmat kernels. Since these formats differ only in data layout, they have the same quantized size as the `Q4_0` format. + *(outdated)* | Model | Measure | F16 | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 | diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp index 7eb947650..dd8a82e6e 100644 --- a/examples/retrieval/retrieval.cpp +++ b/examples/retrieval/retrieval.cpp @@ -4,9 +4,7 @@ #include #include -static void print_usage(int argc, char ** argv, const gpt_params & params) { - gpt_params_print_usage(argc, argv, params); - +static void print_usage(int, char ** argv) { LOG_TEE("\nexample usage:\n"); LOG_TEE("\n %s --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .\n", argv[0]); LOG_TEE("\n"); @@ -113,8 +111,8 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu int main(int argc, char ** argv) { gpt_params params; - if (!gpt_params_parse(argc, argv, params)) { - print_usage(argc, argv, params); + auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_RETRIEVAL, print_usage); + if (!gpt_params_parse(argc, argv, params, options)) { return 1; } diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index d86292b22..9e3fad60a 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -11,8 +11,8 @@ int main(int argc, char ** argv) { params.prompt = "The quick brown fox"; params.sparams.seed = 1234; - if (!gpt_params_parse(argc, argv, params)) { - gpt_params_print_usage(argc, argv, params); + auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON); + if (!gpt_params_parse(argc, argv, params, options)) { return 1; } diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 95f3706e1..d3dce1294 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2424,14 +2424,11 @@ int main(int argc, char ** argv) { // own arguments required by this example gpt_params params; - if (!gpt_params_parse(argc, argv, params)) { - gpt_params_print_usage(argc, argv, params); + auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_SERVER); + if (!gpt_params_parse(argc, argv, params, options)) { return 1; } - // parse arguments from environment variables - gpt_params_parse_from_env(params); - // TODO: not great to use extern vars server_log_json = params.log_json; server_verbose = params.verbosity > 0; diff --git a/examples/server/tests/features/embeddings.feature b/examples/server/tests/features/embeddings.feature index 6f163ce04..e1eade6cd 100644 --- a/examples/server/tests/features/embeddings.feature +++ b/examples/server/tests/features/embeddings.feature @@ -9,8 +9,11 @@ Feature: llama.cpp server And a model alias bert-bge-small And 42 as server seed And 2 slots - And 1024 as batch size - And 1024 as ubatch size + # the bert-bge-small model has context size of 512 + # since the generated prompts are as big as the batch size, we need to set the batch size to 512 + # ref: https://huggingface.co/BAAI/bge-small-en-v1.5/blob/5c38ec7c405ec4b44b94cc5a9bb96e735b38267a/config.json#L20 + And 512 as batch size + And 512 as ubatch size And 2048 KV cache size And embeddings extraction Then the server is starting diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index 8a0ad43ad..a53cef547 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -6,9 +6,7 @@ #include #include -static void print_usage(int argc, char ** argv, const gpt_params & params) { - gpt_params_print_usage(argc, argv, params); - +static void print_usage(int, char ** argv) { LOG_TEE("\nexample usage:\n"); LOG_TEE("\n %s -m model.gguf -p \"Hello my name is\" -n 32\n", argv[0]); LOG_TEE("\n"); @@ -20,8 +18,8 @@ int main(int argc, char ** argv) { params.prompt = "Hello my name is"; params.n_predict = 32; - if (!gpt_params_parse(argc, argv, params)) { - print_usage(argc, argv, params); + auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, print_usage); + if (!gpt_params_parse(argc, argv, params, options)) { return 1; } diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index 9d1c3f662..8df587fee 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -29,8 +29,8 @@ struct seq_draft { int main(int argc, char ** argv) { gpt_params params; - if (!gpt_params_parse(argc, argv, params)) { - gpt_params_print_usage(argc, argv, params); + auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_SPECULATIVE); + if (!gpt_params_parse(argc, argv, params, options)) { return 1; } diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 9b0e32718..44b6eabd4 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -687,8 +687,8 @@ extern "C" { struct ggml_hash_set { size_t size; - ggml_bitset_t * used; - struct ggml_tensor ** keys; + ggml_bitset_t * used; // whether or not the keys are in use i.e. set + struct ggml_tensor ** keys; // actual tensors in the set, keys[i] is only defined if ggml_bitset_get(used, i) }; // computation graph @@ -1278,7 +1278,7 @@ extern "C" { size_t nb1, size_t nb2, size_t nb3, - size_t offset); + size_t offset); // in bytes // b -> view(a,offset,nb1,nb2,3), return view(a) GGML_API struct ggml_tensor * ggml_set_inplace( @@ -1288,19 +1288,19 @@ extern "C" { size_t nb1, size_t nb2, size_t nb3, - size_t offset); + size_t offset); // in bytes GGML_API struct ggml_tensor * ggml_set_1d( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, - size_t offset); + size_t offset); // in bytes GGML_API struct ggml_tensor * ggml_set_1d_inplace( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, - size_t offset); + size_t offset); // in bytes // b -> view(a,offset,nb1,nb2,3), return modified a GGML_API struct ggml_tensor * ggml_set_2d( @@ -1308,7 +1308,7 @@ extern "C" { struct ggml_tensor * a, struct ggml_tensor * b, size_t nb1, - size_t offset); + size_t offset); // in bytes // b -> view(a,offset,nb1,nb2,3), return view(a) GGML_API struct ggml_tensor * ggml_set_2d_inplace( @@ -1316,7 +1316,7 @@ extern "C" { struct ggml_tensor * a, struct ggml_tensor * b, size_t nb1, - size_t offset); + size_t offset); // in bytes // a -> b, return view(b) GGML_API struct ggml_tensor * ggml_cpy( diff --git a/ggml/src/ggml-backend.c b/ggml/src/ggml-backend.c index 96b93d2d8..b5d9301a7 100644 --- a/ggml/src/ggml-backend.c +++ b/ggml/src/ggml-backend.c @@ -827,6 +827,10 @@ GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float case GGML_OP_MUL_MAT: return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type; + case GGML_OP_ROPE_BACK: + return op->src[2] == NULL && (op->op_params[2] & 4) == 0; + case GGML_OP_IM2COL_BACK: + return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32; default: return true; } diff --git a/ggml/src/ggml-cann/Doxyfile b/ggml/src/ggml-cann/Doxyfile index 2b009e8f9..3290a4859 100644 --- a/ggml/src/ggml-cann/Doxyfile +++ b/ggml/src/ggml-cann/Doxyfile @@ -32,7 +32,7 @@ DOXYFILE_ENCODING = UTF-8 # title of most generated pages and in a few other places. # The default value is: My Project. -PROJECT_NAME = "llama.cpp" +PROJECT_NAME = "ggml" # The PROJECT_NUMBER tag can be used to enter a project or revision number. This # could be handy for archiving the generated documentation or if some version @@ -44,7 +44,7 @@ PROJECT_NUMBER = # for a project that appears at the top of each page and should give viewer a # quick idea about the purpose of the project. Keep the description short. -PROJECT_BRIEF = "llama inference engine" +PROJECT_BRIEF = "Tensor library for machine learning" # With the PROJECT_LOGO tag one can specify a logo or an icon that is included # in the documentation. The maximum height of the logo should not exceed 55 diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index faf301964..bdd98487f 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -29,6 +29,7 @@ bool g_mul_mat_q = false; #include "ggml-cuda/rope.cuh" #include "ggml-cuda/scale.cuh" #include "ggml-cuda/softmax.cuh" +#include "ggml-cuda/sum.cuh" #include "ggml-cuda/sumrows.cuh" #include "ggml-cuda/tsembd.cuh" #include "ggml-cuda/unary.cuh" @@ -2184,6 +2185,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg ggml_cuda_dup(ctx, dst); break; case GGML_OP_ADD: + case GGML_OP_ADD1: // TODO: more efficient implementation ggml_cuda_op_add(ctx, dst); break; case GGML_OP_SUB: @@ -2200,6 +2202,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg break; case GGML_OP_UNARY: switch (ggml_get_unary_op(dst)) { + case GGML_UNARY_OP_NEG: + ggml_cuda_op_neg(ctx, dst); + break; case GGML_UNARY_OP_GELU: ggml_cuda_op_gelu(ctx, dst); break; @@ -2308,6 +2313,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg case GGML_OP_POOL_2D: ggml_cuda_op_pool2d(ctx, dst); break; + case GGML_OP_SUM: + ggml_cuda_op_sum(ctx, dst); + break; case GGML_OP_SUM_ROWS: ggml_cuda_op_sum_rows(ctx, dst); break; @@ -2752,6 +2760,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons switch (op->op) { case GGML_OP_UNARY: switch (ggml_get_unary_op(op)) { + case GGML_UNARY_OP_NEG: case GGML_UNARY_OP_GELU: case GGML_UNARY_OP_SILU: case GGML_UNARY_OP_RELU: @@ -2881,6 +2890,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons case GGML_OP_TRANSPOSE: case GGML_OP_NORM: case GGML_OP_ADD: + case GGML_OP_ADD1: case GGML_OP_SUB: case GGML_OP_MUL: case GGML_OP_DIV: @@ -2891,14 +2901,18 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons case GGML_OP_SIN: case GGML_OP_COS: case GGML_OP_CLAMP: + return true; case GGML_OP_CONT: + return op->src[0]->type != GGML_TYPE_BF16; case GGML_OP_DIAG_MASK_INF: case GGML_OP_SOFT_MAX: return true; case GGML_OP_ROPE: return ggml_is_contiguous(op->src[0]); case GGML_OP_IM2COL: + return op->src[0]->type == GGML_TYPE_F16; case GGML_OP_POOL_2D: + case GGML_OP_SUM: case GGML_OP_SUM_ROWS: case GGML_OP_ARGSORT: case GGML_OP_ACC: diff --git a/ggml/src/ggml-cuda/cross-entropy-loss.cu b/ggml/src/ggml-cuda/cross-entropy-loss.cu index a14043e70..5575a90f6 100644 --- a/ggml/src/ggml-cuda/cross-entropy-loss.cu +++ b/ggml/src/ggml-cuda/cross-entropy-loss.cu @@ -1,6 +1,6 @@ #include "common.cuh" #include "cross-entropy-loss.cuh" -#include "sumrows.cuh" +#include "sum.cuh" #include #include @@ -102,5 +102,5 @@ void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor * cross_entropy_loss_f32<<>>(src0_d, src1_d, dst_tmp.ptr, ne00, nrows); // Combine results from individual blocks: - sum_rows_f32_cuda(dst_tmp.ptr, dst_d, blocks_num.x, 1, stream); + sum_f32_cuda(pool, dst_tmp.ptr, dst_d, blocks_num.x, stream); } diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu index f87f33b3e..f28a19d40 100644 --- a/ggml/src/ggml-cuda/fattn.cu +++ b/ggml/src/ggml-cuda/fattn.cu @@ -152,7 +152,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g } \ static void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { - ggml_tensor * Q = dst->src[1]; + ggml_tensor * Q = dst->src[0]; ggml_tensor * K = dst->src[1]; ggml_tensor * V = dst->src[2]; @@ -227,7 +227,7 @@ static void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, gg } \ static void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { - ggml_tensor * Q = dst->src[1]; + ggml_tensor * Q = dst->src[0]; ggml_tensor * K = dst->src[1]; ggml_tensor * V = dst->src[2]; diff --git a/ggml/src/ggml-cuda/sum.cu b/ggml/src/ggml-cuda/sum.cu new file mode 100644 index 000000000..0d5e953ee --- /dev/null +++ b/ggml/src/ggml-cuda/sum.cu @@ -0,0 +1,41 @@ +#include "sumrows.cuh" +#include "sum.cuh" + +#include + +#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) +#include +using namespace cub; +#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) + +void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream) { +#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) + size_t tmp_size = 0; + DeviceReduce::Sum(nullptr, tmp_size, x, dst, ne, stream); + ggml_cuda_pool_alloc tmp_alloc(pool, tmp_size); + DeviceReduce::Sum(tmp_alloc.ptr, tmp_size, x, dst, ne, stream); +#else + // Use (inefficient) sum_rows implementation as a fallback. + // For AMD there is rocPRIM which could be used as a drop-in replacement via hipcub but this would require C++11 -> C++14. + sum_rows_f32_cuda(x, dst, ne, 1, stream); + GGML_UNUSED(pool); +#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) +} + +void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + GGML_ASSERT(ggml_is_contiguous(src0)); + + const float * src0_d = (const float *) src0->data; + float * dst_d = (float *) dst->data; + + const int64_t ne = ggml_nelements(src0); + + ggml_cuda_pool & pool = ctx.pool(); + cudaStream_t stream = ctx.stream(); + + sum_f32_cuda(pool, src0_d, dst_d, ne, stream); +} diff --git a/ggml/src/ggml-cuda/sum.cuh b/ggml/src/ggml-cuda/sum.cuh new file mode 100644 index 000000000..8cadc3736 --- /dev/null +++ b/ggml/src/ggml-cuda/sum.cuh @@ -0,0 +1,5 @@ +#include "common.cuh" + +void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream); + +void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml/src/ggml-cuda/unary.cu b/ggml/src/ggml-cuda/unary.cu index 89abfc21d..8ac669f94 100644 --- a/ggml/src/ggml-cuda/unary.cu +++ b/ggml/src/ggml-cuda/unary.cu @@ -1,5 +1,15 @@ #include "unary.cuh" +static __global__ void neg_f32(const float * x, float * dst, const int k) { + const int i = blockDim.x*blockIdx.x + threadIdx.x; + + if (i >= k) { + return; + } + + dst[i] = -x[i]; +} + static __global__ void gelu_f32(const float * x, float * dst, const int k) { const float GELU_COEF_A = 0.044715f; const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; @@ -119,6 +129,11 @@ static __global__ void cos_f32(const float * x, float * dst, const int k) { dst[i] = cosf(x[i]); } +static void neg_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) { + const int num_blocks = (k + CUDA_NEG_BLOCK_SIZE - 1) / CUDA_NEG_BLOCK_SIZE; + neg_f32<<>>(x, dst, k); +} + static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) { const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE; gelu_f32<<>>(x, dst, k); @@ -184,6 +199,20 @@ static void cos_f32_cuda(const float * x, float * dst, const int k, cudaStream_t cos_f32<<>>(x, dst, k); } +void ggml_cuda_op_neg(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const float * src0_d = (const float *)src0->data; + float * dst_d = (float *)dst->data; + cudaStream_t stream = ctx.stream(); + + GGML_ASSERT(ggml_is_contiguous(src0)); + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + neg_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream); +} + void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; const float * src0_d = (const float *)src0->data; diff --git a/ggml/src/ggml-cuda/unary.cuh b/ggml/src/ggml-cuda/unary.cuh index c610e996a..ed2ffc461 100644 --- a/ggml/src/ggml-cuda/unary.cuh +++ b/ggml/src/ggml-cuda/unary.cuh @@ -1,5 +1,6 @@ #include "common.cuh" +#define CUDA_NEG_BLOCK_SIZE 256 #define CUDA_GELU_BLOCK_SIZE 256 #define CUDA_SILU_BLOCK_SIZE 256 #define CUDA_TANH_BLOCK_SIZE 256 @@ -12,6 +13,8 @@ #define CUDA_SIN_BLOCK_SIZE 256 #define CUDA_COS_BLOCK_SIZE 256 +void ggml_cuda_op_neg(ggml_backend_cuda_context & ctx, ggml_tensor * dst); + void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_silu(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m index cf004c02c..fcf08fbdc 100644 --- a/ggml/src/ggml-metal.m +++ b/ggml/src/ggml-metal.m @@ -799,8 +799,9 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_context * ctx return ctx->support_simdgroup_reduction; case GGML_OP_NORM: case GGML_OP_ROPE: - case GGML_OP_IM2COL: return true; + case GGML_OP_IM2COL: + return op->src[0]->type == GGML_TYPE_F16; case GGML_OP_POOL_1D: case GGML_OP_POOL_2D: return false; diff --git a/ggml/src/ggml-sycl.cpp b/ggml/src/ggml-sycl.cpp index 0d884f89a..4f03b01e7 100644 --- a/ggml/src/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl.cpp @@ -1954,6 +1954,11 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool { SYCL_CHECK( CHECK_TRY_ERROR(ptr = (void *)sycl::malloc_device( look_ahead_size, *qptr))); + if (!ptr) { + fprintf(stderr, "%s: can't malloc %lu Bytes memory on device", __func__, look_ahead_size); + return nullptr; + } + *actual_size = look_ahead_size; pool_size += look_ahead_size; @@ -4350,6 +4355,10 @@ ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, void * dev_ptr; SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)sycl::malloc_device( size, *stream))); + if (!dev_ptr) { + fprintf(stderr, "%s: can't malloc %lu Bytes memory on device", __func__, size); + return nullptr; + } ggml_backend_sycl_buffer_context * ctx = new ggml_backend_sycl_buffer_context(buft_ctx->device, dev_ptr, buft_ctx->stream); return ggml_backend_buffer_init(buft, ggml_backend_sycl_buffer_interface, ctx, size); } @@ -4570,7 +4579,11 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer, */ SYCL_CHECK(CHECK_TRY_ERROR(buf = (char *)sycl::malloc_device( size, *stream))); - + if (!buf) { + char err_buf[1024]; + snprintf(err_buf, 1023, "%s: can't malloc %lu Bytes memory on device", __func__, size); + throw std::runtime_error(err_buf); + } // set padding to 0 to avoid possible NaN values if (size > original_size) { /* diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp index f98dbb37b..52cf83426 100644 --- a/ggml/src/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan.cpp @@ -787,6 +787,9 @@ static vk_submission ggml_vk_create_submission(vk_device& device, vk_queue& q, s static void ggml_vk_submit(vk_context& ctx, vk::Fence fence) { if (ctx->seqs.empty()) { + if (fence) { + ctx->q->queue.submit({}, fence); + } return; } VK_LOG_DEBUG("ggml_vk_submit(" << ctx << ", " << fence << ")"); @@ -4616,7 +4619,7 @@ static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context& subctx, const }, dryrun); } -static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { +static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t dst_type_size = ggml_type_size(dst->type); @@ -4626,10 +4629,10 @@ static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 0, 0.0f, 0.0f, - }); + }, dryrun); } -static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { +static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t dst_type_size = ggml_type_size(dst->type); @@ -4639,7 +4642,7 @@ static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 0, 0.0f, 0.0f, - }); + }, dryrun); } static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { @@ -5658,11 +5661,15 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) { } } -static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, bool last_node, bool dryrun){ +static bool ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_tensor* tensor, int tensor_idx, bool use_fence); + +// Returns true if node has enqueued work into the queue, false otherwise +// If submit is true the current all operations queued so far are being submitted to Vulkan to overlap cmdlist creation and GPU execution. +static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool dryrun, bool last_node, bool submit){ ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra; if (ggml_is_empty(node) || extra == nullptr) { - return; + return false; } VK_LOG_DEBUG("ggml_vk_build_graph(" << node << ", " << ggml_op_name(node->op) << ")"); @@ -5679,7 +5686,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod case GGML_OP_PERMUTE: case GGML_OP_TRANSPOSE: case GGML_OP_NONE: - return; + return false; case GGML_OP_UNARY: switch (ggml_get_unary_op(node)) { case GGML_UNARY_OP_SILU: @@ -5689,7 +5696,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod case GGML_UNARY_OP_TANH: break; default: - return; + return false; } break; case GGML_OP_REPEAT: @@ -5726,7 +5733,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod default: std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl; GGML_ABORT("fatal error"); - return; + return false; } vk_context compute_ctx; @@ -5783,11 +5790,11 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod break; case GGML_OP_SIN: - ggml_vk_sin(ctx, compute_ctx, src0, node); + ggml_vk_sin(ctx, compute_ctx, src0, node, dryrun); break; case GGML_OP_COS: - ggml_vk_cos(ctx, compute_ctx, src0, node); + ggml_vk_cos(ctx, compute_ctx, src0, node, dryrun); break; case GGML_OP_CLAMP: @@ -5826,7 +5833,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod ggml_vk_unary(ctx, compute_ctx, src0, node, dryrun); break; default: - return; + return false; } break; case GGML_OP_DIAG_MASK_INF: @@ -5870,11 +5877,11 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod break; default: - return; + return false; } if (dryrun) { - return; + return false; } ctx->tensor_ctxs[node_idx] = compute_ctx; @@ -5885,14 +5892,34 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod last_node = true; #endif - if (last_node) { + if (submit || last_node) { ggml_vk_ctx_end(compute_ctx); - compute_ctx->exit_tensor_idx = node_idx; + + // TODO probably it'd be better to pass a exit_node flag to ggml_vk_compute_forward + if (last_node) { + compute_ctx->exit_tensor_idx = node_idx_begin; + } + else { + compute_ctx->exit_tensor_idx = -1; + } + ctx->compute_ctx.reset(); + + bool ok = ggml_vk_compute_forward(ctx, node_begin, node_idx_begin, false); + if (!ok) { + if (node->op == GGML_OP_UNARY) { + std::cerr << __func__ << ": error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name(static_cast(node->op_params[0])) << ")" << std::endl; + } + else { + std::cerr << __func__ << ": error: op not supported " << node->name << " (" << ggml_op_name(node->op) << ")" << std::endl; + } + } + } + return true; } -static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx){ +static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx, bool use_fence = true){ ggml_tensor_extra_gpu * extra = nullptr; switch (tensor->op) { @@ -5960,40 +5987,38 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * VK_LOG_DEBUG("ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")"); -#ifdef GGML_VULKAN_CHECK_RESULTS - ggml_vk_check_results_0(tensor); -#endif - vk_context subctx = ctx->tensor_ctxs[tensor_idx].lock(); -#ifdef GGML_VULKAN_PERF - std::chrono::steady_clock::time_point start; -#endif // GGML_VULKAN_PERF + // always wait for the GPU work to be done for the last submit + if (tensor_idx == subctx->exit_tensor_idx) { + use_fence = true; + } // Only run if ctx hasn't been submitted yet if (!subctx->seqs.empty()) { +#ifdef GGML_VULKAN_CHECK_RESULTS + ggml_vk_check_results_0(tensor); + use_fence = true; +#endif + // Do staging buffer copies for (auto& cpy : subctx->in_memcpys) { memcpy(cpy.dst, cpy.src, cpy.n); } -#ifdef GGML_VULKAN_PERF - start = std::chrono::steady_clock::now(); -#endif // GGML_VULKAN_PERF + ggml_vk_submit(subctx, use_fence ? ctx->fence : vk::Fence{}); - ggml_vk_submit(subctx, ctx->fence); + if (use_fence) { + VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences"); + + ctx->device->device.resetFences({ ctx->fence }); + } +#ifdef GGML_VULKAN_CHECK_RESULTS + ggml_vk_check_results_1(tensor); +#endif } if (tensor_idx == subctx->exit_tensor_idx) { - VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences"); - -#ifdef GGML_VULKAN_PERF - auto duration = std::chrono::duration_cast(std::chrono::steady_clock::now() - start); - ctx->device->perf_logger->log_timing(tensor, duration.count()); -#endif // GGML_VULKAN_PERF - - ctx->device->device.resetFences({ ctx->fence }); - // Do staging buffer copies for (auto& cpy : subctx->out_memcpys) { memcpy(cpy.dst, cpy.src, cpy.n); @@ -6482,7 +6507,7 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; for (int i = 0; i < cgraph->n_nodes; i++) { - ggml_vk_build_graph(ctx, cgraph->nodes[i], i, 0, true); + ggml_vk_build_graph(ctx, cgraph->nodes[i], i, nullptr, 0, true, false, false); } ggml_vk_preallocate_buffers(ctx); ggml_pipeline_allocate_descriptor_sets(ctx->device); @@ -6497,31 +6522,36 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen // Reserve tensor context space for all nodes ctx->tensor_ctxs.resize(cgraph->n_nodes); - for (int i = 0; i < cgraph->n_nodes; i++) { - ggml_vk_build_graph(ctx, cgraph->nodes[i], i, i == last_node, false); - } + bool first_node_in_batch = true; // true if next node will be first node in a batch + int submit_node_idx = 0; // index to first node in a batch + // submit work every submit_count node to overlap CPU cmdbuffer generation with GPU execution + constexpr int submit_count = 100; + int submitted_nodes = 0; for (int i = 0; i < cgraph->n_nodes; i++) { - ggml_tensor * node = cgraph->nodes[i]; - - if (ggml_vk_is_empty(node)) { - continue; + if (first_node_in_batch) { + submit_node_idx = i; } - bool ok = ggml_vk_compute_forward(ctx, node, i); - if (!ok) { - if (node->op == GGML_OP_UNARY) { - std::cerr << __func__ << ": error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name(static_cast(node->op_params[0])) << ")" << std::endl; - } else { - std::cerr << __func__ << ": error: op not supported " << node->name << " (" << ggml_op_name(node->op) << ")" << std::endl; + bool submit = (submitted_nodes >= submit_count) || (i == last_node); + + + bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, submit); + + if (enqueued) { + ++submitted_nodes; + +#ifndef GGML_VULKAN_CHECK_RESULTS + if (first_node_in_batch) { + first_node_in_batch = false; } - } -#ifdef GGML_VULKAN_CHECK_RESULTS - else { - ggml_vk_check_results_1(node); - } #endif - GGML_ASSERT(ok); + } + + if (submit) { + first_node_in_batch = true; + submitted_nodes = 0; + } } #ifdef GGML_VULKAN_PERF @@ -6602,6 +6632,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const return false; } } break; + case GGML_OP_CONT: case GGML_OP_CPY: case GGML_OP_DUP: { @@ -6642,7 +6673,6 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const case GGML_OP_COS: case GGML_OP_CLAMP: case GGML_OP_PAD: - case GGML_OP_CONT: case GGML_OP_DIAG_MASK_INF: case GGML_OP_SOFT_MAX: case GGML_OP_ARGSORT: diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 3e0e6bccd..85eea1a3a 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -5291,6 +5291,7 @@ struct ggml_tensor * ggml_concat( bool is_node = false; if (a->grad || b->grad) { + GGML_ABORT("fatal error"); // TODO: implement is_node = true; } @@ -5412,6 +5413,7 @@ struct ggml_tensor * ggml_leaky_relu( bool is_node = false; if (!inplace && (a->grad)) { + GGML_ABORT("fatal error"); // TODO: not implemented is_node = true; } @@ -5850,6 +5852,7 @@ static struct ggml_tensor * ggml_set_impl( // make a view of the destination struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + GGML_ASSERT(offset < (size_t)(1 << 30)); int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 }; ggml_set_op_params(result, params, sizeof(params)); @@ -6807,14 +6810,12 @@ struct ggml_tensor * ggml_rope_back( GGML_ASSERT(ggml_is_vector(b)); GGML_ASSERT(b->type == GGML_TYPE_I32); GGML_ASSERT(a->ne[2] == b->ne[0]); - GGML_ASSERT(c == NULL && "freq factors not implemented yet"); - - GGML_ASSERT((mode & 4) == 0 && "ggml_rope_back() for ChatGLM not implemented yet"); bool is_node = false; if (a->grad) { - is_node = false; // TODO: implement backward + GGML_ASSERT(false && "backwards pass not implemented"); + is_node = false; } struct ggml_tensor * result = ggml_dup_tensor(ctx, a); @@ -6832,6 +6833,7 @@ struct ggml_tensor * ggml_rope_back( result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; result->src[1] = b; + result->src[2] = c; return result; } @@ -7385,6 +7387,11 @@ struct ggml_tensor * ggml_argsort( enum ggml_sort_order order) { bool is_node = false; + if (a->grad) { + GGML_ABORT("fatal error"); // TODO: not implemented + is_node = true; + } + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, GGML_MAX_DIMS, a->ne); ggml_set_op_params_i32(result, 0, (int32_t) order); @@ -8346,8 +8353,7 @@ static void ggml_compute_forward_dup_same_cont( GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0)); GGML_ASSERT(src0->type == dst->type); - const size_t nb00 = src0->nb[0]; - const size_t nb0 = dst->nb[0]; + const size_t nb0 = ggml_type_size(src0->type); const int ith = params->ith; // thread index const int nth = params->nth; // number of threads @@ -8361,8 +8367,8 @@ static void ggml_compute_forward_dup_same_cont( if (ie0 < ie1) { memcpy( ((char *) dst->data + ie0*nb0), - ((char *) src0->data + ie0*nb00), - (ie1 - ie0) * ggml_type_size(src0->type)); + ((char *) src0->data + ie0*nb0), + (ie1 - ie0) * nb0); } } @@ -8379,11 +8385,6 @@ static void ggml_compute_forward_dup_f16( const int ith = params->ith; // thread index const int nth = params->nth; // number of threads - if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) { - ggml_compute_forward_dup_same_cont(params, dst); - return; - } - // parallelize by rows const int nr = ne01; // number of rows per thread @@ -8648,11 +8649,6 @@ static void ggml_compute_forward_dup_bf16( const int ith = params->ith; // thread index const int nth = params->nth; // number of threads - if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) { - ggml_compute_forward_dup_same_cont(params, dst); - return; - } - // parallelize by rows const int nr = ne01; // number of rows per thread @@ -9004,11 +9000,6 @@ static void ggml_compute_forward_dup_f32( const int ith = params->ith; // thread index const int nth = params->nth; // number of threads - if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) { - ggml_compute_forward_dup_same_cont(params, dst); - return; - } - // parallelize by rows const int nr = ne01; // number of rows per thread @@ -9318,13 +9309,13 @@ static void ggml_compute_forward_dup_bytes( GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); GGML_ASSERT(src0->type == dst->type); + GGML_TENSOR_UNARY_OP_LOCALS; + if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) { ggml_compute_forward_dup_same_cont(params, dst); return; } - GGML_TENSOR_UNARY_OP_LOCALS; - const size_t type_size = ggml_type_size(src0->type); const int ith = params->ith; // thread index const int nth = params->nth; // number of threads @@ -11015,9 +11006,6 @@ static void ggml_compute_forward_sum_f32( return; } - assert(ggml_is_scalar(dst)); - - assert(ggml_is_scalar(dst)); assert(src0->nb[0] == sizeof(float)); @@ -13775,7 +13763,7 @@ static void ggml_compute_forward_get_rows_q( const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10); const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); - assert(i01 >= 0 && i01 < ne01); + GGML_ASSERT(i01 >= 0 && i01 < ne01); dequantize_row_q( (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03), @@ -13816,7 +13804,7 @@ static void ggml_compute_forward_get_rows_f16( const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10); const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); - assert(i01 >= 0 && i01 < ne01); + GGML_ASSERT(i01 >= 0 && i01 < ne01); ggml_fp16_to_fp32_row( (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03), @@ -13857,7 +13845,7 @@ static void ggml_compute_forward_get_rows_bf16( const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10); const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); - assert(i01 >= 0 && i01 < ne01); + GGML_ASSERT(i01 >= 0 && i01 < ne01); ggml_bf16_to_fp32_row( (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03), @@ -13898,7 +13886,7 @@ static void ggml_compute_forward_get_rows_f32( const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10); const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); - assert(i01 >= 0 && i01 < ne01); + GGML_ASSERT(i01 >= 0 && i01 < ne01); ggml_vec_cpy_f32(nc, (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), @@ -18426,14 +18414,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor if (src0->grad || src1->grad) { GGML_ASSERT(src0->type == tensor->type); GGML_ASSERT(tensor->grad->type == tensor->type); - GGML_ASSERT(tensor->grad->type == src1->grad->type); + GGML_ASSERT(!src1->grad || src1->grad->type == tensor->grad->type); tensor_grad_view = ggml_view_4d(ctx, - tensor->grad, - src1->grad->ne[0], - src1->grad->ne[1], - src1->grad->ne[2], - src1->grad->ne[3], + tensor->grad, src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], nb1, nb2, nb3, offset); } @@ -18502,9 +18486,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor memcpy(&offset, tensor->op_params, sizeof(offset)); - size_t nb1 = tensor->nb[1]; - size_t nb2 = tensor->nb[2]; - size_t nb3 = tensor->nb[3]; + size_t nb1 = tensor->nb[1]; + size_t nb2 = tensor->nb[2]; + size_t nb3 = tensor->nb[3]; if (src0->type != src0->grad->type) { // gradient is typically F32, but src0 could be other type @@ -19200,7 +19184,8 @@ void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) { } for (size_t i = 0; i < src->visited_hash_set.size; ++i) { - if (src->visited_hash_set.keys[i]) { + // copy all hashset keys (tensors) that are in use + if (ggml_bitset_get(src->visited_hash_set.used, i)) { ggml_hash_insert(&dst->visited_hash_set, src->visited_hash_set.keys[i]); } } diff --git a/ggml/src/llamafile/sgemm.cpp b/ggml/src/llamafile/sgemm.cpp index f0988ba7c..d0c2bb284 100644 --- a/ggml/src/llamafile/sgemm.cpp +++ b/ggml/src/llamafile/sgemm.cpp @@ -1006,6 +1006,10 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda assert(nth > 0); assert(ith < nth); + // only enable sgemm for prompt processing + if (n < 2) + return false; + if (Ctype != GGML_TYPE_F32) return false; diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index c9b6c6dec..02aed8120 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -401,9 +401,112 @@ static void GetOverlappingTokenSequences(const std::string& str, std::unordered_ } } +// KCPP SAMPLING FUNCTIONS +void sample_softmax(llama_token_data_array * cur_p) { + GGML_ASSERT(cur_p->size > 0); + + // Sort the logits in descending order + if (!cur_p->sorted) { + std::sort(cur_p->data, cur_p->data + cur_p->size, [](const llama_token_data & a, const llama_token_data & b) { + return a.logit > b.logit; + }); + cur_p->sorted = true; + } + + float max_l = cur_p->data[0].logit; + float cum_sum = 0.0f; + + for (size_t i = 0; i < cur_p->size; ++i) { + float p = expf(cur_p->data[i].logit - max_l); + cur_p->data[i].p = p; + cum_sum += p; + } + + for (size_t i = 0; i < cur_p->size; ++i) { + cur_p->data[i].p /= cum_sum; + } +} + +void sample_top_k(llama_token_data_array * cur_p, int32_t k) { + // TODO: move bucket sort to separate function so that top_p/tail_free/typical/softmax first is equally fast + // if (k >= (int32_t)cur_p->size) { + // return; + // } + + if (k <= 0) { + k = cur_p->size; + } + + k = std::max(k, (int) 1); //min keep of 1 + k = std::min(k, (int) cur_p->size); + + // Sort scores in descending order + if (!cur_p->sorted) { + auto comp = [](const llama_token_data & a, const llama_token_data & b) { + return a.logit > b.logit; + }; + if (k <= 128) { + std::partial_sort(cur_p->data, cur_p->data + k, cur_p->data + cur_p->size, comp); + } else { + constexpr int nbuckets = 128; + constexpr float bucket_low = -10.0f; + constexpr float bucket_high = 10.0f; + constexpr float bucket_scale = nbuckets/(bucket_high - bucket_low); + constexpr float bucket_inter = -bucket_low * bucket_scale; + + std::vector bucket_idx(cur_p->size); + std::vector histo(nbuckets, 0); + + for (int i = 0; i < (int)cur_p->size; ++i) { + const float val = cur_p->data[i].logit; + int ib = int(bucket_scale * val + bucket_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low); + ib = std::max(0, std::min(nbuckets-1, ib)); + bucket_idx[i] = ib; + ++histo[ib]; + } + int nhave = 0; + int ib = nbuckets - 1; + for ( ; ib >= 0; --ib) { + nhave += histo[ib]; + if (nhave >= k) { + break; + } + } + std::vector tmp_tokens(nhave); + auto * ptr = tmp_tokens.data(); + std::vector bucket_ptrs; + bucket_ptrs.reserve(nbuckets - ib); + for (int j = nbuckets - 1; j >= ib; --j) { + bucket_ptrs.push_back(ptr); + ptr += histo[j]; + } + for (int i = 0; i < (int)cur_p->size; ++i) { + int j = bucket_idx[i]; + if (j >= ib) { + *bucket_ptrs[nbuckets-1-j]++ = cur_p->data[i]; + } + } + + ptr = tmp_tokens.data(); + int ndone = 0; + for (int j = nbuckets-1; j > ib; --j) { + std::sort(ptr, ptr + histo[j], comp); + ptr += histo[j]; + ndone += histo[j]; + } + std::partial_sort(ptr, ptr + k - ndone, ptr + histo[ib], comp); + + std::memcpy(cur_p->data, tmp_tokens.data(), k*sizeof(llama_token_data)); + + } + cur_p->sorted = true; + } + cur_p->size = k; +} + llama_token sample_token(llama_token_data_array * candidates, std::mt19937 & rng) { - llama_sampler_softmax_impl(candidates); + sample_softmax(candidates); std::vector probs; probs.reserve(candidates->size); top_picks.clear(); @@ -433,7 +536,7 @@ llama_token sample_token(llama_token_data_array * candidates, std::mt19937 & rng llama_token sample_token_mirostat(int n_vocab, llama_token_data_array * candidates, std::mt19937 & rng, float tau, float eta, int m, float * mu) { float N = float(n_vocab); - llama_sampler_softmax_impl(candidates); + sample_softmax(candidates); // Estimate s_hat using the most probable m tokens float s_hat = 0.0; float sum_ti_bi = 0.0; @@ -449,7 +552,7 @@ llama_token sample_token_mirostat(int n_vocab, llama_token_data_array * candidat float epsilon_hat = s_hat - 1; float k = powf((epsilon_hat * powf(2, *mu)) / (1 - powf(N, -epsilon_hat)), 1 / s_hat); // Sample the next word X using top-k sampling - llama_sampler_top_k_impl(candidates, int(k)); + sample_top_k(candidates, int(k)); llama_token X = sample_token(candidates, rng); // Compute error as the difference between observed surprise and target surprise value size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) { return candidate.id == X; @@ -463,7 +566,7 @@ llama_token sample_token_mirostat(int n_vocab, llama_token_data_array * candidat llama_token sample_token_mirostat_v2(llama_token_data_array * candidates, std::mt19937 & rng, float tau, float eta, float * mu) { - llama_sampler_softmax_impl(candidates); + sample_softmax(candidates); // Truncate the words with surprise values greater than mu candidates->size = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) { return -log2f(candidate.p) > *mu; @@ -474,7 +577,7 @@ llama_token sample_token_mirostat_v2(llama_token_data_array * candidates, std::m } // Normalize the probabilities of the remaining words - llama_sampler_softmax_impl(candidates); + sample_softmax(candidates); // Sample the next word X from the remaining words llama_token X = sample_token(candidates,rng); @@ -496,7 +599,7 @@ void sample_top_a(llama_token_data_array * candidates, float a, size_t min_keep) return; } - llama_sampler_softmax_impl(candidates); + sample_softmax(candidates); // Compute the cumulative probabilities float maxprob = candidates->data[0].p; @@ -532,7 +635,7 @@ void sample_xtc(llama_token_data_array * candidates, float xtc_threshold, float return; } - llama_sampler_softmax_impl(candidates); + sample_softmax(candidates); //calculate how many tokens cross the xtc threshold size_t last_idx = candidates->size; @@ -825,18 +928,292 @@ void sample_rep_pen(int n_ctx, int rep_pen_range, float rep_pen, float rep_pen_s } +void sample_top_p(llama_token_data_array * cur_p, float p, size_t min_keep) { + if (p >= 1.0f) { + return; + } + + sample_softmax(cur_p); + + // Compute the cumulative probabilities + float cum_sum = 0.0f; + size_t last_idx = cur_p->size; + + for (size_t i = 0; i < cur_p->size; ++i) { + cum_sum += cur_p->data[i].p; + + // Check if the running sum is at least p or if we have kept at least min_keep tokens + // we set the last index to i+1 to indicate that the current iterate should be included in the set + if (cum_sum >= p && i + 1 >= min_keep) { + last_idx = i + 1; + break; + } + } + + // Resize the output vector to keep only the top-p tokens + cur_p->size = last_idx; +} + +void sample_min_p(llama_token_data_array * cur_p, float p, size_t min_keep) { + if (p <= 0.0f || !cur_p->size) { + return; + } + + bool min_p_applied = false; + + // if the cur_p aren't sorted, try the unsorted implementation first + if (!cur_p->sorted) { + std::vector filtered_tokens; + + float max_logit = -FLT_MAX; + for (size_t i = 0; i < cur_p->size; ++i) { + max_logit = std::max(max_logit, cur_p->data[i].logit); + } + const float min_logit = max_logit + logf(p); // min logit for p_i >= p * p_max + + for (size_t i = 0; i < cur_p->size; ++i) { + if (cur_p->data[i].logit >= min_logit) { + filtered_tokens.push_back(cur_p->data[i]); + } + } + + // if we have enough values the operation was a success + if (filtered_tokens.size() >= min_keep) { + memcpy(cur_p->data, filtered_tokens.data(), filtered_tokens.size()*sizeof(llama_token_data)); + cur_p->size = filtered_tokens.size(); + min_p_applied = true; + } + } + + // if the cur_p are sorted or the unsorted implementation failed, use this implementation + if (!min_p_applied) { + // Sort the logits in descending order + if (!cur_p->sorted) { + std::sort(cur_p->data, cur_p->data + cur_p->size, [](const llama_token_data & a, const llama_token_data & b) { + return a.logit > b.logit; + }); + cur_p->sorted = true; + } + + const float min_logit = cur_p->data[0].logit + logf(p); // min logit for p_i >= p * p_max + size_t i = 1; // first token always matches + + for (; i < cur_p->size; ++i) { + if (cur_p->data[i].logit < min_logit && i >= min_keep) { + break; // prob too small + } + } + + // Resize the output vector to keep only the matching tokens + cur_p->size = i; + } +} + +void sample_tail_free(llama_token_data_array * cur_p, float z, size_t min_keep) { + if (z >= 1.0f || cur_p->size <= 2) { + return; + } + + sample_softmax(cur_p); + + // Compute the first and second derivatives + std::vector first_derivatives(cur_p->size - 1); + std::vector second_derivatives(cur_p->size - 2); + + for (size_t i = 0; i < first_derivatives.size(); ++i) { + first_derivatives[i] = cur_p->data[i].p - cur_p->data[i + 1].p; + } + for (size_t i = 0; i < second_derivatives.size(); ++i) { + second_derivatives[i] = first_derivatives[i] - first_derivatives[i + 1]; + } + + // Calculate absolute value of second derivatives + for (size_t i = 0; i < second_derivatives.size(); ++i) { + second_derivatives[i] = std::abs(second_derivatives[i]); + } + + // Normalize the second derivatives + { + const float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f); + + if (second_derivatives_sum > 1e-6f) { + for (float & value : second_derivatives) { + value /= second_derivatives_sum; + } + } else { + for (float & value : second_derivatives) { + value = 1.0f / second_derivatives.size(); + } + } + } + + float cum_sum = 0.0f; + size_t last_idx = cur_p->size; + for (size_t i = 0; i < second_derivatives.size(); ++i) { + cum_sum += second_derivatives[i]; + + // Check if the running sum is greater than z or if we have kept at least min_keep tokens + if (cum_sum > z && i >= min_keep) { + last_idx = i; + break; + } + } + + // Resize the output vector to keep only the tokens above the tail location + cur_p->size = last_idx; +} + +void sampler_typical(llama_token_data_array * cur_p, float p, size_t min_keep) { + // Reference implementation: + // https://github.com/huggingface/transformers/compare/main...cimeister:typical-sampling:typical-pr + if (p >= 1.0f) { + return; + } + + // Compute the softmax of logits and calculate entropy + sample_softmax(cur_p); + + float entropy = 0.0f; + for (size_t i = 0; i < cur_p->size; ++i) { + if(cur_p->data[i].p>0) + { + entropy += -cur_p->data[i].p * logf(cur_p->data[i].p); + } + } + + // Compute the absolute difference between negative log probability and entropy for each candidate + std::vector shifted_scores; + for (size_t i = 0; i < cur_p->size; ++i) { + float shifted_score = fabsf(-logf(cur_p->data[i].p) - entropy); + shifted_scores.push_back(shifted_score); + } + + // Sort tokens based on the shifted_scores and their corresponding indices + std::vector indices(cur_p->size); + std::iota(indices.begin(), indices.end(), 0); + + std::sort(indices.begin(), indices.end(), [&](size_t a, size_t b) { + return shifted_scores[a] < shifted_scores[b]; + }); + + // Compute the cumulative probabilities + float cum_sum = 0.0f; + size_t last_idx = indices.size(); + + for (size_t i = 0; i < indices.size(); ++i) { + size_t idx = indices[i]; + cum_sum += cur_p->data[idx].p; + + // Check if the running sum is greater than typical or if we have kept at least min_keep tokens + if (cum_sum > p && i >= min_keep - 1) { + last_idx = i + 1; + break; + } + } + + // Resize the output vector to keep only the locally typical tokens + std::vector cur_p_new; + for (size_t i = 0; i < last_idx; ++i) { + size_t idx = indices[i]; + cur_p_new.push_back(cur_p->data[idx]); + } + + // Replace the data in cur_p with the cur_p_new data + std::copy(cur_p_new.begin(), cur_p_new.end(), cur_p->data); + cur_p->size = cur_p_new.size(); + cur_p->sorted = false; +} + +void sample_entropy(llama_token_data_array * cur_p, float min_temp, float max_temp, float exponent_val, float smoothing_factor) { + // no need to do anything if there is only one (or zero) candidates + if (cur_p->size <= 1) { + return; + } + + // Calculate maximum possible entropy + float max_entropy = -logf(1.0f / cur_p->size); + + sample_softmax(cur_p); + + // Calculate entropy of the softmax probabilities + float entropy = 0.0f; + for (size_t i = 0; i < cur_p->size; ++i) { + float prob = cur_p->data[i].p; + if (prob > 0.0f) { // Ensure no log(0) + entropy -= prob * logf(prob); + } + } + + // Normalize the entropy (max_entropy cannot be 0 here because we checked cur_p->size != 1 above) + float normalized_entropy = entropy / max_entropy; + + // Map the normalized entropy to the desired temperature range using the power function + float dyn_temp = min_temp + (max_temp - min_temp) * powf(normalized_entropy, exponent_val); + + // Apply the dynamically calculated temperature scaling + for (size_t i = 0; i < cur_p->size; ++i) { + cur_p->data[i].logit /= dyn_temp; + } + + // Re-compute softmax probabilities after scaling logits with dynamic temperature + const double max_l_double = cur_p->data[0].logit; + + double cum_sum_double = 0.0; + for (size_t i = 0; i < cur_p->size; ++i) { + double p = exp(cur_p->data[i].logit - max_l_double); + cur_p->data[i].p = p; // Store the scaled probability + cum_sum_double += p; + } + + for (size_t i = 0; i < cur_p->size; ++i) { + cur_p->data[i].p /= cum_sum_double; // Re-normalize the probabilities + } + + // Only apply smoothing if smoothing_factor is > 0. Do not change base implementation otherwise. + if (smoothing_factor > 0 && cur_p->size > 1) { + sample_softmax(cur_p); + float h = cur_p->data[0].logit; // Find the maximum logit for h to be added after the transformation + // Apply quadratic transformation using the smoothing_factor + for (size_t i = 0; i < cur_p->size; ++i) + { + float logit_shifted = cur_p->data[i].logit - h; + cur_p->data[i].logit = -smoothing_factor * logit_shifted * logit_shifted + h; + } + sample_softmax(cur_p); + } + +} + void sample_temperature(llama_token_data_array * candidates_p, float temp, float smoothing_factor) { + bool isgreedy = false; if (temp <= 0) { // Imitate greedy sampling temp = 0.00390625f; //cannot be zero else div0, this is 1/256 - llama_sampler_temp_impl(candidates_p, temp, 0); - llama_sampler_top_k_impl(candidates_p, 1); //only want first candidate + smoothing_factor = 0; + isgreedy = true; } - else + + for (size_t i = 0; i < candidates_p->size; ++i) { + candidates_p->data[i].logit /= temp; + } + // Only apply smoothing if smoothing_factor is > 0. Do not change base implementation otherwise. + if (smoothing_factor > 0 && candidates_p->size > 1) { + sample_softmax(candidates_p); + float h = candidates_p->data[0].logit; // Find the maximum logit for h to be added after the transformation + // Apply quadratic transformation using the smoothing_factor + for (size_t i = 0; i < candidates_p->size; ++i) + { + float logit_shifted = candidates_p->data[i].logit - h; + candidates_p->data[i].logit = -smoothing_factor * logit_shifted * logit_shifted + h; + } + sample_softmax(candidates_p); + } + + if(isgreedy) { - llama_sampler_temp_impl(candidates_p, temp, smoothing_factor); + sample_top_k(candidates_p, 1); //only want first candidate } } @@ -907,7 +1284,7 @@ const std::vector & sampler_order, llama_grammar * grammar, float dyna sample_dry(n_ctx, dry_penalty_last_n, dry_multiplier, dry_base, dry_allowed_length, dry_sequence_breakers, &candidates_p); //prefilter to top 5k tokens for improved speed - llama_sampler_top_k_impl(&candidates_p, 5000); + sample_top_k(&candidates_p, 5000); if (mirostat == 1 || mirostat == 2) { @@ -931,20 +1308,20 @@ const std::vector & sampler_order, llama_grammar * grammar, float dyna switch (sampler_order[i]) { case KCPP_SAMPLER_TOP_K: - llama_sampler_top_k_impl(&candidates_p, top_k); + sample_top_k(&candidates_p, top_k); break; case KCPP_SAMPLER_TOP_A: sample_top_a(&candidates_p, top_a, 1); break; case KCPP_SAMPLER_TOP_P: - llama_sampler_top_p_impl(&candidates_p, top_p, 1); - llama_sampler_min_p_impl(&candidates_p, min_p, 1); + sample_top_p(&candidates_p, top_p, 1); + sample_min_p(&candidates_p, min_p, 1); break; case KCPP_SAMPLER_TFS: - llama_sampler_tail_free_impl(&candidates_p, tfs, 1); + sample_tail_free(&candidates_p, tfs, 1); break; case KCPP_SAMPLER_TYP: - llama_sampler_typical_impl(&candidates_p, typical_p, 1); + sampler_typical(&candidates_p, typical_p, 1); break; case KCPP_SAMPLER_TEMP: if (dynatemp_range>0) @@ -955,7 +1332,7 @@ const std::vector & sampler_order, llama_grammar * grammar, float dyna dynatemp_min = dynatemp_min<0?0:dynatemp_min; dynatemp_max = dynatemp_max<0?0:dynatemp_max; dynatemp_exponent = dynatemp_exponent<0?0:dynatemp_exponent; - llama_sampler_entropy_impl(&candidates_p, dynatemp_min, dynatemp_max, dynatemp_exponent, smoothing_factor); + sample_entropy(&candidates_p, dynatemp_min, dynatemp_max, dynatemp_exponent, smoothing_factor); } else { diff --git a/src/llama-impl.h b/src/llama-impl.h index fa2e09e1f..87012617f 100644 --- a/src/llama-impl.h +++ b/src/llama-impl.h @@ -101,6 +101,10 @@ struct ring_buffer { } void push_back(const T & value) { + if (capacity == 0) { + throw std::runtime_error("ring buffer: capacity is zero"); + } + if (sz == capacity) { // advance the start when buffer is full first = (first + 1) % capacity; diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index ad14e392a..64d4cc995 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -13,12 +13,40 @@ #include static int llama_sample_dist(llama_token_data_array * cur_p, std::mt19937 & rng, std::vector & probs) { +#if 1 probs.resize(cur_p->size); for (size_t i = 0; i < cur_p->size; ++i) { probs[i] = cur_p->data[i].p; } std::discrete_distribution dist(probs.begin(), probs.end()); +#else + // avoid the copy with a custom iterator + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wunused-local-typedefs" + + struct probs_iterator { + typedef std::input_iterator_tag iterator_category; + typedef float value_type; + typedef float * pointer; + typedef float & reference; + typedef size_t difference_type; + + const llama_token_data_array * data; + size_t i; + + bool operator==(const probs_iterator & other) const { return data + i == other.data + other.i; } + bool operator!=(const probs_iterator & other) const { return data + i != other.data + other.i; } + float operator*() const { return data->data[i].p; } + probs_iterator & operator++() { ++i; return *this; } + probs_iterator operator++(int) { probs_iterator tmp = *this; ++i; return tmp; } + }; + #pragma GCC diagnostic pop + + std::discrete_distribution dist(probs_iterator{cur_p, 0}, probs_iterator{cur_p, cur_p->size}); + + GGML_UNUSED(probs); +#endif return dist(rng); } @@ -72,7 +100,6 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) k = cur_p->size; } - k = std::max(k, (int) 1); //min keep of 1 k = std::min(k, (int) cur_p->size); // Sort scores in descending order @@ -139,313 +166,6 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) cur_p->size = k; } -static void llama_sampler_top_p_impl(llama_token_data_array * cur_p, float p, size_t min_keep) { - if (p >= 1.0f) { - return; - } - - llama_sampler_softmax_impl(cur_p); - - // Compute the cumulative probabilities - float cum_sum = 0.0f; - size_t last_idx = cur_p->size; - - for (size_t i = 0; i < cur_p->size; ++i) { - cum_sum += cur_p->data[i].p; - - // Check if the running sum is at least p or if we have kept at least min_keep tokens - // we set the last index to i+1 to indicate that the current iterate should be included in the set - if (cum_sum >= p && i + 1 >= min_keep) { - last_idx = i + 1; - break; - } - } - - // Resize the output vector to keep only the top-p tokens - cur_p->size = last_idx; -} - -static void llama_sampler_min_p_impl(llama_token_data_array * cur_p, float p, size_t min_keep) { - if (p <= 0.0f || !cur_p->size) { - return; - } - - bool min_p_applied = false; - - // if the cur_p aren't sorted, try the unsorted implementation first - if (!cur_p->sorted) { - std::vector filtered_tokens; - - float max_logit = -FLT_MAX; - for (size_t i = 0; i < cur_p->size; ++i) { - max_logit = std::max(max_logit, cur_p->data[i].logit); - } - const float min_logit = max_logit + logf(p); // min logit for p_i >= p * p_max - - for (size_t i = 0; i < cur_p->size; ++i) { - if (cur_p->data[i].logit >= min_logit) { - filtered_tokens.push_back(cur_p->data[i]); - } - } - - // if we have enough values the operation was a success - if (filtered_tokens.size() >= min_keep) { - memcpy(cur_p->data, filtered_tokens.data(), filtered_tokens.size()*sizeof(llama_token_data)); - cur_p->size = filtered_tokens.size(); - min_p_applied = true; - } - } - - // if the cur_p are sorted or the unsorted implementation failed, use this implementation - if (!min_p_applied) { - // Sort the logits in descending order - if (!cur_p->sorted) { - std::sort(cur_p->data, cur_p->data + cur_p->size, [](const llama_token_data & a, const llama_token_data & b) { - return a.logit > b.logit; - }); - cur_p->sorted = true; - } - - const float min_logit = cur_p->data[0].logit + logf(p); // min logit for p_i >= p * p_max - size_t i = 1; // first token always matches - - for (; i < cur_p->size; ++i) { - if (cur_p->data[i].logit < min_logit && i >= min_keep) { - break; // prob too small - } - } - - // Resize the output vector to keep only the matching tokens - cur_p->size = i; - } -} - -static void llama_sampler_tail_free_impl(llama_token_data_array * cur_p, float z, size_t min_keep) { - if (z >= 1.0f || cur_p->size <= 2) { - return; - } - - llama_sampler_softmax_impl(cur_p); - - // Compute the first and second derivatives - std::vector first_derivatives(cur_p->size - 1); - std::vector second_derivatives(cur_p->size - 2); - - for (size_t i = 0; i < first_derivatives.size(); ++i) { - first_derivatives[i] = cur_p->data[i].p - cur_p->data[i + 1].p; - } - for (size_t i = 0; i < second_derivatives.size(); ++i) { - second_derivatives[i] = first_derivatives[i] - first_derivatives[i + 1]; - } - - // Calculate absolute value of second derivatives - for (size_t i = 0; i < second_derivatives.size(); ++i) { - second_derivatives[i] = std::abs(second_derivatives[i]); - } - - // Normalize the second derivatives - { - const float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f); - - if (second_derivatives_sum > 1e-6f) { - for (float & value : second_derivatives) { - value /= second_derivatives_sum; - } - } else { - for (float & value : second_derivatives) { - value = 1.0f / second_derivatives.size(); - } - } - } - - float cum_sum = 0.0f; - size_t last_idx = cur_p->size; - for (size_t i = 0; i < second_derivatives.size(); ++i) { - cum_sum += second_derivatives[i]; - - // Check if the running sum is greater than z or if we have kept at least min_keep tokens - if (cum_sum > z && i >= min_keep) { - last_idx = i; - break; - } - } - - // Resize the output vector to keep only the tokens above the tail location - cur_p->size = last_idx; -} - -static void llama_sampler_typical_impl(llama_token_data_array * cur_p, float p, size_t min_keep) { - // Reference implementation: - // https://github.com/huggingface/transformers/compare/main...cimeister:typical-sampling:typical-pr - if (p >= 1.0f) { - return; - } - - // Compute the softmax of logits and calculate entropy - llama_sampler_softmax_impl(cur_p); - - float entropy = 0.0f; - for (size_t i = 0; i < cur_p->size; ++i) { - if(cur_p->data[i].p>0) - { - entropy += -cur_p->data[i].p * logf(cur_p->data[i].p); - } - } - - // Compute the absolute difference between negative log probability and entropy for each candidate - std::vector shifted_scores; - for (size_t i = 0; i < cur_p->size; ++i) { - float shifted_score = fabsf(-logf(cur_p->data[i].p) - entropy); - shifted_scores.push_back(shifted_score); - } - - // Sort tokens based on the shifted_scores and their corresponding indices - std::vector indices(cur_p->size); - std::iota(indices.begin(), indices.end(), 0); - - std::sort(indices.begin(), indices.end(), [&](size_t a, size_t b) { - return shifted_scores[a] < shifted_scores[b]; - }); - - // Compute the cumulative probabilities - float cum_sum = 0.0f; - size_t last_idx = indices.size(); - - for (size_t i = 0; i < indices.size(); ++i) { - size_t idx = indices[i]; - cum_sum += cur_p->data[idx].p; - - // Check if the running sum is greater than typical or if we have kept at least min_keep tokens - if (cum_sum > p && i >= min_keep - 1) { - last_idx = i + 1; - break; - } - } - - // Resize the output vector to keep only the locally typical tokens - std::vector cur_p_new; - for (size_t i = 0; i < last_idx; ++i) { - size_t idx = indices[i]; - cur_p_new.push_back(cur_p->data[idx]); - } - - // Replace the data in cur_p with the cur_p_new data - std::copy(cur_p_new.begin(), cur_p_new.end(), cur_p->data); - cur_p->size = cur_p_new.size(); - cur_p->sorted = false; -} - -static void llama_sampler_entropy_impl(llama_token_data_array * cur_p, float min_temp, float max_temp, float exponent_val, float smoothing_factor) { - // no need to do anything if there is only one (or zero) candidates - if (cur_p->size <= 1) { - return; - } - - // Calculate maximum possible entropy - float max_entropy = -logf(1.0f / cur_p->size); - - llama_sampler_softmax_impl(cur_p); - - // Calculate entropy of the softmax probabilities - float entropy = 0.0f; - for (size_t i = 0; i < cur_p->size; ++i) { - float prob = cur_p->data[i].p; - if (prob > 0.0f) { // Ensure no log(0) - entropy -= prob * logf(prob); - } - } - - // Normalize the entropy (max_entropy cannot be 0 here because we checked cur_p->size != 1 above) - float normalized_entropy = entropy / max_entropy; - - // Map the normalized entropy to the desired temperature range using the power function - float dyn_temp = min_temp + (max_temp - min_temp) * powf(normalized_entropy, exponent_val); - - // Apply the dynamically calculated temperature scaling - for (size_t i = 0; i < cur_p->size; ++i) { - cur_p->data[i].logit /= dyn_temp; - } - - // Re-compute softmax probabilities after scaling logits with dynamic temperature - const double max_l_double = cur_p->data[0].logit; - - double cum_sum_double = 0.0; - for (size_t i = 0; i < cur_p->size; ++i) { - double p = exp(cur_p->data[i].logit - max_l_double); - cur_p->data[i].p = p; // Store the scaled probability - cum_sum_double += p; - } - - for (size_t i = 0; i < cur_p->size; ++i) { - cur_p->data[i].p /= cum_sum_double; // Re-normalize the probabilities - } - - // Only apply smoothing if smoothing_factor is > 0. Do not change base implementation otherwise. - if (smoothing_factor > 0 && cur_p->size > 1) { - llama_sampler_softmax_impl(cur_p); - float h = cur_p->data[0].logit; // Find the maximum logit for h to be added after the transformation - // Apply quadratic transformation using the smoothing_factor - for (size_t i = 0; i < cur_p->size; ++i) - { - float logit_shifted = cur_p->data[i].logit - h; - cur_p->data[i].logit = -smoothing_factor * logit_shifted * logit_shifted + h; - } - llama_sampler_softmax_impl(cur_p); - } - -} - -static void llama_sampler_temp_impl(llama_token_data_array * cur_p, float temp, float smoothing_factor) { - for (size_t i = 0; i < cur_p->size; ++i) { - cur_p->data[i].logit /= temp; - } - // Only apply smoothing if smoothing_factor is > 0. Do not change base implementation otherwise. - if (smoothing_factor > 0 && cur_p->size > 1) { - llama_sampler_softmax_impl(cur_p); - float h = cur_p->data[0].logit; // Find the maximum logit for h to be added after the transformation - // Apply quadratic transformation using the smoothing_factor - for (size_t i = 0; i < cur_p->size; ++i) - { - float logit_shifted = cur_p->data[i].logit - h; - cur_p->data[i].logit = -smoothing_factor * logit_shifted * logit_shifted + h; - } - llama_sampler_softmax_impl(cur_p); - } -} - -static void llama_sampler_grammar_impl(llama_token_data_array * cur_p, const struct llama_grammar & grammar) { - llama_grammar_apply_impl(grammar, cur_p); -} - -void llama_sampler_penalties_impl( - llama_token_data_array * cur_p, - const llama_token_cnt & token_count, - float penalty_repeat, - float penalty_freq, - float penalty_present) { - // Apply frequency and presence penalties to the cur_p - for (size_t i = 0; i < cur_p->size; ++i) { - const auto token_iter = token_count.find(cur_p->data[i].id); - if (token_iter == token_count.end()) { - continue; - } - - const int count = token_iter->second; - - // The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong. - // This is common fix for this problem, which is to multiply by the penalty instead of dividing. - if (cur_p->data[i].logit <= 0) { - cur_p->data[i].logit *= penalty_repeat; - } else { - cur_p->data[i].logit /= penalty_repeat; - } - - cur_p->data[i].logit -= float(count) * penalty_freq + float(count > 0) * penalty_present; - } - - cur_p->sorted = false; -} - // llama_sampler API const char * llama_sampler_name(const struct llama_sampler * smpl) { @@ -613,17 +333,23 @@ int llama_sampler_chain_n(const struct llama_sampler * chain) { // greedy -static struct llama_sampler_i llama_sampler_greedy_i = { - /* .name = */ [](const struct llama_sampler * /*smpl*/) { return "greedy"; }, - /* .accept = */ nullptr, - /* .apply = */ [](struct llama_sampler * /*smpl*/, llama_token_data_array * cur_p) { - cur_p->selected = 0; - for (size_t i = 1; i < cur_p->size; ++i) { - if (cur_p->data[i].logit > cur_p->data[cur_p->selected].logit) { - cur_p->selected = i; - } +static const char * llama_sampler_greedy_name(const struct llama_sampler * /*smpl*/) { + return "greedy"; +} + +static void llama_sampler_greedy_apply(struct llama_sampler * /*smpl*/, llama_token_data_array * cur_p) { + cur_p->selected = 0; + for (size_t i = 1; i < cur_p->size; ++i) { + if (cur_p->data[i].logit > cur_p->data[cur_p->selected].logit) { + cur_p->selected = i; } - }, + } +} + +static struct llama_sampler_i llama_sampler_greedy_i = { + /* .name = */ llama_sampler_greedy_name, + /* .accept = */ nullptr, + /* .apply = */ llama_sampler_greedy_apply, /* .reset = */ nullptr, /* .clone = */ nullptr, /* .free = */ nullptr, @@ -646,30 +372,45 @@ struct llama_sampler_dist { std::vector probs; // work array }; +static const char * llama_sampler_dist_name(const struct llama_sampler * /*smpl*/) { + return "dist"; +} + +static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + auto * ctx = (llama_sampler_dist *) smpl->ctx; + cur_p->selected = llama_sample_dist(cur_p, ctx->rng, ctx->probs); +} + +static struct llama_sampler * llama_sampler_dist_clone(const struct llama_sampler * smpl) { + const auto * ctx = (const llama_sampler_dist *) smpl->ctx; + auto * result = llama_sampler_init_dist(ctx->seed); + + // copy the state + { + auto * result_ctx = (llama_sampler_dist *) result->ctx; + + result_ctx->rng = ctx->rng; + } + + return result; +} + +static void llama_sampler_dist_reset(struct llama_sampler * smpl) { + auto * ctx = (llama_sampler_dist *) smpl->ctx; + ctx->rng = std::mt19937(ctx->seed); +} + +static void llama_sampler_dist_free(struct llama_sampler * smpl) { + delete (llama_sampler_dist *) smpl->ctx; +} + static struct llama_sampler_i llama_sampler_dist_i = { - /* .name = */ [](const struct llama_sampler * /*smpl*/) { return "dist"; }, + /* .name = */ llama_sampler_dist_name, /* .accept = */ nullptr, - /* .apply = */ [](struct llama_sampler * smpl, llama_token_data_array * cur_p) { - auto * ctx = (llama_sampler_dist *) smpl->ctx; - cur_p->selected = llama_sample_dist(cur_p, ctx->rng, ctx->probs); - }, - /* .reset = */ nullptr, - /* .clone = */ [](const struct llama_sampler * smpl) { - const auto * ctx = (const llama_sampler_dist *) smpl->ctx; - auto * result = llama_sampler_init_dist(ctx->seed); - - // copy the state - { - auto * result_ctx = (llama_sampler_dist *) result->ctx; - - result_ctx->rng = ctx->rng; - } - - return result; - }, - /* .free = */ [](struct llama_sampler * smpl) { - delete (llama_sampler_dist *) smpl->ctx; - }, + /* .apply = */ llama_sampler_dist_apply, + /* .reset = */ llama_sampler_dist_reset, + /* .clone = */ llama_sampler_dist_clone, + /* .free = */ llama_sampler_dist_free, }; struct llama_sampler * llama_sampler_init_dist(uint32_t seed) { @@ -685,12 +426,18 @@ struct llama_sampler * llama_sampler_init_dist(uint32_t seed) { // softmax +static const char * llama_sampler_softmax_name(const struct llama_sampler * /*smpl*/) { + return "softmax"; +} + +static void llama_sampler_softmax_apply(struct llama_sampler * /*smpl*/, llama_token_data_array * cur_p) { + llama_sampler_softmax_impl(cur_p); +} + static struct llama_sampler_i llama_sampler_softmax_i = { - /* .name = */ [](const struct llama_sampler * /*smpl*/) { return "softmax"; }, + /* .name = */ llama_sampler_softmax_name, /* .accept = */ nullptr, - /* .apply = */ [](struct llama_sampler * /*smpl*/, llama_token_data_array * cur_p) { - llama_sampler_softmax_impl(cur_p); - }, + /* .apply = */ llama_sampler_softmax_apply, /* .reset = */ nullptr, /* .clone = */ nullptr, /* .free = */ nullptr, @@ -709,21 +456,31 @@ struct llama_sampler_top_k { const int32_t k; }; +static const char * llama_sampler_top_k_name(const struct llama_sampler * /*smpl*/) { + return "top-k"; +} + +static void llama_sampler_top_k_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + const auto * ctx = (llama_sampler_top_k *) smpl->ctx; + llama_sampler_top_k_impl(cur_p, ctx->k); +} + +static struct llama_sampler * llama_sampler_top_k_clone(const struct llama_sampler * smpl) { + const auto * ctx = (const llama_sampler_top_k *) smpl->ctx; + return llama_sampler_init_top_k(ctx->k); +} + +static void llama_sampler_top_k_free(struct llama_sampler * smpl) { + delete (llama_sampler_top_k *) smpl->ctx; +} + static struct llama_sampler_i llama_sampler_top_k_i = { - /* .name = */ [](const struct llama_sampler * /*smpl*/) { return "top-k"; }, + /* .name = */ llama_sampler_top_k_name, /* .accept = */ nullptr, - /* .apply = */ [](struct llama_sampler * smpl, llama_token_data_array * cur_p) { - const auto * ctx = (llama_sampler_top_k *) smpl->ctx; - llama_sampler_top_k_impl(cur_p, ctx->k); - }, + /* .apply = */ llama_sampler_top_k_apply, /* .reset = */ nullptr, - /* .clone = */ [](const struct llama_sampler * smpl) { - const auto * ctx = (const llama_sampler_top_k *) smpl->ctx; - return llama_sampler_init_top_k(ctx->k); - }, - /* .free = */ [](struct llama_sampler * smpl) { - delete (llama_sampler_top_k *) smpl->ctx; - }, + /* .clone = */ llama_sampler_top_k_clone, + /* .free = */ llama_sampler_top_k_free, }; struct llama_sampler * llama_sampler_init_top_k(int32_t k) { @@ -742,21 +499,54 @@ struct llama_sampler_top_p { const size_t min_keep; }; +static const char * llama_sampler_top_p_name(const struct llama_sampler * /*smpl*/) { + return "top-p"; +} + +static void llama_sampler_top_p_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + const auto * ctx = (llama_sampler_top_p *) smpl->ctx; + + if (ctx->p >= 1.0f) { + return; + } + + llama_sampler_softmax_impl(cur_p); + + // Compute the cumulative probabilities + float cum_sum = 0.0f; + size_t last_idx = cur_p->size; + + for (size_t i = 0; i < cur_p->size; ++i) { + cum_sum += cur_p->data[i].p; + + // Check if the running sum is at least p or if we have kept at least min_keep tokens + // we set the last index to i+1 to indicate that the current iterate should be included in the set + if (cum_sum >= ctx->p && i + 1 >= ctx->min_keep) { + last_idx = i + 1; + break; + } + } + + // Resize the output vector to keep only the top-p tokens + cur_p->size = last_idx; +} + +static struct llama_sampler * llama_sampler_top_p_clone(const struct llama_sampler * smpl) { + const auto * ctx = (const llama_sampler_top_p *) smpl->ctx; + return llama_sampler_init_top_p(ctx->p, ctx->min_keep); +} + +static void llama_sampler_top_p_free(struct llama_sampler * smpl) { + delete (llama_sampler_top_p *) smpl->ctx; +} + static struct llama_sampler_i llama_sampler_top_p_i = { - /* .name = */ [](const struct llama_sampler * /*smpl*/) { return "top-p"; }, + /* .name = */ llama_sampler_top_p_name, /* .accept = */ nullptr, - /* .apply = */ [](struct llama_sampler * smpl, llama_token_data_array * cur_p) { - const auto * ctx = (llama_sampler_top_p *) smpl->ctx; - llama_sampler_top_p_impl(cur_p, ctx->p, ctx->min_keep); - }, + /* .apply = */ llama_sampler_top_p_apply, /* .reset = */ nullptr, - /* .clone = */ [](const struct llama_sampler * smpl) { - const auto * ctx = (const llama_sampler_top_p *) smpl->ctx; - return llama_sampler_init_top_p(ctx->p, ctx->min_keep); - }, - /* .free = */ [](struct llama_sampler * smpl) { - delete (llama_sampler_top_p *) smpl->ctx; - }, + /* .clone = */ llama_sampler_top_p_clone, + /* .free = */ llama_sampler_top_p_free, }; struct llama_sampler * llama_sampler_init_top_p(float p, size_t min_keep) { @@ -776,21 +566,83 @@ struct llama_sampler_min_p { const size_t min_keep; }; +static const char * llama_sampler_min_p_name(const struct llama_sampler * /*smpl*/) { + return "min-p"; +} + +static void llama_sampler_min_p_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + const auto * ctx = (llama_sampler_min_p *) smpl->ctx; + + if (ctx->p <= 0.0f || !cur_p->size) { + return; + } + + bool min_p_applied = false; + + // if the cur_p aren't sorted, try the unsorted implementation first + if (!cur_p->sorted) { + std::vector filtered_tokens; + + float max_logit = -FLT_MAX; + for (size_t i = 0; i < cur_p->size; ++i) { + max_logit = std::max(max_logit, cur_p->data[i].logit); + } + const float min_logit = max_logit + logf(ctx->p); // min logit for p_i >= p * p_max + + for (size_t i = 0; i < cur_p->size; ++i) { + if (cur_p->data[i].logit >= min_logit) { + filtered_tokens.push_back(cur_p->data[i]); + } + } + + // if we have enough values the operation was a success + if (filtered_tokens.size() >= ctx->min_keep) { + memcpy(cur_p->data, filtered_tokens.data(), filtered_tokens.size()*sizeof(llama_token_data)); + cur_p->size = filtered_tokens.size(); + min_p_applied = true; + } + } + + // if the cur_p are sorted or the unsorted implementation failed, use this implementation + if (!min_p_applied) { + // Sort the logits in descending order + if (!cur_p->sorted) { + std::sort(cur_p->data, cur_p->data + cur_p->size, [](const llama_token_data & a, const llama_token_data & b) { + return a.logit > b.logit; + }); + cur_p->sorted = true; + } + + const float min_logit = cur_p->data[0].logit + logf(ctx->p); // min logit for p_i >= p * p_max + size_t i = 1; // first token always matches + + for (; i < cur_p->size; ++i) { + if (cur_p->data[i].logit < min_logit && i >= ctx->min_keep) { + break; // prob too small + } + } + + // Resize the output vector to keep only the matching tokens + cur_p->size = i; + } +} + +static struct llama_sampler * llama_sampler_min_p_clone(const struct llama_sampler * smpl) { + const auto * ctx = (const llama_sampler_min_p *) smpl->ctx; + return llama_sampler_init_min_p(ctx->p, ctx->min_keep); +} + +static void llama_sampler_min_p_free(struct llama_sampler * smpl) { + delete (llama_sampler_min_p *) smpl->ctx; +} + static struct llama_sampler_i llama_sampler_min_p_i = { - /* .name = */ [](const struct llama_sampler * /*smpl*/) { return "min-p"; }, + /* .name = */ llama_sampler_min_p_name, /* .accept = */ nullptr, - /* .apply = */ [](struct llama_sampler * smpl, llama_token_data_array * cur_p) { - const auto * ctx = (llama_sampler_min_p *) smpl->ctx; - llama_sampler_min_p_impl(cur_p, ctx->p, ctx->min_keep); - }, + /* .apply = */ llama_sampler_min_p_apply, /* .reset = */ nullptr, - /* .clone = */ [](const struct llama_sampler * smpl) { - const auto * ctx = (const llama_sampler_min_p *) smpl->ctx; - return llama_sampler_init_min_p(ctx->p, ctx->min_keep); - }, - /* .free = */ [](struct llama_sampler * smpl) { - delete (llama_sampler_min_p *) smpl->ctx; - }, + /* .clone = */ llama_sampler_min_p_clone, + /* .free = */ llama_sampler_min_p_free, }; struct llama_sampler * llama_sampler_init_min_p(float p, size_t min_keep) { @@ -810,21 +662,82 @@ struct llama_sampler_tail_free { const size_t min_keep; }; +static const char * llama_sampler_tail_free_name(const struct llama_sampler * /*smpl*/) { + return "tail-free"; +} + +static void llama_sampler_tail_free_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + const auto * ctx = (llama_sampler_tail_free *) smpl->ctx; + + if (ctx->z >= 1.0f || cur_p->size <= 2) { + return; + } + + llama_sampler_softmax_impl(cur_p); + + // Compute the first and second derivatives + std::vector first_derivatives(cur_p->size - 1); + std::vector second_derivatives(cur_p->size - 2); + + for (size_t i = 0; i < first_derivatives.size(); ++i) { + first_derivatives[i] = cur_p->data[i].p - cur_p->data[i + 1].p; + } + for (size_t i = 0; i < second_derivatives.size(); ++i) { + second_derivatives[i] = first_derivatives[i] - first_derivatives[i + 1]; + } + + // Calculate absolute value of second derivatives + for (size_t i = 0; i < second_derivatives.size(); ++i) { + second_derivatives[i] = std::abs(second_derivatives[i]); + } + + // Normalize the second derivatives + { + const float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f); + + if (second_derivatives_sum > 1e-6f) { + for (float & value : second_derivatives) { + value /= second_derivatives_sum; + } + } else { + for (float & value : second_derivatives) { + value = 1.0f / second_derivatives.size(); + } + } + } + + float cum_sum = 0.0f; + size_t last_idx = cur_p->size; + for (size_t i = 0; i < second_derivatives.size(); ++i) { + cum_sum += second_derivatives[i]; + + // Check if the running sum is greater than z or if we have kept at least min_keep tokens + if (cum_sum > ctx->z && i >= ctx->min_keep) { + last_idx = i; + break; + } + } + + // Resize the output vector to keep only the tokens above the tail location + cur_p->size = last_idx; +} + +static struct llama_sampler * llama_sampler_tail_free_clone(const struct llama_sampler * smpl) { + const auto * ctx = (const llama_sampler_tail_free *) smpl->ctx; + return llama_sampler_init_tail_free(ctx->z, ctx->min_keep); +} + +static void llama_sampler_tail_free_free(struct llama_sampler * smpl) { + delete (llama_sampler_tail_free *) smpl->ctx; +} + static struct llama_sampler_i llama_sampler_tail_free_i = { - /* .name = */ [](const struct llama_sampler * /*smpl*/) { return "tail-free"; }, + /* .name = */ llama_sampler_tail_free_name, /* .accept = */ nullptr, - /* .apply = */ [](struct llama_sampler * smpl, llama_token_data_array * cur_p) { - const auto * ctx = (llama_sampler_tail_free *) smpl->ctx; - llama_sampler_tail_free_impl(cur_p, ctx->z, ctx->min_keep); - }, + /* .apply = */ llama_sampler_tail_free_apply, /* .reset = */ nullptr, - /* .clone = */ [](const struct llama_sampler * smpl) { - const auto * ctx = (const llama_sampler_tail_free *) smpl->ctx; - return llama_sampler_init_tail_free(ctx->z, ctx->min_keep); - }, - /* .free = */ [](struct llama_sampler * smpl) { - delete (llama_sampler_tail_free *) smpl->ctx; - }, + /* .clone = */ llama_sampler_tail_free_clone, + /* .free = */ llama_sampler_tail_free_free, }; struct llama_sampler * llama_sampler_init_tail_free(float z, size_t min_keep) { @@ -844,21 +757,86 @@ struct llama_sampler_typical { const size_t min_keep; }; +static const char * llama_sampler_typical_name(const struct llama_sampler * /*smpl*/) { + return "typical"; +} + +static void llama_sampler_typical_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + const auto * ctx = (llama_sampler_typical *) smpl->ctx; + + // Reference implementation: + // https://github.com/huggingface/transformers/compare/main...cimeister:typical-sampling:typical-pr + if (ctx->p >= 1.0f) { + return; + } + + // Compute the softmax of logits and calculate entropy + llama_sampler_softmax_impl(cur_p); + + float entropy = 0.0f; + for (size_t i = 0; i < cur_p->size; ++i) { + entropy += -cur_p->data[i].p * logf(cur_p->data[i].p); + } + + // Compute the absolute difference between negative log probability and entropy for each candidate + std::vector shifted_scores; + for (size_t i = 0; i < cur_p->size; ++i) { + float shifted_score = fabsf(-logf(cur_p->data[i].p) - entropy); + shifted_scores.push_back(shifted_score); + } + + // Sort tokens based on the shifted_scores and their corresponding indices + std::vector indices(cur_p->size); + std::iota(indices.begin(), indices.end(), 0); + + std::sort(indices.begin(), indices.end(), [&](size_t a, size_t b) { + return shifted_scores[a] < shifted_scores[b]; + }); + + // Compute the cumulative probabilities + float cum_sum = 0.0f; + size_t last_idx = indices.size(); + + for (size_t i = 0; i < indices.size(); ++i) { + size_t idx = indices[i]; + cum_sum += cur_p->data[idx].p; + + // Check if the running sum is greater than typical or if we have kept at least min_keep tokens + if (cum_sum > ctx->p && i >= ctx->min_keep - 1) { + last_idx = i + 1; + break; + } + } + + // Resize the output vector to keep only the locally typical tokens + std::vector cur_p_new; + for (size_t i = 0; i < last_idx; ++i) { + size_t idx = indices[i]; + cur_p_new.push_back(cur_p->data[idx]); + } + + // Replace the data in cur_p with the cur_p_new data + std::copy(cur_p_new.begin(), cur_p_new.end(), cur_p->data); + cur_p->size = cur_p_new.size(); + cur_p->sorted = false; +} + +static struct llama_sampler * llama_sampler_typical_clone(const struct llama_sampler * smpl) { + const auto * ctx = (const llama_sampler_typical *) smpl->ctx; + return llama_sampler_init_typical(ctx->p, ctx->min_keep); +} + +static void llama_sampler_typical_free(struct llama_sampler * smpl) { + delete (llama_sampler_typical *) smpl->ctx; +} + static struct llama_sampler_i llama_sampler_typical_i = { - /* .name = */ [](const struct llama_sampler * /*smpl*/) { return "typical"; }, + /* .name = */ llama_sampler_typical_name, /* .accept = */ nullptr, - /* .apply = */ [](struct llama_sampler * smpl, llama_token_data_array * cur_p) { - const auto * ctx = (llama_sampler_typical *) smpl->ctx; - llama_sampler_typical_impl(cur_p, ctx->p, ctx->min_keep); - }, + /* .apply = */ llama_sampler_typical_apply, /* .reset = */ nullptr, - /* .clone = */ [](const struct llama_sampler * smpl) { - const auto * ctx = (const llama_sampler_typical *) smpl->ctx; - return llama_sampler_init_typical(ctx->p, ctx->min_keep); - }, - /* .free = */ [](struct llama_sampler * smpl) { - delete (llama_sampler_typical *) smpl->ctx; - }, + /* .clone = */ llama_sampler_typical_clone, + /* .free = */ llama_sampler_typical_free, }; struct llama_sampler * llama_sampler_init_typical(float p, size_t min_keep) { @@ -877,21 +855,33 @@ struct llama_sampler_temp { const float temp; }; +static const char * llama_sampler_temp_name(const struct llama_sampler * /*smpl*/) { + return "temp"; +} + +static void llama_sampler_temp_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + const auto * ctx = (llama_sampler_temp *) smpl->ctx; + for (size_t i = 0; i < cur_p->size; ++i) { + cur_p->data[i].logit /= ctx->temp; + } +} + +static struct llama_sampler * llama_sampler_temp_clone(const struct llama_sampler * smpl) { + const auto * ctx = (const llama_sampler_temp *) smpl->ctx; + return llama_sampler_init_temp(ctx->temp); +} + +static void llama_sampler_temp_free(struct llama_sampler * smpl) { + delete (llama_sampler_temp *) smpl->ctx; +} + static struct llama_sampler_i llama_sampler_temp_i = { - /* .name = */ [](const struct llama_sampler * /*smpl*/) { return "temp"; }, + /* .name = */ llama_sampler_temp_name, /* .accept = */ nullptr, - /* .apply = */ [](struct llama_sampler * smpl, llama_token_data_array * cur_p) { - const auto * ctx = (llama_sampler_temp *) smpl->ctx; - llama_sampler_temp_impl(cur_p, ctx->temp, 0); //UNUSED IN KCPP - }, + /* .apply = */ llama_sampler_temp_apply, /* .reset = */ nullptr, - /* .clone = */ [](const struct llama_sampler * smpl) { - const auto * ctx = (const llama_sampler_temp *) smpl->ctx; - return llama_sampler_init_temp(ctx->temp); - }, - /* .free = */ [](struct llama_sampler * smpl) { - delete (llama_sampler_temp *) smpl->ctx; - }, + /* .clone = */ llama_sampler_temp_clone, + /* .free = */ llama_sampler_temp_free, }; struct llama_sampler * llama_sampler_init_temp(float temp) { @@ -911,28 +901,100 @@ struct llama_sampler_temp_ext { const float exponent; }; -static struct llama_sampler_i llama_sampler_temp_ext_i = { - /* .name = */ [](const struct llama_sampler * /*smpl*/) { return "temp-ext"; }, - /* .accept = */ nullptr, - /* .apply = */ [](struct llama_sampler * smpl, llama_token_data_array * cur_p) { - const auto * ctx = (llama_sampler_temp_ext *) smpl->ctx; - if (ctx->delta > 0) { - const float temp_min = std::max(0.0f, ctx->temp - ctx->delta); - const float temp_max = ctx->temp + ctx->delta; +static const char * llama_sampler_temp_ext_name(const struct llama_sampler * /*smpl*/) { + return "temp-ext"; +} - llama_sampler_entropy_impl(cur_p, temp_min, temp_max, ctx->exponent,0);//UNUSED IN KCPP - } else { - llama_sampler_temp_impl(cur_p, ctx->temp,0);//UNUSED IN KCPP +static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + const auto * ctx = (llama_sampler_temp_ext *) smpl->ctx; + if (ctx->delta > 0) { + const float min_temp = std::max(0.0f, ctx->temp - ctx->delta); + const float max_temp = ctx->temp + ctx->delta; + float exponent_val = ctx->exponent; + + // no need to do anything if there is only one (or zero) candidates + if (cur_p->size <= 1) { + return; } - }, + + // Calculate maximum possible entropy + float max_entropy = -logf(1.0f / cur_p->size); + + llama_sampler_softmax_impl(cur_p); + + // Calculate entropy of the softmax probabilities + float entropy = 0.0f; + for (size_t i = 0; i < cur_p->size; ++i) { + float prob = cur_p->data[i].p; + if (prob > 0.0f) { // Ensure no log(0) + entropy -= prob * logf(prob); + } + } + + // Normalize the entropy (max_entropy cannot be 0 here because we checked cur_p->size != 1 above) + float normalized_entropy = entropy / max_entropy; + + // Map the normalized entropy to the desired temperature range using the power function + float dyn_temp = min_temp + (max_temp - min_temp) * powf(normalized_entropy, exponent_val); + + #ifdef DEBUG + LLAMA_LOG_INFO("Your text maxtemp value is: %f\n", max_temp); + LLAMA_LOG_INFO("Entropy: %f\n", entropy); + LLAMA_LOG_INFO("Max Possible Entropy: %f\n", max_entropy); + LLAMA_LOG_INFO("Normalized Entropy: %f\n", normalized_entropy); + LLAMA_LOG_INFO("Exponent: %f\n", exponent_val); + LLAMA_LOG_INFO("Dynamic Temperature (dyn_temp): %f\n", dyn_temp); + #endif + + // Apply the dynamically calculated temperature scaling + for (size_t i = 0; i < cur_p->size; ++i) { + cur_p->data[i].logit /= dyn_temp; + } + + // Re-compute softmax probabilities after scaling logits with dynamic temperature + const double max_l_double = cur_p->data[0].logit; + + double cum_sum_double = 0.0; + for (size_t i = 0; i < cur_p->size; ++i) { + double p = exp(cur_p->data[i].logit - max_l_double); + cur_p->data[i].p = p; // Store the scaled probability + cum_sum_double += p; + } + + for (size_t i = 0; i < cur_p->size; ++i) { + cur_p->data[i].p /= cum_sum_double; // Re-normalize the probabilities + } + + #ifdef DEBUG + // Print the updated top 25 probabilities after temperature scaling + LLAMA_LOG_INFO("\nUpdated Top 25 Probabilities After Dynamic Temperature Scaling (in percentages):\n"); + for (size_t i = 0; i < 25 && i < cur_p->size; ++i) { + LLAMA_LOG_INFO("Token %zu: %f%%\n", i + 1, cur_p->data[i].p * 100.0f); + } + #endif + } else { + for (size_t i = 0; i < cur_p->size; ++i) { + cur_p->data[i].logit /= ctx->temp; + } + } +} + +static struct llama_sampler * llama_sampler_temp_ext_clone(const struct llama_sampler * smpl) { + const auto * ctx = (const llama_sampler_temp_ext *) smpl->ctx; + return llama_sampler_init_temp_ext(ctx->temp, ctx->delta, ctx->exponent); +} + +static void llama_sampler_temp_ext_free(struct llama_sampler * smpl) { + delete (llama_sampler_temp_ext *) smpl->ctx; +} + +static struct llama_sampler_i llama_sampler_temp_ext_i = { + /* .name = */ llama_sampler_temp_ext_name, + /* .accept = */ nullptr, + /* .apply = */ llama_sampler_temp_ext_apply, /* .reset = */ nullptr, - /* .clone = */ [](const struct llama_sampler * smpl) { - const auto * ctx = (const llama_sampler_temp_ext *) smpl->ctx; - return llama_sampler_init_temp_ext(ctx->temp, ctx->delta, ctx->exponent); - }, - /* .free = */ [](struct llama_sampler * smpl) { - delete (llama_sampler_temp_ext *) smpl->ctx; - }, + /* .clone = */ llama_sampler_temp_ext_clone, + /* .free = */ llama_sampler_temp_ext_free, }; struct llama_sampler * llama_sampler_init_temp_ext(float temp, float delta, float exponent) { @@ -965,65 +1027,77 @@ struct llama_sampler_mirostat { std::vector probs; }; +static const char * llama_sampler_mirostat_name(const struct llama_sampler * /*smpl*/) { + return "mirostat"; +} + +static void llama_sampler_mirostat_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + auto * ctx = (llama_sampler_mirostat *) smpl->ctx; + + llama_sampler_softmax_impl(cur_p); + + // Estimate s_hat using the most probable m tokens + float s_hat = 0.0; + float sum_ti_bi = 0.0; + float sum_ti_sq = 0.0; + for (size_t i = 0; i < size_t(ctx->m - 1) && i < cur_p->size - 1; ++i) { + float t_i = logf(float(i + 2) / float(i + 1)); + float b_i = logf(cur_p->data[i].p / cur_p->data[i + 1].p); + sum_ti_bi += t_i * b_i; + sum_ti_sq += t_i * t_i; + } + s_hat = sum_ti_bi / sum_ti_sq; + + // Compute k from the estimated s_hat and target surprise value + float epsilon_hat = s_hat - 1; + float k = powf((epsilon_hat * powf(2, ctx->mu)) / (1 - powf(ctx->n_vocab, -epsilon_hat)), 1 / s_hat); + + llama_sampler_top_k_impl(cur_p, std::max(int(k), 1)); + llama_sampler_softmax_impl(cur_p); + + const int idx = llama_sample_dist(cur_p, ctx->rng, ctx->probs); + + cur_p->selected = idx; + + float observed_surprise = -log2f(cur_p->data[idx].p); + float e = observed_surprise - ctx->tau; + + // Update mu using the learning rate and error + ctx->mu = ctx->mu - ctx->eta * e; +} + +static struct llama_sampler * llama_sampler_mirostat_clone(const struct llama_sampler * smpl) { + const auto * ctx = (const llama_sampler_mirostat *) smpl->ctx; + auto * result = llama_sampler_init_mirostat(ctx->n_vocab, ctx->seed, ctx->tau, ctx->eta, ctx->m); + + // copy the state + { + auto * result_ctx = (llama_sampler_mirostat *) smpl->ctx; + + result_ctx->mu = ctx->mu; + result_ctx->rng = ctx->rng; + } + + return result; +} + +static void llama_sampler_mirostat_reset(struct llama_sampler * smpl) { + auto * ctx = (llama_sampler_mirostat *) smpl->ctx; + ctx->mu = 2.0f*ctx->tau; + ctx->rng = std::mt19937(ctx->seed); +} + +static void llama_sampler_mirostat_free(struct llama_sampler * smpl) { + delete (llama_sampler_mirostat *) smpl->ctx; +} + static struct llama_sampler_i llama_sampler_mirostat_i = { - /* .name = */ [](const struct llama_sampler * /*smpl*/) { return "mirostat"; }, + /* .name = */ llama_sampler_mirostat_name, /* .accept = */ nullptr, - /* .apply = */ [](struct llama_sampler * smpl, llama_token_data_array * cur_p) { - auto * ctx = (llama_sampler_mirostat *) smpl->ctx; - - llama_sampler_softmax_impl(cur_p); - - // Estimate s_hat using the most probable m tokens - float s_hat = 0.0; - float sum_ti_bi = 0.0; - float sum_ti_sq = 0.0; - for (size_t i = 0; i < size_t(ctx->m - 1) && i < cur_p->size - 1; ++i) { - float t_i = logf(float(i + 2) / float(i + 1)); - float b_i = logf(cur_p->data[i].p / cur_p->data[i + 1].p); - sum_ti_bi += t_i * b_i; - sum_ti_sq += t_i * t_i; - } - s_hat = sum_ti_bi / sum_ti_sq; - - // Compute k from the estimated s_hat and target surprise value - float epsilon_hat = s_hat - 1; - float k = powf((epsilon_hat * powf(2, ctx->mu)) / (1 - powf(ctx->n_vocab, -epsilon_hat)), 1 / s_hat); - - llama_sampler_top_k_impl(cur_p, std::max(int(k), 1)); - llama_sampler_softmax_impl(cur_p); - - const int idx = llama_sample_dist(cur_p, ctx->rng, ctx->probs); - - cur_p->selected = idx; - - float observed_surprise = -log2f(cur_p->data[idx].p); - float e = observed_surprise - ctx->tau; - - // Update mu using the learning rate and error - ctx->mu = ctx->mu - ctx->eta * e; - }, - /* .reset = */ [](struct llama_sampler * smpl) { - auto * ctx = (llama_sampler_mirostat *) smpl->ctx; - ctx->mu = 2.0f*ctx->tau; - ctx->rng = std::mt19937(ctx->seed); - }, - /* .clone = */ [](const struct llama_sampler * smpl) { - const auto * ctx = (const llama_sampler_mirostat *) smpl->ctx; - auto * result = llama_sampler_init_mirostat(ctx->n_vocab, ctx->seed, ctx->tau, ctx->eta, ctx->m); - - // copy the state - { - auto * result_ctx = (llama_sampler_mirostat *) smpl->ctx; - - result_ctx->mu = ctx->mu; - result_ctx->rng = ctx->rng; - } - - return result; - }, - /* .free = */ [](struct llama_sampler * smpl) { - delete (llama_sampler_mirostat *) smpl->ctx; - }, + /* .apply = */ llama_sampler_mirostat_apply, + /* .reset = */ llama_sampler_mirostat_reset, + /* .clone = */ llama_sampler_mirostat_clone, + /* .free = */ llama_sampler_mirostat_free, }; struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t seed, float tau, float eta, int32_t m) { @@ -1057,59 +1131,71 @@ struct llama_sampler_mirostat_v2 { std::vector probs; }; +static const char * llama_sampler_mirostat_v2_name(const struct llama_sampler * /*smpl*/) { + return "mirostat-v2"; +} + +static void llama_sampler_mirostat_v2_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + auto * ctx = (llama_sampler_mirostat_v2 *) smpl->ctx; + + llama_sampler_softmax_impl(cur_p); + + // Truncate the words with surprise values greater than mu + cur_p->size = std::distance(cur_p->data, std::find_if(cur_p->data, cur_p->data + cur_p->size, [&](const llama_token_data & candidate) { + return -log2f(candidate.p) > ctx->mu; + })); + + if (cur_p->size == 0) { + cur_p->size = 1; + } + + // Normalize the probabilities of the remaining words + llama_sampler_softmax_impl(cur_p); + + const int idx = llama_sample_dist(cur_p, ctx->rng, ctx->probs); + + cur_p->selected = idx; + + float observed_surprise = -log2f(cur_p->data[idx].p); + float e = observed_surprise - ctx->tau; + + // Update mu using the learning rate and error + ctx->mu = ctx->mu - ctx->eta * e; +} + +static void llama_sampler_mirostat_v2_reset(struct llama_sampler * smpl) { + auto * ctx = (llama_sampler_mirostat_v2 *) smpl->ctx; + ctx->mu = 2.0f*ctx->tau; + ctx->rng = std::mt19937(ctx->seed); +} + +static struct llama_sampler * llama_sampler_mirostat_v2_clone(const struct llama_sampler * smpl) { + const auto * ctx = (const llama_sampler_mirostat_v2 *) smpl->ctx; + + auto * result = llama_sampler_init_mirostat_v2(ctx->seed, ctx->tau, ctx->eta); + + // copy the state + { + auto * result_ctx = (llama_sampler_mirostat_v2 *) result->ctx; + + result_ctx->mu = ctx->mu; + result_ctx->rng = ctx->rng; + } + + return result; +} + +static void llama_sampler_mirostat_v2_free(struct llama_sampler * smpl) { + delete (llama_sampler_mirostat_v2 *) smpl->ctx; +} + static struct llama_sampler_i llama_sampler_mirostat_v2_i = { - /* .name = */ [](const struct llama_sampler * /*smpl*/) { return "mirostat-v2"; }, + /* .name = */ llama_sampler_mirostat_v2_name, /* .accept = */ nullptr, - /* .apply = */ [](struct llama_sampler * smpl, llama_token_data_array * cur_p) { - auto * ctx = (llama_sampler_mirostat_v2 *) smpl->ctx; - - llama_sampler_softmax_impl(cur_p); - - // Truncate the words with surprise values greater than mu - cur_p->size = std::distance(cur_p->data, std::find_if(cur_p->data, cur_p->data + cur_p->size, [&](const llama_token_data & candidate) { - return -log2f(candidate.p) > ctx->mu; - })); - - if (cur_p->size == 0) { - cur_p->size = 1; - } - - // Normalize the probabilities of the remaining words - llama_sampler_softmax_impl(cur_p); - - const int idx = llama_sample_dist(cur_p, ctx->rng, ctx->probs); - - cur_p->selected = idx; - - float observed_surprise = -log2f(cur_p->data[idx].p); - float e = observed_surprise - ctx->tau; - - // Update mu using the learning rate and error - ctx->mu = ctx->mu - ctx->eta * e; - }, - /* .reset = */ [](struct llama_sampler * smpl) { - auto * ctx = (llama_sampler_mirostat_v2 *) smpl->ctx; - ctx->mu = 2.0f*ctx->tau; - ctx->rng = std::mt19937(ctx->seed); - }, - /* .clone = */ [](const struct llama_sampler * smpl) { - const auto * ctx = (const llama_sampler_mirostat_v2 *) smpl->ctx; - - auto * result = llama_sampler_init_mirostat_v2(ctx->seed, ctx->tau, ctx->eta); - - // copy the state - { - auto * result_ctx = (llama_sampler_mirostat_v2 *) result->ctx; - - result_ctx->mu = ctx->mu; - result_ctx->rng = ctx->rng; - } - - return result; - }, - /* .free = */ [](struct llama_sampler * smpl) { - delete (llama_sampler_mirostat_v2 *) smpl->ctx; - }, + /* .apply = */ llama_sampler_mirostat_v2_apply, + /* .reset = */ llama_sampler_mirostat_v2_reset, + /* .clone = */ llama_sampler_mirostat_v2_clone, + /* .free = */ llama_sampler_mirostat_v2_free, }; struct llama_sampler * llama_sampler_init_mirostat_v2(uint32_t seed, float tau, float eta) { @@ -1137,59 +1223,73 @@ struct llama_sampler_grammar { struct llama_grammar * grammar; }; -static struct llama_sampler_i llama_sampler_grammar_i = { - /* .name = */ [](const struct llama_sampler * /*smpl*/) { return "grammar"; }, - /* .accept = */ [](struct llama_sampler * smpl, llama_token token) { - const auto * ctx = (llama_sampler_grammar *) smpl->ctx; - if (ctx->grammar) { - llama_grammar_accept_impl(*ctx->grammar, token); - } - }, - /* .apply = */ [](struct llama_sampler * smpl, llama_token_data_array * cur_p) { - const auto * ctx = (llama_sampler_grammar *) smpl->ctx; - if (ctx->grammar) { - llama_sampler_grammar_impl(cur_p, *ctx->grammar); - } - }, - /* .reset = */ [](struct llama_sampler * smpl) { - auto * ctx = (llama_sampler_grammar *) smpl->ctx; - if (!ctx->grammar) { - return; - } +static const char * llama_sampler_grammar_name(const struct llama_sampler * /*smpl*/) { + return "grammar"; +} - auto * grammar_new = llama_grammar_init_impl(ctx->grammar->vocab, ctx->grammar_str.c_str(), ctx->grammar_root.c_str()); +static void llama_sampler_grammar_accept_impl(struct llama_sampler * smpl, llama_token token) { + auto * ctx = (llama_sampler_grammar *) smpl->ctx; + if (ctx->grammar) { + llama_grammar_accept_impl(*ctx->grammar, token); + } +} +static void llama_sampler_grammar_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + auto * ctx = (llama_sampler_grammar *) smpl->ctx; + if (ctx->grammar) { + llama_grammar_apply_impl(*ctx->grammar, cur_p); + } +} + +static void llama_sampler_grammar_reset(struct llama_sampler * smpl) { + auto * ctx = (llama_sampler_grammar *) smpl->ctx; + if (!ctx->grammar) { + return; + } + + auto * grammar_new = llama_grammar_init_impl(ctx->grammar->vocab, ctx->grammar_str.c_str(), ctx->grammar_root.c_str()); + + llama_grammar_free_impl(ctx->grammar); + ctx->grammar = grammar_new; +} + +static struct llama_sampler * llama_sampler_grammar_clone(const struct llama_sampler * smpl) { + const auto * ctx = (const llama_sampler_grammar *) smpl->ctx; + + auto * result = llama_sampler_init_grammar_impl(*ctx->vocab, nullptr, nullptr); + + // copy the state + { + auto * result_ctx = (llama_sampler_grammar *) result->ctx; + + if (ctx->grammar) { + result_ctx->grammar_str = ctx->grammar_str; + result_ctx->grammar_root = ctx->grammar_root; + + result_ctx->grammar = llama_grammar_clone_impl(*ctx->grammar); + } + } + + return result; +} + +static void llama_sampler_grammar_free(struct llama_sampler * smpl) { + const auto * ctx = (llama_sampler_grammar *) smpl->ctx; + + if (ctx->grammar) { llama_grammar_free_impl(ctx->grammar); - ctx->grammar = grammar_new; - }, - /* .clone = */ [](const struct llama_sampler * smpl) { - const auto * ctx = (const llama_sampler_grammar *) smpl->ctx; + } - auto * result = llama_sampler_init_grammar_impl(*ctx->vocab, nullptr, nullptr); + delete ctx; +} - // copy the state - { - auto * result_ctx = (llama_sampler_grammar *) result->ctx; - - if (ctx->grammar) { - result_ctx->grammar_str = ctx->grammar_str; - result_ctx->grammar_root = ctx->grammar_root; - - result_ctx->grammar = llama_grammar_clone_impl(*ctx->grammar); - } - } - - return result; - }, - /* .free = */ [](struct llama_sampler * smpl) { - const auto * ctx = (llama_sampler_grammar *) smpl->ctx; - - if (ctx->grammar) { - llama_grammar_free_impl(ctx->grammar); - } - - delete ctx; - }, +static struct llama_sampler_i llama_sampler_grammar_i = { + /* .name = */ llama_sampler_grammar_name, + /* .accept = */ llama_sampler_grammar_accept_impl, + /* .apply = */ llama_sampler_grammar_apply, + /* .reset = */ llama_sampler_grammar_reset, + /* .clone = */ llama_sampler_grammar_clone, + /* .free = */ llama_sampler_grammar_free, }; struct llama_sampler * llama_sampler_init_grammar_impl(const struct llama_vocab & vocab, const char * grammar_str, const char * grammar_root) { @@ -1235,104 +1335,144 @@ struct llama_sampler_penalties { ring_buffer prev; }; +static const char * llama_sampler_penalties_name(const struct llama_sampler * /*smpl*/) { + return "penalties"; +} + +static void llama_sampler_penalties_accept(struct llama_sampler * smpl, llama_token token) { + auto * ctx = (llama_sampler_penalties *) smpl->ctx; + if (ctx->penalty_last_n == 0) { + return; + } + + ctx->prev.push_back(token); +} + +static void llama_sampler_penalties_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + auto * ctx = (llama_sampler_penalties *) smpl->ctx; + + if (ctx->ignore_eos) { + assert(ctx->special_eos_id >= 0); + + // optimistically check if the candidates are not yet sorted/shuffled/truncated + if (cur_p->size > (size_t) ctx->special_eos_id && cur_p->data[ctx->special_eos_id].id == ctx->special_eos_id) { + cur_p->data[ctx->special_eos_id].logit = -INFINITY; + } else { + // else, search for the special EOS token + for (size_t i = 0; i < cur_p->size; ++i) { + if (cur_p->data[i].id == ctx->special_eos_id) { + cur_p->data[i].logit = -INFINITY; + break; + } + } + } + } + + if ((ctx->penalty_last_n == 0) || + (ctx->penalty_repeat == 1.0f && ctx->penalty_freq == 0.0f && ctx->penalty_present == 0.0f)) { + return; + } + + bool nl_found = false; + size_t nl_idx = 0; + float nl_logit = -INFINITY; + if (!ctx->penalize_nl) { + assert(ctx->linefeed_id >= 0); + + // optimistically check if the candidates are not yet sorted/shuffled/truncated + if (cur_p->size > (size_t) ctx->linefeed_id && cur_p->data[ctx->linefeed_id].id == ctx->linefeed_id) { + nl_found = true; + nl_idx = ctx->linefeed_id; + nl_logit = cur_p->data[ctx->linefeed_id].logit; + } else { + // else, search for the linefeed token + for (size_t i = 0; i < cur_p->size; ++i) { + if (cur_p->data[i].id == ctx->linefeed_id) { + nl_found = true; + nl_idx = i; + nl_logit = cur_p->data[i].logit; + break; + } + } + } + } + + // Create a frequency map to count occurrences of each token in last_tokens + // TODO: optimize this by maintaining the token count in the sampler context + using llama_token_cnt = std::unordered_map; + llama_token_cnt token_count; + + for (int i = 0; i < std::min(ctx->penalty_last_n, ctx->prev.size()); ++i) { + token_count[ctx->prev.rat(i)]++; + } + + // Apply frequency and presence penalties to the cur_p + for (size_t i = 0; i < cur_p->size; ++i) { + const auto token_iter = token_count.find(cur_p->data[i].id); + if (token_iter == token_count.end()) { + continue; + } + + const int count = token_iter->second; + + // The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong. + // This is common fix for this problem, which is to multiply by the penalty instead of dividing. + if (cur_p->data[i].logit <= 0) { + cur_p->data[i].logit *= ctx->penalty_repeat; + } else { + cur_p->data[i].logit /= ctx->penalty_repeat; + } + + cur_p->data[i].logit -= float(count) * ctx->penalty_freq + float(count > 0) * ctx->penalty_present; + } + + cur_p->sorted = false; + + if (!ctx->penalize_nl && nl_found) { + // restore the logit of the newline token if it was penalized + cur_p->data[nl_idx].logit = nl_logit; + } +} + +static void llama_sampler_penalties_reset(struct llama_sampler * smpl) { + auto * ctx = (llama_sampler_penalties *) smpl->ctx; + ctx->prev.clear(); +} + +static struct llama_sampler * llama_sampler_penalties_clone(const struct llama_sampler * smpl) { + const auto * ctx = (const llama_sampler_penalties *) smpl->ctx; + auto * result = llama_sampler_init_penalties( + ctx->n_vocab, + ctx->special_eos_id, + ctx->linefeed_id, + ctx->penalty_last_n, + ctx->penalty_repeat, + ctx->penalty_freq, + ctx->penalty_present, + ctx->penalize_nl, + ctx->ignore_eos); + + // copy the state + { + auto * result_ctx = (llama_sampler_penalties *) result->ctx; + + result_ctx->prev = ctx->prev; + } + + return result; +} + +static void llama_sampler_penalties_free(struct llama_sampler * smpl) { + delete (llama_sampler_penalties *) smpl->ctx; +} + static struct llama_sampler_i llama_sampler_penalties_i = { - /* .name = */ [](const struct llama_sampler * /*smpl*/) { return "penalties"; }, - /* .accept = */ [](struct llama_sampler * smpl, llama_token token) { - auto * ctx = (llama_sampler_penalties *) smpl->ctx; - ctx->prev.push_back(token); - }, - /* .apply = */ [](struct llama_sampler * smpl, llama_token_data_array * cur_p) { - auto * ctx = (llama_sampler_penalties *) smpl->ctx; - - if (ctx->ignore_eos) { - assert(ctx->special_eos_id >= 0); - - // optimistically check if the candidates are not yet sorted/shuffled/truncated - if (cur_p->size > (size_t) ctx->special_eos_id && cur_p->data[ctx->special_eos_id].id == ctx->special_eos_id) { - cur_p->data[ctx->special_eos_id].logit = -INFINITY; - } else { - // else, search for the special EOS token - for (size_t i = 0; i < cur_p->size; ++i) { - if (cur_p->data[i].id == ctx->special_eos_id) { - cur_p->data[i].logit = -INFINITY; - break; - } - } - } - } - - if ((ctx->penalty_last_n == 0) || - (ctx->penalty_repeat == 1.0f && ctx->penalty_freq == 0.0f && ctx->penalty_present == 0.0f)) { - return; - } - - bool nl_found = false; - size_t nl_idx = 0; - float nl_logit = -INFINITY; - if (!ctx->penalize_nl) { - assert(ctx->linefeed_id >= 0); - - // optimistically check if the candidates are not yet sorted/shuffled/truncated - if (cur_p->size > (size_t) ctx->linefeed_id && cur_p->data[ctx->linefeed_id].id == ctx->linefeed_id) { - nl_found = true; - nl_idx = ctx->linefeed_id; - nl_logit = cur_p->data[ctx->linefeed_id].logit; - } else { - // else, search for the linefeed token - for (size_t i = 0; i < cur_p->size; ++i) { - if (cur_p->data[i].id == ctx->linefeed_id) { - nl_found = true; - nl_idx = i; - nl_logit = cur_p->data[i].logit; - break; - } - } - } - } - - // Create a frequency map to count occurrences of each token in last_tokens - // TODO: optimize this by maintaining the token count in the sampler context - llama_token_cnt token_count; - for (int i = 0; i < std::min(ctx->penalty_last_n, ctx->prev.size()); ++i) { - token_count[ctx->prev.rat(i)]++; - } - - llama_sampler_penalties_impl(cur_p, token_count, ctx->penalty_repeat, ctx->penalty_freq, ctx->penalty_present); - - if (!ctx->penalize_nl && nl_found) { - // restore the logit of the newline token if it was penalized - cur_p->data[nl_idx].logit = nl_logit; - } - }, - /* .reset = */ [](struct llama_sampler * smpl) { - auto * ctx = (llama_sampler_penalties *) smpl->ctx; - ctx->prev.clear(); - }, - /* .clone = */ [](const struct llama_sampler * smpl) { - const auto * ctx = (const llama_sampler_penalties *) smpl->ctx; - auto * result = llama_sampler_init_penalties( - ctx->n_vocab, - ctx->special_eos_id, - ctx->linefeed_id, - ctx->penalty_last_n, - ctx->penalty_repeat, - ctx->penalty_freq, - ctx->penalty_present, - ctx->penalize_nl, - ctx->ignore_eos); - - // copy the state - { - auto * result_ctx = (llama_sampler_penalties *) result->ctx; - - result_ctx->prev = ctx->prev; - } - - return result; - }, - /* .free = */ [](struct llama_sampler * smpl) { - delete (llama_sampler_penalties *) smpl->ctx; - }, + /* .name = */ llama_sampler_penalties_name, + /* .accept = */ llama_sampler_penalties_accept, + /* .apply = */ llama_sampler_penalties_apply, + /* .reset = */ llama_sampler_penalties_reset, + /* .clone = */ llama_sampler_penalties_clone, + /* .free = */ llama_sampler_penalties_free, }; struct llama_sampler * llama_sampler_init_penalties( @@ -1346,11 +1486,11 @@ struct llama_sampler * llama_sampler_init_penalties( bool penalize_nl, bool ignore_eos) { if (linefeed_id == LLAMA_TOKEN_NULL) { - penalize_nl = false; + penalize_nl = true; } if (special_eos_id == LLAMA_TOKEN_NULL) { - ignore_eos = true; + ignore_eos = false; } return new llama_sampler { @@ -1380,41 +1520,50 @@ struct llama_sampler_logit_bias { std::vector to_search; }; +static const char * llama_sampler_logit_bias_name(const struct llama_sampler * /*smpl*/) { + return "logit-bias"; +} + +static void llama_sampler_logit_bias_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + auto * ctx = (llama_sampler_logit_bias *) smpl->ctx; + + ctx->to_search.clear(); + + // update the candidates that have not been shuffled in the vocabulary (i.e. idx == id) + for (const auto & lb : ctx->logit_bias) { + if (lb.token >= 0 && cur_p->size > (size_t) lb.token && cur_p->data[lb.token].id == lb.token) { + cur_p->data[lb.token].logit += lb.bias; + } else { + ctx->to_search.push_back(lb); + } + } + + // search for the remaining candidates that were not found in the previous step + for (size_t i = 0; i < cur_p->size; ++i) { + for (const auto & lb : ctx->to_search) { + if (cur_p->data[i].id == lb.token) { + cur_p->data[i].logit += lb.bias; + break; + } + } + } +} +static struct llama_sampler * llama_sampler_logit_bias_clone(const struct llama_sampler * smpl) { + const auto * ctx = (const llama_sampler_logit_bias *) smpl->ctx; + return llama_sampler_init_logit_bias(ctx->n_vocab, ctx->logit_bias.size(), ctx->logit_bias.data()); +} + +static void llama_sampler_logit_bias_free(struct llama_sampler * smpl) { + delete (llama_sampler_logit_bias *) smpl->ctx; +} + static struct llama_sampler_i llama_sampler_logit_bias_i = { - /* .name = */ [](const struct llama_sampler * /*smpl*/) { return "logit-bias"; }, + /* .name = */ llama_sampler_logit_bias_name, /* .accept = */ nullptr, - /* .apply = */ [](struct llama_sampler * smpl, llama_token_data_array * cur_p) { - auto * ctx = (llama_sampler_logit_bias *) smpl->ctx; - - ctx->to_search.clear(); - - // update the candidates that have not been shuffled in the vocabulary (i.e. idx == id) - for (const auto & lb : ctx->logit_bias) { - if (lb.token >= 0 && cur_p->size > (size_t) lb.token && cur_p->data[lb.token].id == lb.token) { - cur_p->data[lb.token].logit += lb.bias; - } else { - ctx->to_search.push_back(lb); - } - } - - // search for the remaining candidates that were not found in the previous step - for (size_t i = 0; i < cur_p->size; ++i) { - for (const auto & lb : ctx->to_search) { - if (cur_p->data[i].id == lb.token) { - cur_p->data[i].logit += lb.bias; - break; - } - } - } - }, + /* .apply = */ llama_sampler_logit_bias_apply, /* .reset = */ nullptr, - /* .clone = */ [](const struct llama_sampler * smpl) { - const auto * ctx = (const llama_sampler_logit_bias *) smpl->ctx; - return llama_sampler_init_logit_bias(ctx->n_vocab, ctx->logit_bias.size(), ctx->logit_bias.data()); - }, - /* .free = */ [](struct llama_sampler * smpl) { - delete (llama_sampler_logit_bias *) smpl->ctx; - }, + /* .clone = */ llama_sampler_logit_bias_clone, + /* .free = */ llama_sampler_logit_bias_free, }; struct llama_sampler * llama_sampler_init_logit_bias( @@ -1429,4 +1578,4 @@ struct llama_sampler * llama_sampler_init_logit_bias( /* .to_search = */ {}, }, }; -} +} \ No newline at end of file diff --git a/src/llama-sampling.h b/src/llama-sampling.h index 137c0025c..d90b14713 100644 --- a/src/llama-sampling.h +++ b/src/llama-sampling.h @@ -23,16 +23,6 @@ struct llama_sampler_chain { mutable int32_t n_sample; }; -using llama_token_cnt = std::unordered_map; - -// TODO: tmp exposed until test-sampling is fixed -void llama_sampler_penalties_impl( - llama_token_data_array * cur_p, - const llama_token_cnt & token_count, - float penalty_repeat, - float penalty_freq, - float penalty_present); - struct llama_sampler * llama_sampler_init_grammar_impl( const struct llama_vocab & vocab, const char * grammar_str, diff --git a/src/llama.cpp b/src/llama.cpp index 3c26d1573..0687cf463 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -6449,6 +6449,11 @@ static void llm_load_vocab( ) ) { vocab.special_eot_id = t.second; + if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { + LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n", + __func__, t.first.c_str()); + vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL; + } break; } } @@ -6462,6 +6467,11 @@ static void llm_load_vocab( const auto & t = vocab.token_to_id.find("<|eom_id|>"); if (t != vocab.token_to_id.end()) { vocab.special_eom_id = t->second; + if ((vocab.id_to_token[t->second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { + LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n", + __func__, t->first.c_str()); + vocab.id_to_token[t->second].attr = LLAMA_TOKEN_ATTR_CONTROL; + } } } } @@ -16143,6 +16153,13 @@ static int llama_decode_internal( return -1; } + for (uint32_t i = 0; i < n_tokens_all; ++i) { + if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= lctx.model.vocab.n_vocab) { + LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch_all.token[i]); + return -1; + } + } + const auto & model = lctx.model; const auto & hparams = model.hparams; const auto & cparams = lctx.cparams; @@ -16435,6 +16452,13 @@ static int llama_encode_internal( return -1; } + for (uint32_t i = 0; i < n_tokens; ++i) { + if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= lctx.model.vocab.n_vocab) { + LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch.token[i]); + return -1; + } + } + const auto & model = lctx.model; const auto & hparams = model.hparams; const auto & cparams = lctx.cparams; diff --git a/tests/test-arg-parser.cpp b/tests/test-arg-parser.cpp new file mode 100644 index 000000000..9ad91acc0 --- /dev/null +++ b/tests/test-arg-parser.cpp @@ -0,0 +1,119 @@ +#include +#include +#include +#include + +#undef NDEBUG +#include + +#include "common.h" + +int main(void) { + gpt_params params; + + printf("test-arg-parser: make sure there is no duplicated arguments in any examples\n\n"); + for (int ex = 0; ex < LLAMA_EXAMPLE_COUNT; ex++) { + try { + auto options = gpt_params_parser_init(params, (enum llama_example)ex); + std::unordered_set seen_args; + std::unordered_set seen_env_vars; + for (const auto & opt : options) { + // check for args duplications + for (const auto & arg : opt.args) { + if (seen_args.find(arg) == seen_args.end()) { + seen_args.insert(arg); + } else { + fprintf(stderr, "test-arg-parser: found different handlers for the same argument: %s", arg); + exit(1); + } + } + // check for env var duplications + if (opt.env) { + if (seen_env_vars.find(opt.env) == seen_env_vars.end()) { + seen_env_vars.insert(opt.env); + } else { + fprintf(stderr, "test-arg-parser: found different handlers for the same env var: %s", opt.env); + exit(1); + } + } + } + } catch (std::exception & e) { + printf("%s\n", e.what()); + assert(false); + } + } + + auto list_str_to_char = [](std::vector & argv) -> std::vector { + std::vector res; + for (auto & arg : argv) { + res.push_back(const_cast(arg.data())); + } + return res; + }; + + std::vector argv; + auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON); + + printf("test-arg-parser: test invalid usage\n\n"); + + argv = {"binary_name", "-m"}; + assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options)); + + argv = {"binary_name", "-ngl", "hello"}; + assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options)); + + argv = {"binary_name", "-sm", "hello"}; + assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options)); + + + printf("test-arg-parser: test valid usage\n\n"); + + argv = {"binary_name", "-m", "model_file.gguf"}; + assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options)); + assert(params.model == "model_file.gguf"); + + argv = {"binary_name", "-t", "1234"}; + assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options)); + assert(params.cpuparams.n_threads == 1234); + + argv = {"binary_name", "--verbose"}; + assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options)); + assert(params.verbosity == 1); + + argv = {"binary_name", "-m", "abc.gguf", "--predict", "6789", "--batch-size", "9090"}; + assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options)); + assert(params.model == "abc.gguf"); + assert(params.n_predict == 6789); + assert(params.n_batch == 9090); + +// skip this part on windows, because setenv is not supported +#ifdef _WIN32 + printf("test-arg-parser: skip on windows build\n"); +#else + printf("test-arg-parser: test environment variables (valid + invalid usages)\n\n"); + + setenv("LLAMA_ARG_THREADS", "blah", true); + argv = {"binary_name"}; + assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options)); + + setenv("LLAMA_ARG_MODEL", "blah.gguf", true); + setenv("LLAMA_ARG_THREADS", "1010", true); + argv = {"binary_name"}; + assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options)); + assert(params.model == "blah.gguf"); + assert(params.cpuparams.n_threads == 1010); + + + printf("test-arg-parser: test environment variables being overwritten\n\n"); + + setenv("LLAMA_ARG_MODEL", "blah.gguf", true); + setenv("LLAMA_ARG_THREADS", "1010", true); + argv = {"binary_name", "-m", "overwritten.gguf"}; + assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options)); + assert(params.model == "overwritten.gguf"); + assert(params.cpuparams.n_threads == 1010); +#endif // _WIN32 + + + printf("test-arg-parser: all tests OK\n\n"); +}