diff --git a/common/arg.cpp b/common/arg.cpp new file mode 100644 index 000000000..c5134be51 --- /dev/null +++ b/common/arg.cpp @@ -0,0 +1,1994 @@ +#include "arg.h" + +#include "sampling.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "json-schema-to-grammar.h" + +using json = nlohmann::ordered_json; + +llama_arg & llama_arg::set_examples(std::initializer_list examples) { + this->examples = std::move(examples); + return *this; +} + +llama_arg & llama_arg::set_env(const char * env) { + help = help + "\n(env: " + env + ")"; + this->env = env; + return *this; +} + +llama_arg & llama_arg::set_sparam() { + is_sparam = true; + return *this; +} + +bool llama_arg::in_example(enum llama_example ex) { + return examples.find(ex) != examples.end(); +} + +bool llama_arg::get_value_from_env(std::string & output) { + if (env == nullptr) return false; + char * value = std::getenv(env); + if (value) { + output = value; + return true; + } + return false; +} + +bool llama_arg::has_value_from_env() { + return env != nullptr && std::getenv(env); +} + +static std::vector break_str_into_lines(std::string input, size_t max_char_per_line) { + std::vector result; + std::istringstream iss(input); + std::string line; + auto add_line = [&](const std::string& l) { + if (l.length() <= max_char_per_line) { + result.push_back(l); + } else { + std::istringstream line_stream(l); + std::string word, current_line; + while (line_stream >> word) { + if (current_line.length() + !current_line.empty() + word.length() > max_char_per_line) { + if (!current_line.empty()) result.push_back(current_line); + current_line = word; + } else { + current_line += (!current_line.empty() ? " " : "") + word; + } + } + if (!current_line.empty()) result.push_back(current_line); + } + }; + while (std::getline(iss, line)) { + add_line(line); + } + return result; +} + +std::string llama_arg::to_string() { + // params for printing to console + const static int n_leading_spaces = 40; + const static int n_char_per_line_help = 70; // TODO: detect this based on current console + std::string leading_spaces(n_leading_spaces, ' '); + + std::ostringstream ss; + for (const auto arg : args) { + if (arg == args.front()) { + if (args.size() == 1) { + ss << arg; + } else { + // first arg is usually abbreviation, we need padding to make it more beautiful + auto tmp = std::string(arg) + ", "; + auto spaces = std::string(std::max(0, 7 - (int)tmp.size()), ' '); + ss << tmp << spaces; + } + } else { + ss << arg << (arg != args.back() ? ", " : ""); + } + } + if (value_hint) ss << " " << value_hint; + if (value_hint_2) ss << " " << value_hint_2; + if (ss.tellp() > n_leading_spaces - 3) { + // current line is too long, add new line + ss << "\n" << leading_spaces; + } else { + // padding between arg and help, same line + ss << std::string(leading_spaces.size() - ss.tellp(), ' '); + } + const auto help_lines = break_str_into_lines(help, n_char_per_line_help); + for (const auto & line : help_lines) { + ss << (&line == &help_lines.front() ? "" : leading_spaces) << line << "\n"; + } + return ss.str(); +} + +// +// utils +// + +#ifdef __GNUC__ +#ifdef __MINGW32__ +#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__))) +#else +#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__))) +#endif +#else +#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) +#endif + +LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2) +static std::string format(const char * fmt, ...) { + va_list ap; + va_list ap2; + va_start(ap, fmt); + va_copy(ap2, ap); + int size = vsnprintf(NULL, 0, fmt, ap); + GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT + std::vector buf(size + 1); + int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2); + GGML_ASSERT(size2 == size); + va_end(ap2); + va_end(ap); + return std::string(buf.data(), size); +} + +static void gpt_params_handle_model_default(gpt_params & params) { + if (!params.hf_repo.empty()) { + // short-hand to avoid specifying --hf-file -> default it to --model + if (params.hf_file.empty()) { + if (params.model.empty()) { + throw std::invalid_argument("error: --hf-repo requires either --hf-file or --model\n"); + } + params.hf_file = params.model; + } else if (params.model.empty()) { + params.model = fs_get_cache_file(string_split(params.hf_file, '/').back()); + } + } else if (!params.model_url.empty()) { + if (params.model.empty()) { + auto f = string_split(params.model_url, '#').front(); + f = string_split(f, '?').front(); + params.model = fs_get_cache_file(string_split(f, '/').back()); + } + } else if (params.model.empty()) { + params.model = DEFAULT_MODEL_PATH; + } +} + +// +// CLI argument parsing functions +// + +static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx_arg) { + std::string arg; + const std::string arg_prefix = "--"; + gpt_params & params = ctx_arg.params; + gpt_sampler_params & sparams = params.sparams; + + std::unordered_map arg_to_options; + for (auto & opt : ctx_arg.options) { + for (const auto & arg : opt.args) { + arg_to_options[arg] = &opt; + } + } + + // handle environment variables + for (auto & opt : ctx_arg.options) { + std::string value; + if (opt.get_value_from_env(value)) { + try { + if (opt.handler_void && (value == "1" || value == "true")) { + opt.handler_void(params); + } + if (opt.handler_int) { + opt.handler_int(params, std::stoi(value)); + } + if (opt.handler_string) { + opt.handler_string(params, value); + continue; + } + } catch (std::exception & e) { + throw std::invalid_argument(format( + "error while handling environment variable \"%s\": %s\n\n", opt.env, e.what())); + } + } + } + + // handle command line arguments + auto check_arg = [&](int i) { + if (i+1 >= argc) { + throw std::invalid_argument("expected value for argument"); + } + }; + + for (int i = 1; i < argc; i++) { + const std::string arg_prefix = "--"; + + std::string arg = argv[i]; + if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) { + std::replace(arg.begin(), arg.end(), '_', '-'); + } + if (arg_to_options.find(arg) == arg_to_options.end()) { + throw std::invalid_argument(format("error: invalid argument: %s", arg.c_str())); + } + auto opt = *arg_to_options[arg]; + if (opt.has_value_from_env()) { + fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str()); + } + try { + if (opt.handler_void) { + opt.handler_void(params); + continue; + } + + // arg with single value + check_arg(i); + std::string val = argv[++i]; + if (opt.handler_int) { + opt.handler_int(params, std::stoi(val)); + continue; + } + if (opt.handler_string) { + opt.handler_string(params, val); + continue; + } + + // arg with 2 values + check_arg(i); + std::string val2 = argv[++i]; + if (opt.handler_str_str) { + opt.handler_str_str(params, val, val2); + continue; + } + } catch (std::exception & e) { + throw std::invalid_argument(format( + "error while handling argument \"%s\": %s\n\n" + "usage:\n%s\n\nto show complete usage, run with -h", + arg.c_str(), e.what(), arg_to_options[arg]->to_string().c_str())); + } + } + + postprocess_cpu_params(params.cpuparams, nullptr); + postprocess_cpu_params(params.cpuparams_batch, ¶ms.cpuparams); + postprocess_cpu_params(params.draft_cpuparams, ¶ms.cpuparams); + postprocess_cpu_params(params.draft_cpuparams_batch, ¶ms.cpuparams_batch); + + if (params.prompt_cache_all && (params.interactive || params.interactive_first)) { + throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n"); + } + + gpt_params_handle_model_default(params); + + if (params.escape) { + string_process_escapes(params.prompt); + string_process_escapes(params.input_prefix); + string_process_escapes(params.input_suffix); + for (auto & antiprompt : params.antiprompt) { + string_process_escapes(antiprompt); + } + } + + if (!params.kv_overrides.empty()) { + params.kv_overrides.emplace_back(); + params.kv_overrides.back().key[0] = 0; + } + + if (sparams.seed == LLAMA_DEFAULT_SEED) { + sparams.seed = time(NULL); + } + + return true; +} + +static void gpt_params_print_usage(gpt_params_context & ctx_arg) { + auto print_options = [](std::vector & options) { + for (llama_arg * opt : options) { + printf("%s", opt->to_string().c_str()); + } + }; + + std::vector common_options; + std::vector sparam_options; + std::vector specific_options; + for (auto & opt : ctx_arg.options) { + // in case multiple LLAMA_EXAMPLE_* are set, we prioritize the LLAMA_EXAMPLE_* matching current example + if (opt.is_sparam) { + sparam_options.push_back(&opt); + } else if (opt.in_example(ctx_arg.ex)) { + specific_options.push_back(&opt); + } else { + common_options.push_back(&opt); + } + } + printf("----- common params -----\n\n"); + print_options(common_options); + printf("\n\n----- sampling params -----\n\n"); + print_options(sparam_options); + // TODO: maybe convert enum llama_example to string + printf("\n\n----- example-specific params -----\n\n"); + print_options(specific_options); +} + +bool gpt_params_parse(int argc, char ** argv, gpt_params & params, llama_example ex, void(*print_usage)(int, char **)) { + auto ctx_arg = gpt_params_parser_init(params, ex, print_usage); + const gpt_params params_org = ctx_arg.params; // the example can modify the default params + + try { + if (!gpt_params_parse_ex(argc, argv, ctx_arg)) { + ctx_arg.params = params_org; + return false; + } + if (ctx_arg.params.usage) { + gpt_params_print_usage(ctx_arg); + if (ctx_arg.print_usage) { + ctx_arg.print_usage(argc, argv); + } + exit(0); + } + } catch (const std::invalid_argument & ex) { + fprintf(stderr, "%s\n", ex.what()); + ctx_arg.params = params_org; + return false; + } + + return true; +} + +gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, void(*print_usage)(int, char **)) { + gpt_params_context ctx_arg(params); + ctx_arg.print_usage = print_usage; + ctx_arg.ex = ex; + + std::string sampler_type_chars; + std::string sampler_type_names; + for (const auto & sampler : params.sparams.samplers) { + sampler_type_chars += gpt_sampler_type_to_chr(sampler); + sampler_type_names += gpt_sampler_type_to_str(sampler) + ";"; + } + sampler_type_names.pop_back(); + + + /** + * filter options by example + * rules: + * - all examples inherit options from LLAMA_EXAMPLE_COMMON + * - if LLAMA_EXAMPLE_* is set (other than COMMON), we only show the option in the corresponding example + * - if both {LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_*,} are set, we will prioritize the LLAMA_EXAMPLE_* matching current example + */ + auto add_opt = [&](llama_arg arg) { + if (arg.in_example(ex) || arg.in_example(LLAMA_EXAMPLE_COMMON)) { + ctx_arg.options.push_back(std::move(arg)); + } + }; + + + add_opt(llama_arg( + {"-h", "--help", "--usage"}, + "print usage and exit", + [](gpt_params & params) { + params.usage = true; + } + )); + add_opt(llama_arg( + {"--version"}, + "show version and build info", + [](gpt_params &) { + fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT); + fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET); + exit(0); + } + )); + add_opt(llama_arg( + {"-v", "--verbose"}, + "print verbose information", + [](gpt_params & params) { + params.verbosity = 1; + } + )); + add_opt(llama_arg( + {"--verbosity"}, "N", + format("set specific verbosity level (default: %d)", params.verbosity), + [](gpt_params & params, int value) { + params.verbosity = value; + } + )); + add_opt(llama_arg( + {"--verbose-prompt"}, + format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"), + [](gpt_params & params) { + params.verbose_prompt = true; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"--no-display-prompt"}, + format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"), + [](gpt_params & params) { + params.display_prompt = false; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"-co", "--color"}, + format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"), + [](gpt_params & params) { + params.use_color = true; + } + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL})); + add_opt(llama_arg( + {"-t", "--threads"}, "N", + format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads), + [](gpt_params & params, int value) { + params.cpuparams.n_threads = value; + if (params.cpuparams.n_threads <= 0) { + params.cpuparams.n_threads = std::thread::hardware_concurrency(); + } + } + ).set_env("LLAMA_ARG_THREADS")); + add_opt(llama_arg( + {"-tb", "--threads-batch"}, "N", + "number of threads to use during batch and prompt processing (default: same as --threads)", + [](gpt_params & params, int value) { + params.cpuparams_batch.n_threads = value; + if (params.cpuparams_batch.n_threads <= 0) { + params.cpuparams_batch.n_threads = std::thread::hardware_concurrency(); + } + } + )); + add_opt(llama_arg( + {"-td", "--threads-draft"}, "N", + "number of threads to use during generation (default: same as --threads)", + [](gpt_params & params, int value) { + params.draft_cpuparams.n_threads = value; + if (params.draft_cpuparams.n_threads <= 0) { + params.draft_cpuparams.n_threads = std::thread::hardware_concurrency(); + } + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"-tbd", "--threads-batch-draft"}, "N", + "number of threads to use during batch and prompt processing (default: same as --threads-draft)", + [](gpt_params & params, int value) { + params.draft_cpuparams_batch.n_threads = value; + if (params.draft_cpuparams_batch.n_threads <= 0) { + params.draft_cpuparams_batch.n_threads = std::thread::hardware_concurrency(); + } + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"-C", "--cpu-mask"}, "M", + "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")", + [](gpt_params & params, const std::string & mask) { + params.cpuparams.mask_valid = true; + if (!parse_cpu_mask(mask, params.cpuparams.cpumask)) { + throw std::invalid_argument("invalid cpumask"); + } + } + )); + add_opt(llama_arg( + {"-Cr", "--cpu-range"}, "lo-hi", + "range of CPUs for affinity. Complements --cpu-mask", + [](gpt_params & params, const std::string & range) { + params.cpuparams.mask_valid = true; + if (!parse_cpu_range(range, params.cpuparams.cpumask)) { + throw std::invalid_argument("invalid range"); + } + } + )); + add_opt(llama_arg( + {"--cpu-strict"}, "<0|1>", + format("use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu), + [](gpt_params & params, const std::string & value) { + params.cpuparams.strict_cpu = std::stoul(value); + } + )); + add_opt(llama_arg( + {"--prio"}, "N", + format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority), + [](gpt_params & params, int prio) { + if (prio < 0 || prio > 3) { + throw std::invalid_argument("invalid value"); + } + params.cpuparams.priority = (enum ggml_sched_priority) prio; + } + )); + add_opt(llama_arg( + {"--poll"}, "<0...100>", + format("use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll), + [](gpt_params & params, const std::string & value) { + params.cpuparams.poll = std::stoul(value); + } + )); + add_opt(llama_arg( + {"-Cb", "--cpu-mask-batch"}, "M", + "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)", + [](gpt_params & params, const std::string & mask) { + params.cpuparams_batch.mask_valid = true; + if (!parse_cpu_mask(mask, params.cpuparams_batch.cpumask)) { + throw std::invalid_argument("invalid cpumask"); + } + } + )); + add_opt(llama_arg( + {"-Crb", "--cpu-range-batch"}, "lo-hi", + "ranges of CPUs for affinity. Complements --cpu-mask-batch", + [](gpt_params & params, const std::string & range) { + params.cpuparams_batch.mask_valid = true; + if (!parse_cpu_range(range, params.cpuparams_batch.cpumask)) { + throw std::invalid_argument("invalid range"); + } + } + )); + add_opt(llama_arg( + {"--cpu-strict-batch"}, "<0|1>", + "use strict CPU placement (default: same as --cpu-strict)", + [](gpt_params & params, int value) { + params.cpuparams_batch.strict_cpu = value; + } + )); + add_opt(llama_arg( + {"--prio-batch"}, "N", + format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams_batch.priority), + [](gpt_params & params, int prio) { + if (prio < 0 || prio > 3) { + throw std::invalid_argument("invalid value"); + } + params.cpuparams_batch.priority = (enum ggml_sched_priority) prio; + } + )); + add_opt(llama_arg( + {"--poll-batch"}, "<0|1>", + "use polling to wait for work (default: same as --poll)", + [](gpt_params & params, int value) { + params.cpuparams_batch.poll = value; + } + )); + add_opt(llama_arg( + {"-Cd", "--cpu-mask-draft"}, "M", + "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)", + [](gpt_params & params, const std::string & mask) { + params.draft_cpuparams.mask_valid = true; + if (!parse_cpu_mask(mask, params.draft_cpuparams.cpumask)) { + throw std::invalid_argument("invalid cpumask"); + } + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"-Crd", "--cpu-range-draft"}, "lo-hi", + "Ranges of CPUs for affinity. Complements --cpu-mask-draft", + [](gpt_params & params, const std::string & range) { + params.draft_cpuparams.mask_valid = true; + if (!parse_cpu_range(range, params.draft_cpuparams.cpumask)) { + throw std::invalid_argument("invalid range"); + } + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"--cpu-strict-draft"}, "<0|1>", + "Use strict CPU placement for draft model (default: same as --cpu-strict)", + [](gpt_params & params, int value) { + params.draft_cpuparams.strict_cpu = value; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"--prio-draft"}, "N", + format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams.priority), + [](gpt_params & params, int prio) { + if (prio < 0 || prio > 3) { + throw std::invalid_argument("invalid value"); + } + params.draft_cpuparams.priority = (enum ggml_sched_priority) prio; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"--poll-draft"}, "<0|1>", + "Use polling to wait for draft model work (default: same as --poll])", + [](gpt_params & params, int value) { + params.draft_cpuparams.poll = value; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"-Cbd", "--cpu-mask-batch-draft"}, "M", + "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)", + [](gpt_params & params, const std::string & mask) { + params.draft_cpuparams_batch.mask_valid = true; + if (!parse_cpu_mask(mask, params.draft_cpuparams_batch.cpumask)) { + throw std::invalid_argument("invalid cpumask"); + } + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"-Crbd", "--cpu-range-batch-draft"}, "lo-hi", + "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)", + [](gpt_params & params, const std::string & range) { + params.draft_cpuparams_batch.mask_valid = true; + if (!parse_cpu_range(range, params.draft_cpuparams_batch.cpumask)) { + throw std::invalid_argument("invalid cpumask"); + } + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"--cpu-strict-batch-draft"}, "<0|1>", + "Use strict CPU placement for draft model (default: --cpu-strict-draft)", + [](gpt_params & params, int value) { + params.draft_cpuparams_batch.strict_cpu = value; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"--prio-batch-draft"}, "N", + format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams_batch.priority), + [](gpt_params & params, int prio) { + if (prio < 0 || prio > 3) { + throw std::invalid_argument("invalid value"); + } + params.draft_cpuparams_batch.priority = (enum ggml_sched_priority) prio; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"--poll-batch-draft"}, "<0|1>", + "Use polling to wait for draft model work (default: --poll-draft)", + [](gpt_params & params, int value) { + params.draft_cpuparams_batch.poll = value; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"--draft"}, "N", + format("number of tokens to draft for speculative decoding (default: %d)", params.n_draft), + [](gpt_params & params, int value) { + params.n_draft = value; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP})); + add_opt(llama_arg( + {"-ps", "--p-split"}, "N", + format("speculative decoding split probability (default: %.1f)", (double)params.p_split), + [](gpt_params & params, const std::string & value) { + params.p_split = std::stof(value); + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"-lcs", "--lookup-cache-static"}, "FNAME", + "path to static lookup cache to use for lookup decoding (not updated by generation)", + [](gpt_params & params, const std::string & value) { + params.lookup_cache_static = value; + } + ).set_examples({LLAMA_EXAMPLE_LOOKUP})); + add_opt(llama_arg( + {"-lcd", "--lookup-cache-dynamic"}, "FNAME", + "path to dynamic lookup cache to use for lookup decoding (updated by generation)", + [](gpt_params & params, const std::string & value) { + params.lookup_cache_dynamic = value; + } + ).set_examples({LLAMA_EXAMPLE_LOOKUP})); + add_opt(llama_arg( + {"-c", "--ctx-size"}, "N", + format("size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx), + [](gpt_params & params, int value) { + params.n_ctx = value; + } + ).set_env("LLAMA_ARG_CTX_SIZE")); + add_opt(llama_arg( + {"-n", "--predict", "--n-predict"}, "N", + format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict), + [](gpt_params & params, int value) { + params.n_predict = value; + } + ).set_env("LLAMA_ARG_N_PREDICT")); + add_opt(llama_arg( + {"-b", "--batch-size"}, "N", + format("logical maximum batch size (default: %d)", params.n_batch), + [](gpt_params & params, int value) { + params.n_batch = value; + } + ).set_env("LLAMA_ARG_BATCH")); + add_opt(llama_arg( + {"-ub", "--ubatch-size"}, "N", + format("physical maximum batch size (default: %d)", params.n_ubatch), + [](gpt_params & params, int value) { + params.n_ubatch = value; + } + ).set_env("LLAMA_ARG_UBATCH")); + add_opt(llama_arg( + {"--keep"}, "N", + format("number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep), + [](gpt_params & params, int value) { + params.n_keep = value; + } + )); + add_opt(llama_arg( + {"--chunks"}, "N", + format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks), + [](gpt_params & params, int value) { + params.n_chunks = value; + } + ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL})); + add_opt(llama_arg( + {"-fa", "--flash-attn"}, + format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"), + [](gpt_params & params) { + params.flash_attn = true; + } + ).set_env("LLAMA_ARG_FLASH_ATTN")); + add_opt(llama_arg( + {"-p", "--prompt"}, "PROMPT", + ex == LLAMA_EXAMPLE_MAIN + ? "prompt to start generation with\nif -cnv is set, this will be used as system prompt" + : "prompt to start generation with", + [](gpt_params & params, const std::string & value) { + params.prompt = value; + } + )); + add_opt(llama_arg( + {"-f", "--file"}, "FNAME", + "a file containing the prompt (default: none)", + [](gpt_params & params, const std::string & value) { + std::ifstream file(value); + if (!file) { + throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); + } + // store the external file name in params + params.prompt_file = value; + std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(params.prompt)); + if (!params.prompt.empty() && params.prompt.back() == '\n') { + params.prompt.pop_back(); + } + } + )); + add_opt(llama_arg( + {"--in-file"}, "FNAME", + "an input file (repeat to specify multiple files)", + [](gpt_params & params, const std::string & value) { + std::ifstream file(value); + if (!file) { + throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); + } + params.in_files.push_back(value); + } + ).set_examples({LLAMA_EXAMPLE_IMATRIX})); + add_opt(llama_arg( + {"-bf", "--binary-file"}, "FNAME", + "binary file containing the prompt (default: none)", + [](gpt_params & params, const std::string & value) { + std::ifstream file(value, std::ios::binary); + if (!file) { + throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); + } + // store the external file name in params + params.prompt_file = value; + std::ostringstream ss; + ss << file.rdbuf(); + params.prompt = ss.str(); + fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), value.c_str()); + } + )); + add_opt(llama_arg( + {"-e", "--escape"}, + format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"), + [](gpt_params & params) { + params.escape = true; + } + )); + add_opt(llama_arg( + {"--no-escape"}, + "do not process escape sequences", + [](gpt_params & params) { + params.escape = false; + } + )); + add_opt(llama_arg( + {"-ptc", "--print-token-count"}, "N", + format("print token count every N tokens (default: %d)", params.n_print), + [](gpt_params & params, int value) { + params.n_print = value; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"--prompt-cache"}, "FNAME", + "file to cache prompt state for faster startup (default: none)", + [](gpt_params & params, const std::string & value) { + params.path_prompt_cache = value; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"--prompt-cache-all"}, + "if specified, saves user input and generations to cache as well\n", + [](gpt_params & params) { + params.prompt_cache_all = true; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"--prompt-cache-ro"}, + "if specified, uses the prompt cache but does not update it", + [](gpt_params & params) { + params.prompt_cache_ro = true; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"-r", "--reverse-prompt"}, "PROMPT", + "halt generation at PROMPT, return control in interactive mode\n", + [](gpt_params & params, const std::string & value) { + params.antiprompt.emplace_back(value); + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"-sp", "--special"}, + format("special tokens output enabled (default: %s)", params.special ? "true" : "false"), + [](gpt_params & params) { + params.special = true; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"-cnv", "--conversation"}, + format( + "run in conversation mode:\n" + "- does not print special tokens and suffix/prefix\n" + "- interactive mode is also enabled\n" + "(default: %s)", + params.conversation ? "true" : "false" + ), + [](gpt_params & params) { + params.conversation = true; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"-i", "--interactive"}, + format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"), + [](gpt_params & params) { + params.interactive = true; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"-if", "--interactive-first"}, + format("run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"), + [](gpt_params & params) { + params.interactive_first = true; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"-mli", "--multiline-input"}, + "allows you to write or paste multiple lines without ending each in '\\'", + [](gpt_params & params) { + params.multiline_input = true; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"--in-prefix-bos"}, + "prefix BOS to user inputs, preceding the `--in-prefix` string", + [](gpt_params & params) { + params.input_prefix_bos = true; + params.enable_chat_template = false; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"--in-prefix"}, "STRING", + "string to prefix user inputs with (default: empty)", + [](gpt_params & params, const std::string & value) { + params.input_prefix = value; + params.enable_chat_template = false; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"--in-suffix"}, "STRING", + "string to suffix after user inputs with (default: empty)", + [](gpt_params & params, const std::string & value) { + params.input_suffix = value; + params.enable_chat_template = false; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"--no-warmup"}, + "skip warming up the model with an empty run", + [](gpt_params & params) { + params.warmup = false; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(llama_arg( + {"--spm-infill"}, + format( + "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", + params.spm_infill ? "enabled" : "disabled" + ), + [](gpt_params & params) { + params.spm_infill = true; + } + ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_INFILL})); + add_opt(llama_arg( + {"--samplers"}, "SAMPLERS", + format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()), + [](gpt_params & params, const std::string & value) { + const auto sampler_names = string_split(value, ';'); + params.sparams.samplers = gpt_sampler_types_from_names(sampler_names, true); + } + ).set_sparam()); + add_opt(llama_arg( + {"-s", "--seed"}, "SEED", + format("RNG seed (default: %d, use random seed for < 0)", params.sparams.seed), + [](gpt_params & params, const std::string & value) { + params.sparams.seed = std::stoul(value); + } + ).set_sparam()); + add_opt(llama_arg( + {"--sampling-seq"}, "SEQUENCE", + format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()), + [](gpt_params & params, const std::string & value) { + params.sparams.samplers = gpt_sampler_types_from_chars(value); + } + ).set_sparam()); + add_opt(llama_arg( + {"--ignore-eos"}, + "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)", + [](gpt_params & params) { + params.sparams.ignore_eos = true; + } + ).set_sparam()); + add_opt(llama_arg( + {"--penalize-nl"}, + format("penalize newline tokens (default: %s)", params.sparams.penalize_nl ? "true" : "false"), + [](gpt_params & params) { + params.sparams.penalize_nl = true; + } + ).set_sparam()); + add_opt(llama_arg( + {"--temp"}, "N", + format("temperature (default: %.1f)", (double)params.sparams.temp), + [](gpt_params & params, const std::string & value) { + params.sparams.temp = std::stof(value); + params.sparams.temp = std::max(params.sparams.temp, 0.0f); + } + ).set_sparam()); + add_opt(llama_arg( + {"--top-k"}, "N", + format("top-k sampling (default: %d, 0 = disabled)", params.sparams.top_k), + [](gpt_params & params, int value) { + params.sparams.top_k = value; + } + ).set_sparam()); + add_opt(llama_arg( + {"--top-p"}, "N", + format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sparams.top_p), + [](gpt_params & params, const std::string & value) { + params.sparams.top_p = std::stof(value); + } + ).set_sparam()); + add_opt(llama_arg( + {"--min-p"}, "N", + format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sparams.min_p), + [](gpt_params & params, const std::string & value) { + params.sparams.min_p = std::stof(value); + } + ).set_sparam()); + add_opt(llama_arg( + {"--tfs"}, "N", + format("tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)params.sparams.tfs_z), + [](gpt_params & params, const std::string & value) { + params.sparams.tfs_z = std::stof(value); + } + ).set_sparam()); + add_opt(llama_arg( + {"--typical"}, "N", + format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sparams.typ_p), + [](gpt_params & params, const std::string & value) { + params.sparams.typ_p = std::stof(value); + } + ).set_sparam()); + add_opt(llama_arg( + {"--repeat-last-n"}, "N", + format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sparams.penalty_last_n), + [](gpt_params & params, int value) { + params.sparams.penalty_last_n = value; + params.sparams.n_prev = std::max(params.sparams.n_prev, params.sparams.penalty_last_n); + } + ).set_sparam()); + add_opt(llama_arg( + {"--repeat-penalty"}, "N", + format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sparams.penalty_repeat), + [](gpt_params & params, const std::string & value) { + params.sparams.penalty_repeat = std::stof(value); + } + ).set_sparam()); + add_opt(llama_arg( + {"--presence-penalty"}, "N", + format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_present), + [](gpt_params & params, const std::string & value) { + params.sparams.penalty_present = std::stof(value); + } + ).set_sparam()); + add_opt(llama_arg( + {"--frequency-penalty"}, "N", + format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_freq), + [](gpt_params & params, const std::string & value) { + params.sparams.penalty_freq = std::stof(value); + } + ).set_sparam()); + add_opt(llama_arg( + {"--dynatemp-range"}, "N", + format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sparams.dynatemp_range), + [](gpt_params & params, const std::string & value) { + params.sparams.dynatemp_range = std::stof(value); + } + ).set_sparam()); + add_opt(llama_arg( + {"--dynatemp-exp"}, "N", + format("dynamic temperature exponent (default: %.1f)", (double)params.sparams.dynatemp_exponent), + [](gpt_params & params, const std::string & value) { + params.sparams.dynatemp_exponent = std::stof(value); + } + ).set_sparam()); + add_opt(llama_arg( + {"--mirostat"}, "N", + format("use Mirostat sampling.\nTop K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n" + "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sparams.mirostat), + [](gpt_params & params, int value) { + params.sparams.mirostat = value; + } + ).set_sparam()); + add_opt(llama_arg( + {"--mirostat-lr"}, "N", + format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sparams.mirostat_eta), + [](gpt_params & params, const std::string & value) { + params.sparams.mirostat_eta = std::stof(value); + } + ).set_sparam()); + add_opt(llama_arg( + {"--mirostat-ent"}, "N", + format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sparams.mirostat_tau), + [](gpt_params & params, const std::string & value) { + params.sparams.mirostat_tau = std::stof(value); + } + ).set_sparam()); + add_opt(llama_arg( + {"-l", "--logit-bias"}, "TOKEN_ID(+/-)BIAS", + "modifies the likelihood of token appearing in the completion,\n" + "i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n" + "or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'", + [](gpt_params & params, const std::string & value) { + std::stringstream ss(value); + llama_token key; + char sign; + std::string value_str; + try { + if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) { + const float bias = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f); + params.sparams.logit_bias.push_back({key, bias}); + } else { + throw std::invalid_argument("invalid input format"); + } + } catch (const std::exception&) { + throw std::invalid_argument("invalid input format"); + } + } + ).set_sparam()); + add_opt(llama_arg( + {"--grammar"}, "GRAMMAR", + format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sparams.grammar.c_str()), + [](gpt_params & params, const std::string & value) { + params.sparams.grammar = value; + } + ).set_sparam()); + add_opt(llama_arg( + {"--grammar-file"}, "FNAME", + "file to read grammar from", + [](gpt_params & params, const std::string & value) { + std::ifstream file(value); + if (!file) { + throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); + } + std::copy( + std::istreambuf_iterator(file), + std::istreambuf_iterator(), + std::back_inserter(params.sparams.grammar) + ); + } + ).set_sparam()); + add_opt(llama_arg( + {"-j", "--json-schema"}, "SCHEMA", + "JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead", + [](gpt_params & params, const std::string & value) { + params.sparams.grammar = json_schema_to_grammar(json::parse(value)); + } + ).set_sparam()); + add_opt(llama_arg( + {"--pooling"}, "{none,mean,cls,last}", + "pooling type for embeddings, use model default if unspecified", + [](gpt_params & params, const std::string & value) { + /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; } + else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; } + else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; } + else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; } + else { throw std::invalid_argument("invalid value"); } + } + ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); + add_opt(llama_arg( + {"--attention"}, "{causal,non,causal}", + "attention type for embeddings, use model default if unspecified", + [](gpt_params & params, const std::string & value) { + /**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; } + else if (value == "non-causal") { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; } + else { throw std::invalid_argument("invalid value"); } + } + ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); + add_opt(llama_arg( + {"--rope-scaling"}, "{none,linear,yarn}", + "RoPE frequency scaling method, defaults to linear unless specified by the model", + [](gpt_params & params, const std::string & value) { + /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; } + else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; } + else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; } + else { throw std::invalid_argument("invalid value"); } + } + )); + add_opt(llama_arg( + {"--rope-scale"}, "N", + "RoPE context scaling factor, expands context by a factor of N", + [](gpt_params & params, const std::string & value) { + params.rope_freq_scale = 1.0f / std::stof(value); + } + )); + add_opt(llama_arg( + {"--rope-freq-base"}, "N", + "RoPE base frequency, used by NTK-aware scaling (default: loaded from model)", + [](gpt_params & params, const std::string & value) { + params.rope_freq_base = std::stof(value); + } + )); + add_opt(llama_arg( + {"--rope-freq-scale"}, "N", + "RoPE frequency scaling factor, expands context by a factor of 1/N", + [](gpt_params & params, const std::string & value) { + params.rope_freq_scale = std::stof(value); + } + )); + add_opt(llama_arg( + {"--yarn-orig-ctx"}, "N", + format("YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx), + [](gpt_params & params, int value) { + params.yarn_orig_ctx = value; + } + )); + add_opt(llama_arg( + {"--yarn-ext-factor"}, "N", + format("YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor), + [](gpt_params & params, const std::string & value) { + params.yarn_ext_factor = std::stof(value); + } + )); + add_opt(llama_arg( + {"--yarn-attn-factor"}, "N", + format("YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor), + [](gpt_params & params, const std::string & value) { + params.yarn_attn_factor = std::stof(value); + } + )); + add_opt(llama_arg( + {"--yarn-beta-slow"}, "N", + format("YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow), + [](gpt_params & params, const std::string & value) { + params.yarn_beta_slow = std::stof(value); + } + )); + add_opt(llama_arg( + {"--yarn-beta-fast"}, "N", + format("YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast), + [](gpt_params & params, const std::string & value) { + params.yarn_beta_fast = std::stof(value); + } + )); + add_opt(llama_arg( + {"-gan", "--grp-attn-n"}, "N", + format("group-attention factor (default: %d)", params.grp_attn_n), + [](gpt_params & params, int value) { + params.grp_attn_n = value; + } + )); + add_opt(llama_arg( + {"-gaw", "--grp-attn-w"}, "N", + format("group-attention width (default: %.1f)", (double)params.grp_attn_w), + [](gpt_params & params, int value) { + params.grp_attn_w = value; + } + )); + add_opt(llama_arg( + {"-dkvc", "--dump-kv-cache"}, + "verbose print of the KV cache", + [](gpt_params & params) { + params.dump_kv_cache = true; + } + )); + add_opt(llama_arg( + {"-nkvo", "--no-kv-offload"}, + "disable KV offload", + [](gpt_params & params) { + params.no_kv_offload = true; + } + )); + add_opt(llama_arg( + {"-ctk", "--cache-type-k"}, "TYPE", + format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()), + [](gpt_params & params, const std::string & value) { + // TODO: get the type right here + params.cache_type_k = value; + } + )); + add_opt(llama_arg( + {"-ctv", "--cache-type-v"}, "TYPE", + format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()), + [](gpt_params & params, const std::string & value) { + // TODO: get the type right here + params.cache_type_v = value; + } + )); + add_opt(llama_arg( + {"--perplexity", "--all-logits"}, + format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"), + [](gpt_params & params) { + params.logits_all = true; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(llama_arg( + {"--hellaswag"}, + "compute HellaSwag score over random tasks from datafile supplied with -f", + [](gpt_params & params) { + params.hellaswag = true; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(llama_arg( + {"--hellaswag-tasks"}, "N", + format("number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks), + [](gpt_params & params, int value) { + params.hellaswag_tasks = value; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(llama_arg( + {"--winogrande"}, + "compute Winogrande score over random tasks from datafile supplied with -f", + [](gpt_params & params) { + params.winogrande = true; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(llama_arg( + {"--winogrande-tasks"}, "N", + format("number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks), + [](gpt_params & params, int value) { + params.winogrande_tasks = value; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(llama_arg( + {"--multiple-choice"}, + "compute multiple choice score over random tasks from datafile supplied with -f", + [](gpt_params & params) { + params.multiple_choice = true; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(llama_arg( + {"--multiple-choice-tasks"}, "N", + format("number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks), + [](gpt_params & params, int value) { + params.multiple_choice_tasks = value; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(llama_arg( + {"--kl-divergence"}, + "computes KL-divergence to logits provided via --kl-divergence-base", + [](gpt_params & params) { + params.kl_divergence = true; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(llama_arg( + {"--save-all-logits", "--kl-divergence-base"}, "FNAME", + "set logits file", + [](gpt_params & params, const std::string & value) { + params.logits_file = value; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(llama_arg( + {"--ppl-stride"}, "N", + format("stride for perplexity calculation (default: %d)", params.ppl_stride), + [](gpt_params & params, int value) { + params.ppl_stride = value; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(llama_arg( + {"--ppl-output-type"}, "<0|1>", + format("output type for perplexity calculation (default: %d)", params.ppl_output_type), + [](gpt_params & params, int value) { + params.ppl_output_type = value; + } + ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); + add_opt(llama_arg( + {"-dt", "--defrag-thold"}, "N", + format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold), + [](gpt_params & params, const std::string & value) { + params.defrag_thold = std::stof(value); + } + ).set_env("LLAMA_ARG_DEFRAG_THOLD")); + add_opt(llama_arg( + {"-np", "--parallel"}, "N", + format("number of parallel sequences to decode (default: %d)", params.n_parallel), + [](gpt_params & params, int value) { + params.n_parallel = value; + } + )); + add_opt(llama_arg( + {"-ns", "--sequences"}, "N", + format("number of sequences to decode (default: %d)", params.n_sequences), + [](gpt_params & params, int value) { + params.n_sequences = value; + } + ).set_examples({LLAMA_EXAMPLE_PARALLEL})); + add_opt(llama_arg( + {"-cb", "--cont-batching"}, + format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"), + [](gpt_params & params) { + params.cont_batching = true; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CONT_BATCHING")); + add_opt(llama_arg( + {"-nocb", "--no-cont-batching"}, + "disable continuous batching", + [](gpt_params & params) { + params.cont_batching = false; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING")); + add_opt(llama_arg( + {"--mmproj"}, "FILE", + "path to a multimodal projector file for LLaVA. see examples/llava/README.md", + [](gpt_params & params, const std::string & value) { + params.mmproj = value; + } + ).set_examples({LLAMA_EXAMPLE_LLAVA})); + add_opt(llama_arg( + {"--image"}, "FILE", + "path to an image file. use with multimodal models. Specify multiple times for batching", + [](gpt_params & params, const std::string & value) { + params.image.emplace_back(value); + } + ).set_examples({LLAMA_EXAMPLE_LLAVA})); +#ifdef GGML_USE_RPC + add_opt(llama_arg( + {"--rpc"}, "SERVERS", + "comma separated list of RPC servers", + [](gpt_params & params, const std::string & value) { + params.rpc_servers = value; + } + )); +#endif + add_opt(llama_arg( + {"--mlock"}, + "force system to keep model in RAM rather than swapping or compressing", + [](gpt_params & params) { + params.use_mlock = true; + } + )); + add_opt(llama_arg( + {"--no-mmap"}, + "do not memory-map model (slower load but may reduce pageouts if not using mlock)", + [](gpt_params & params) { + params.use_mmap = false; + } + )); + add_opt(llama_arg( + {"--numa"}, "TYPE", + "attempt optimizations that help on some NUMA systems\n" + "- distribute: spread execution evenly over all nodes\n" + "- isolate: only spawn threads on CPUs on the node that execution started on\n" + "- numactl: use the CPU map provided by numactl\n" + "if run without this previously, it is recommended to drop the system page cache before using this\n" + "see https://github.com/ggerganov/llama.cpp/issues/1437", + [](gpt_params & params, const std::string & value) { + /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; } + else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; } + else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; } + else { throw std::invalid_argument("invalid value"); } + } + )); + add_opt(llama_arg( + {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N", + "number of layers to store in VRAM", + [](gpt_params & params, int value) { + params.n_gpu_layers = value; + if (!llama_supports_gpu_offload()) { + fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n"); + fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); + } + } + ).set_env("LLAMA_ARG_N_GPU_LAYERS")); + add_opt(llama_arg( + {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N", + "number of layers to store in VRAM for the draft model", + [](gpt_params & params, int value) { + params.n_gpu_layers_draft = value; + if (!llama_supports_gpu_offload()) { + fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n"); + fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); + } + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"-sm", "--split-mode"}, "{none,layer,row}", + "how to split the model across multiple GPUs, one of:\n" + "- none: use one GPU only\n" + "- layer (default): split layers and KV across GPUs\n" + "- row: split rows across GPUs", + [](gpt_params & params, const std::string & value) { + std::string arg_next = value; + if (arg_next == "none") { + params.split_mode = LLAMA_SPLIT_MODE_NONE; + } else if (arg_next == "layer") { + params.split_mode = LLAMA_SPLIT_MODE_LAYER; + } + else if (arg_next == "row") { +#ifdef GGML_USE_SYCL + fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n"); + exit(1); +#endif // GGML_USE_SYCL + params.split_mode = LLAMA_SPLIT_MODE_ROW; + } + else { + throw std::invalid_argument("invalid value"); + } +#ifndef GGML_USE_CUDA_SYCL_VULKAN + fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the split mode has no effect.\n"); +#endif // GGML_USE_CUDA_SYCL_VULKAN + } + )); + add_opt(llama_arg( + {"-ts", "--tensor-split"}, "N0,N1,N2,...", + "fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1", + [](gpt_params & params, const std::string & value) { + std::string arg_next = value; + + // split string by , and / + const std::regex regex{ R"([,/]+)" }; + std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 }; + std::vector split_arg{ it, {} }; + if (split_arg.size() >= llama_max_devices()) { + throw std::invalid_argument( + format("got %d input configs, but system only has %d devices", (int)split_arg.size(), (int)llama_max_devices()) + ); + } + for (size_t i = 0; i < llama_max_devices(); ++i) { + if (i < split_arg.size()) { + params.tensor_split[i] = std::stof(split_arg[i]); + } else { + params.tensor_split[i] = 0.0f; + } + } +#ifndef GGML_USE_CUDA_SYCL_VULKAN + fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting a tensor split has no effect.\n"); +#endif // GGML_USE_CUDA_SYCL_VULKAN + } + )); + add_opt(llama_arg( + {"-mg", "--main-gpu"}, "INDEX", + format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu), + [](gpt_params & params, int value) { + params.main_gpu = value; +#ifndef GGML_USE_CUDA_SYCL_VULKAN + fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the main GPU has no effect.\n"); +#endif // GGML_USE_CUDA_SYCL_VULKAN + } + )); + add_opt(llama_arg( + {"--check-tensors"}, + format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"), + [](gpt_params & params) { + params.check_tensors = true; + } + )); + add_opt(llama_arg( + {"--override-kv"}, "KEY=TYPE:VALUE", + "advanced option to override model metadata by key. may be specified multiple times.\n" + "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false", + [](gpt_params & params, const std::string & value) { + if (!string_parse_kv_override(value.c_str(), params.kv_overrides)) { + throw std::runtime_error(format("error: Invalid type for KV override: %s\n", value.c_str())); + } + } + )); + add_opt(llama_arg( + {"--lora"}, "FNAME", + "path to LoRA adapter (can be repeated to use multiple adapters)", + [](gpt_params & params, const std::string & value) { + params.lora_adapters.push_back({ std::string(value), 1.0 }); + } + // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg + ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA})); + add_opt(llama_arg( + {"--lora-scaled"}, "FNAME", "SCALE", + "path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)", + [](gpt_params & params, const std::string & fname, const std::string & scale) { + params.lora_adapters.push_back({ fname, std::stof(scale) }); + } + // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg + ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA})); + add_opt(llama_arg( + {"--control-vector"}, "FNAME", + "add a control vector\nnote: this argument can be repeated to add multiple control vectors", + [](gpt_params & params, const std::string & value) { + params.control_vectors.push_back({ 1.0f, value, }); + } + )); + add_opt(llama_arg( + {"--control-vector-scaled"}, "FNAME", "SCALE", + "add a control vector with user defined scaling SCALE\n" + "note: this argument can be repeated to add multiple scaled control vectors", + [](gpt_params & params, const std::string & fname, const std::string & scale) { + params.control_vectors.push_back({ std::stof(scale), fname }); + } + )); + add_opt(llama_arg( + {"--control-vector-layer-range"}, "START", "END", + "layer range to apply the control vector(s) to, start and end inclusive", + [](gpt_params & params, const std::string & start, const std::string & end) { + params.control_vector_layer_start = std::stoi(start); + params.control_vector_layer_end = std::stoi(end); + } + )); + add_opt(llama_arg( + {"-a", "--alias"}, "STRING", + "set alias for model name (to be used by REST API)", + [](gpt_params & params, const std::string & value) { + params.model_alias = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"-m", "--model"}, "FNAME", + ex == LLAMA_EXAMPLE_EXPORT_LORA + ? std::string("model path from which to load base model") + : format( + "model path (default: `models/$filename` with filename from `--hf-file` " + "or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH + ), + [](gpt_params & params, const std::string & value) { + params.model = value; + } + ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL")); + add_opt(llama_arg( + {"-md", "--model-draft"}, "FNAME", + "draft model for speculative decoding (default: unused)", + [](gpt_params & params, const std::string & value) { + params.model_draft = value; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"-mu", "--model-url"}, "MODEL_URL", + "model download url (default: unused)", + [](gpt_params & params, const std::string & value) { + params.model_url = value; + } + ).set_env("LLAMA_ARG_MODEL_URL")); + add_opt(llama_arg( + {"-hfr", "--hf-repo"}, "REPO", + "Hugging Face model repository (default: unused)", + [](gpt_params & params, const std::string & value) { + params.hf_repo = value; + } + ).set_env("LLAMA_ARG_HF_REPO")); + add_opt(llama_arg( + {"-hff", "--hf-file"}, "FILE", + "Hugging Face model file (default: unused)", + [](gpt_params & params, const std::string & value) { + params.hf_file = value; + } + ).set_env("LLAMA_ARG_HF_FILE")); + add_opt(llama_arg( + {"-hft", "--hf-token"}, "TOKEN", + "Hugging Face access token (default: value from HF_TOKEN environment variable)", + [](gpt_params & params, const std::string & value) { + params.hf_token = value; + } + ).set_env("HF_TOKEN")); + add_opt(llama_arg( + {"--context-file"}, "FNAME", + "file to load context from (repeat to specify multiple files)", + [](gpt_params & params, const std::string & value) { + std::ifstream file(value, std::ios::binary); + if (!file) { + throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); + } + params.context_files.push_back(value); + } + ).set_examples({LLAMA_EXAMPLE_RETRIEVAL})); + add_opt(llama_arg( + {"--chunk-size"}, "N", + format("minimum length of embedded text chunks (default: %d)", params.chunk_size), + [](gpt_params & params, int value) { + params.chunk_size = value; + } + ).set_examples({LLAMA_EXAMPLE_RETRIEVAL})); + add_opt(llama_arg( + {"--chunk-separator"}, "STRING", + format("separator between chunks (default: '%s')", params.chunk_separator.c_str()), + [](gpt_params & params, const std::string & value) { + params.chunk_separator = value; + } + ).set_examples({LLAMA_EXAMPLE_RETRIEVAL})); + add_opt(llama_arg( + {"--junk"}, "N", + format("number of times to repeat the junk text (default: %d)", params.n_junk), + [](gpt_params & params, int value) { + params.n_junk = value; + } + ).set_examples({LLAMA_EXAMPLE_PASSKEY})); + add_opt(llama_arg( + {"--pos"}, "N", + format("position of the passkey in the junk text (default: %d)", params.i_pos), + [](gpt_params & params, int value) { + params.i_pos = value; + } + ).set_examples({LLAMA_EXAMPLE_PASSKEY})); + add_opt(llama_arg( + {"-o", "--output", "--output-file"}, "FNAME", + format("output file (default: '%s')", + ex == LLAMA_EXAMPLE_EXPORT_LORA + ? params.lora_outfile.c_str() + : ex == LLAMA_EXAMPLE_CVECTOR_GENERATOR + ? params.cvector_outfile.c_str() + : params.out_file.c_str()), + [](gpt_params & params, const std::string & value) { + params.out_file = value; + params.cvector_outfile = value; + params.lora_outfile = value; + } + ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA})); + add_opt(llama_arg( + {"-ofreq", "--output-frequency"}, "N", + format("output the imatrix every N iterations (default: %d)", params.n_out_freq), + [](gpt_params & params, int value) { + params.n_out_freq = value; + } + ).set_examples({LLAMA_EXAMPLE_IMATRIX})); + add_opt(llama_arg( + {"--save-frequency"}, "N", + format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq), + [](gpt_params & params, int value) { + params.n_save_freq = value; + } + ).set_examples({LLAMA_EXAMPLE_IMATRIX})); + add_opt(llama_arg( + {"--process-output"}, + format("collect data for the output tensor (default: %s)", params.process_output ? "true" : "false"), + [](gpt_params & params) { + params.process_output = true; + } + ).set_examples({LLAMA_EXAMPLE_IMATRIX})); + add_opt(llama_arg( + {"--no-ppl"}, + format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"), + [](gpt_params & params) { + params.compute_ppl = false; + } + ).set_examples({LLAMA_EXAMPLE_IMATRIX})); + add_opt(llama_arg( + {"--chunk", "--from-chunk"}, "N", + format("start processing the input from chunk N (default: %d)", params.i_chunk), + [](gpt_params & params, int value) { + params.i_chunk = value; + } + ).set_examples({LLAMA_EXAMPLE_IMATRIX})); + add_opt(llama_arg( + {"-pps"}, + format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"), + [](gpt_params & params) { + params.is_pp_shared = true; + } + ).set_examples({LLAMA_EXAMPLE_BENCH})); + add_opt(llama_arg( + {"-npp"}, "n0,n1,...", + "number of prompt tokens", + [](gpt_params & params, const std::string & value) { + auto p = string_split(value, ','); + params.n_pp.insert(params.n_pp.end(), p.begin(), p.end()); + } + ).set_examples({LLAMA_EXAMPLE_BENCH})); + add_opt(llama_arg( + {"-ntg"}, "n0,n1,...", + "number of text generation tokens", + [](gpt_params & params, const std::string & value) { + auto p = string_split(value, ','); + params.n_tg.insert(params.n_tg.end(), p.begin(), p.end()); + } + ).set_examples({LLAMA_EXAMPLE_BENCH})); + add_opt(llama_arg( + {"-npl"}, "n0,n1,...", + "number of parallel prompts", + [](gpt_params & params, const std::string & value) { + auto p = string_split(value, ','); + params.n_pl.insert(params.n_pl.end(), p.begin(), p.end()); + } + ).set_examples({LLAMA_EXAMPLE_BENCH})); + add_opt(llama_arg( + {"--embd-normalize"}, "N", + format("normalisation for embendings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize), + [](gpt_params & params, int value) { + params.embd_normalize = value; + } + ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); + add_opt(llama_arg( + {"--embd-output-format"}, "FORMAT", + "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix", + [](gpt_params & params, const std::string & value) { + params.embd_out = value; + } + ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); + add_opt(llama_arg( + {"--embd-separator"}, "STRING", + "separator of embendings (default \\n) for example \"<#sep#>\"", + [](gpt_params & params, const std::string & value) { + params.embd_sep = value; + } + ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); + add_opt(llama_arg( + {"--host"}, "HOST", + format("ip address to listen (default: %s)", params.hostname.c_str()), + [](gpt_params & params, const std::string & value) { + params.hostname = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_HOST")); + add_opt(llama_arg( + {"--port"}, "PORT", + format("port to listen (default: %d)", params.port), + [](gpt_params & params, int value) { + params.port = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PORT")); + add_opt(llama_arg( + {"--path"}, "PATH", + format("path to serve static files from (default: %s)", params.public_path.c_str()), + [](gpt_params & params, const std::string & value) { + params.public_path = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"--embedding", "--embeddings"}, + format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"), + [](gpt_params & params) { + params.embedding = true; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS")); + add_opt(llama_arg( + {"--api-key"}, "KEY", + "API key to use for authentication (default: none)", + [](gpt_params & params, const std::string & value) { + params.api_keys.push_back(value); + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_API_KEY")); + add_opt(llama_arg( + {"--api-key-file"}, "FNAME", + "path to file containing API keys (default: none)", + [](gpt_params & params, const std::string & value) { + std::ifstream key_file(value); + if (!key_file) { + throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); + } + std::string key; + while (std::getline(key_file, key)) { + if (!key.empty()) { + params.api_keys.push_back(key); + } + } + key_file.close(); + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"--ssl-key-file"}, "FNAME", + "path to file a PEM-encoded SSL private key", + [](gpt_params & params, const std::string & value) { + params.ssl_file_key = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"--ssl-cert-file"}, "FNAME", + "path to file a PEM-encoded SSL certificate", + [](gpt_params & params, const std::string & value) { + params.ssl_file_cert = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"-to", "--timeout"}, "N", + format("server read/write timeout in seconds (default: %d)", params.timeout_read), + [](gpt_params & params, int value) { + params.timeout_read = value; + params.timeout_write = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"--threads-http"}, "N", + format("number of threads used to process HTTP requests (default: %d)", params.n_threads_http), + [](gpt_params & params, int value) { + params.n_threads_http = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP")); + add_opt(llama_arg( + {"-spf", "--system-prompt-file"}, "FNAME", + "set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications", + [](gpt_params & params, const std::string & value) { + std::ifstream file(value); + if (!file) { + throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); + } + std::string system_prompt; + std::copy( + std::istreambuf_iterator(file), + std::istreambuf_iterator(), + std::back_inserter(system_prompt) + ); + params.system_prompt = system_prompt; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"--log-format"}, "{text, json}", + "log output format: json or text (default: json)", + [](gpt_params & params, const std::string & value) { + if (value == "json") { + params.log_json = true; + } else if (value == "text") { + params.log_json = false; + } else { + throw std::invalid_argument("invalid value"); + } + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"--metrics"}, + format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"), + [](gpt_params & params) { + params.endpoint_metrics = true; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS")); + add_opt(llama_arg( + {"--no-slots"}, + format("disables slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"), + [](gpt_params & params) { + params.endpoint_slots = false; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_ENDPOINT_SLOTS")); + add_opt(llama_arg( + {"--slot-save-path"}, "PATH", + "path to save slot kv cache (default: disabled)", + [](gpt_params & params, const std::string & value) { + params.slot_save_path = value; + // if doesn't end with DIRECTORY_SEPARATOR, add it + if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) { + params.slot_save_path += DIRECTORY_SEPARATOR; + } + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"--chat-template"}, "JINJA_TEMPLATE", + "set custom jinja chat template (default: template taken from model's metadata)\n" + "if suffix/prefix are specified, template will be disabled\n" + "only commonly used templates are accepted:\nhttps://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template", + [](gpt_params & params, const std::string & value) { + if (!llama_chat_verify_template(value)) { + throw std::runtime_error(format( + "error: the supplied chat template is not supported: %s\n" + "note: llama.cpp does not use jinja parser, we only support commonly used templates\n", + value.c_str() + )); + } + params.chat_template = value; + } + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE")); + add_opt(llama_arg( + {"-sps", "--slot-prompt-similarity"}, "SIMILARITY", + format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity), + [](gpt_params & params, const std::string & value) { + params.slot_prompt_similarity = std::stof(value); + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"--lora-init-without-apply"}, + format("load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"), + [](gpt_params & params) { + params.lora_init_without_apply = true; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"--simple-io"}, + "use basic IO for better compatibility in subprocesses and limited consoles", + [](gpt_params & params) { + params.simple_io = true; + } + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL})); + add_opt(llama_arg( + {"-ld", "--logdir"}, "LOGDIR", + "path under which to save YAML logs (no logging if unset)", + [](gpt_params & params, const std::string & value) { + params.logdir = value; + + if (params.logdir.back() != DIRECTORY_SEPARATOR) { + params.logdir += DIRECTORY_SEPARATOR; + } + } + )); + add_opt(llama_arg( + {"--positive-file"}, "FNAME", + format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()), + [](gpt_params & params, const std::string & value) { + params.cvector_positive_file = value; + } + ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); + add_opt(llama_arg( + {"--negative-file"}, "FNAME", + format("negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str()), + [](gpt_params & params, const std::string & value) { + params.cvector_negative_file = value; + } + ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); + add_opt(llama_arg( + {"--pca-batch"}, "N", + format("batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch), + [](gpt_params & params, int value) { + params.n_pca_batch = value; + } + ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); + add_opt(llama_arg( + {"--pca-iter"}, "N", + format("number of iterations used for PCA (default: %d)", params.n_pca_iterations), + [](gpt_params & params, int value) { + params.n_pca_iterations = value; + } + ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); + add_opt(llama_arg( + {"--method"}, "{pca, mean}", + "dimensionality reduction method to be used (default: pca)", + [](gpt_params & params, const std::string & value) { + /**/ if (value == "pca") { params.cvector_dimre_method = DIMRE_METHOD_PCA; } + else if (value == "mean") { params.cvector_dimre_method = DIMRE_METHOD_MEAN; } + else { throw std::invalid_argument("invalid value"); } + } + ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); + add_opt(llama_arg( + {"--output-format"}, "{md,jsonl}", + "output format for batched-bench results (default: md)", + [](gpt_params & params, const std::string & value) { + /**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; } + else if (value == "md") { params.batched_bench_output_jsonl = false; } + else { std::invalid_argument("invalid value"); } + } + ).set_examples({LLAMA_EXAMPLE_BENCH})); +#ifndef LOG_DISABLE_LOGS + // TODO: make this looks less weird + add_opt(llama_arg( + {"--log-test"}, + "Log test", + [](gpt_params &) { log_param_single_parse("--log-test"); } + )); + add_opt(llama_arg( + {"--log-disable"}, + "Log disable", + [](gpt_params &) { log_param_single_parse("--log-disable"); } + )); + add_opt(llama_arg( + {"--log-enable"}, + "Log enable", + [](gpt_params &) { log_param_single_parse("--log-enable"); } + )); + add_opt(llama_arg( + {"--log-new"}, + "Log new", + [](gpt_params &) { log_param_single_parse("--log-new"); } + )); + add_opt(llama_arg( + {"--log-append"}, + "Log append", + [](gpt_params &) { log_param_single_parse("--log-append"); } + )); + add_opt(llama_arg( + {"--log-file"}, "FNAME", + "Log file", + [](gpt_params &, const std::string & value) { log_param_pair_parse(false, "--log-file", value); } + )); +#endif // LOG_DISABLE_LOGS + + return ctx_arg; +} + diff --git a/common/arg.h b/common/arg.h new file mode 100644 index 000000000..413de2c88 --- /dev/null +++ b/common/arg.h @@ -0,0 +1,77 @@ +#pragma once + +#include "common.h" + +#include +#include +#include + +// +// CLI argument parsing +// + +struct llama_arg { + std::set examples = {LLAMA_EXAMPLE_COMMON}; + std::vector args; + const char * value_hint = nullptr; // help text or example for arg value + const char * value_hint_2 = nullptr; // for second arg value + const char * env = nullptr; + std::string help; + bool is_sparam = false; // is current arg a sampling param? + void (*handler_void) (gpt_params & params) = nullptr; + void (*handler_string) (gpt_params & params, const std::string &) = nullptr; + void (*handler_str_str)(gpt_params & params, const std::string &, const std::string &) = nullptr; + void (*handler_int) (gpt_params & params, int) = nullptr; + + llama_arg( + const std::initializer_list & args, + const char * value_hint, + const std::string & help, + void (*handler)(gpt_params & params, const std::string &) + ) : args(args), value_hint(value_hint), help(help), handler_string(handler) {} + + llama_arg( + const std::initializer_list & args, + const char * value_hint, + const std::string & help, + void (*handler)(gpt_params & params, int) + ) : args(args), value_hint(value_hint), help(help), handler_int(handler) {} + + llama_arg( + const std::initializer_list & args, + const std::string & help, + void (*handler)(gpt_params & params) + ) : args(args), help(help), handler_void(handler) {} + + // support 2 values for arg + llama_arg( + const std::initializer_list & args, + const char * value_hint, + const char * value_hint_2, + const std::string & help, + void (*handler)(gpt_params & params, const std::string &, const std::string &) + ) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {} + + llama_arg & set_examples(std::initializer_list examples); + llama_arg & set_env(const char * env); + llama_arg & set_sparam(); + bool in_example(enum llama_example ex); + bool get_value_from_env(std::string & output); + bool has_value_from_env(); + std::string to_string(); +}; + +struct gpt_params_context { + enum llama_example ex = LLAMA_EXAMPLE_COMMON; + gpt_params & params; + std::vector options; + void(*print_usage)(int, char **) = nullptr; + gpt_params_context(gpt_params & params) : params(params) {} +}; + +// parse input arguments from CLI +// if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message) +bool gpt_params_parse(int argc, char ** argv, gpt_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr); + +// function to be used by test-arg-parser +gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr); diff --git a/common/common.cpp b/common/common.cpp index 166c7a1de..4a7f55a06 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -26,7 +26,6 @@ #include #include #include -#include #if defined(__APPLE__) && defined(__MACH__) #include @@ -273,53 +272,6 @@ bool set_process_priority(enum ggml_sched_priority prio) { // CLI argument parsing // -#ifdef __GNUC__ -#ifdef __MINGW32__ -#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__))) -#else -#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__))) -#endif -#else -#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) -#endif - -LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2) -static std::string format(const char * fmt, ...) { - va_list ap; - va_list ap2; - va_start(ap, fmt); - va_copy(ap2, ap); - int size = vsnprintf(NULL, 0, fmt, ap); - GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT - std::vector buf(size + 1); - int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2); - GGML_ASSERT(size2 == size); - va_end(ap2); - va_end(ap); - return std::string(buf.data(), size); -} - -static void gpt_params_handle_model_default(gpt_params & params) { - if (!params.hf_repo.empty()) { - // short-hand to avoid specifying --hf-file -> default it to --model - if (params.hf_file.empty()) { - if (params.model.empty()) { - throw std::invalid_argument("error: --hf-repo requires either --hf-file or --model\n"); - } - params.hf_file = params.model; - } else if (params.model.empty()) { - params.model = fs_get_cache_file(string_split(params.hf_file, '/').back()); - } - } else if (!params.model_url.empty()) { - if (params.model.empty()) { - auto f = string_split(params.model_url, '#').front(); - f = string_split(f, '?').front(); - params.model = fs_get_cache_file(string_split(f, '/').back()); - } - } else if (params.model.empty()) { - params.model = DEFAULT_MODEL_PATH; - } -} void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) { int32_t n_set = 0; @@ -345,150 +297,6 @@ void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) } } -bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vector & options) { - std::string arg; - const std::string arg_prefix = "--"; - gpt_sampler_params & sparams = params.sparams; - - std::unordered_map arg_to_options; - for (auto & opt : options) { - for (const auto & arg : opt.args) { - arg_to_options[arg] = &opt; - } - } - - // handle environment variables - for (auto & opt : options) { - std::string value; - if (opt.get_value_from_env(value)) { - try { - if (opt.handler_void && (value == "1" || value == "true")) { - opt.handler_void(params); - } - if (opt.handler_int) { - opt.handler_int(params, std::stoi(value)); - } - if (opt.handler_string) { - opt.handler_string(params, value); - continue; - } - } catch (std::exception & e) { - throw std::invalid_argument(format( - "error while handling environment variable \"%s\": %s\n\n", opt.env, e.what())); - } - } - } - - // handle command line arguments - auto check_arg = [&](int i) { - if (i+1 >= argc) { - throw std::invalid_argument("expected value for argument"); - } - }; - - for (int i = 1; i < argc; i++) { - const std::string arg_prefix = "--"; - - std::string arg = argv[i]; - if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) { - std::replace(arg.begin(), arg.end(), '_', '-'); - } - if (arg_to_options.find(arg) == arg_to_options.end()) { - throw std::invalid_argument(format("error: invalid argument: %s", arg.c_str())); - } - auto opt = *arg_to_options[arg]; - if (opt.has_value_from_env()) { - fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str()); - } - try { - if (opt.handler_void) { - opt.handler_void(params); - continue; - } - - // arg with single value - check_arg(i); - std::string val = argv[++i]; - if (opt.handler_int) { - opt.handler_int(params, std::stoi(val)); - continue; - } - if (opt.handler_string) { - opt.handler_string(params, val); - continue; - } - - // arg with 2 values - check_arg(i); - std::string val2 = argv[++i]; - if (opt.handler_str_str) { - opt.handler_str_str(params, val, val2); - continue; - } - } catch (std::exception & e) { - throw std::invalid_argument(format( - "error while handling argument \"%s\": %s\n\n" - "usage:\n%s\n\nto show complete usage, run with -h", - arg.c_str(), e.what(), arg_to_options[arg]->to_string().c_str())); - } - } - - postprocess_cpu_params(params.cpuparams, nullptr); - postprocess_cpu_params(params.cpuparams_batch, ¶ms.cpuparams); - postprocess_cpu_params(params.draft_cpuparams, ¶ms.cpuparams); - postprocess_cpu_params(params.draft_cpuparams_batch, ¶ms.cpuparams_batch); - - if (params.prompt_cache_all && (params.interactive || params.interactive_first)) { - throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n"); - } - - gpt_params_handle_model_default(params); - - if (params.escape) { - string_process_escapes(params.prompt); - string_process_escapes(params.input_prefix); - string_process_escapes(params.input_suffix); - for (auto & antiprompt : params.antiprompt) { - string_process_escapes(antiprompt); - } - } - - if (!params.kv_overrides.empty()) { - params.kv_overrides.emplace_back(); - params.kv_overrides.back().key[0] = 0; - } - - if (sparams.seed == LLAMA_DEFAULT_SEED) { - sparams.seed = time(NULL); - } - - return true; -} - -bool gpt_params_parse(int argc, char ** argv, gpt_params & params, std::vector & options) { - const auto params_org = params; // the example can modify the default params - - try { - if (!gpt_params_parse_ex(argc, argv, params, options)) { - params = params_org; - return false; - } - if (params.usage) { - gpt_params_print_usage(params, options); - if (params.print_usage) { - params.print_usage(argc, argv); - } - exit(0); - } - } catch (const std::invalid_argument & ex) { - fprintf(stderr, "%s\n", ex.what()); - params = params_org; - return false; - } - - return true; -} - bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THREADS]) { size_t dash_loc = range.find('-'); if (dash_loc == std::string::npos) { @@ -562,1743 +370,6 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD return true; } -static std::vector break_str_into_lines(std::string input, size_t max_char_per_line) { - std::vector result; - std::istringstream iss(input); - std::string line; - auto add_line = [&](const std::string& l) { - if (l.length() <= max_char_per_line) { - result.push_back(l); - } else { - std::istringstream line_stream(l); - std::string word, current_line; - while (line_stream >> word) { - if (current_line.length() + !current_line.empty() + word.length() > max_char_per_line) { - if (!current_line.empty()) result.push_back(current_line); - current_line = word; - } else { - current_line += (!current_line.empty() ? " " : "") + word; - } - } - if (!current_line.empty()) result.push_back(current_line); - } - }; - while (std::getline(iss, line)) { - add_line(line); - } - return result; -} - -std::string llama_arg::to_string() { - // params for printing to console - const static int n_leading_spaces = 40; - const static int n_char_per_line_help = 70; // TODO: detect this based on current console - std::string leading_spaces(n_leading_spaces, ' '); - - std::ostringstream ss; - for (const auto arg : args) { - if (arg == args.front()) { - if (args.size() == 1) { - ss << arg; - } else { - // first arg is usually abbreviation, we need padding to make it more beautiful - auto tmp = std::string(arg) + ", "; - ss << format("%-7s", tmp.c_str()); - } - } else { - ss << arg << (arg != args.back() ? ", " : ""); - } - } - if (value_hint) ss << " " << value_hint; - if (value_hint_2) ss << " " << value_hint_2; - if (ss.tellp() > n_leading_spaces - 3) { - // current line is too long, add new line - ss << "\n" << leading_spaces; - } else { - // padding between arg and help, same line - ss << std::string(leading_spaces.size() - ss.tellp(), ' '); - } - const auto help_lines = break_str_into_lines(help, n_char_per_line_help); - for (const auto & line : help_lines) { - ss << (&line == &help_lines.front() ? "" : leading_spaces) << line << "\n"; - } - return ss.str(); -} - -void gpt_params_print_usage(gpt_params & params, std::vector & options) { - auto print_options = [](std::vector & options) { - for (llama_arg * opt : options) { - printf("%s", opt->to_string().c_str()); - } - }; - - std::vector common_options; - std::vector specific_options; - for (auto & opt : options) { - // in case multiple LLAMA_EXAMPLE_* are set, we prioritize the LLAMA_EXAMPLE_* matching current example - if (opt.in_example(params.curr_ex)) { - specific_options.push_back(&opt); - } else { - common_options.push_back(&opt); - } - } - printf("----- common options -----\n\n"); - print_options(common_options); - // TODO: maybe convert enum llama_example to string - printf("\n\n----- example-specific options -----\n\n"); - print_options(specific_options); -} - -std::vector gpt_params_parser_init(gpt_params & params, llama_example ex) { - return gpt_params_parser_init(params, ex, nullptr); -} - -std::vector gpt_params_parser_init(gpt_params & params, llama_example ex, std::function print_usage) { - std::vector options; - params.print_usage = print_usage; - params.curr_ex = ex; - - std::string sampler_type_chars; - std::string sampler_type_names; - for (const auto & sampler : params.sparams.samplers) { - sampler_type_chars += gpt_sampler_type_to_chr(sampler); - sampler_type_names += gpt_sampler_type_to_str(sampler) + ";"; - } - sampler_type_names.pop_back(); - - - /** - * filter options by example - * rules: - * - all examples inherit options from LLAMA_EXAMPLE_COMMON - * - if LLAMA_EXAMPLE_* is set (other than COMMON), we only show the option in the corresponding example - * - if both {LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_*,} are set, we will prioritize the LLAMA_EXAMPLE_* matching current example - */ - auto add_opt = [&](llama_arg arg) { - if (arg.in_example(ex) || arg.in_example(LLAMA_EXAMPLE_COMMON)) { - options.push_back(std::move(arg)); - } - }; - - - add_opt(llama_arg( - {"-h", "--help", "--usage"}, - "print usage and exit", - [](gpt_params & params) { - params.usage = true; - } - )); - add_opt(llama_arg( - {"--version"}, - "show version and build info", - [](gpt_params &) { - fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT); - fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET); - exit(0); - } - )); - add_opt(llama_arg( - {"-v", "--verbose"}, - "print verbose information", - [](gpt_params & params) { - params.verbosity = 1; - } - )); - add_opt(llama_arg( - {"--verbosity"}, "N", - format("set specific verbosity level (default: %d)", params.verbosity), - [](gpt_params & params, int value) { - params.verbosity = value; - } - )); - add_opt(llama_arg( - {"--verbose-prompt"}, - format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"), - [](gpt_params & params) { - params.verbose_prompt = true; - } - ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(llama_arg( - {"--no-display-prompt"}, - format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"), - [](gpt_params & params) { - params.display_prompt = false; - } - ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(llama_arg( - {"-co", "--color"}, - format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"), - [](gpt_params & params) { - params.use_color = true; - } - ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL})); - add_opt(llama_arg( - {"-s", "--seed"}, "SEED", - format("RNG seed (default: %d, use random seed for < 0)", params.sparams.seed), - [](gpt_params & params, const std::string & value) { - params.sparams.seed = std::stoul(value); - } - )); - add_opt(llama_arg( - {"-t", "--threads"}, "N", - format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads), - [](gpt_params & params, int value) { - params.cpuparams.n_threads = value; - if (params.cpuparams.n_threads <= 0) { - params.cpuparams.n_threads = std::thread::hardware_concurrency(); - } - } - ).set_env("LLAMA_ARG_THREADS")); - add_opt(llama_arg( - {"-tb", "--threads-batch"}, "N", - "number of threads to use during batch and prompt processing (default: same as --threads)", - [](gpt_params & params, int value) { - params.cpuparams_batch.n_threads = value; - if (params.cpuparams_batch.n_threads <= 0) { - params.cpuparams_batch.n_threads = std::thread::hardware_concurrency(); - } - } - )); - add_opt(llama_arg( - {"-td", "--threads-draft"}, "N", - "number of threads to use during generation (default: same as --threads)", - [](gpt_params & params, int value) { - params.draft_cpuparams.n_threads = value; - if (params.draft_cpuparams.n_threads <= 0) { - params.draft_cpuparams.n_threads = std::thread::hardware_concurrency(); - } - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(llama_arg( - {"-tbd", "--threads-batch-draft"}, "N", - "number of threads to use during batch and prompt processing (default: same as --threads-draft)", - [](gpt_params & params, int value) { - params.draft_cpuparams_batch.n_threads = value; - if (params.draft_cpuparams_batch.n_threads <= 0) { - params.draft_cpuparams_batch.n_threads = std::thread::hardware_concurrency(); - } - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(llama_arg( - {"-C", "--cpu-mask"}, "M", - "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")", - [](gpt_params & params, const std::string & mask) { - params.cpuparams.mask_valid = true; - if (!parse_cpu_mask(mask, params.cpuparams.cpumask)) { - throw std::invalid_argument("invalid cpumask"); - } - } - )); - add_opt(llama_arg( - {"-Cr", "--cpu-range"}, "lo-hi", - "range of CPUs for affinity. Complements --cpu-mask", - [](gpt_params & params, const std::string & range) { - params.cpuparams.mask_valid = true; - if (!parse_cpu_range(range, params.cpuparams.cpumask)) { - throw std::invalid_argument("invalid range"); - } - } - )); - add_opt(llama_arg( - {"--cpu-strict"}, "<0|1>", - format("use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu), - [](gpt_params & params, const std::string & value) { - params.cpuparams.strict_cpu = std::stoul(value); - } - )); - add_opt(llama_arg( - {"--prio"}, "N", - format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority), - [](gpt_params & params, int prio) { - if (prio < 0 || prio > 3) { - throw std::invalid_argument("invalid value"); - } - params.cpuparams.priority = (enum ggml_sched_priority) prio; - } - )); - add_opt(llama_arg( - {"--poll"}, "<0...100>", - format("use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll), - [](gpt_params & params, const std::string & value) { - params.cpuparams.poll = std::stoul(value); - } - )); - add_opt(llama_arg( - {"-Cb", "--cpu-mask-batch"}, "M", - "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)", - [](gpt_params & params, const std::string & mask) { - params.cpuparams_batch.mask_valid = true; - if (!parse_cpu_mask(mask, params.cpuparams_batch.cpumask)) { - throw std::invalid_argument("invalid cpumask"); - } - } - )); - add_opt(llama_arg( - {"-Crb", "--cpu-range-batch"}, "lo-hi", - "ranges of CPUs for affinity. Complements --cpu-mask-batch", - [](gpt_params & params, const std::string & range) { - params.cpuparams_batch.mask_valid = true; - if (!parse_cpu_range(range, params.cpuparams_batch.cpumask)) { - throw std::invalid_argument("invalid range"); - } - } - )); - add_opt(llama_arg( - {"--cpu-strict-batch"}, "<0|1>", - "use strict CPU placement (default: same as --cpu-strict)", - [](gpt_params & params, int value) { - params.cpuparams_batch.strict_cpu = value; - } - )); - add_opt(llama_arg( - {"--prio-batch"}, "N", - format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams_batch.priority), - [](gpt_params & params, int prio) { - if (prio < 0 || prio > 3) { - throw std::invalid_argument("invalid value"); - } - params.cpuparams_batch.priority = (enum ggml_sched_priority) prio; - } - )); - add_opt(llama_arg( - {"--poll-batch"}, "<0|1>", - "use polling to wait for work (default: same as --poll)", - [](gpt_params & params, int value) { - params.cpuparams_batch.poll = value; - } - )); - add_opt(llama_arg( - {"-Cd", "--cpu-mask-draft"}, "M", - "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)", - [](gpt_params & params, const std::string & mask) { - params.draft_cpuparams.mask_valid = true; - if (!parse_cpu_mask(mask, params.draft_cpuparams.cpumask)) { - throw std::invalid_argument("invalid cpumask"); - } - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(llama_arg( - {"-Crd", "--cpu-range-draft"}, "lo-hi", - "Ranges of CPUs for affinity. Complements --cpu-mask-draft", - [](gpt_params & params, const std::string & range) { - params.draft_cpuparams.mask_valid = true; - if (!parse_cpu_range(range, params.draft_cpuparams.cpumask)) { - throw std::invalid_argument("invalid range"); - } - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(llama_arg( - {"--cpu-strict-draft"}, "<0|1>", - "Use strict CPU placement for draft model (default: same as --cpu-strict)", - [](gpt_params & params, int value) { - params.draft_cpuparams.strict_cpu = value; - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(llama_arg( - {"--prio-draft"}, "N", - format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams.priority), - [](gpt_params & params, int prio) { - if (prio < 0 || prio > 3) { - throw std::invalid_argument("invalid value"); - } - params.draft_cpuparams.priority = (enum ggml_sched_priority) prio; - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(llama_arg( - {"--poll-draft"}, "<0|1>", - "Use polling to wait for draft model work (default: same as --poll])", - [](gpt_params & params, int value) { - params.draft_cpuparams.poll = value; - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(llama_arg( - {"-Cbd", "--cpu-mask-batch-draft"}, "M", - "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)", - [](gpt_params & params, const std::string & mask) { - params.draft_cpuparams_batch.mask_valid = true; - if (!parse_cpu_mask(mask, params.draft_cpuparams_batch.cpumask)) { - throw std::invalid_argument("invalid cpumask"); - } - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(llama_arg( - {"-Crbd", "--cpu-range-batch-draft"}, "lo-hi", - "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)", - [](gpt_params & params, const std::string & range) { - params.draft_cpuparams_batch.mask_valid = true; - if (!parse_cpu_range(range, params.draft_cpuparams_batch.cpumask)) { - throw std::invalid_argument("invalid cpumask"); - } - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(llama_arg( - {"--cpu-strict-batch-draft"}, "<0|1>", - "Use strict CPU placement for draft model (default: --cpu-strict-draft)", - [](gpt_params & params, int value) { - params.draft_cpuparams_batch.strict_cpu = value; - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(llama_arg( - {"--prio-batch-draft"}, "N", - format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams_batch.priority), - [](gpt_params & params, int prio) { - if (prio < 0 || prio > 3) { - throw std::invalid_argument("invalid value"); - } - params.draft_cpuparams_batch.priority = (enum ggml_sched_priority) prio; - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(llama_arg( - {"--poll-batch-draft"}, "<0|1>", - "Use polling to wait for draft model work (default: --poll-draft)", - [](gpt_params & params, int value) { - params.draft_cpuparams_batch.poll = value; - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(llama_arg( - {"--draft"}, "N", - format("number of tokens to draft for speculative decoding (default: %d)", params.n_draft), - [](gpt_params & params, int value) { - params.n_draft = value; - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(llama_arg( - {"-ps", "--p-split"}, "N", - format("speculative decoding split probability (default: %.1f)", (double)params.p_split), - [](gpt_params & params, const std::string & value) { - params.p_split = std::stof(value); - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(llama_arg( - {"-lcs", "--lookup-cache-static"}, "FNAME", - "path to static lookup cache to use for lookup decoding (not updated by generation)", - [](gpt_params & params, const std::string & value) { - params.lookup_cache_static = value; - } - )); - add_opt(llama_arg( - {"-lcd", "--lookup-cache-dynamic"}, "FNAME", - "path to dynamic lookup cache to use for lookup decoding (updated by generation)", - [](gpt_params & params, const std::string & value) { - params.lookup_cache_dynamic = value; - } - )); - add_opt(llama_arg( - {"-c", "--ctx-size"}, "N", - format("size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx), - [](gpt_params & params, int value) { - params.n_ctx = value; - } - ).set_env("LLAMA_ARG_CTX_SIZE")); - add_opt(llama_arg( - {"-n", "--predict", "--n-predict"}, "N", - format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict), - [](gpt_params & params, int value) { - params.n_predict = value; - } - ).set_env("LLAMA_ARG_N_PREDICT")); - add_opt(llama_arg( - {"-b", "--batch-size"}, "N", - format("logical maximum batch size (default: %d)", params.n_batch), - [](gpt_params & params, int value) { - params.n_batch = value; - } - ).set_env("LLAMA_ARG_BATCH")); - add_opt(llama_arg( - {"-ub", "--ubatch-size"}, "N", - format("physical maximum batch size (default: %d)", params.n_ubatch), - [](gpt_params & params, int value) { - params.n_ubatch = value; - } - ).set_env("LLAMA_ARG_UBATCH")); - add_opt(llama_arg( - {"--keep"}, "N", - format("number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep), - [](gpt_params & params, int value) { - params.n_keep = value; - } - )); - add_opt(llama_arg( - {"--chunks"}, "N", - format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks), - [](gpt_params & params, int value) { - params.n_chunks = value; - } - )); - add_opt(llama_arg( - {"-fa", "--flash-attn"}, - format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"), - [](gpt_params & params) { - params.flash_attn = true; - } - ).set_env("LLAMA_ARG_FLASH_ATTN")); - add_opt(llama_arg( - {"-p", "--prompt"}, "PROMPT", - ex == LLAMA_EXAMPLE_MAIN - ? "prompt to start generation with\nif -cnv is set, this will be used as system prompt" - : "prompt to start generation with", - [](gpt_params & params, const std::string & value) { - params.prompt = value; - } - )); - add_opt(llama_arg( - {"-f", "--file"}, "FNAME", - "a file containing the prompt (default: none)", - [](gpt_params & params, const std::string & value) { - std::ifstream file(value); - if (!file) { - throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); - } - // store the external file name in params - params.prompt_file = value; - std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(params.prompt)); - if (!params.prompt.empty() && params.prompt.back() == '\n') { - params.prompt.pop_back(); - } - } - )); - add_opt(llama_arg( - {"--in-file"}, "FNAME", - "an input file (repeat to specify multiple files)", - [](gpt_params & params, const std::string & value) { - std::ifstream file(value); - if (!file) { - throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); - } - params.in_files.push_back(value); - } - )); - add_opt(llama_arg( - {"-bf", "--binary-file"}, "FNAME", - "binary file containing the prompt (default: none)", - [](gpt_params & params, const std::string & value) { - std::ifstream file(value, std::ios::binary); - if (!file) { - throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); - } - // store the external file name in params - params.prompt_file = value; - std::ostringstream ss; - ss << file.rdbuf(); - params.prompt = ss.str(); - fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), value.c_str()); - } - )); - add_opt(llama_arg( - {"-e", "--escape"}, - format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"), - [](gpt_params & params) { - params.escape = true; - } - )); - add_opt(llama_arg( - {"--no-escape"}, - "do not process escape sequences", - [](gpt_params & params) { - params.escape = false; - } - )); - add_opt(llama_arg( - {"-ptc", "--print-token-count"}, "N", - format("print token count every N tokens (default: %d)", params.n_print), - [](gpt_params & params, int value) { - params.n_print = value; - } - ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(llama_arg( - {"--prompt-cache"}, "FNAME", - "file to cache prompt state for faster startup (default: none)", - [](gpt_params & params, const std::string & value) { - params.path_prompt_cache = value; - } - ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(llama_arg( - {"--prompt-cache-all"}, - "if specified, saves user input and generations to cache as well\n", - [](gpt_params & params) { - params.prompt_cache_all = true; - } - ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(llama_arg( - {"--prompt-cache-ro"}, - "if specified, uses the prompt cache but does not update it", - [](gpt_params & params) { - params.prompt_cache_ro = true; - } - ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(llama_arg( - {"-r", "--reverse-prompt"}, "PROMPT", - "halt generation at PROMPT, return control in interactive mode\n", - [](gpt_params & params, const std::string & value) { - params.antiprompt.emplace_back(value); - } - ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(llama_arg( - {"-sp", "--special"}, - format("special tokens output enabled (default: %s)", params.special ? "true" : "false"), - [](gpt_params & params) { - params.special = true; - } - ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(llama_arg( - {"-cnv", "--conversation"}, - format( - "run in conversation mode:\n" - "- does not print special tokens and suffix/prefix\n" - "- interactive mode is also enabled\n" - "(default: %s)", - params.conversation ? "true" : "false" - ), - [](gpt_params & params) { - params.conversation = true; - } - ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(llama_arg( - {"-i", "--interactive"}, - format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"), - [](gpt_params & params) { - params.interactive = true; - } - ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(llama_arg( - {"-if", "--interactive-first"}, - format("run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"), - [](gpt_params & params) { - params.interactive_first = true; - } - ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(llama_arg( - {"-mli", "--multiline-input"}, - "allows you to write or paste multiple lines without ending each in '\\'", - [](gpt_params & params) { - params.multiline_input = true; - } - ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(llama_arg( - {"--in-prefix-bos"}, - "prefix BOS to user inputs, preceding the `--in-prefix` string", - [](gpt_params & params) { - params.input_prefix_bos = true; - params.enable_chat_template = false; - } - ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(llama_arg( - {"--in-prefix"}, "STRING", - "string to prefix user inputs with (default: empty)", - [](gpt_params & params, const std::string & value) { - params.input_prefix = value; - params.enable_chat_template = false; - } - ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(llama_arg( - {"--in-suffix"}, "STRING", - "string to suffix after user inputs with (default: empty)", - [](gpt_params & params, const std::string & value) { - params.input_suffix = value; - params.enable_chat_template = false; - } - ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(llama_arg( - {"--no-warmup"}, - "skip warming up the model with an empty run", - [](gpt_params & params) { - params.warmup = false; - } - ).set_examples({LLAMA_EXAMPLE_MAIN})); - add_opt(llama_arg( - {"--spm-infill"}, - format( - "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", - params.spm_infill ? "enabled" : "disabled" - ), - [](gpt_params & params) { - params.spm_infill = true; - } - ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_INFILL})); - add_opt(llama_arg( - {"--samplers"}, "SAMPLERS", - format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()), - [](gpt_params & params, const std::string & value) { - const auto sampler_names = string_split(value, ';'); - params.sparams.samplers = gpt_sampler_types_from_names(sampler_names, true); - } - )); - add_opt(llama_arg( - {"--sampling-seq"}, "SEQUENCE", - format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()), - [](gpt_params & params, const std::string & value) { - params.sparams.samplers = gpt_sampler_types_from_chars(value); - } - )); - add_opt(llama_arg( - {"--ignore-eos"}, - "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)", - [](gpt_params & params) { - params.sparams.ignore_eos = true; - } - )); - add_opt(llama_arg( - {"--penalize-nl"}, - format("penalize newline tokens (default: %s)", params.sparams.penalize_nl ? "true" : "false"), - [](gpt_params & params) { - params.sparams.penalize_nl = true; - } - )); - add_opt(llama_arg( - {"--temp"}, "N", - format("temperature (default: %.1f)", (double)params.sparams.temp), - [](gpt_params & params, const std::string & value) { - params.sparams.temp = std::stof(value); - params.sparams.temp = std::max(params.sparams.temp, 0.0f); - } - )); - add_opt(llama_arg( - {"--top-k"}, "N", - format("top-k sampling (default: %d, 0 = disabled)", params.sparams.top_k), - [](gpt_params & params, int value) { - params.sparams.top_k = value; - } - )); - add_opt(llama_arg( - {"--top-p"}, "N", - format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sparams.top_p), - [](gpt_params & params, const std::string & value) { - params.sparams.top_p = std::stof(value); - } - )); - add_opt(llama_arg( - {"--min-p"}, "N", - format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sparams.min_p), - [](gpt_params & params, const std::string & value) { - params.sparams.min_p = std::stof(value); - } - )); - add_opt(llama_arg( - {"--tfs"}, "N", - format("tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)params.sparams.tfs_z), - [](gpt_params & params, const std::string & value) { - params.sparams.tfs_z = std::stof(value); - } - )); - add_opt(llama_arg( - {"--typical"}, "N", - format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sparams.typ_p), - [](gpt_params & params, const std::string & value) { - params.sparams.typ_p = std::stof(value); - } - )); - add_opt(llama_arg( - {"--repeat-last-n"}, "N", - format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sparams.penalty_last_n), - [](gpt_params & params, int value) { - params.sparams.penalty_last_n = value; - params.sparams.n_prev = std::max(params.sparams.n_prev, params.sparams.penalty_last_n); - } - )); - add_opt(llama_arg( - {"--repeat-penalty"}, "N", - format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sparams.penalty_repeat), - [](gpt_params & params, const std::string & value) { - params.sparams.penalty_repeat = std::stof(value); - } - )); - add_opt(llama_arg( - {"--presence-penalty"}, "N", - format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_present), - [](gpt_params & params, const std::string & value) { - params.sparams.penalty_present = std::stof(value); - } - )); - add_opt(llama_arg( - {"--frequency-penalty"}, "N", - format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_freq), - [](gpt_params & params, const std::string & value) { - params.sparams.penalty_freq = std::stof(value); - } - )); - add_opt(llama_arg( - {"--dynatemp-range"}, "N", - format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sparams.dynatemp_range), - [](gpt_params & params, const std::string & value) { - params.sparams.dynatemp_range = std::stof(value); - } - )); - add_opt(llama_arg( - {"--dynatemp-exp"}, "N", - format("dynamic temperature exponent (default: %.1f)", (double)params.sparams.dynatemp_exponent), - [](gpt_params & params, const std::string & value) { - params.sparams.dynatemp_exponent = std::stof(value); - } - )); - add_opt(llama_arg( - {"--mirostat"}, "N", - format("use Mirostat sampling.\nTop K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n" - "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sparams.mirostat), - [](gpt_params & params, int value) { - params.sparams.mirostat = value; - } - )); - add_opt(llama_arg( - {"--mirostat-lr"}, "N", - format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sparams.mirostat_eta), - [](gpt_params & params, const std::string & value) { - params.sparams.mirostat_eta = std::stof(value); - } - )); - add_opt(llama_arg( - {"--mirostat-ent"}, "N", - format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sparams.mirostat_tau), - [](gpt_params & params, const std::string & value) { - params.sparams.mirostat_tau = std::stof(value); - } - )); - add_opt(llama_arg( - {"-l", "--logit-bias"}, "TOKEN_ID(+/-)BIAS", - "modifies the likelihood of token appearing in the completion,\n" - "i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n" - "or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'", - [](gpt_params & params, const std::string & value) { - std::stringstream ss(value); - llama_token key; - char sign; - std::string value_str; - try { - if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) { - const float bias = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f); - params.sparams.logit_bias.push_back({key, bias}); - } else { - throw std::invalid_argument("invalid input format"); - } - } catch (const std::exception&) { - throw std::invalid_argument("invalid input format"); - } - } - )); - add_opt(llama_arg( - {"--grammar"}, "GRAMMAR", - format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sparams.grammar.c_str()), - [](gpt_params & params, const std::string & value) { - params.sparams.grammar = value; - } - )); - add_opt(llama_arg( - {"--grammar-file"}, "FNAME", - "file to read grammar from", - [](gpt_params & params, const std::string & value) { - std::ifstream file(value); - if (!file) { - throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); - } - std::copy( - std::istreambuf_iterator(file), - std::istreambuf_iterator(), - std::back_inserter(params.sparams.grammar) - ); - } - )); - add_opt(llama_arg( - {"-j", "--json-schema"}, "SCHEMA", - "JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead", - [](gpt_params & params, const std::string & value) { - //params.sparams.grammar = json_schema_to_grammar(json::parse(value)); - } - )); - add_opt(llama_arg( - {"--pooling"}, "{none,mean,cls,last}", - "pooling type for embeddings, use model default if unspecified", - [](gpt_params & params, const std::string & value) { - /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; } - else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; } - else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; } - else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; } - else { throw std::invalid_argument("invalid value"); } - } - ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); - add_opt(llama_arg( - {"--attention"}, "{causal,non,causal}", - "attention type for embeddings, use model default if unspecified", - [](gpt_params & params, const std::string & value) { - /**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; } - else if (value == "non-causal") { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; } - else { throw std::invalid_argument("invalid value"); } - } - ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); - add_opt(llama_arg( - {"--rope-scaling"}, "{none,linear,yarn}", - "RoPE frequency scaling method, defaults to linear unless specified by the model", - [](gpt_params & params, const std::string & value) { - /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; } - else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; } - else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; } - else { throw std::invalid_argument("invalid value"); } - } - )); - add_opt(llama_arg( - {"--rope-scale"}, "N", - "RoPE context scaling factor, expands context by a factor of N", - [](gpt_params & params, const std::string & value) { - params.rope_freq_scale = 1.0f / std::stof(value); - } - )); - add_opt(llama_arg( - {"--rope-freq-base"}, "N", - "RoPE base frequency, used by NTK-aware scaling (default: loaded from model)", - [](gpt_params & params, const std::string & value) { - params.rope_freq_base = std::stof(value); - } - )); - add_opt(llama_arg( - {"--rope-freq-scale"}, "N", - "RoPE frequency scaling factor, expands context by a factor of 1/N", - [](gpt_params & params, const std::string & value) { - params.rope_freq_scale = std::stof(value); - } - )); - add_opt(llama_arg( - {"--yarn-orig-ctx"}, "N", - format("YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx), - [](gpt_params & params, int value) { - params.yarn_orig_ctx = value; - } - )); - add_opt(llama_arg( - {"--yarn-ext-factor"}, "N", - format("YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor), - [](gpt_params & params, const std::string & value) { - params.yarn_ext_factor = std::stof(value); - } - )); - add_opt(llama_arg( - {"--yarn-attn-factor"}, "N", - format("YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor), - [](gpt_params & params, const std::string & value) { - params.yarn_attn_factor = std::stof(value); - } - )); - add_opt(llama_arg( - {"--yarn-beta-slow"}, "N", - format("YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow), - [](gpt_params & params, const std::string & value) { - params.yarn_beta_slow = std::stof(value); - } - )); - add_opt(llama_arg( - {"--yarn-beta-fast"}, "N", - format("YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast), - [](gpt_params & params, const std::string & value) { - params.yarn_beta_fast = std::stof(value); - } - )); - add_opt(llama_arg( - {"-gan", "--grp-attn-n"}, "N", - format("group-attention factor (default: %d)", params.grp_attn_n), - [](gpt_params & params, int value) { - params.grp_attn_n = value; - } - )); - add_opt(llama_arg( - {"-gaw", "--grp-attn-w"}, "N", - format("group-attention width (default: %.1f)", (double)params.grp_attn_w), - [](gpt_params & params, int value) { - params.grp_attn_w = value; - } - )); - add_opt(llama_arg( - {"-dkvc", "--dump-kv-cache"}, - "verbose print of the KV cache", - [](gpt_params & params) { - params.dump_kv_cache = true; - } - )); - add_opt(llama_arg( - {"-nkvo", "--no-kv-offload"}, - "disable KV offload", - [](gpt_params & params) { - params.no_kv_offload = true; - } - )); - add_opt(llama_arg( - {"-ctk", "--cache-type-k"}, "TYPE", - format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()), - [](gpt_params & params, const std::string & value) { - // TODO: get the type right here - params.cache_type_k = value; - } - )); - add_opt(llama_arg( - {"-ctv", "--cache-type-v"}, "TYPE", - format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()), - [](gpt_params & params, const std::string & value) { - // TODO: get the type right here - params.cache_type_v = value; - } - )); - add_opt(llama_arg( - {"--perplexity", "--all-logits"}, - format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"), - [](gpt_params & params) { - params.logits_all = true; - } - ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); - add_opt(llama_arg( - {"--hellaswag"}, - "compute HellaSwag score over random tasks from datafile supplied with -f", - [](gpt_params & params) { - params.hellaswag = true; - } - ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); - add_opt(llama_arg( - {"--hellaswag-tasks"}, "N", - format("number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks), - [](gpt_params & params, int value) { - params.hellaswag_tasks = value; - } - ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); - add_opt(llama_arg( - {"--winogrande"}, - "compute Winogrande score over random tasks from datafile supplied with -f", - [](gpt_params & params) { - params.winogrande = true; - } - ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); - add_opt(llama_arg( - {"--winogrande-tasks"}, "N", - format("number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks), - [](gpt_params & params, int value) { - params.winogrande_tasks = value; - } - ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); - add_opt(llama_arg( - {"--multiple-choice"}, - "compute multiple choice score over random tasks from datafile supplied with -f", - [](gpt_params & params) { - params.multiple_choice = true; - } - ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); - add_opt(llama_arg( - {"--multiple-choice-tasks"}, "N", - format("number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks), - [](gpt_params & params, int value) { - params.multiple_choice_tasks = value; - } - ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); - add_opt(llama_arg( - {"--kl-divergence"}, - "computes KL-divergence to logits provided via --kl-divergence-base", - [](gpt_params & params) { - params.kl_divergence = true; - } - ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); - add_opt(llama_arg( - {"--save-all-logits", "--kl-divergence-base"}, "FNAME", - "set logits file", - [](gpt_params & params, const std::string & value) { - params.logits_file = value; - } - ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); - add_opt(llama_arg( - {"--ppl-stride"}, "N", - format("stride for perplexity calculation (default: %d)", params.ppl_stride), - [](gpt_params & params, int value) { - params.ppl_stride = value; - } - ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); - add_opt(llama_arg( - {"--ppl-output-type"}, "<0|1>", - format("output type for perplexity calculation (default: %d)", params.ppl_output_type), - [](gpt_params & params, int value) { - params.ppl_output_type = value; - } - ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); - add_opt(llama_arg( - {"-dt", "--defrag-thold"}, "N", - format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold), - [](gpt_params & params, const std::string & value) { - params.defrag_thold = std::stof(value); - } - ).set_env("LLAMA_ARG_DEFRAG_THOLD")); - add_opt(llama_arg( - {"-np", "--parallel"}, "N", - format("number of parallel sequences to decode (default: %d)", params.n_parallel), - [](gpt_params & params, int value) { - params.n_parallel = value; - } - )); - add_opt(llama_arg( - {"-ns", "--sequences"}, "N", - format("number of sequences to decode (default: %d)", params.n_sequences), - [](gpt_params & params, int value) { - params.n_sequences = value; - } - )); - add_opt(llama_arg( - {"-cb", "--cont-batching"}, - format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"), - [](gpt_params & params) { - params.cont_batching = true; - } - ).set_env("LLAMA_ARG_CONT_BATCHING")); - add_opt(llama_arg( - {"-nocb", "--no-cont-batching"}, - "disable continuous batching", - [](gpt_params & params) { - params.cont_batching = false; - } - ).set_env("LLAMA_ARG_NO_CONT_BATCHING")); - add_opt(llama_arg( - {"--mmproj"}, "FILE", - "path to a multimodal projector file for LLaVA. see examples/llava/README.md", - [](gpt_params & params, const std::string & value) { - params.mmproj = value; - } - ).set_examples({LLAMA_EXAMPLE_LLAVA})); - add_opt(llama_arg( - {"--image"}, "FILE", - "path to an image file. use with multimodal models. Specify multiple times for batching", - [](gpt_params & params, const std::string & value) { - params.image.emplace_back(value); - } - ).set_examples({LLAMA_EXAMPLE_LLAVA})); -#ifdef GGML_USE_RPC - add_opt(llama_arg( - {"--rpc"}, "SERVERS", - "comma separated list of RPC servers", - [](gpt_params & params, const std::string & value) { - params.rpc_servers = value; - } - )); -#endif - add_opt(llama_arg( - {"--mlock"}, - "force system to keep model in RAM rather than swapping or compressing", - [](gpt_params & params) { - params.use_mlock = true; - } - )); - add_opt(llama_arg( - {"--no-mmap"}, - "do not memory-map model (slower load but may reduce pageouts if not using mlock)", - [](gpt_params & params) { - params.use_mmap = false; - } - )); - add_opt(llama_arg( - {"--numa"}, "TYPE", - "attempt optimizations that help on some NUMA systems\n" - "- distribute: spread execution evenly over all nodes\n" - "- isolate: only spawn threads on CPUs on the node that execution started on\n" - "- numactl: use the CPU map provided by numactl\n" - "if run without this previously, it is recommended to drop the system page cache before using this\n" - "see https://github.com/ggerganov/llama.cpp/issues/1437", - [](gpt_params & params, const std::string & value) { - /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; } - else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; } - else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; } - else { throw std::invalid_argument("invalid value"); } - } - )); - add_opt(llama_arg( - {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N", - "number of layers to store in VRAM", - [](gpt_params & params, int value) { - params.n_gpu_layers = value; - if (!llama_supports_gpu_offload()) { - fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n"); - fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); - } - } - ).set_env("LLAMA_ARG_N_GPU_LAYERS")); - add_opt(llama_arg( - {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N", - "number of layers to store in VRAM for the draft model", - [](gpt_params & params, int value) { - params.n_gpu_layers_draft = value; - if (!llama_supports_gpu_offload()) { - fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n"); - fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); - } - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(llama_arg( - {"-sm", "--split-mode"}, "{none,layer,row}", - "how to split the model across multiple GPUs, one of:\n" - "- none: use one GPU only\n" - "- layer (default): split layers and KV across GPUs\n" - "- row: split rows across GPUs", - [](gpt_params & params, const std::string & value) { - std::string arg_next = value; - if (arg_next == "none") { - params.split_mode = LLAMA_SPLIT_MODE_NONE; - } else if (arg_next == "layer") { - params.split_mode = LLAMA_SPLIT_MODE_LAYER; - } - else if (arg_next == "row") { -#ifdef GGML_USE_SYCL - fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n"); - exit(1); -#endif // GGML_USE_SYCL - params.split_mode = LLAMA_SPLIT_MODE_ROW; - } - else { - throw std::invalid_argument("invalid value"); - } -#ifndef GGML_USE_CUDA_SYCL_VULKAN - fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the split mode has no effect.\n"); -#endif // GGML_USE_CUDA_SYCL_VULKAN - } - )); - add_opt(llama_arg( - {"-ts", "--tensor-split"}, "N0,N1,N2,...", - "fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1", - [](gpt_params & params, const std::string & value) { - std::string arg_next = value; - - // split string by , and / - const std::regex regex{ R"([,/]+)" }; - std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 }; - std::vector split_arg{ it, {} }; - if (split_arg.size() >= llama_max_devices()) { - throw std::invalid_argument( - format("got %d input configs, but system only has %d devices", (int)split_arg.size(), (int)llama_max_devices()) - ); - } - for (size_t i = 0; i < llama_max_devices(); ++i) { - if (i < split_arg.size()) { - params.tensor_split[i] = std::stof(split_arg[i]); - } else { - params.tensor_split[i] = 0.0f; - } - } -#ifndef GGML_USE_CUDA_SYCL_VULKAN - fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting a tensor split has no effect.\n"); -#endif // GGML_USE_CUDA_SYCL_VULKAN - } - )); - add_opt(llama_arg( - {"-mg", "--main-gpu"}, "INDEX", - format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu), - [](gpt_params & params, int value) { - params.main_gpu = value; -#ifndef GGML_USE_CUDA_SYCL_VULKAN - fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the main GPU has no effect.\n"); -#endif // GGML_USE_CUDA_SYCL_VULKAN - } - )); - add_opt(llama_arg( - {"--check-tensors"}, - format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"), - [](gpt_params & params) { - params.check_tensors = true; - } - )); - add_opt(llama_arg( - {"--override-kv"}, "KEY=TYPE:VALUE", - "advanced option to override model metadata by key. may be specified multiple times.\n" - "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false", - [](gpt_params & params, const std::string & value) { - if (!string_parse_kv_override(value.c_str(), params.kv_overrides)) { - throw std::runtime_error(format("error: Invalid type for KV override: %s\n", value.c_str())); - } - } - )); - add_opt(llama_arg( - {"--lora"}, "FNAME", - "path to LoRA adapter (can be repeated to use multiple adapters)", - [](gpt_params & params, const std::string & value) { - params.lora_adapters.push_back({ std::string(value), 1.0 }); - } - ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA})); - add_opt(llama_arg( - {"--lora-scaled"}, "FNAME", "SCALE", - "path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)", - [](gpt_params & params, const std::string & fname, const std::string & scale) { - params.lora_adapters.push_back({ fname, std::stof(scale) }); - } - ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA})); - add_opt(llama_arg( - {"--control-vector"}, "FNAME", - "add a control vector\nnote: this argument can be repeated to add multiple control vectors", - [](gpt_params & params, const std::string & value) { - params.control_vectors.push_back({ 1.0f, value, }); - } - )); - add_opt(llama_arg( - {"--control-vector-scaled"}, "FNAME", "SCALE", - "add a control vector with user defined scaling SCALE\n" - "note: this argument can be repeated to add multiple scaled control vectors", - [](gpt_params & params, const std::string & fname, const std::string & scale) { - params.control_vectors.push_back({ std::stof(scale), fname }); - } - )); - add_opt(llama_arg( - {"--control-vector-layer-range"}, "START", "END", - "layer range to apply the control vector(s) to, start and end inclusive", - [](gpt_params & params, const std::string & start, const std::string & end) { - params.control_vector_layer_start = std::stoi(start); - params.control_vector_layer_end = std::stoi(end); - } - )); - add_opt(llama_arg( - {"-a", "--alias"}, "STRING", - "set alias for model name (to be used by REST API)", - [](gpt_params & params, const std::string & value) { - params.model_alias = value; - } - ).set_examples({LLAMA_EXAMPLE_SERVER})); - add_opt(llama_arg( - {"-m", "--model"}, "FNAME", - ex == LLAMA_EXAMPLE_EXPORT_LORA - ? std::string("model path from which to load base model") - : format( - "model path (default: `models/$filename` with filename from `--hf-file` " - "or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH - ), - [](gpt_params & params, const std::string & value) { - params.model = value; - } - ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL")); - add_opt(llama_arg( - {"-md", "--model-draft"}, "FNAME", - "draft model for speculative decoding (default: unused)", - [](gpt_params & params, const std::string & value) { - params.model_draft = value; - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); - add_opt(llama_arg( - {"-mu", "--model-url"}, "MODEL_URL", - "model download url (default: unused)", - [](gpt_params & params, const std::string & value) { - params.model_url = value; - } - ).set_env("LLAMA_ARG_MODEL_URL")); - add_opt(llama_arg( - {"-hfr", "--hf-repo"}, "REPO", - "Hugging Face model repository (default: unused)", - [](gpt_params & params, const std::string & value) { - params.hf_repo = value; - } - ).set_env("LLAMA_ARG_HF_REPO")); - add_opt(llama_arg( - {"-hff", "--hf-file"}, "FILE", - "Hugging Face model file (default: unused)", - [](gpt_params & params, const std::string & value) { - params.hf_file = value; - } - ).set_env("LLAMA_ARG_HF_FILE")); - add_opt(llama_arg( - {"-hft", "--hf-token"}, "TOKEN", - "Hugging Face access token (default: value from HF_TOKEN environment variable)", - [](gpt_params & params, const std::string & value) { - params.hf_token = value; - } - ).set_env("HF_TOKEN")); - add_opt(llama_arg( - {"--context-file"}, "FNAME", - "file to load context from (repeat to specify multiple files)", - [](gpt_params & params, const std::string & value) { - std::ifstream file(value, std::ios::binary); - if (!file) { - throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); - } - params.context_files.push_back(value); - } - ).set_examples({LLAMA_EXAMPLE_RETRIEVAL})); - add_opt(llama_arg( - {"--chunk-size"}, "N", - format("minimum length of embedded text chunks (default: %d)", params.chunk_size), - [](gpt_params & params, int value) { - params.chunk_size = value; - } - ).set_examples({LLAMA_EXAMPLE_RETRIEVAL})); - add_opt(llama_arg( - {"--chunk-separator"}, "STRING", - format("separator between chunks (default: '%s')", params.chunk_separator.c_str()), - [](gpt_params & params, const std::string & value) { - params.chunk_separator = value; - } - ).set_examples({LLAMA_EXAMPLE_RETRIEVAL})); - add_opt(llama_arg( - {"--junk"}, "N", - format("number of times to repeat the junk text (default: %d)", params.n_junk), - [](gpt_params & params, int value) { - params.n_junk = value; - } - ).set_examples({LLAMA_EXAMPLE_PASSKEY})); - add_opt(llama_arg( - {"--pos"}, "N", - format("position of the passkey in the junk text (default: %d)", params.i_pos), - [](gpt_params & params, int value) { - params.i_pos = value; - } - ).set_examples({LLAMA_EXAMPLE_PASSKEY})); - add_opt(llama_arg( - {"-o", "--output", "--output-file"}, "FNAME", - format("output file (default: '%s')", - ex == LLAMA_EXAMPLE_EXPORT_LORA - ? params.lora_outfile.c_str() - : ex == LLAMA_EXAMPLE_CVECTOR_GENERATOR - ? params.cvector_outfile.c_str() - : params.out_file.c_str()), - [](gpt_params & params, const std::string & value) { - params.out_file = value; - params.cvector_outfile = value; - params.lora_outfile = value; - } - ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA})); - add_opt(llama_arg( - {"-ofreq", "--output-frequency"}, "N", - format("output the imatrix every N iterations (default: %d)", params.n_out_freq), - [](gpt_params & params, int value) { - params.n_out_freq = value; - } - ).set_examples({LLAMA_EXAMPLE_IMATRIX})); - add_opt(llama_arg( - {"--save-frequency"}, "N", - format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq), - [](gpt_params & params, int value) { - params.n_save_freq = value; - } - ).set_examples({LLAMA_EXAMPLE_IMATRIX})); - add_opt(llama_arg( - {"--process-output"}, - format("collect data for the output tensor (default: %s)", params.process_output ? "true" : "false"), - [](gpt_params & params) { - params.process_output = true; - } - ).set_examples({LLAMA_EXAMPLE_IMATRIX})); - add_opt(llama_arg( - {"--no-ppl"}, - format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"), - [](gpt_params & params) { - params.compute_ppl = false; - } - ).set_examples({LLAMA_EXAMPLE_IMATRIX})); - add_opt(llama_arg( - {"--chunk", "--from-chunk"}, "N", - format("start processing the input from chunk N (default: %d)", params.i_chunk), - [](gpt_params & params, int value) { - params.i_chunk = value; - } - ).set_examples({LLAMA_EXAMPLE_IMATRIX})); - add_opt(llama_arg( - {"-pps"}, - format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"), - [](gpt_params & params) { - params.is_pp_shared = true; - } - ).set_examples({LLAMA_EXAMPLE_BENCH})); - add_opt(llama_arg( - {"-npp"}, "n0,n1,...", - "number of prompt tokens", - [](gpt_params & params, const std::string & value) { - auto p = string_split(value, ','); - params.n_pp.insert(params.n_pp.end(), p.begin(), p.end()); - } - ).set_examples({LLAMA_EXAMPLE_BENCH})); - add_opt(llama_arg( - {"-ntg"}, "n0,n1,...", - "number of text generation tokens", - [](gpt_params & params, const std::string & value) { - auto p = string_split(value, ','); - params.n_tg.insert(params.n_tg.end(), p.begin(), p.end()); - } - ).set_examples({LLAMA_EXAMPLE_BENCH})); - add_opt(llama_arg( - {"-npl"}, "n0,n1,...", - "number of parallel prompts", - [](gpt_params & params, const std::string & value) { - auto p = string_split(value, ','); - params.n_pl.insert(params.n_pl.end(), p.begin(), p.end()); - } - ).set_examples({LLAMA_EXAMPLE_BENCH})); - add_opt(llama_arg( - {"--embd-normalize"}, "N", - format("normalisation for embendings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize), - [](gpt_params & params, int value) { - params.embd_normalize = value; - } - ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); - add_opt(llama_arg( - {"--embd-output-format"}, "FORMAT", - "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix", - [](gpt_params & params, const std::string & value) { - params.embd_out = value; - } - ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); - add_opt(llama_arg( - {"--embd-separator"}, "STRING", - "separator of embendings (default \\n) for example \"<#sep#>\"", - [](gpt_params & params, const std::string & value) { - params.embd_sep = value; - } - ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); - add_opt(llama_arg( - {"--host"}, "HOST", - format("ip address to listen (default: %s)", params.hostname.c_str()), - [](gpt_params & params, const std::string & value) { - params.hostname = value; - } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_HOST")); - add_opt(llama_arg( - {"--port"}, "PORT", - format("port to listen (default: %d)", params.port), - [](gpt_params & params, int value) { - params.port = value; - } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PORT")); - add_opt(llama_arg( - {"--path"}, "PATH", - format("path to serve static files from (default: %s)", params.public_path.c_str()), - [](gpt_params & params, const std::string & value) { - params.public_path = value; - } - ).set_examples({LLAMA_EXAMPLE_SERVER})); - add_opt(llama_arg( - {"--embedding", "--embeddings"}, - format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"), - [](gpt_params & params) { - params.embedding = true; - } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS")); - add_opt(llama_arg( - {"--api-key"}, "KEY", - "API key to use for authentication (default: none)", - [](gpt_params & params, const std::string & value) { - params.api_keys.push_back(value); - } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_API_KEY")); - add_opt(llama_arg( - {"--api-key-file"}, "FNAME", - "path to file containing API keys (default: none)", - [](gpt_params & params, const std::string & value) { - std::ifstream key_file(value); - if (!key_file) { - throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); - } - std::string key; - while (std::getline(key_file, key)) { - if (!key.empty()) { - params.api_keys.push_back(key); - } - } - key_file.close(); - } - ).set_examples({LLAMA_EXAMPLE_SERVER})); - add_opt(llama_arg( - {"--ssl-key-file"}, "FNAME", - "path to file a PEM-encoded SSL private key", - [](gpt_params & params, const std::string & value) { - params.ssl_file_key = value; - } - ).set_examples({LLAMA_EXAMPLE_SERVER})); - add_opt(llama_arg( - {"--ssl-cert-file"}, "FNAME", - "path to file a PEM-encoded SSL certificate", - [](gpt_params & params, const std::string & value) { - params.ssl_file_cert = value; - } - ).set_examples({LLAMA_EXAMPLE_SERVER})); - add_opt(llama_arg( - {"-to", "--timeout"}, "N", - format("server read/write timeout in seconds (default: %d)", params.timeout_read), - [](gpt_params & params, int value) { - params.timeout_read = value; - params.timeout_write = value; - } - ).set_examples({LLAMA_EXAMPLE_SERVER})); - add_opt(llama_arg( - {"--threads-http"}, "N", - format("number of threads used to process HTTP requests (default: %d)", params.n_threads_http), - [](gpt_params & params, int value) { - params.n_threads_http = value; - } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP")); - add_opt(llama_arg( - {"-spf", "--system-prompt-file"}, "FNAME", - "set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications", - [](gpt_params & params, const std::string & value) { - std::ifstream file(value); - if (!file) { - throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); - } - std::string system_prompt; - std::copy( - std::istreambuf_iterator(file), - std::istreambuf_iterator(), - std::back_inserter(system_prompt) - ); - params.system_prompt = system_prompt; - } - ).set_examples({LLAMA_EXAMPLE_SERVER})); - add_opt(llama_arg( - {"--log-format"}, "{text, json}", - "log output format: json or text (default: json)", - [](gpt_params & params, const std::string & value) { - if (value == "json") { - params.log_json = true; - } else if (value == "text") { - params.log_json = false; - } else { - throw std::invalid_argument("invalid value"); - } - } - ).set_examples({LLAMA_EXAMPLE_SERVER})); - add_opt(llama_arg( - {"--metrics"}, - format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"), - [](gpt_params & params) { - params.endpoint_metrics = true; - } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS")); - add_opt(llama_arg( - {"--no-slots"}, - format("disables slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"), - [](gpt_params & params) { - params.endpoint_slots = false; - } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_ENDPOINT_SLOTS")); - add_opt(llama_arg( - {"--slot-save-path"}, "PATH", - "path to save slot kv cache (default: disabled)", - [](gpt_params & params, const std::string & value) { - params.slot_save_path = value; - // if doesn't end with DIRECTORY_SEPARATOR, add it - if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) { - params.slot_save_path += DIRECTORY_SEPARATOR; - } - } - ).set_examples({LLAMA_EXAMPLE_SERVER})); - add_opt(llama_arg( - {"--chat-template"}, "JINJA_TEMPLATE", - "set custom jinja chat template (default: template taken from model's metadata)\n" - "if suffix/prefix are specified, template will be disabled\n" - "only commonly used templates are accepted:\nhttps://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template", - [](gpt_params & params, const std::string & value) { - if (!llama_chat_verify_template(value)) { - throw std::runtime_error(format( - "error: the supplied chat template is not supported: %s\n" - "note: llama.cpp does not use jinja parser, we only support commonly used templates\n", - value.c_str() - )); - } - params.chat_template = value; - } - ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE")); - add_opt(llama_arg( - {"-sps", "--slot-prompt-similarity"}, "SIMILARITY", - format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity), - [](gpt_params & params, const std::string & value) { - params.slot_prompt_similarity = std::stof(value); - } - ).set_examples({LLAMA_EXAMPLE_SERVER})); - add_opt(llama_arg( - {"--lora-init-without-apply"}, - format("load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"), - [](gpt_params & params) { - params.lora_init_without_apply = true; - } - ).set_examples({LLAMA_EXAMPLE_SERVER})); - add_opt(llama_arg( - {"--simple-io"}, - "use basic IO for better compatibility in subprocesses and limited consoles", - [](gpt_params & params) { - params.simple_io = true; - } - ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL})); - add_opt(llama_arg( - {"-ld", "--logdir"}, "LOGDIR", - "path under which to save YAML logs (no logging if unset)", - [](gpt_params & params, const std::string & value) { - params.logdir = value; - - if (params.logdir.back() != DIRECTORY_SEPARATOR) { - params.logdir += DIRECTORY_SEPARATOR; - } - } - )); - add_opt(llama_arg( - {"--positive-file"}, "FNAME", - format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()), - [](gpt_params & params, const std::string & value) { - params.cvector_positive_file = value; - } - ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); - add_opt(llama_arg( - {"--negative-file"}, "FNAME", - format("negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str()), - [](gpt_params & params, const std::string & value) { - params.cvector_negative_file = value; - } - ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); - add_opt(llama_arg( - {"--pca-batch"}, "N", - format("batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch), - [](gpt_params & params, int value) { - params.n_pca_batch = value; - } - ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); - add_opt(llama_arg( - {"--pca-iter"}, "N", - format("number of iterations used for PCA (default: %d)", params.n_pca_iterations), - [](gpt_params & params, int value) { - params.n_pca_iterations = value; - } - ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); - add_opt(llama_arg( - {"--method"}, "{pca, mean}", - "dimensionality reduction method to be used (default: pca)", - [](gpt_params & params, const std::string & value) { - /**/ if (value == "pca") { params.cvector_dimre_method = DIMRE_METHOD_PCA; } - else if (value == "mean") { params.cvector_dimre_method = DIMRE_METHOD_MEAN; } - else { throw std::invalid_argument("invalid value"); } - } - ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); - add_opt(llama_arg( - {"--output-format"}, "{md,jsonl}", - "output format for batched-bench results (default: md)", - [](gpt_params & params, const std::string & value) { - /**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; } - else if (value == "md") { params.batched_bench_output_jsonl = false; } - else { std::invalid_argument("invalid value"); } - } - ).set_examples({LLAMA_EXAMPLE_BENCH})); -#ifndef LOG_DISABLE_LOGS - // TODO: make this looks less weird - add_opt(llama_arg( - {"--log-test"}, - "Log test", - [](gpt_params &) { log_param_single_parse("--log-test"); } - )); - add_opt(llama_arg( - {"--log-disable"}, - "Log disable", - [](gpt_params &) { log_param_single_parse("--log-disable"); } - )); - add_opt(llama_arg( - {"--log-enable"}, - "Log enable", - [](gpt_params &) { log_param_single_parse("--log-enable"); } - )); - add_opt(llama_arg( - {"--log-new"}, - "Log new", - [](gpt_params &) { log_param_single_parse("--log-new"); } - )); - add_opt(llama_arg( - {"--log-append"}, - "Log append", - [](gpt_params &) { log_param_single_parse("--log-append"); } - )); - add_opt(llama_arg( - {"--log-file"}, "FNAME", - "Log file", - [](gpt_params &, const std::string & value) { log_param_pair_parse(false, "--log-file", value); } - )); -#endif // LOG_DISABLE_LOGS - - return options; -} - std::string gpt_params_get_system_info(const gpt_params & params) { std::ostringstream os; diff --git a/common/common.h b/common/common.h index 480e597a8..4cb978a7c 100644 --- a/common/common.h +++ b/common/common.h @@ -4,20 +4,11 @@ #include "llama.h" -#include "sampling.h" - #define LOG_NO_FILE_LINE_FUNCTION #include "log.h" -#include #include #include -#include -#include -#include -#include -#include -#include #ifdef _WIN32 #define DIRECTORY_SEPARATOR '\\' @@ -52,11 +43,20 @@ struct llama_control_vector_load_info; // CPU utils // +struct cpu_params { + int n_threads = -1; + bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask. + bool mask_valid = false; // Default: any CPU + enum ggml_sched_priority priority = GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime) + bool strict_cpu = false; // Use strict CPU placement + uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling) +}; + int32_t cpu_get_num_physical_cores(); int32_t cpu_get_num_math(); // -// CLI argument parsing +// Common params // enum llama_example { @@ -74,30 +74,71 @@ enum llama_example { LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_LLAVA, + LLAMA_EXAMPLE_LOOKUP, + LLAMA_EXAMPLE_PARALLEL, LLAMA_EXAMPLE_COUNT, }; +enum gpt_sampler_type { + GPT_SAMPLER_TYPE_NONE = 0, + GPT_SAMPLER_TYPE_TOP_K = 1, + GPT_SAMPLER_TYPE_TOP_P = 2, + GPT_SAMPLER_TYPE_MIN_P = 3, + GPT_SAMPLER_TYPE_TFS_Z = 4, + GPT_SAMPLER_TYPE_TYPICAL_P = 5, + GPT_SAMPLER_TYPE_TEMPERATURE = 6, +}; + // dimensionality reduction methods, used by cvector-generator enum dimre_method { DIMRE_METHOD_PCA, DIMRE_METHOD_MEAN, }; -struct cpu_params { - int n_threads = -1; - bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask. - bool mask_valid = false; // Default: any CPU - enum ggml_sched_priority priority = GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime) - bool strict_cpu = false; // Use strict CPU placement - uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling) +// sampler parameters +struct gpt_sampler_params { + uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler + + int32_t n_prev = 64; // number of previous tokens to remember + int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. + int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens + int32_t top_k = 40; // <= 0 to use vocab size + float top_p = 0.95f; // 1.0 = disabled + float min_p = 0.05f; // 0.0 = disabled + float tfs_z = 1.00f; // 1.0 = disabled + float typ_p = 1.00f; // typical_p, 1.0 = disabled + float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities + float dynatemp_range = 0.00f; // 0.0 = disabled + float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler + int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) + float penalty_repeat = 1.00f; // 1.0 = disabled + float penalty_freq = 0.00f; // 0.0 = disabled + float penalty_present = 0.00f; // 0.0 = disabled + int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 + float mirostat_tau = 5.00f; // target entropy + float mirostat_eta = 0.10f; // learning rate + bool penalize_nl = false; // consider newlines as a repeatable token + bool ignore_eos = false; + + std::vector samplers = { + GPT_SAMPLER_TYPE_TOP_K, + GPT_SAMPLER_TYPE_TFS_Z, + GPT_SAMPLER_TYPE_TYPICAL_P, + GPT_SAMPLER_TYPE_TOP_P, + GPT_SAMPLER_TYPE_MIN_P, + GPT_SAMPLER_TYPE_TEMPERATURE + }; + + std::string grammar; // optional BNF-like grammar to constrain sampling + + std::vector logit_bias; // logit biases to apply + + // print the parameters into a string + std::string print() const; }; struct gpt_params { - enum llama_example curr_ex = LLAMA_EXAMPLE_COMMON; - - uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed - int32_t n_predict = -1; // new tokens to predict int32_t n_ctx = 0; // context size int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS) @@ -139,54 +180,25 @@ struct gpt_params { enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings - // sampling parameters - int32_t top_k = 40; // <= 0 to use vocab size - float top_p = 0.95f; // 1.0 = disabled - float min_p = 0.0f; // 0.0 = disabled - float tfs_z = 1.00f; // 1.0 = disabled - float typical_p = 1.00f; // 1.0 = disabled - float temp = 0.80f; // 1.0 = disabled - float smoothing_factor = 0.00f; // 0.00 = disabled - float repeat_penalty = 1.10f; // 1.0 = disabled - int32_t repeat_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) - float rep_pen_slope = 1.0f; - float frequency_penalty = 0.00f; // 0.0 = disabled - float presence_penalty = 0.00f; // 0.0 = disabled - int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 - float mirostat_tau = 5.00f; // target entropy - float mirostat_eta = 0.10f; // learning rate - float dry_multiplier = 0.0f; // penalty multiplier, 0.0 = disabled - float dry_base = 1.75f; // exponential base - int32_t dry_allowed_length = 2; // repeated sequences longer than this are penalized - int32_t dry_penalty_last_n = 0; // how many tokens to scan for repetitions (0 = entire context) - std::vector dry_sequence_breakers; // DRY sequence breakers - float xtc_threshold = 0; - float xtc_probability = 0; - - // DynaTemp! - float dynatemp_range = 0.0f; // enables DynaTemp if greater than 0. dynatemp_min = temperature - dt_range, dynatemp_max = temperature + dt_range - float dynatemp_exponent = 1.0f; - - struct gpt_sampler_params sparams; - std::string model = ""; // model path - std::string model_draft = ""; // draft model for speculative decoding - std::string model_alias = "unknown"; // model alias - std::string model_url = ""; // model url to download - std::string hf_token = ""; // HF token - std::string hf_repo = ""; // HF repo - std::string hf_file = ""; // HF file - std::string prompt = ""; - std::string prompt_file = ""; // store the external prompt file name - std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state - std::string input_prefix = ""; // string to prefix user inputs with - std::string input_suffix = ""; // string to suffix user inputs with - std::string logdir = ""; // directory in which to save YAML log files - std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding - std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding - std::string logits_file = ""; // file for saving *all* logits - std::string rpc_servers = ""; // comma separated list of RPC servers + std::string model = ""; // model path // NOLINT + std::string model_draft = ""; // draft model for speculative decoding // NOLINT + std::string model_alias = "unknown"; // model alias // NOLINT + std::string model_url = ""; // model url to download // NOLINT + std::string hf_token = ""; // HF token // NOLINT + std::string hf_repo = ""; // HF repo // NOLINT + std::string hf_file = ""; // HF file // NOLINT + std::string prompt = ""; // NOLINT + std::string prompt_file = ""; // store the external prompt file name // NOLINT + std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT + std::string input_prefix = ""; // string to prefix user inputs with // NOLINT + std::string input_suffix = ""; // string to suffix user inputs with // NOLINT + std::string logdir = ""; // directory in which to save YAML log files // NOLINT + std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT + std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT + std::string logits_file = ""; // file for saving *all* logits // NOLINT + std::string rpc_servers = ""; // comma separated list of RPC servers // NOLINT std::vector in_files; // all input files std::vector antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts) @@ -216,7 +228,6 @@ struct gpt_params { bool kl_divergence = false; // compute KL divergence - std::function print_usage = nullptr; // print example-specific usage and example bool usage = false; // print usage bool use_color = false; // use color to distinguish generations and inputs bool special = false; // enable special token output @@ -247,7 +258,7 @@ struct gpt_params { std::string cache_type_v = "f16"; // KV cache data type for the V // multimodal models (see examples/llava) - std::string mmproj = ""; // path to multimodal projector + std::string mmproj = ""; // path to multimodal projector // NOLINT std::vector image; // path to image file(s) // embedding @@ -263,15 +274,15 @@ struct gpt_params { int n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool) std::string hostname = "127.0.0.1"; - std::string public_path = ""; - std::string chat_template = ""; - std::string system_prompt = ""; + std::string public_path = ""; // NOLINT + std::string chat_template = ""; // NOLINT + std::string system_prompt = ""; // NOLINT bool enable_chat_template = true; std::vector api_keys; - std::string ssl_file_key = ""; - std::string ssl_file_cert = ""; + std::string ssl_file_key = ""; // NOLINT + std::string ssl_file_cert = ""; // NOLINT bool endpoint_slots = true; bool endpoint_metrics = false; @@ -326,92 +337,6 @@ struct gpt_params { bool batched_bench_output_jsonl = false; }; -struct llama_arg { - std::set examples = {LLAMA_EXAMPLE_COMMON}; - std::vector args; - const char * value_hint = nullptr; // help text or example for arg value - const char * value_hint_2 = nullptr; // for second arg value - const char * env = nullptr; - std::string help; - void (*handler_void) (gpt_params & params) = nullptr; - void (*handler_string) (gpt_params & params, const std::string &) = nullptr; - void (*handler_str_str)(gpt_params & params, const std::string &, const std::string &) = nullptr; - void (*handler_int) (gpt_params & params, int) = nullptr; - - llama_arg( - const std::initializer_list & args, - const char * value_hint, - const std::string & help, - void (*handler)(gpt_params & params, const std::string &) - ) : args(args), value_hint(value_hint), help(help), handler_string(handler) {} - - llama_arg( - const std::initializer_list & args, - const char * value_hint, - const std::string & help, - void (*handler)(gpt_params & params, int) - ) : args(args), value_hint(value_hint), help(help), handler_int(handler) {} - - llama_arg( - const std::initializer_list & args, - const std::string & help, - void (*handler)(gpt_params & params) - ) : args(args), help(help), handler_void(handler) {} - - // support 2 values for arg - llama_arg( - const std::initializer_list & args, - const char * value_hint, - const char * value_hint_2, - const std::string & help, - void (*handler)(gpt_params & params, const std::string &, const std::string &) - ) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {} - - llama_arg & set_examples(std::initializer_list examples) { - this->examples = std::move(examples); - return *this; - } - - llama_arg & set_env(const char * env) { - help = help + "\n(env: " + env + ")"; - this->env = env; - return *this; - } - - bool in_example(enum llama_example ex) { - return examples.find(ex) != examples.end(); - } - - bool get_value_from_env(std::string & output) const { - if (env == nullptr) return false; - char * value = std::getenv(env); - if (value) { - output = value; - return true; - } - return false; - } - - bool has_value_from_env() const { - return env != nullptr && std::getenv(env); - } - - std::string to_string(); -}; - -// initialize list of options (arguments) that can be used by the current example -std::vector gpt_params_parser_init(gpt_params & params, llama_example ex); -// optionally, we can provide "print_usage" to print example usage -std::vector gpt_params_parser_init(gpt_params & params, llama_example ex, std::function print_usage); - -// parse input arguments from CLI -// if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message) -bool gpt_params_parse (int argc, char ** argv, gpt_params & params, std::vector & options); -bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vector & options); - -// print full usage message; it will be called internally by gpt_params_parse() if "-h" is set -void gpt_params_print_usage(gpt_params & params, std::vector & options); - std::string gpt_params_get_system_info(const gpt_params & params); bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]); @@ -597,6 +522,7 @@ llama_control_vector_data llama_control_vector_load(const std::vector +#include + // the ring buffer works similarly to std::deque, but with a fixed capacity // TODO: deduplicate with llama-impl.h template @@ -420,7 +423,7 @@ std::vector gpt_sampler_types_from_names(const std::vector gpt_sampler_types_from_chars(const std::string & chars) { - std::unordered_map sampler_name_map { + std::unordered_map sampler_name_map = { { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_K), GPT_SAMPLER_TYPE_TOP_K }, { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TFS_Z), GPT_SAMPLER_TYPE_TFS_Z }, { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TYPICAL_P), GPT_SAMPLER_TYPE_TYPICAL_P }, diff --git a/common/sampling.h b/common/sampling.h index 654e0c513..0a4461fab 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -2,61 +2,11 @@ #include "llama.h" +#include "common.h" + #include #include -enum gpt_sampler_type { - GPT_SAMPLER_TYPE_NONE = 0, - GPT_SAMPLER_TYPE_TOP_K = 1, - GPT_SAMPLER_TYPE_TOP_P = 2, - GPT_SAMPLER_TYPE_MIN_P = 3, - GPT_SAMPLER_TYPE_TFS_Z = 4, - GPT_SAMPLER_TYPE_TYPICAL_P = 5, - GPT_SAMPLER_TYPE_TEMPERATURE = 6, -}; - -// sampling parameters -struct gpt_sampler_params { - uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler - - int32_t n_prev = 64; // number of previous tokens to remember - int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. - int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens - int32_t top_k = 40; // <= 0 to use vocab size - float top_p = 0.95f; // 1.0 = disabled - float min_p = 0.05f; // 0.0 = disabled - float tfs_z = 1.00f; // 1.0 = disabled - float typ_p = 1.00f; // typical_p, 1.0 = disabled - float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities - float dynatemp_range = 0.00f; // 0.0 = disabled - float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler - int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) - float penalty_repeat = 1.00f; // 1.0 = disabled - float penalty_freq = 0.00f; // 0.0 = disabled - float penalty_present = 0.00f; // 0.0 = disabled - int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 - float mirostat_tau = 5.00f; // target entropy - float mirostat_eta = 0.10f; // learning rate - bool penalize_nl = false; // consider newlines as a repeatable token - bool ignore_eos = false; - - std::vector samplers = { - GPT_SAMPLER_TYPE_TOP_K, - GPT_SAMPLER_TYPE_TFS_Z, - GPT_SAMPLER_TYPE_TYPICAL_P, - GPT_SAMPLER_TYPE_TOP_P, - GPT_SAMPLER_TYPE_MIN_P, - GPT_SAMPLER_TYPE_TEMPERATURE - }; - - std::string grammar; // optional BNF-like grammar to constrain sampling - - std::vector logit_bias; // logit biases to apply - - // print the parameters into a string - std::string print() const; -}; - // gpt_sampler extends llama_sampler with additional functionality: // // - grammar support diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 0a9bbc829..ca473244e 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -302,6 +302,8 @@ class Model: gguf.MODEL_TENSOR.TIME_MIX_FIRST, gguf.MODEL_TENSOR.TIME_MIX_W1, gguf.MODEL_TENSOR.TIME_MIX_W2, + gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1, + gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2, ) ) or not new_name.endswith(".weight") diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp index f3b0c433b..a91e7f4bd 100644 --- a/examples/batched-bench/batched-bench.cpp +++ b/examples/batched-bench/batched-bench.cpp @@ -1,3 +1,4 @@ +#include "arg.h" #include "common.h" #include "llama.h" @@ -37,8 +38,7 @@ static void print_usage(int, char ** argv) { int main(int argc, char ** argv) { gpt_params params; - auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_BENCH, print_usage); - if (!gpt_params_parse(argc, argv, params, options)) { + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_BENCH, print_usage)) { return 1; } diff --git a/examples/batched.swift/Sources/main.swift b/examples/batched.swift/Sources/main.swift index 4bc2bbf2c..9f7c49492 100644 --- a/examples/batched.swift/Sources/main.swift +++ b/examples/batched.swift/Sources/main.swift @@ -140,8 +140,6 @@ while n_cur <= n_len { let new_token_id = llama_sampler_sample(smpl, context, i_batch[i]) - llama_sampler_accept(smpl, new_token_id) - // is it an end of stream? -> mark the stream as finished if llama_token_is_eog(model, new_token_id) || n_cur == n_len { i_batch[i] = -1 diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp index f5f309022..5d32153fe 100644 --- a/examples/batched/batched.cpp +++ b/examples/batched/batched.cpp @@ -1,3 +1,4 @@ +#include "arg.h" #include "common.h" #include "llama.h" @@ -18,8 +19,7 @@ int main(int argc, char ** argv) { params.prompt = "Hello my name is"; params.n_predict = 32; - auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, print_usage); - if (!gpt_params_parse(argc, argv, params, options)) { + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) { return 1; } @@ -172,8 +172,6 @@ int main(int argc, char ** argv) { const llama_token new_token_id = llama_sampler_sample(smpl, ctx, i_batch[i]); - llama_sampler_accept(smpl, new_token_id); - // is it an end of generation? -> mark the stream as finished if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) { i_batch[i] = -1; diff --git a/examples/cvector-generator/cvector-generator.cpp b/examples/cvector-generator/cvector-generator.cpp index 0795175a1..569b6c38f 100644 --- a/examples/cvector-generator/cvector-generator.cpp +++ b/examples/cvector-generator/cvector-generator.cpp @@ -1,3 +1,4 @@ +#include "arg.h" #include "common.h" #include "llama.h" #include "ggml.h" @@ -388,8 +389,7 @@ static int prepare_entries(gpt_params & params, train_context & ctx_train) { int main(int argc, char ** argv) { gpt_params params; - auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage); - if (!gpt_params_parse(argc, argv, params, options)) { + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) { return 1; } diff --git a/examples/cvector-generator/pca.hpp b/examples/cvector-generator/pca.hpp index 6ec3141af..05c66856c 100644 --- a/examples/cvector-generator/pca.hpp +++ b/examples/cvector-generator/pca.hpp @@ -12,12 +12,9 @@ #include #include +#include #include -#include #include -#include -#include -#include #define DEBUG_POS 5 diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp deleted file mode 100644 index 7a141f953..000000000 --- a/examples/embedding/embedding.cpp +++ /dev/null @@ -1,321 +0,0 @@ -#include "build-info.h" -#include "common.h" -#include "llama.h" - -#include - -#if defined(_MSC_VER) -#pragma warning(disable: 4244 4267) // possible loss of data -#endif - -static std::vector split_lines(const std::string & s, const std::string & separator = "\n") { - std::vector lines; - size_t start = 0; - size_t end = s.find(separator); - - while (end != std::string::npos) { - lines.push_back(s.substr(start, end - start)); - start = end + separator.length(); - end = s.find(separator, start); - } - - lines.push_back(s.substr(start)); // Add the last part - - return lines; -} - -static void batch_add_seq(llama_batch & batch, const std::vector & tokens, llama_seq_id seq_id) { - size_t n_tokens = tokens.size(); - for (size_t i = 0; i < n_tokens; i++) { - llama_batch_add(batch, tokens[i], i, { seq_id }, true); - } -} - -static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) { - const enum llama_pooling_type pooling_type = llama_pooling_type(ctx); - const struct llama_model * model = llama_get_model(ctx); - - // clear previous kv_cache values (irrelevant for embeddings) - llama_kv_cache_clear(ctx); - - // run model - fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq); - if (llama_model_has_encoder(model) && !llama_model_has_decoder(model)) { - // encoder-only model - if (llama_encode(ctx, batch) < 0) { - fprintf(stderr, "%s : failed to encode\n", __func__); - } - } else if (!llama_model_has_encoder(model) && llama_model_has_decoder(model)) { - // decoder-only model - if (llama_decode(ctx, batch) < 0) { - fprintf(stderr, "%s : failed to decode\n", __func__); - } - } - - for (int i = 0; i < batch.n_tokens; i++) { - if (!batch.logits[i]) { - continue; - } - - const float * embd = nullptr; - int embd_pos = 0; - - if (pooling_type == LLAMA_POOLING_TYPE_NONE) { - // try to get token embeddings - embd = llama_get_embeddings_ith(ctx, i); - embd_pos = i; - GGML_ASSERT(embd != NULL && "failed to get token embeddings"); - } else { - // try to get sequence embeddings - supported only when pooling_type is not NONE - embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]); - embd_pos = batch.seq_id[i][0]; - GGML_ASSERT(embd != NULL && "failed to get sequence embeddings"); - } - - float * out = output + embd_pos * n_embd; - llama_embd_normalize(embd, out, n_embd, embd_norm); - } -} - -int main(int argc, char ** argv) { - gpt_params params; - - auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_EMBEDDING); - if (!gpt_params_parse(argc, argv, params, options)) { - return 1; - } - - params.embedding = true; - // For non-causal models, batch size must be equal to ubatch size - params.n_ubatch = params.n_batch; - - print_build_info(); - - LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed); - - llama_backend_init(); - llama_numa_init(params.numa); - - // load the model - llama_init_result llama_init = llama_init_from_gpt_params(params); - - llama_model * model = llama_init.model; - llama_context * ctx = llama_init.context; - if (model == NULL) { - fprintf(stderr, "%s: error: unable to load model\n", __func__); - return 1; - } - - const int n_ctx_train = llama_n_ctx_train(model); - const int n_ctx = llama_n_ctx(ctx); - - const enum llama_pooling_type pooling_type = llama_pooling_type(ctx); - - if (llama_model_has_encoder(model) && llama_model_has_decoder(model)) { - fprintf(stderr, "%s: error: computing embeddings in encoder-decoder models is not supported\n", __func__); - return 1; - } - - if (n_ctx > n_ctx_train) { - fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n", - __func__, n_ctx_train, n_ctx); - } - - // print system information - { - fprintf(stderr, "\n"); - fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str()); - } - - // split the prompt into lines - std::vector prompts = split_lines(params.prompt, params.embd_sep); - - // max batch size - const uint64_t n_batch = params.n_batch; - GGML_ASSERT(params.n_batch >= params.n_ctx); - - // tokenize the prompts and trim - std::vector> inputs; - for (const auto & prompt : prompts) { - auto inp = ::llama_tokenize(ctx, prompt, true, false); - if (inp.size() > n_batch) { - fprintf(stderr, "%s: error: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n", - __func__, (long long int) inp.size(), (long long int) n_batch); - return 1; - } - inputs.push_back(inp); - } - - // check if the last token is SEP - // it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true' - for (auto & inp : inputs) { - if (inp.empty() || inp.back() != llama_token_sep(model)) { - fprintf(stderr, "%s: warning: last token in the prompt is not SEP\n", __func__); - fprintf(stderr, "%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__); - } - } - - // tokenization stats - if (params.verbose_prompt) { - for (int i = 0; i < (int) inputs.size(); i++) { - fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str()); - fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size()); - for (int j = 0; j < (int) inputs[i].size(); j++) { - fprintf(stderr, "%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str()); - } - fprintf(stderr, "\n\n"); - } - } - - // initialize batch - const int n_prompts = prompts.size(); - struct llama_batch batch = llama_batch_init(n_batch, 0, 1); - - // count number of embeddings - int n_embd_count = 0; - if (pooling_type == LLAMA_POOLING_TYPE_NONE) { - for (int k = 0; k < n_prompts; k++) { - n_embd_count += inputs[k].size(); - } - } else { - n_embd_count = n_prompts; - } - - // allocate output - const int n_embd = llama_n_embd(model); - std::vector embeddings(n_embd_count * n_embd, 0); - float * emb = embeddings.data(); - - // break into batches - int e = 0; // number of embeddings already stored - int s = 0; // number of prompts in current batch - for (int k = 0; k < n_prompts; k++) { - // clamp to n_batch tokens - auto & inp = inputs[k]; - - const uint64_t n_toks = inp.size(); - - // encode if at capacity - if (batch.n_tokens + n_toks > n_batch) { - float * out = emb + e * n_embd; - batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize); - e += pooling_type == LLAMA_POOLING_TYPE_NONE ? batch.n_tokens : s; - s = 0; - llama_batch_clear(batch); - } - - // add to batch - batch_add_seq(batch, inp, s); - s += 1; - } - - // final batch - float * out = emb + e * n_embd; - batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize); - - if (params.embd_out.empty()) { - fprintf(stdout, "\n"); - - if (pooling_type == LLAMA_POOLING_TYPE_NONE) { - for (int j = 0; j < n_embd_count; j++) { - fprintf(stdout, "embedding %d: ", j); - for (int i = 0; i < std::min(3, n_embd); i++) { - if (params.embd_normalize == 0) { - fprintf(stdout, "%6.0f ", emb[j * n_embd + i]); - } else { - fprintf(stdout, "%9.6f ", emb[j * n_embd + i]); - } - } - fprintf(stdout, " ... "); - for (int i = n_embd - 3; i < n_embd; i++) { - if (params.embd_normalize == 0) { - fprintf(stdout, "%6.0f ", emb[j * n_embd + i]); - } else { - fprintf(stdout, "%9.6f ", emb[j * n_embd + i]); - } - } - fprintf(stdout, "\n"); - } - } else { - // print the first part of the embeddings or for a single prompt, the full embedding - for (int j = 0; j < n_prompts; j++) { - fprintf(stdout, "embedding %d: ", j); - for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) { - if (params.embd_normalize == 0) { - fprintf(stdout, "%6.0f ", emb[j * n_embd + i]); - } else { - fprintf(stdout, "%9.6f ", emb[j * n_embd + i]); - } - } - fprintf(stdout, "\n"); - } - - // print cosine similarity matrix - if (n_prompts > 1) { - fprintf(stdout, "\n"); - printf("cosine similarity matrix:\n\n"); - for (int i = 0; i < n_prompts; i++) { - fprintf(stdout, "%6.6s ", prompts[i].c_str()); - } - fprintf(stdout, "\n"); - for (int i = 0; i < n_prompts; i++) { - for (int j = 0; j < n_prompts; j++) { - float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd); - fprintf(stdout, "%6.2f ", sim); - } - fprintf(stdout, "%1.10s", prompts[i].c_str()); - fprintf(stdout, "\n"); - } - } - } - } - - if (params.embd_out == "json" || params.embd_out == "json+" || params.embd_out == "array") { - const bool notArray = params.embd_out != "array"; - - fprintf(stdout, notArray ? "{\n \"object\": \"list\",\n \"data\": [\n" : "["); - for (int j = 0;;) { // at least one iteration (one prompt) - if (notArray) fprintf(stdout, " {\n \"object\": \"embedding\",\n \"index\": %d,\n \"embedding\": ",j); - fprintf(stdout, "["); - for (int i = 0;;) { // at least one iteration (n_embd > 0) - fprintf(stdout, params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]); - i++; - if (i < n_embd) fprintf(stdout, ","); else break; - } - fprintf(stdout, notArray ? "]\n }" : "]"); - j++; - if (j < n_embd_count) fprintf(stdout, notArray ? ",\n" : ","); else break; - } - fprintf(stdout, notArray ? "\n ]" : "]\n"); - - if (params.embd_out == "json+" && n_prompts > 1) { - fprintf(stdout, ",\n \"cosineSimilarity\": [\n"); - for (int i = 0;;) { // at least two iteration (n_embd_count > 1) - fprintf(stdout, " ["); - for (int j = 0;;) { // at least two iteration (n_embd_count > 1) - float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd); - fprintf(stdout, "%6.2f", sim); - j++; - if (j < n_embd_count) fprintf(stdout, ", "); else break; - } - fprintf(stdout, " ]"); - i++; - if (i < n_embd_count) fprintf(stdout, ",\n"); else break; - } - fprintf(stdout, "\n ]"); - } - - if (notArray) fprintf(stdout, "\n}\n"); - } - - LOG_TEE("\n"); - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); - - // clean up - llama_batch_free(batch); - llama_free(ctx); - llama_free_model(model); - llama_backend_free(); - - return 0; -} diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp index 881111ffd..bc7203143 100644 --- a/examples/eval-callback/eval-callback.cpp +++ b/examples/eval-callback/eval-callback.cpp @@ -1,3 +1,4 @@ +#include "arg.h" #include "common.h" #include "llama.h" #include "ggml.h" @@ -144,8 +145,7 @@ int main(int argc, char ** argv) { gpt_params params; - auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON); - if (!gpt_params_parse(argc, argv, params, options)) { + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { return 1; } diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp index 544e7fff6..ff324926a 100644 --- a/examples/export-lora/export-lora.cpp +++ b/examples/export-lora/export-lora.cpp @@ -1,3 +1,4 @@ +#include "arg.h" #include "common.h" #include "ggml.h" #include "ggml-alloc.h" @@ -401,8 +402,7 @@ static void print_usage(int, char ** argv) { int main(int argc, char ** argv) { gpt_params params; - auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage); - if (!gpt_params_parse(argc, argv, params, options)) { + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage)) { return 1; } diff --git a/examples/gen-docs/gen-docs.cpp b/examples/gen-docs/gen-docs.cpp index 8b1dafd63..b6d4725fd 100644 --- a/examples/gen-docs/gen-docs.cpp +++ b/examples/gen-docs/gen-docs.cpp @@ -1,3 +1,4 @@ +#include "arg.h" #include "common.h" #include @@ -9,11 +10,11 @@ static void export_md(std::string fname, llama_example ex) { std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc); gpt_params params; - auto options = gpt_params_parser_init(params, ex); + auto ctx_arg = gpt_params_parser_init(params, ex); file << "| Argument | Explanation |\n"; file << "| -------- | ----------- |\n"; - for (auto & opt : options) { + for (auto & opt : ctx_arg.options) { file << "| `"; // args for (const auto & arg : opt.args) { diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp index e1efbf573..14c715202 100644 --- a/examples/gritlm/gritlm.cpp +++ b/examples/gritlm/gritlm.cpp @@ -1,3 +1,4 @@ +#include "arg.h" #include "common.h" #include "llama.h" @@ -121,7 +122,6 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std llama_decode(ctx, bat); llama_token token = llama_sampler_sample(smpl, ctx, bat.n_tokens - 1); - llama_sampler_accept(smpl, token); if (token == eos_token) { break; @@ -154,8 +154,7 @@ static std::string gritlm_instruction(const std::string & instruction) { int main(int argc, char * argv[]) { gpt_params params; - auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON); - if (!gpt_params_parse(argc, argv, params, options)) { + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { return 1; } diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp deleted file mode 100644 index c310c229a..000000000 --- a/examples/imatrix/imatrix.cpp +++ /dev/null @@ -1,649 +0,0 @@ -#include "build-info.h" -#include "common.h" -#include "llama.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#if defined(_MSC_VER) -#pragma warning(disable: 4244 4267) // possible loss of data -#endif - -static void print_usage(int, char ** argv) { - LOG_TEE("\nexample usage:\n"); - LOG_TEE("\n %s \\\n" - " -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \\\n" - " [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n" - " [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n" , argv[0]); - LOG_TEE("\n"); -} - -struct Stats { - std::vector values; - std::vector counts; - int ncall = 0; -}; - -class IMatrixCollector { -public: - IMatrixCollector() = default; - void set_params(gpt_params params) { m_params = std::move(params); } - bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data); - void save_imatrix(int ncall = -1) const; - bool load_imatrix(const char * file_name); -private: - std::unordered_map m_stats; - gpt_params m_params; - std::mutex m_mutex; - int m_last_call = 0; - std::vector m_src1_data; - std::vector m_ids; // the expert ids from ggml_mul_mat_id -}; - -// remove any prefix and suffixes from the name -// CUDA0#blk.0.attn_k.weight#0 => blk.0.attn_k.weight -static std::string filter_tensor_name(const char * name) { - std::string wname; - const char * p = strchr(name, '#'); - if (p != NULL) { - p = p + 1; - const char * q = strchr(p, '#'); - if (q != NULL) { - wname = std::string(p, q - p); - } else { - wname = p; - } - } else { - wname = name; - } - return wname; -} - -bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) { - GGML_UNUSED(user_data); - - const struct ggml_tensor * src0 = t->src[0]; - const struct ggml_tensor * src1 = t->src[1]; - std::string wname = filter_tensor_name(src0->name); - - // when ask is true, the scheduler wants to know if we are interested in data from this tensor - // if we return true, a follow-up call will be made with ask=false in which we can do the actual collection - if (ask) { - if (t->op == GGML_OP_MUL_MAT_ID) return true; // collect all indirect matrix multiplications - if (t->op != GGML_OP_MUL_MAT) return false; - // why are small batches ignored (<16 tokens)? - if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false; - if (!(wname.substr(0, 4) == "blk." || (m_params.process_output && wname == "output.weight"))) return false; - return true; - } - - std::lock_guard lock(m_mutex); - - // copy the data from the GPU memory if needed - const bool is_host = ggml_backend_buffer_is_host(src1->buffer); - - if (!is_host) { - m_src1_data.resize(ggml_nelements(src1)); - ggml_backend_tensor_get(src1, m_src1_data.data(), 0, ggml_nbytes(src1)); - } - - const float * data = is_host ? (const float *) src1->data : m_src1_data.data(); - - // this has been adapted to the new format of storing merged experts in a single 3d tensor - // ref: https://github.com/ggerganov/llama.cpp/pull/6387 - if (t->op == GGML_OP_MUL_MAT_ID) { - // ids -> [n_experts_used, n_tokens] - // src1 -> [cols, n_expert_used, n_tokens] - const ggml_tensor * ids = t->src[2]; - const int n_as = src0->ne[2]; - const int n_ids = ids->ne[0]; - - // the top-k selected expert ids are stored in the ids tensor - // for simplicity, always copy ids to host, because it is small - // take into account that ids is not contiguous! - - GGML_ASSERT(ids->ne[1] == src1->ne[2]); - - m_ids.resize(ggml_nbytes(ids)); - ggml_backend_tensor_get(ids, m_ids.data(), 0, ggml_nbytes(ids)); - - auto & e = m_stats[wname]; - - ++e.ncall; - - if (e.values.empty()) { - e.values.resize(src1->ne[0]*n_as, 0); - e.counts.resize(src1->ne[0]*n_as, 0); - } - else if (e.values.size() != (size_t)src1->ne[0]*n_as) { - fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as); - exit(1); //GGML_ABORT("fatal error"); - } - if (m_params.verbosity > 1) { - printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type); - } - // loop over all possible experts, regardless if they are used or not in the batch - for (int ex = 0; ex < n_as; ++ex) { - size_t e_start = ex*src1->ne[0]; - - for (int idx = 0; idx < n_ids; ++idx) { - for (int row = 0; row < (int)src1->ne[2]; ++row) { - const int excur = *(const int32_t *) (m_ids.data() + row*ids->nb[1] + idx*ids->nb[0]); - - GGML_ASSERT(excur >= 0 && excur < n_as); // sanity check - - if (excur != ex) continue; - - const int64_t i11 = idx % src1->ne[1]; - const int64_t i12 = row; - const float * x = (const float *)((const char *)data + i11*src1->nb[1] + i12*src1->nb[2]); - - for (int j = 0; j < (int)src1->ne[0]; ++j) { - e.values[e_start + j] += x[j]*x[j]; - e.counts[e_start + j]++; - if (!std::isfinite(e.values[e_start + j])) { - fprintf(stderr, "%f detected in %s\n", e.values[e_start + j], wname.c_str()); - exit(1); - } - } - } - } - if (e.ncall > m_last_call) { - m_last_call = e.ncall; - if (m_last_call % m_params.n_out_freq == 0) { - save_imatrix(); - } - if (m_params.n_save_freq > 0 && m_last_call%m_params.n_save_freq == 0) { - save_imatrix(m_last_call); - } - } - } - } else { - auto & e = m_stats[wname]; - if (e.values.empty()) { - e.values.resize(src1->ne[0], 0); - e.counts.resize(src1->ne[0], 0); - } - else if (e.values.size() != (size_t)src1->ne[0]) { - fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]); - exit(1); //GGML_ABORT("fatal error"); - } - ++e.ncall; - if (m_params.verbosity > 1) { - printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type); - } - for (int row = 0; row < (int)src1->ne[1]; ++row) { - const float * x = data + row * src1->ne[0]; - for (int j = 0; j < (int)src1->ne[0]; ++j) { - e.values[j] += x[j]*x[j]; - e.counts[j]++; - if (!std::isfinite(e.values[j])) { - fprintf(stderr, "%f detected in %s\n", e.values[j], wname.c_str()); - exit(1); - } - } - } - if (e.ncall > m_last_call) { - m_last_call = e.ncall; - if (m_last_call % m_params.n_out_freq == 0) { - save_imatrix(); - } - if (m_params.n_save_freq > 0 && m_last_call%m_params.n_save_freq == 0) { - save_imatrix(m_last_call); - } - } - } - - return true; -} - -void IMatrixCollector::save_imatrix(int ncall) const { - auto fname = m_params.out_file; - if (fname.empty()) { - fname = "imatrix.dat"; - } - - if (ncall > 0) { - fname += ".at_"; - fname += std::to_string(ncall); - } - - // avoid writing imatrix entries that do not have full data - // this can happen with MoE models where some of the experts end up not being exercised by the provided training data - - int n_entries = 0; - std::vector to_store; - - bool is_first = true; // for printing - for (const auto & kv : m_stats) { - const int n_all = kv.second.counts.size(); - - if (n_all == 0) { - continue; - } - - int n_zeros = 0; - for (const int c : kv.second.counts) { - if (c == 0) { - n_zeros++; - } - } - - if (n_zeros != 0 && is_first) { - fprintf(stderr, "\n"); - is_first = false; - } - - if (n_zeros == n_all) { - fprintf(stderr, "%s: entry '%40s' has no data - skipping\n", __func__, kv.first.c_str()); - continue; - } - - if (n_zeros > 0) { - fprintf(stderr, "%s: entry '%40s' has partial data (%.2f%%) - skipping\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all); - continue; - } - - n_entries++; - to_store.push_back(kv.first); - } - - if (to_store.size() < m_stats.size()) { - fprintf(stderr, "%s: warning: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size()); - } - - std::ofstream out(fname, std::ios::binary); - out.write((const char *) &n_entries, sizeof(n_entries)); - for (const auto & name : to_store) { - const auto & stat = m_stats.at(name); - int len = name.size(); - out.write((const char *) &len, sizeof(len)); - out.write(name.c_str(), len); - out.write((const char *) &stat.ncall, sizeof(stat.ncall)); - int nval = stat.values.size(); - out.write((const char *) &nval, sizeof(nval)); - if (nval > 0) { - std::vector tmp(nval); - for (int i = 0; i < nval; i++) { - tmp[i] = (stat.values[i] / static_cast(stat.counts[i])) * static_cast(stat.ncall); - } - out.write((const char*)tmp.data(), nval*sizeof(float)); - } - } - - // Write the number of call the matrix was computed with - out.write((const char *) &m_last_call, sizeof(m_last_call)); - - // Write the input filename at the end of the file to later on specify it in quantize - { - int len = m_params.prompt_file.size(); - out.write((const char *) &len, sizeof(len)); - out.write(m_params.prompt_file.c_str(), len); - } - - if (m_params.verbosity > 0) { - fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str()); - } -} - -bool IMatrixCollector::load_imatrix(const char * fname) { - std::ifstream in(fname, std::ios::binary); - if (!in) { - printf("%s: failed to open %s\n",__func__, fname); - return false; - } - int n_entries; - in.read((char*)&n_entries, sizeof(n_entries)); - if (in.fail() || n_entries < 1) { - printf("%s: no data in file %s\n", __func__, fname); - return false; - } - for (int i = 0; i < n_entries; ++i) { - int len; in.read((char *)&len, sizeof(len)); - std::vector name_as_vec(len+1); - in.read((char *)name_as_vec.data(), len); - if (in.fail()) { - printf("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname); - return false; - } - name_as_vec[len] = 0; - std::string name{name_as_vec.data()}; - auto & e = m_stats[std::move(name)]; - int ncall; - in.read((char*)&ncall, sizeof(ncall)); - int nval; - in.read((char *)&nval, sizeof(nval)); - if (in.fail() || nval < 1) { - printf("%s: failed reading number of values for entry %d\n",__func__,i); - m_stats = {}; - return false; - } - - if (e.values.empty()) { - e.values.resize(nval, 0); - e.counts.resize(nval, 0); - } - - std::vector tmp(nval); - in.read((char*)tmp.data(), nval*sizeof(float)); - if (in.fail()) { - printf("%s: failed reading data for entry %d\n",__func__,i); - m_stats = {}; - return false; - } - - // Recreate the state as expected by save_imatrix(), and corerct for weighted sum. - for (int i = 0; i < nval; i++) { - e.values[i] += tmp[i]; - e.counts[i] += ncall; - } - e.ncall += ncall; - - } - return true; -} - -static IMatrixCollector g_collector; - -static bool ik_collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) { - return g_collector.collect_imatrix(t, ask, user_data); -} - - -struct results_log_softmax { - double log_softmax; - float logit; - float prob; -}; - -static std::vector softmax(const std::vector & logits) { - std::vector probs(logits.size()); - float max_logit = logits[0]; - for (float v : logits) { - max_logit = std::max(max_logit, v); - } - double sum_exp = 0.0; - for (size_t i = 0; i < logits.size(); i++) { - // Subtract the maximum logit value from the current logit value for numerical stability - const float logit = logits[i] - max_logit; - const float exp_logit = expf(logit); - sum_exp += exp_logit; - probs[i] = exp_logit; - } - for (size_t i = 0; i < probs.size(); i++) { - probs[i] /= sum_exp; - } - return probs; -} - -static results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) { - float max_logit = logits[0]; - for (int i = 1; i < n_vocab; ++i) { - max_logit = std::max(max_logit, logits[i]); - } - double sum_exp = 0.0; - for (int i = 0; i < n_vocab; ++i) { - sum_exp += expf(logits[i] - max_logit); - } - return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp}; -} - -static void process_logits( - int n_vocab, const float * logits, const int * tokens, int n_token, std::vector & workers, - double & nll, double & nll2, float * logit_history, float * prob_history) { - std::mutex mutex; - int counter = 0; - auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () { - double local_nll = 0; - double local_nll2 = 0; - while (true) { - std::unique_lock lock(mutex); - int i = counter++; - if (i >= n_token) { - nll += local_nll; nll2 += local_nll2; - break; - } - lock.unlock(); - const results_log_softmax results = log_softmax(n_vocab, logits + i*n_vocab, tokens[i+1]); - const double v = -results.log_softmax; - local_nll += v; - local_nll2 += v*v; - - logit_history[i] = results.logit; - prob_history[i] = results.prob; - } - }; - for (auto & w : workers) { - w = std::thread(compute); - } - compute(); - for (auto & w : workers) { - w.join(); - } -} - -static bool compute_imatrix(llama_context * ctx, const gpt_params & params) { - const bool add_bos = llama_add_bos_token(llama_get_model(ctx)); - GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx))); - const int n_ctx = llama_n_ctx(ctx); - - auto tim1 = std::chrono::high_resolution_clock::now(); - fprintf(stderr, "%s: tokenizing the input ..\n", __func__); - - std::vector tokens = ::llama_tokenize(ctx, params.prompt, true); - - auto tim2 = std::chrono::high_resolution_clock::now(); - fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast(tim2-tim1).count()); - - if (params.i_chunk > 0) { - if (size_t((params.i_chunk + 2)*n_ctx) >= tokens.size()) { - fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, params.i_chunk); - return false; - } - fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, params.i_chunk, params.i_chunk*n_ctx); - tokens.erase(tokens.begin(), tokens.begin() + params.i_chunk*n_ctx); - } - - if (int(tokens.size()) < 2*n_ctx) { - fprintf(stderr, "%s: you need at least %d tokens for a context of %d tokens\n",__func__,2*n_ctx, - n_ctx); - fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size()); - return false; - } - - std::vector logit_history; - std::vector prob_history; - - if (params.compute_ppl) { - logit_history.resize(tokens.size()); - prob_history.resize(tokens.size()); - } - - const int n_chunk_max = tokens.size() / n_ctx; - - const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max); - const int n_vocab = llama_n_vocab(llama_get_model(ctx)); - const int n_batch = params.n_batch; - - int count = 0; - double nll = 0.0; - double nll2 = 0.0; - - fprintf(stderr, "%s: computing over %d chunks with batch_size %d\n", __func__, n_chunk, n_batch); - - std::vector workers(std::thread::hardware_concurrency() - 1); - - const int num_batches = (n_ctx + n_batch - 1) / n_batch; - - std::vector logits; - if (params.compute_ppl && num_batches > 1) { - logits.reserve((size_t)n_ctx * n_vocab); - } - - for (int i = 0; i < n_chunk; ++i) { - const int start = i * n_ctx; - const int end = start + n_ctx; - - std::vector logits; - - const auto t_start = std::chrono::high_resolution_clock::now(); - - // clear the KV cache - llama_kv_cache_clear(ctx); - - for (int j = 0; j < num_batches; ++j) { - const int batch_start = start + j * n_batch; - const int batch_size = std::min(end - batch_start, n_batch); - - // save original token and restore it after eval - const auto token_org = tokens[batch_start]; - - // add BOS token for the first batch of each chunk - if (add_bos && j == 0) { - tokens[batch_start] = llama_token_bos(llama_get_model(ctx)); - } - - // TODO: use batch.logits to save computations instead of relying on logits_all == true - if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) { - fprintf(stderr, "%s : failed to eval\n", __func__); - return false; - } - - // restore the original token in case it was set to BOS - tokens[batch_start] = token_org; - - if (params.compute_ppl && num_batches > 1) { - const auto * batch_logits = llama_get_logits(ctx); - logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab); - } - } - - const auto t_end = std::chrono::high_resolution_clock::now(); - - if (i == 0) { - const float t_total = std::chrono::duration(t_end - t_start).count(); - fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total); - int total_seconds = (int)(t_total * n_chunk); - if (total_seconds >= 60*60) { - fprintf(stderr, "%d hours ", total_seconds / (60*60)); - total_seconds = total_seconds % (60*60); - } - fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0); - } - - if (params.compute_ppl) { - const int first = n_ctx/2; - const auto all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx); - process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first, - workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first); - count += n_ctx - first - 1; - - printf("[%d]%.4lf,", i + 1, std::exp(nll / count)); - fflush(stdout); - - logits.clear(); - } - } - printf("\n"); - - if (params.compute_ppl) { - nll2 /= count; - nll /= count; - const double ppl = exp(nll); - nll2 -= nll * nll; - if (nll2 > 0) { - nll2 = sqrt(nll2/(count-1)); - printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl); - } else { - printf("Unexpected negative standard deviation of log(prob)\n"); - } - } - - return true; -} - -int main(int argc, char ** argv) { - gpt_params params; - - params.n_ctx = 512; - params.logits_all = true; - params.verbosity = 1; - - auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_IMATRIX, print_usage); - if (!gpt_params_parse(argc, argv, params, options)) { - return 1; - } - - params.n_batch = std::min(params.n_batch, params.n_ctx); - - g_collector.set_params(params); - - for (const auto & in_file : params.in_files) { - printf("%s : loading imatrix from '%s'\n", __func__, in_file.c_str()); - if (!g_collector.load_imatrix(in_file.c_str())) { - fprintf(stderr, "%s : failed to load %s\n", __func__, in_file.c_str()); - return 1; - } - } - - if (params.in_files.size() > 1) { - printf("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str()); - g_collector.save_imatrix(); - } - - llama_backend_init(); - llama_numa_init(params.numa); - - // pass the callback to the backend scheduler - // it will be executed for each node during the graph computation - params.cb_eval = ik_collect_imatrix; - params.cb_eval_user_data = NULL; - params.warmup = false; - - // init - llama_init_result llama_init = llama_init_from_gpt_params(params); - - llama_model * model = llama_init.model; - llama_context * ctx = llama_init.context; - if (model == nullptr || ctx == nullptr) { - fprintf(stderr, "%s : failed to init\n", __func__); - return 1; - } - - const int n_ctx_train = llama_n_ctx_train(model); - if (params.n_ctx > n_ctx_train) { - fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n", - __func__, n_ctx_train, params.n_ctx); - } - - // print system information - { - fprintf(stderr, "\n"); - fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str()); - } - - if (!compute_imatrix(ctx, params)) { - return 1; - } - - g_collector.save_imatrix(); - - LOG_TEE("\n"); - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); - - llama_free(ctx); - llama_free_model(model); - - llama_backend_free(); - - return 0; -} diff --git a/examples/llama.android/llama/src/main/cpp/llama-android.cpp b/examples/llama.android/llama/src/main/cpp/llama-android.cpp index 06ec160c2..f611809c6 100644 --- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp +++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp @@ -414,8 +414,6 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop( // sample the most likely token const auto new_token_id = llama_sampler_sample(sampler, context, -1); - llama_sampler_accept(sampler, new_token_id); - const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value); if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) { return nullptr; diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift index 92f61fe83..dcd9803a2 100644 --- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift +++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift @@ -152,8 +152,6 @@ actor LlamaContext { new_token_id = llama_sampler_sample(sampling, context, batch.n_tokens - 1) - llama_sampler_accept(sampling, new_token_id) - if llama_token_is_eog(model, new_token_id) || n_cur == n_len { print("\n") is_done = true diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp index 5845d0106..e9108a9bd 100644 --- a/examples/llava/llava-cli.cpp +++ b/examples/llava/llava-cli.cpp @@ -1,11 +1,12 @@ -#include "ggml.h" +#include "arg.h" +#include "base64.hpp" #include "log.h" #include "common.h" +#include "sampling.h" #include "clip.h" #include "llava.h" #include "llama.h" - -#include "base64.hpp" +#include "ggml.h" #include #include @@ -278,8 +279,7 @@ int main(int argc, char ** argv) { gpt_params params; - auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_LLAVA, print_usage); - if (!gpt_params_parse(argc, argv, params, options)) { + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) { return 1; } diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp index 57e7d42c5..3475bbce5 100644 --- a/examples/llava/minicpmv-cli.cpp +++ b/examples/llava/minicpmv-cli.cpp @@ -1,9 +1,11 @@ -#include "ggml.h" +#include "arg.h" #include "log.h" #include "common.h" +#include "sampling.h" #include "clip.h" #include "llava.h" #include "llama.h" +#include "ggml.h" #include #include @@ -253,8 +255,7 @@ int main(int argc, char ** argv) { gpt_params params; - auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, show_additional_info); - if (!gpt_params_parse(argc, argv, params, options)) { + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, show_additional_info)) { return 1; } diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp index 5027a483a..de8b792f2 100644 --- a/examples/lookahead/lookahead.cpp +++ b/examples/lookahead/lookahead.cpp @@ -1,4 +1,6 @@ +#include "arg.h" #include "common.h" +#include "sampling.h" #include "llama.h" #include @@ -36,8 +38,7 @@ struct ngram_container { int main(int argc, char ** argv) { gpt_params params; - auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON); - if (!gpt_params_parse(argc, argv, params, options)) { + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { return 1; } diff --git a/examples/lookup/lookup-create.cpp b/examples/lookup/lookup-create.cpp index 795b06c88..33287c02c 100644 --- a/examples/lookup/lookup-create.cpp +++ b/examples/lookup/lookup-create.cpp @@ -1,7 +1,8 @@ -#include "ggml.h" -#include "llama.h" +#include "arg.h" #include "common.h" #include "ngram-cache.h" +#include "ggml.h" +#include "llama.h" #include #include @@ -13,8 +14,7 @@ int main(int argc, char ** argv){ gpt_params params; - auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON); - if (!gpt_params_parse(argc, argv, params, options)) { + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) { return 1; } @@ -40,4 +40,6 @@ int main(int argc, char ** argv){ fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.lookup_cache_static.c_str()); llama_ngram_cache_save(ngram_cache, params.lookup_cache_static); + + return 0; } diff --git a/examples/lookup/lookup-stats.cpp b/examples/lookup/lookup-stats.cpp index 93299ef8b..f299d68a9 100644 --- a/examples/lookup/lookup-stats.cpp +++ b/examples/lookup/lookup-stats.cpp @@ -1,8 +1,9 @@ -#include "ggml.h" +#include "arg.h" #include "common.h" -#include "llama.h" #include "log.h" #include "ngram-cache.h" +#include "llama.h" +#include "ggml.h" #include #include @@ -15,8 +16,7 @@ int main(int argc, char ** argv){ gpt_params params; - auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON); - if (!gpt_params_parse(argc, argv, params, options)) { + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) { return 1; } diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp index 9ac7f6b47..fff44a499 100644 --- a/examples/lookup/lookup.cpp +++ b/examples/lookup/lookup.cpp @@ -1,7 +1,9 @@ +#include "arg.h" #include "ggml.h" -#include "llama.h" #include "common.h" #include "ngram-cache.h" +#include "sampling.h" +#include "llama.h" #include #include @@ -12,8 +14,7 @@ int main(int argc, char ** argv){ gpt_params params; - auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON); - if (!gpt_params_parse(argc, argv, params, options)) { + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) { return 1; } diff --git a/examples/main/main.cpp b/examples/main/main.cpp index f443360eb..be04899ad 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -1,6 +1,7 @@ +#include "arg.h" #include "common.h" - #include "console.h" +#include "sampling.h" #include "llama.h" #include "build-info.h" @@ -139,9 +140,7 @@ static std::string chat_add_and_format(struct llama_model * model, std::vector -#include -#include -#include -#include - -// trim whitespace from the beginning and end of a string -static std::string trim(const std::string & str) { - size_t start = 0; - size_t end = str.size(); - - while (start < end && isspace(str[start])) { - start += 1; - } - - while (end > start && isspace(str[end - 1])) { - end -= 1; - } - - return str.substr(start, end - start); -} - -static std::string k_system = -R"(Transcript of a never ending dialog, where the User interacts with an Assistant. -The Assistant is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision. - -User: Recommend a nice restaurant in the area. -Assistant: I recommend the restaurant "The Golden Duck". It is a 5 star restaurant with a great view of the city. The food is delicious and the service is excellent. The prices are reasonable and the portions are generous. The restaurant is located at 123 Main Street, New York, NY 10001. The phone number is (212) 555-1234. The hours are Monday through Friday from 11:00 am to 10:00 pm. The restaurant is closed on Saturdays and Sundays. -User: Who is Richard Feynman? -Assistant: Richard Feynman was an American physicist who is best known for his work in quantum mechanics and particle physics. He was awarded the Nobel Prize in Physics in 1965 for his contributions to the development of quantum electrodynamics. He was a popular lecturer and author, and he wrote several books, including "Surely You're Joking, Mr. Feynman!" and "What Do You Care What Other People Think?". -User:)"; - -static std::vector k_prompts = { - "What is the meaning of life?", - "Tell me an interesting fact about llamas.", - "What is the best way to cook a steak?", - "Are you familiar with the Special Theory of Relativity and can you explain it to me?", - "Recommend some interesting books to read.", - "What is the best way to learn a new language?", - "How to get a job at Google?", - "If you could have any superpower, what would it be?", - "I want to learn how to play the piano.", -}; - -struct client { - ~client() { - if (smpl) { - gpt_sampler_free(smpl); - } - } - - int32_t id = 0; - - llama_seq_id seq_id = -1; - - llama_token sampled; - - int64_t t_start_prompt; - int64_t t_start_gen; - - int32_t n_prompt = 0; - int32_t n_decoded = 0; - int32_t i_batch = -1; - - std::string input; - std::string prompt; - std::string response; - - struct gpt_sampler * smpl = nullptr; -}; - -static void print_date_time() { - std::time_t current_time = std::time(nullptr); - std::tm* local_time = std::localtime(¤t_time); - char buffer[80]; - strftime(buffer, sizeof(buffer), "%Y-%m-%d %H:%M:%S", local_time); - - printf("\n\033[35mrun parameters as at %s\033[0m\n", buffer); -} - -// Define a split string function to ... -static std::vector split_string(const std::string& input, char delimiter) { - std::vector tokens; - std::istringstream stream(input); - std::string token; - while (std::getline(stream, token, delimiter)) { - tokens.push_back(token); - } - return tokens; -} - -int main(int argc, char ** argv) { - srand(1234); - - gpt_params params; - - auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON); - if (!gpt_params_parse(argc, argv, params, options)) { - return 1; - } - - // number of simultaneous "clients" to simulate - const int32_t n_clients = params.n_parallel; - - // dedicate one sequence to the system prompt - params.n_parallel += 1; - - // requests to simulate - const int32_t n_seq = params.n_sequences; - - // insert new requests as soon as the previous one is done - const bool cont_batching = params.cont_batching; - - const bool dump_kv_cache = params.dump_kv_cache; - -#ifndef LOG_DISABLE_LOGS - log_set_target(log_filename_generator("parallel", "log")); - LOG_TEE("Log start\n"); - log_dump_cmdline(argc, argv); -#endif // LOG_DISABLE_LOGS - - // init llama.cpp - llama_backend_init(); - llama_numa_init(params.numa); - - // load the target model - llama_init_result llama_init = llama_init_from_gpt_params(params); - - llama_model * model = llama_init.model; - llama_context * ctx = llama_init.context; - - // load the prompts from an external file if there are any - if (params.prompt.empty()) { - printf("\n\033[32mNo new questions so proceed with build-in defaults.\033[0m\n"); - } else { - // Output each line of the input params.prompts vector and copy to k_prompts - int index = 0; - printf("\n\033[32mNow printing the external prompt file %s\033[0m\n\n", params.prompt_file.c_str()); - - std::vector prompts = split_string(params.prompt, '\n'); - for (const auto& prompt : prompts) { - k_prompts.resize(index + 1); - k_prompts[index] = prompt; - index++; - printf("%3d prompt: %s\n", index, prompt.c_str()); - } - } - - fprintf(stderr, "\n\n"); - fflush(stderr); - - const int n_ctx = llama_n_ctx(ctx); - - std::vector clients(n_clients); - for (size_t i = 0; i < clients.size(); ++i) { - auto & client = clients[i]; - client.id = i; - client.smpl = gpt_sampler_init(model, params.sparams); - } - - std::vector tokens_system; - tokens_system = ::llama_tokenize(ctx, k_system, true); - const int32_t n_tokens_system = tokens_system.size(); - - llama_seq_id g_seq_id = 0; - - // the max batch size is as large as the context to handle cases where we get very long input prompt from multiple - // users. regardless of the size, the main loop will chunk the batch into a maximum of params.n_batch tokens at a time - llama_batch batch = llama_batch_init(n_ctx, 0, 1); - - int32_t n_total_prompt = 0; - int32_t n_total_gen = 0; - int32_t n_cache_miss = 0; - - struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, n_clients); - - const auto t_main_start = ggml_time_us(); - - LOG_TEE("%s: Simulating parallel requests from clients:\n", __func__); - LOG_TEE("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system); - LOG_TEE("\n"); - - { - LOG_TEE("%s: Evaluating the system prompt ...\n", __func__); - - for (int32_t i = 0; i < n_tokens_system; ++i) { - llama_batch_add(batch, tokens_system[i], i, { 0 }, false); - } - - if (llama_decode(ctx, batch) != 0) { - LOG_TEE("%s: llama_decode() failed\n", __func__); - return 1; - } - - // assign the system KV cache to all parallel sequences - for (int32_t i = 1; i <= n_clients; ++i) { - llama_kv_cache_seq_cp(ctx, 0, i, -1, -1); - } - - LOG_TEE("\n"); - } - - LOG_TEE("Processing requests ...\n\n"); - - while (true) { - if (dump_kv_cache) { - llama_kv_cache_view_update(ctx, &kvc_view); - llama_kv_cache_dump_view_seqs(kvc_view, 40); - } - - llama_batch_clear(batch); - - // decode any currently ongoing sequences - for (auto & client : clients) { - if (client.seq_id == -1) { - continue; - } - - client.i_batch = batch.n_tokens; - - llama_batch_add(batch, client.sampled, n_tokens_system + client.n_prompt + client.n_decoded, { client.id + 1 }, true); - - client.n_decoded += 1; - } - - if (batch.n_tokens == 0) { - // all sequences have ended - clear the entire KV cache - for (int i = 1; i <= n_clients; ++i) { - llama_kv_cache_seq_rm(ctx, i, -1, -1); - // but keep the system prompt - llama_kv_cache_seq_cp(ctx, 0, i, -1, -1); - } - - LOG_TEE("%s: clearing the KV cache\n", __func__); - } - - // insert new sequences for decoding - if (cont_batching || batch.n_tokens == 0) { - for (auto & client : clients) { - if (client.seq_id == -1 && g_seq_id < n_seq) { - client.seq_id = g_seq_id; - - client.t_start_prompt = ggml_time_us(); - client.t_start_gen = 0; - - client.input = k_prompts[rand() % k_prompts.size()]; - client.prompt = client.input + "\nAssistant:"; - client.response = ""; - - gpt_sampler_reset(client.smpl); - - // do not prepend BOS because we have a system prompt! - std::vector tokens_prompt; - tokens_prompt = ::llama_tokenize(ctx, client.prompt, false); - - for (size_t i = 0; i < tokens_prompt.size(); ++i) { - llama_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id + 1 }, false); - } - - // extract the logits only for the last token - if (batch.n_tokens > 0) { - batch.logits[batch.n_tokens - 1] = true; - } - - client.n_prompt = tokens_prompt.size(); - client.n_decoded = 0; - client.i_batch = batch.n_tokens - 1; - - LOG_TEE("\033[31mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id); - - g_seq_id += 1; - - // insert new requests one-by-one - //if (cont_batching) { - // break; - //} - } - } - } - - if (batch.n_tokens == 0) { - break; - } - - // process in chunks of params.n_batch - int32_t n_batch = params.n_batch; - - for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) { - // experiment: process in powers of 2 - //if (i + n_batch > (int32_t) batch.n_tokens && n_batch > 32) { - // n_batch /= 2; - // i -= n_batch; - // continue; - //} - - const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i)); - - llama_batch batch_view = { - n_tokens, - batch.token + i, - nullptr, - batch.pos + i, - batch.n_seq_id + i, - batch.seq_id + i, - batch.logits + i, - 0, 0, 0, // unused - }; - - const int ret = llama_decode(ctx, batch_view); - if (ret != 0) { - if (n_batch == 1 || ret < 0) { - // if you get here, it means the KV cache is full - try increasing it via the context size - LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret); - return 1; - } - - LOG("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2); - - n_cache_miss += 1; - - // retry with half the batch size to try to find a free slot in the KV cache - n_batch /= 2; - i -= n_batch; - - continue; - } - - LOG("%s : decoded batch of %d tokens\n", __func__, n_tokens); - - for (auto & client : clients) { - if (client.i_batch < (int) i || client.i_batch >= (int) (i + n_tokens)) { - continue; - } - - //printf("client %d, seq %d, token %d, pos %d, batch %d\n", - // client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch); - - const llama_token id = gpt_sampler_sample(client.smpl, ctx, client.i_batch - i); - - gpt_sampler_accept(client.smpl, id, true); - - if (client.n_decoded == 1) { - // start measuring generation time after the first token to make sure all concurrent clients - // have their prompt already processed - client.t_start_gen = ggml_time_us(); - } - - const std::string token_str = llama_token_to_piece(ctx, id); - - client.response += token_str; - client.sampled = id; - - //printf("client %d, seq %d, token %d, pos %d, batch %d: %s\n", - // client.id, client.seq_id, id, client.n_decoded, client.i_batch, token_str.c_str()); - - if (client.n_decoded > 2 && - (llama_token_is_eog(model, id) || - (params.n_predict > 0 && client.n_decoded + client.n_prompt >= params.n_predict) || - client.response.find("User:") != std::string::npos || - client.response.find('\n') != std::string::npos)) { - // basic reverse prompt - const size_t pos = client.response.find("User:"); - if (pos != std::string::npos) { - client.response = client.response.substr(0, pos); - } - - // delete only the generated part of the sequence, i.e. keep the system prompt in the cache - llama_kv_cache_seq_rm(ctx, client.id + 1, -1, -1); - llama_kv_cache_seq_cp(ctx, 0, client.id + 1, -1, -1); - - const auto t_main_end = ggml_time_us(); - - LOG_TEE("\033[31mClient %3d, seq %3d/%3d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \nInput: %s\n\033[35mResponse: %s\033[0m\n\n", - client.id, client.seq_id, n_seq, client.n_prompt, client.n_decoded, - (t_main_end - client.t_start_prompt) / 1e6, - (double) (client.n_prompt + client.n_decoded) / (t_main_end - client.t_start_prompt) * 1e6, - n_cache_miss, - ::trim(client.input).c_str(), - ::trim(client.response).c_str()); - - n_total_prompt += client.n_prompt; - n_total_gen += client.n_decoded; - - client.seq_id = -1; - } - - client.i_batch = -1; - } - } - } - - const auto t_main_end = ggml_time_us(); - - print_date_time(); - - LOG_TEE("\n%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system); - if (params.prompt_file.empty()) { - params.prompt_file = "used built-in defaults"; - } - LOG_TEE("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str()); - LOG_TEE("Model and path used: \033[32m%s\033[0m\n\n", params.model.c_str()); - - LOG_TEE("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt ) / (t_main_end - t_main_start) * 1e6); - LOG_TEE("Total gen tokens: %6d, speed: %5.2f t/s\n", n_total_gen, (double) (n_total_gen ) / (t_main_end - t_main_start) * 1e6); - LOG_TEE("Total speed (AVG): %6s speed: %5.2f t/s\n", "", (double) (n_total_prompt + n_total_gen) / (t_main_end - t_main_start) * 1e6); - LOG_TEE("Cache misses: %6d\n", n_cache_miss); - - LOG_TEE("\n"); - - // TODO: print sampling/grammar timings for all clients - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); - - llama_batch_free(batch); - - llama_free(ctx); - llama_free_model(model); - - llama_backend_free(); - - fprintf(stderr, "\n\n"); - - return 0; -} diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp index 76d235c2c..d3d5ab46f 100644 --- a/examples/passkey/passkey.cpp +++ b/examples/passkey/passkey.cpp @@ -1,3 +1,4 @@ +#include "arg.h" #include "common.h" #include "llama.h" @@ -19,8 +20,7 @@ int main(int argc, char ** argv) { params.n_keep = 32; params.i_pos = -1; - auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_PASSKEY, print_usage); - if (!gpt_params_parse(argc, argv, params, options)) { + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PASSKEY, print_usage)) { return 1; } @@ -220,8 +220,6 @@ int main(int argc, char ** argv) { { const llama_token new_token_id = llama_sampler_sample(smpl, ctx, batch.n_tokens - 1); - llama_sampler_accept(smpl, new_token_id); - // is it an end of generation? if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) { LOG_TEE("\n"); diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp deleted file mode 100644 index 420dbb813..000000000 --- a/examples/perplexity/perplexity.cpp +++ /dev/null @@ -1,2062 +0,0 @@ -#include "build-info.h" -#include "common.h" -#include "llama.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#if defined(_MSC_VER) -#pragma warning(disable: 4244 4267) // possible loss of data -#endif - -struct results_perplexity { - std::vector tokens; - double ppl_value; - std::vector logits; - std::vector probs; -}; - -struct results_log_softmax { - double log_softmax; - float logit; - float prob; -}; - -static void write_logfile( - const llama_context * ctx, const gpt_params & params, const llama_model * model, - const struct results_perplexity & results -) { - if (params.logdir.empty()) { - return; - } - - if (params.hellaswag) { - fprintf(stderr, "%s: warning: logging results is not implemented for HellaSwag. No files will be written.\n", __func__); - return; - } - - const std::string timestamp = string_get_sortable_timestamp(); - - const bool success = fs_create_directory_with_parents(params.logdir); - if (!success) { - fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n", - __func__, params.logdir.c_str()); - return; - } - - const std::string logfile_path = params.logdir + timestamp + ".yml"; - FILE * logfile = fopen(logfile_path.c_str(), "w"); - - if (logfile == NULL) { - fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str()); - return; - } - - fprintf(logfile, "binary: main\n"); - char model_desc[128]; - llama_model_desc(model, model_desc, sizeof(model_desc)); - yaml_dump_non_result_info(logfile, params, ctx, timestamp, results.tokens, model_desc); - - fprintf(logfile, "\n"); - fprintf(logfile, "######################\n"); - fprintf(logfile, "# Perplexity Results #\n"); - fprintf(logfile, "######################\n"); - fprintf(logfile, "\n"); - - yaml_dump_vector_float(logfile, "logits", results.logits); - fprintf(logfile, "ppl_value: %f\n", results.ppl_value); - yaml_dump_vector_float(logfile, "probs", results.probs); - - llama_perf_dump_yaml(logfile, ctx); - fclose(logfile); -} - -static std::vector softmax(const std::vector& logits) { - std::vector probs(logits.size()); - float max_logit = logits[0]; - for (float v : logits) { - max_logit = std::max(max_logit, v); - } - double sum_exp = 0.0; - for (size_t i = 0; i < logits.size(); i++) { - // Subtract the maximum logit value from the current logit value for numerical stability - const float logit = logits[i] - max_logit; - const float exp_logit = expf(logit); - sum_exp += exp_logit; - probs[i] = exp_logit; - } - for (size_t i = 0; i < probs.size(); i++) { - probs[i] /= sum_exp; - } - return probs; -} - -static results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) { - float max_logit = logits[0]; - for (int i = 1; i < n_vocab; ++i) { - max_logit = std::max(max_logit, logits[i]); - } - double sum_exp = 0.0; - for (int i = 0; i < n_vocab; ++i) { - sum_exp += expf(logits[i] - max_logit); - } - return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp}; -} - -static inline int nearest_int(float fval) { - //assert(fval <= 4194303.f); - float val = fval + 12582912.f; - int i; memcpy(&i, &val, sizeof(int)); - return (i & 0x007fffff) - 0x00400000; -} - -static double log_softmax(int n_vocab, const float * logits, uint16_t * log_prob, int tok) { - float max_logit = logits[0]; - float min_logit = logits[0]; - for (int i = 1; i < n_vocab; ++i) { - max_logit = std::max(max_logit, logits[i]); - min_logit = std::min(min_logit, logits[i]); - } - min_logit = std::max(min_logit, max_logit - 16); - double sum_exp = 0.0; - for (int i = 0; i < n_vocab; ++i) { - sum_exp += expf(logits[i] - max_logit); - } - const float log_sum_exp = log(sum_exp); - const float min_log_prob = min_logit - max_logit - log_sum_exp; - const float scale = (max_logit - min_logit)/65535.f; - float * d = (float *)log_prob; - d[0] = scale; - d[1] = min_log_prob; - log_prob += 4; - if (scale) { - const float inv_scale = 1/scale; - for (int i = 0; i < n_vocab; ++i) { - log_prob[i] = logits[i] > min_logit ? nearest_int(inv_scale*(logits[i] - min_logit)) : 0; - } - } else { - std::memset(log_prob, 0, n_vocab*sizeof(uint16_t)); - } - return max_logit + log_sum_exp - logits[tok]; -} - -static void process_logits( - int n_vocab, const float * logits, const int * tokens, int n_token, std::vector & workers, - double & nll, double & nll2, float * logit_history, float * prob_history -) { - std::mutex mutex; - int counter = 0; - auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () { - double local_nll = 0; - double local_nll2 = 0; - while (true) { - std::unique_lock lock(mutex); - int i = counter++; - if (i >= n_token) { - nll += local_nll; nll2 += local_nll2; - break; - } - lock.unlock(); - const results_log_softmax results = log_softmax(n_vocab, logits + i*n_vocab, tokens[i+1]); - const double v = -results.log_softmax; - local_nll += v; - local_nll2 += v*v; - - logit_history[i] = results.logit; - prob_history[i] = results.prob; - } - }; - for (auto & w : workers) { - w = std::thread(compute); - } - compute(); - for (auto & w : workers) { - w.join(); - } -} - -static void process_logits(std::ostream& out, int n_vocab, const float * logits, const int * tokens, int n_token, - std::vector & workers, std::vector & log_probs, double & nll, double & nll2) { - std::mutex mutex; - const int nv = 2*((n_vocab + 1)/2) + 4; - int counter = 0; - auto compute = [&mutex, &counter, &log_probs, &nll, &nll2, n_vocab, logits, tokens, n_token, nv] () { - double local_nll = 0; - double local_nll2 = 0; - while (true) { - std::unique_lock lock(mutex); - int i = counter++; - if (i >= n_token) { - nll += local_nll; nll2 += local_nll2; - break; - } - lock.unlock(); - const double v = log_softmax(n_vocab, logits + i*n_vocab, log_probs.data() + i*nv, tokens[i+1]); - local_nll += v; - local_nll2 += v*v; - } - }; - for (auto & w : workers) { - w = std::thread(compute); - } - compute(); - for (auto & w : workers) { - w.join(); - } - out.write((const char *)log_probs.data(), n_token*nv*sizeof(uint16_t)); -} - -struct kl_divergence_result { - double sum_nll = 0.0; - double sum_nll2 = 0.0; - double sum_nll_base = 0.0; - double sum_nll_base2 = 0.0; - double sum_nll_nll_base = 0.0; - double sum_kld = 0.0; - double sum_kld2 = 0.0; - double sum_p_diff = 0.0; - double sum_p_diff2 = 0.0; - double sum_p_diff4 = 0.0; - float max_p_diff = 0.0f; - size_t n_same_top = 0.0; - size_t count = 0.0; -}; - -static std::pair log_softmax(int n_vocab, const float * logits, const uint16_t * base_log_prob, int tok, kl_divergence_result & kld) { - float max_logit = logits[0]; - int imax = 0; - for (int i = 1; i < n_vocab; ++i) { - if (logits[i] > max_logit) { - max_logit = logits[i]; - imax = i; - } - } - double sum_exp = 0.0; - for (int i = 0; i < n_vocab; ++i) { - sum_exp += expf(logits[i] - max_logit); - } - const float log_sum_exp = log(sum_exp); - const float * d = (const float *)base_log_prob; - const float scale = d[0]; - const float min_log_prob = d[1]; - base_log_prob += 4; - - const float nll = max_logit + log_sum_exp - logits[tok]; - kld.sum_nll += nll; - kld.sum_nll2 += nll*nll; - - const float nll_base = -(scale*base_log_prob[tok] + min_log_prob); - kld.sum_nll_base += nll_base; - kld.sum_nll_base2 += nll_base*nll_base; - - kld.sum_nll_nll_base += nll*nll_base; - - max_logit += log_sum_exp; - double sum = 0; - int imax_base = -1; - float p_log_base_max = 0; - for (int i = 0; i < n_vocab; ++i) { - const float p_log_base = scale*base_log_prob[i] + min_log_prob; - if (i == 0 || p_log_base > p_log_base_max) { - p_log_base_max = p_log_base; - imax_base = i; - } - if (p_log_base > -16.f) { - const float p_base = expf(p_log_base); - sum += p_base * (p_log_base - logits[i] + max_logit); - } - } - kld.sum_kld += sum; - kld.sum_kld2 += sum*sum; - ++kld.count; - if (imax == imax_base) ++kld.n_same_top; - - const float p_base = expf(-nll_base); - const float p = expf(-nll); - const float p_diff = p - p_base; - kld.sum_p_diff += p_diff; - const double p_diff2 = p_diff*p_diff; - kld.sum_p_diff2 += p_diff2; - kld.sum_p_diff4 += p_diff2*p_diff2; - kld.max_p_diff = std::max(kld.max_p_diff, std::fabs(p_diff)); - - return std::make_pair(sum, p_diff); -} - -static void process_logits(int n_vocab, const float * logits, const int * tokens, int n_token, - std::vector & workers, const std::vector & base_log_probs, kl_divergence_result & kld, - float * kld_values, float * p_diff_values) { - std::mutex mutex; - const int nv = 2*((n_vocab + 1)/2) + 4; - int counter = 0; - auto compute = [&mutex, &counter, &base_log_probs, &kld, n_vocab, logits, tokens, n_token, nv, kld_values, p_diff_values] () { - kl_divergence_result local_kld; - while (true) { - std::unique_lock lock(mutex); - int i = counter++; - if (i >= n_token) { - kld.sum_nll += local_kld.sum_nll; - kld.sum_nll2 += local_kld.sum_nll2; - kld.sum_nll_base += local_kld.sum_nll_base; - kld.sum_nll_base2 += local_kld.sum_nll_base2; - kld.sum_nll_nll_base += local_kld.sum_nll_nll_base; - kld.sum_kld += local_kld.sum_kld; - kld.sum_kld2 += local_kld.sum_kld2; - kld.sum_p_diff += local_kld.sum_p_diff; - kld.sum_p_diff2 += local_kld.sum_p_diff2; - kld.sum_p_diff4 += local_kld.sum_p_diff4; - kld.n_same_top += local_kld.n_same_top; - kld.max_p_diff = std::max(kld.max_p_diff, local_kld.max_p_diff); - kld.count += local_kld.count; - break; - } - lock.unlock(); - std::pair v = log_softmax(n_vocab, logits + i*n_vocab, base_log_probs.data() + i*nv, tokens[i+1], local_kld); - kld_values[i] = (float)v.first; - p_diff_values[i] = v.second; - } - }; - for (auto & w : workers) { - w = std::thread(compute); - } - compute(); - for (auto & w : workers) { - w.join(); - } -} - -static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) { - // Download: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip - // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw` - // Output: `perplexity: 13.5106 [114/114]` - // BOS tokens will be added for each chunk before eval - - const bool add_bos = llama_add_bos_token(llama_get_model(ctx)); - GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx))); - - fprintf(stderr, "%s: tokenizing the input ..\n", __func__); - - std::vector tokens = ::llama_tokenize(ctx, params.prompt, true); - - const int n_ctx = llama_n_ctx(ctx); - - if (int(tokens.size()) < 2*n_ctx) { - fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx, - n_ctx); - fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size()); - return {std::move(tokens), 0., {}, {}}; - } - - std::vector logit_history; - std::vector prob_history; - - logit_history.resize(tokens.size()); - prob_history.resize(tokens.size()); - - if (params.ppl_stride <= 0) { - fprintf(stderr, "%s: stride is %d but must be greater than zero!\n",__func__,params.ppl_stride); - return {tokens, -1, logit_history, prob_history}; - } - - const int calc_chunk = n_ctx; - - fprintf(stderr, "%s: have %zu tokens. Calculation chunk = %d\n", __func__, tokens.size(), calc_chunk); - - if (int(tokens.size()) <= calc_chunk) { - fprintf(stderr, "%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__, - tokens.size(), n_ctx, params.ppl_stride); - return {tokens, -1, logit_history, prob_history}; - } - - const int n_chunk_max = (tokens.size() - calc_chunk + params.ppl_stride - 1) / params.ppl_stride; - - const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max); - const int n_vocab = llama_n_vocab(llama_get_model(ctx)); - const int n_batch = params.n_batch; - - int count = 0; - double nll = 0.0; - - fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch); - - for (int i = 0; i < n_chunk; ++i) { - const int start = i * params.ppl_stride; - const int end = start + calc_chunk; - - const int num_batches = (calc_chunk + n_batch - 1) / n_batch; - //fprintf(stderr, "%s: evaluating %d...%d using %d batches\n", __func__, start, end, num_batches); - - std::vector logits; - - const auto t_start = std::chrono::high_resolution_clock::now(); - - // clear the KV cache - llama_kv_cache_clear(ctx); - - for (int j = 0; j < num_batches; ++j) { - const int batch_start = start + j * n_batch; - const int batch_size = std::min(end - batch_start, n_batch); - - //fprintf(stderr, " Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch); - // TODO: use llama_batch.logits instead of relying on logits_all == true - if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) { - //fprintf(stderr, "%s : failed to eval\n", __func__); - return {tokens, -1, logit_history, prob_history}; - } - - // save original token and restore it after eval - const auto token_org = tokens[batch_start]; - - // add BOS token for the first batch of each chunk - if (add_bos && j == 0) { - tokens[batch_start] = llama_token_bos(llama_get_model(ctx)); - } - - const auto batch_logits = llama_get_logits(ctx); - logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab); - - if (j == 0) { - tokens[batch_start] = token_org; - } - } - - const auto t_end = std::chrono::high_resolution_clock::now(); - - if (i == 0) { - const float t_total = std::chrono::duration(t_end - t_start).count(); - fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total); - int total_seconds = (int)(t_total * n_chunk); - if (total_seconds >= 60*60) { - fprintf(stderr, "%d hours ", total_seconds / (60*60)); - total_seconds = total_seconds % (60*60); - } - fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0); - } - - //fprintf(stderr, "%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start); - for (int j = n_ctx - params.ppl_stride - 1; j < n_ctx - 1; ++j) { - - // Calculate probability of next token, given the previous ones. - const std::vector tok_logits( - logits.begin() + (j + 0) * n_vocab, - logits.begin() + (j + 1) * n_vocab); - - const float prob = softmax(tok_logits)[tokens[start + j + 1]]; - logit_history[start + j + 1] = tok_logits[tokens[start + j + 1]]; - prob_history[start + j + 1] = prob; - - nll += -std::log(prob); - ++count; - } - // perplexity is e^(average negative log-likelihood) - if (params.ppl_output_type == 0) { - printf("[%d]%.4lf,", i + 1, std::exp(nll / count)); - } else { - printf("%8d %.4lf\n", i*params.ppl_stride, std::exp(nll / count)); - } - fflush(stdout); - } - printf("\n"); - - return {tokens, std::exp(nll / count), logit_history, prob_history}; -} - -static results_perplexity perplexity(llama_context * ctx, const gpt_params & params, const int32_t n_ctx) { - if (params.ppl_stride > 0) { - return perplexity_v2(ctx, params); - } - - // Download: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip - // Run `./llama-perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw` - // Output: `perplexity: 13.5106 [114/114]` - // BOS tokens will be added for each chunk before eval - - const bool add_bos = llama_add_bos_token(llama_get_model(ctx)); - GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx))); - - std::ofstream logits_stream; - if (!params.logits_file.empty()) { - logits_stream.open(params.logits_file.c_str(), std::ios::binary); - if (!logits_stream.is_open()) { - fprintf(stderr, "%s: failed to open %s for writing\n", __func__, params.logits_file.c_str()); - return {}; - } - fprintf(stderr, "%s: saving all logits to %s\n", __func__, params.logits_file.c_str()); - logits_stream.write("_logits_", 8); - logits_stream.write(reinterpret_cast(&n_ctx), sizeof(n_ctx)); - } - - auto tim1 = std::chrono::high_resolution_clock::now(); - fprintf(stderr, "%s: tokenizing the input ..\n", __func__); - - std::vector tokens = ::llama_tokenize(ctx, params.prompt, true); - - auto tim2 = std::chrono::high_resolution_clock::now(); - fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast(tim2-tim1).count()); - - if (int(tokens.size()) < 2*n_ctx) { - fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx, - n_ctx); - fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size()); - return {std::move(tokens), 0., {}, {}}; - } - - std::vector logit_history; - logit_history.resize(tokens.size()); - - std::vector prob_history; - prob_history.resize(tokens.size()); - - const int n_chunk_max = tokens.size() / n_ctx; - - const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max); - const int n_vocab = llama_n_vocab(llama_get_model(ctx)); - const int n_batch = params.n_batch; - - int count = 0; - double nll = 0.0; - double nll2 = 0.0; - - const int num_batches = (n_ctx + n_batch - 1) / n_batch; - const int n_seq = std::max(1, n_batch / n_ctx); - - GGML_ASSERT(n_batch < n_ctx || n_batch % n_ctx == 0); - GGML_ASSERT(params.n_ctx == n_seq * n_ctx); - - llama_batch batch = llama_batch_init(std::min(n_batch, n_ctx*n_seq), 0, 1); - - std::vector logits; - if (num_batches > 1) { - logits.reserve((size_t)n_ctx * n_vocab); - } - - fprintf(stderr, "%s: calculating perplexity over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq); - - std::vector workers(std::thread::hardware_concurrency() - 1); - - std::vector log_probs; - if (!params.logits_file.empty()) { - logits_stream.write((const char *)&n_vocab, sizeof(n_vocab)); - logits_stream.write((const char *)&n_chunk, sizeof(n_chunk)); - logits_stream.write((const char *)tokens.data(), n_chunk*n_ctx*sizeof(tokens[0])); - const int nv = 2*((n_vocab + 1)/2) + 4; - log_probs.resize(n_ctx * nv); - } - - // We get the logits for all the tokens in the context window (params.n_ctx) - // from llama_eval above. Now, based on https://huggingface.co/docs/transformers/perplexity, - // calculate the perplexity over the last half of the window (so the model always has - // some context to predict the token). - // - // We rely on the fact that attention in the forward pass only looks at previous - // tokens here, so the logits returned for each token are an accurate representation - // of what the model would have predicted at that point. - // - // Example, we have a context window of 512, we will compute perplexity for each of the - // last 256 tokens. Then, we split the input up into context window size chunks to - // process the entire prompt. - const int first = n_ctx/2; - - for (int i = 0; i < n_chunk; i += n_seq) { - const int start = i * n_ctx; - const int end = start + n_ctx; - - const int n_seq_batch = std::min(n_seq, n_chunk - i); - - const auto t_start = std::chrono::high_resolution_clock::now(); - - // clear the KV cache - llama_kv_cache_clear(ctx); - - for (int j = 0; j < num_batches; ++j) { - const int batch_start = start + j * n_batch; - const int batch_size = std::min(end - batch_start, n_batch); - - int n_outputs = 0; - - batch.n_tokens = 0; - for (int seq = 0; seq < n_seq_batch; seq++) { - int seq_start = batch_start + seq*n_ctx; - - // save original token and restore it after eval - const auto token_org = tokens[seq_start]; - - // add BOS token for the first batch of each chunk - if (add_bos && j == 0) { - tokens[seq_start] = llama_token_bos(llama_get_model(ctx)); - } - - for (int k = 0; k < batch_size; ++k) { - const int idx = seq*n_ctx + k; - batch.token [idx] = tokens[seq_start + k]; - batch.pos [idx] = j*n_batch + k; - batch.n_seq_id[idx] = 1; - batch.seq_id [idx][0] = seq; - batch.logits [idx] = batch.pos[idx] >= first ? 1 : 0; - - n_outputs += batch.logits[idx] != 0; - } - batch.n_tokens += batch_size; - - // restore the original token in case it was set to BOS - tokens[seq_start] = token_org; - } - - if (llama_decode(ctx, batch)) { - fprintf(stderr, "%s : failed to eval\n", __func__); - return {tokens, -1, logit_history, prob_history}; - } - - if (num_batches > 1 && n_outputs > 0) { - const auto * batch_logits = llama_get_logits(ctx); - logits.insert(logits.end(), batch_logits, batch_logits + n_outputs * n_vocab); - } - } - - - if (i == 0) { - llama_synchronize(ctx); - const auto t_end = std::chrono::high_resolution_clock::now(); - const float t_total = std::chrono::duration(t_end - t_start).count(); - fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total); - int total_seconds = (int)(t_total*n_chunk/n_seq); - if (total_seconds >= 60*60) { - fprintf(stderr, "%d hours ", total_seconds / (60*60)); - total_seconds = total_seconds % (60*60); - } - fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0); - } - - for (int seq = 0; seq < n_seq_batch; seq++) { - const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits_ith(ctx, seq*n_ctx + first); - - llama_token * tokens_data = tokens.data() + start + seq*n_ctx + first; - if (!params.logits_file.empty()) { - process_logits(logits_stream, n_vocab, all_logits, - tokens_data, n_ctx - 1 - first, - workers, log_probs, nll, nll2); - } else { - process_logits(n_vocab, all_logits, - tokens_data, n_ctx - 1 - first, - workers, nll, nll2, - logit_history.data() + start + seq*n_ctx + first, - prob_history.data() + start + seq*n_ctx + first); - } - count += n_ctx - first - 1; - - // perplexity is e^(average negative log-likelihood) - if (params.ppl_output_type == 0) { - printf("[%d]%.4lf,", i + seq + 1, std::exp(nll / count)); - } else { - double av = nll/count; - double av2 = nll2/count - av*av; - if (av2 > 0) av2 = sqrt(av2/(count-1)); - printf("%8d %.4lf %4lf %4lf\n", i*n_ctx, std::exp(nll / count), av, av2); - } - } - fflush(stdout); - - logits.clear(); - } - printf("\n"); - - nll2 /= count; - nll /= count; - const double ppl = exp(nll); - nll2 -= nll * nll; - if (nll2 > 0) { - nll2 = sqrt(nll2/(count-1)); - printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl); - } else { - printf("Unexpected negative standard deviation of log(prob)\n"); - } - - llama_batch_free(batch); - - return {tokens, ppl, logit_history, prob_history}; -} - -static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector & batch_logits, int32_t n_batch, int32_t n_vocab) { - int prev_outputs = 0; - for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) { - const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i)); - - llama_batch batch_view = { - n_tokens, - batch.token + i, - nullptr, - batch.pos + i, - batch.n_seq_id + i, - batch.seq_id + i, - batch.logits + i, - 0, 0, 0, // unused - }; - - const int ret = llama_decode(ctx, batch_view); - if (ret != 0) { - LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret); - return false; - } - - int n_outputs = 0; - for (int i = 0; i < n_tokens; ++i) { - n_outputs += batch_view.logits[i] != 0; - } - - memcpy(batch_logits.data() + prev_outputs*n_vocab, llama_get_logits(ctx), n_outputs*n_vocab*sizeof(float)); - - prev_outputs += n_outputs; - } - - return true; -} - -#define K_TOKEN_CHUNK 4 - -static void compute_logprobs(const float * batch_logits, int n_vocab, std::vector& workers, - const std::vector>& eval_pairs, std::vector& eval_results) { - if (eval_results.size() != eval_pairs.size()) { - eval_results.resize(eval_pairs.size()); - } - if (eval_pairs.empty()) return; - - size_t max_threads = std::min((eval_pairs.size() + K_TOKEN_CHUNK - 1)/K_TOKEN_CHUNK, workers.size()); - - std::atomic counter(0); - auto compute = [&counter, &eval_pairs, &eval_results, batch_logits, n_vocab] () { - float local_logprobs[K_TOKEN_CHUNK]; - while (true) { - size_t first = counter.fetch_add(K_TOKEN_CHUNK, std::memory_order_relaxed); - if (first >= eval_results.size()) break; - size_t last = std::min(first + K_TOKEN_CHUNK, eval_results.size()); - for (size_t i = first; i < last; ++i) { - auto logits = batch_logits + eval_pairs[i].first * n_vocab; - float max_logit = logits[0]; - for (int j = 1; j < n_vocab; ++j) { - max_logit = std::max(max_logit, logits[j]); - } - float sum_p = 0.f; - for (int j = 0; j < n_vocab; ++j) { - sum_p += expf(logits[j] - max_logit); - } - local_logprobs[i - first] = logits[eval_pairs[i].second] - max_logit - std::log(sum_p); - } - std::memcpy(eval_results.data() + first, local_logprobs, (last - first)*sizeof(float)); - } - }; - - for (size_t it = 0; it < max_threads; ++it) { - workers[it] = std::thread(compute); - } - for (size_t it = 0; it < max_threads; ++it) { - workers[it].join(); - } -} - -static void hellaswag_score(llama_context * ctx, const gpt_params & params) { - // Calculates hellaswag score (acc_norm) from prompt - // - // Data extracted from the HellaSwag validation dataset (MIT license) https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl - // All used data fields are preprocessed as in https://github.com/EleutherAI/lm-evaluation-harness/blob/df3da98c5405deafd519c2ddca52bb7c3fe36bef/lm_eval/tasks/hellaswag.py#L62-L68 - // - // All 10042 tasks should be extracted to keep the results standardized like other implementations. - // - // Datafile layout: - // ['??'] denotes json fields - // 6 lines per task: - // ['activity_label'] + ": " +['ctx'] - The first part of the query, the context - // ['label'] - The index the best common sense ending aka gold ending - // ['endings'][0] - Endings added to the first part of the query - // ['endings'][1] - // ['endings'][2] - // ['endings'][3] - - std::vector prompt_lines; - std::istringstream strstream(params.prompt); - std::string line; - - while (std::getline(strstream,line,'\n')) { - prompt_lines.push_back(line); - } - - if (prompt_lines.size() % 6 != 0) { - fprintf(stderr, "%s : number of lines in prompt not a multiple of 6.\n", __func__); - return; - } - - size_t hs_task_count = prompt_lines.size()/6; - fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count); - - const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM; - fprintf(stderr, "================================= is_spm = %d\n", is_spm); - - // The tasks should be randomized so the score stabilizes quickly. - bool randomize_tasks = true; - - // Number of tasks to use when computing the score - if (params.hellaswag_tasks < hs_task_count) { - hs_task_count = params.hellaswag_tasks; - } - - // The random seed should not impact the final result if the computation is done over enough tasks, so kept hardcoded for now - std::mt19937 rng(1); - - // Dataholder for hellaswag tasks - struct hs_data_t { - std::string context; - size_t gold_ending_idx; - std::string ending[4]; - size_t ending_logprob_count[4]; - double ending_logprob[4]; - - size_t i_logits; // starting index of logits in the llama_batch - size_t common_prefix; // max number of initial tokens that are the same in all sentences - size_t required_tokens; // needed number of tokens to evaluate all 4 endings - std::vector seq_tokens[4]; - }; - - fprintf(stderr, "%s : selecting %zu %s tasks.\n", __func__, hs_task_count, (randomize_tasks?"randomized":"the first") ); - - // Select and read data from prompt lines - std::vector hs_data(hs_task_count); - for (size_t i = 0; i < hs_task_count; i++) { - size_t idx = i; - - auto & hs_cur = hs_data[i]; - - // Select a random example of those left in the prompt - if (randomize_tasks) { - std::uniform_int_distribution dist(0, prompt_lines.size()/6-1 ) ; - idx = dist(rng); - } - - hs_cur.context = prompt_lines[idx*6]; - hs_cur.gold_ending_idx = std::stoi( prompt_lines[idx*6+1] ); - for (size_t j = 0; j < 4; j++) { - hs_cur.ending[j] = prompt_lines[idx*6+2+j]; - hs_cur.seq_tokens[j] = ::llama_tokenize(ctx, hs_cur.context + " " + hs_cur.ending[j], true); - } - - // determine the common prefix of the endings - hs_cur.common_prefix = 0; - for (size_t k = 0; k < hs_cur.seq_tokens[0].size(); k++) { - if (hs_cur.seq_tokens[0][k] != hs_cur.seq_tokens[1][k] || - hs_cur.seq_tokens[0][k] != hs_cur.seq_tokens[2][k] || - hs_cur.seq_tokens[0][k] != hs_cur.seq_tokens[3][k]) { - break; - } - hs_cur.common_prefix++; - } - hs_cur.required_tokens = hs_cur.common_prefix + - hs_cur.seq_tokens[0].size() - hs_cur.common_prefix + - hs_cur.seq_tokens[1].size() - hs_cur.common_prefix + - hs_cur.seq_tokens[2].size() - hs_cur.common_prefix + - hs_cur.seq_tokens[3].size() - hs_cur.common_prefix; - - //GGML_ASSERT(hs_cur.common_prefix >= ::llama_tokenize(ctx, hs_cur.context, true).size()); - - // Delete the selected random example from the prompt - if (randomize_tasks) { - prompt_lines.erase( std::next(prompt_lines.begin(),idx*6) , std::next(prompt_lines.begin(),idx*6+6) ); - } - } - - fprintf(stderr, "%s : calculating hellaswag score over selected tasks.\n", __func__); - - printf("\ntask\tacc_norm\n"); - - double acc = 0.0f; - - const int n_vocab = llama_n_vocab(llama_get_model(ctx)); - const int n_ctx = llama_n_ctx(ctx); - const int n_batch = params.n_batch; - - const int max_tasks_per_batch = 32; - const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx)); - - llama_batch batch = llama_batch_init(n_ctx, 0, 4); - - std::vector tok_logits(n_vocab); - // TODO: this could be made smaller; it's currently the worst-case size - std::vector batch_logits(n_vocab*n_ctx); - - std::vector> eval_pairs; - std::vector eval_results; - std::vector workers(std::thread::hardware_concurrency()); - - for (size_t i0 = 0; i0 < hs_task_count; i0++) { - int n_cur = 0; - - size_t i1 = i0; - size_t i_logits = 0; // this tells us how many logits were needed before this point in the batch - - llama_batch_clear(batch); - - // batch as much tasks as possible into the available context - // each task has 4 unique sequence ids - one for each ending - // the common prefix is shared among the 4 sequences to save tokens - // we extract logits only from the last common token and from all ending tokens of each sequence - while (n_cur + (int) hs_data[i1].required_tokens <= n_ctx) { - auto & hs_cur = hs_data[i1]; - int n_logits = 0; - - const int s0 = 4*(i1 - i0); - if (s0 + 4 > max_seq) { - break; - } - - for (size_t i = 0; i < hs_cur.common_prefix; ++i) { - llama_batch_add(batch, hs_cur.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3 }, false); - } - batch.logits[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix - n_logits += 1; - - for (int s = 0; s < 4; ++s) { - const size_t seq_tokens_size = hs_cur.seq_tokens[s].size(); - // TODO: don't evaluate the last token of each sequence - for (size_t i = hs_cur.common_prefix; i < seq_tokens_size; ++i) { - const bool needs_logits = i < seq_tokens_size - 1; - llama_batch_add(batch, hs_cur.seq_tokens[s][i], i, { s0 + s }, needs_logits); - n_logits += needs_logits; - } - } - - hs_cur.i_logits = i_logits; - i_logits += n_logits; - - n_cur += hs_data[i1].required_tokens; - if (++i1 == hs_task_count) { - break; - } - } - - if (i0 == i1) { - fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0); - return; - } - - llama_kv_cache_clear(ctx); - - // decode all tasks [i0, i1) - if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) { - fprintf(stderr, "%s: llama_decode() failed\n", __func__); - return; - } - - // Compute log-probs in parallel - // First we collect all tasks - eval_pairs.clear(); - for (size_t i = i0; i < i1; ++i) { - auto & hs_cur = hs_data[i]; - size_t li = 1; // skip the last logit of the common prefix (computed separately below) - for (int s = 0; s < 4; ++s) { - for (size_t j = hs_cur.common_prefix; j < hs_cur.seq_tokens[s].size() - 1; j++) { - eval_pairs.emplace_back(hs_cur.i_logits + li++, hs_cur.seq_tokens[s][j + 1]); - } - } - } - // Then we do the actual calculation - compute_logprobs(batch_logits.data(), n_vocab, workers, eval_pairs, eval_results); - - size_t ir = 0; - - // compute the logprobs for each ending of the decoded tasks - for (size_t i = i0; i < i1; ++i) { - auto & hs_cur = hs_data[i]; - - // get the logits of the last token of the common prefix - std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*hs_cur.i_logits, n_vocab*sizeof(float)); - - const auto first_probs = softmax(tok_logits); - - for (int s = 0; s < 4; ++s) { - hs_cur.ending_logprob_count[s] = 1; - hs_cur.ending_logprob[s] = std::log(first_probs[hs_cur.seq_tokens[s][hs_cur.common_prefix]]); - for (size_t j = hs_cur.common_prefix; j < hs_cur.seq_tokens[s].size() - 1; j++) { - hs_cur.ending_logprob[s] += eval_results[ir++]; - hs_cur.ending_logprob_count[s]++; - } - hs_cur.ending_logprob[s] /= hs_cur.ending_logprob_count[s]; - } - - // Find the ending with maximum logprob - size_t ending_logprob_max_idx = 0; - double ending_logprob_max_val = hs_cur.ending_logprob[0]; - for (size_t s = 1; s < 4; s++) { - if (hs_cur.ending_logprob[s] > ending_logprob_max_val) { - ending_logprob_max_idx = s; - ending_logprob_max_val = hs_cur.ending_logprob[s]; - } - } - - //printf("max logprob ending idx %lu, gold ending idx %lu\n", ending_logprob_max_idx, hs_cur.gold_ending_idx); - - // If the gold ending got the maximum logprobe add one accuracy point - if (ending_logprob_max_idx == hs_cur.gold_ending_idx) { - acc += 1.0; - } - - // Print the accumulated accuracy mean x 100 - printf("%zu\t%.8lf\n", i + 1, acc/double(i + 1)*100.0); - fflush(stdout); - } - - i0 = i1 - 1; - } - - llama_batch_free(batch); - - printf("\n"); -} - -struct winogrande_entry { - std::string first; - std::string second; - std::array choices; - int answer; - - size_t i_logits; - size_t common_prefix; - size_t required_tokens; - size_t n_base1; // number of tokens for context + choice 1 - size_t n_base2; // number of tokens for context + choice 2 - std::vector seq_tokens[2]; -}; - -static std::vector load_winogrande_from_csv(const std::string & prompt) { - std::vector result; - std::istringstream in(prompt); - std::string line; - std::array comma_pos; - while (true) { - std::getline(in, line); - if (in.fail() || in.eof()) break; - int ipos = 0; - bool quote_open = false; - for (int i = 0; i < int(line.size()); ++i) { - if (!quote_open) { - if (line[i] == ',') { - comma_pos[ipos++] = i; - if (ipos == 4) break; - } - else if (line[i] == '"') { - quote_open = true; - } - } - else { - if (line[i] == '"') { - quote_open = false; - } - } - } - if (ipos != 4) { - printf("%s: failed to find comma separators in <%s>\n", __func__, line.c_str()); - continue; - } - auto sentence = line[comma_pos[0]+1] == '"' ? line.substr(comma_pos[0]+2, comma_pos[1] - comma_pos[0] - 3) - : line.substr(comma_pos[0]+1, comma_pos[1] - comma_pos[0] - 1); - auto choice1 = line.substr(comma_pos[1]+1, comma_pos[2] - comma_pos[1] - 1); - auto choice2 = line.substr(comma_pos[2]+1, comma_pos[3] - comma_pos[2] - 1); - auto answer = line.substr(comma_pos[3]+1, line.size() - comma_pos[3] - 1); - auto index = line.substr(0, comma_pos[0]); - int where = 0; - for ( ; where < int(sentence.size()); ++where) { - if (sentence[where] == '_') break; - } - if (where == int(sentence.size())) { - printf("%s: no _ in <%s>\n", __func__, sentence.c_str()); - continue; - } - std::istringstream stream(answer.c_str()); - int i_answer; stream >> i_answer; - if (stream.fail() || i_answer < 1 || i_answer > 2) { - printf("%s: failed to parse answer <%s>\n", __func__, answer.c_str()); - continue; - } - result.emplace_back(); - auto& wg = result.back(); - wg.first = sentence.substr(0, where); - wg.second = sentence.substr(where + 1, sentence.size() - where - 1); - wg.choices[0] = std::move(choice1); - wg.choices[1] = std::move(choice2); - wg.answer = i_answer; - } - return result; -} - -/* - * Evaluates the Winogrande score. - * Uses a CSV containing task index, dentence, choice 1, choice 2, answer (1 or 2) - * You can get one such dataset from e.g. https://huggingface.co/datasets/ikawrakow/winogrande-eval-for-llama.cpp - * As an example, the 1st row in the above dataset is - * - * 0,Sarah was a much better surgeon than Maria so _ always got the easier cases.,Sarah,Maria,2 - * - */ -static void winogrande_score(llama_context * ctx, const gpt_params & params) { - - constexpr int k_min_trailing_ctx = 3; - - auto data = load_winogrande_from_csv(params.prompt); - if (data.empty()) { - fprintf(stderr, "%s: no tasks\n", __func__); - return; - } - - fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, data.size()); - - if (params.winogrande_tasks > 0 && params.winogrande_tasks < data.size()) { - fprintf(stderr, "%s : selecting %zu random tasks\n", __func__, params.winogrande_tasks); - std::mt19937 rng(1); - std::vector aux(data.size()); - for (int i = 0; i < int(data.size()); ++i) { - aux[i] = i; - } - float scale = 1/(1.f + (float)rng.max()); - std::vector selected; - selected.resize(params.winogrande_tasks); - for (int i = 0; i < int(params.winogrande_tasks); ++i) { - int j = int(scale*rng()*aux.size()); - selected[i] = std::move(data[aux[j]]); - aux[j] = aux.back(); - aux.pop_back(); - } - data = std::move(selected); - } - - fprintf(stderr, "%s : tokenizing selected tasks\n", __func__); - - for (auto & task : data) { - task.seq_tokens[0] = ::llama_tokenize(ctx, task.first + task.choices[0] + task.second, true); - task.seq_tokens[1] = ::llama_tokenize(ctx, task.first + task.choices[1] + task.second, true); - - task.common_prefix = 0; - for (size_t k = 0; k < task.seq_tokens[0].size(); k++) { - if (task.seq_tokens[0][k] != task.seq_tokens[1][k]) { - break; - } - task.common_prefix++; - } - - // TODO: the last token of each of the sequences don't need to be evaluated - task.required_tokens = task.common_prefix + - task.seq_tokens[0].size() - task.common_prefix + - task.seq_tokens[1].size() - task.common_prefix; - - task.n_base1 = ::llama_tokenize(ctx, task.first + task.choices[0], true).size(); - task.n_base2 = ::llama_tokenize(ctx, task.first + task.choices[1], true).size(); - } - - fprintf(stderr, "%s : calculating winogrande score over selected tasks.\n", __func__); - - const int n_vocab = llama_n_vocab(llama_get_model(ctx)); - const int n_ctx = llama_n_ctx(ctx); - const int n_batch = params.n_batch; - - const int max_tasks_per_batch = 128; - const int max_seq = std::min(2*max_tasks_per_batch, (int) llama_n_seq_max(ctx)); - - llama_batch batch = llama_batch_init(n_ctx, 0, 2); - - std::vector tok_logits(n_vocab); - // TODO: this could be made smaller; it's currently the worst-case size - std::vector batch_logits(n_vocab*n_ctx); - - std::vector> eval_pairs; - std::vector eval_results; - std::vector workers(std::thread::hardware_concurrency()); - - int n_correct = 0; - int n_done = 0; - - for (size_t i0 = 0; i0 < data.size(); i0++) { - int n_cur = 0; - - size_t i1 = i0; - size_t i_logits = 0; - - llama_batch_clear(batch); - - while (n_cur + (int) data[i1].required_tokens <= n_ctx) { - int n_logits = 0; - const int s0 = 2*(i1 - i0); - if (s0 + 2 > max_seq) { - break; - } - - for (size_t i = 0; i < data[i1].common_prefix; ++i) { - llama_batch_add(batch, data[i1].seq_tokens[0][i], i, { s0 + 0, s0 + 1 }, false); - } - batch.logits[batch.n_tokens - 1] = true; - n_logits += 1; - - for (int s = 0; s < 2; ++s) { - // TODO: end before the last token, no need to predict past the end of the sequences - for (size_t i = data[i1].common_prefix; i < data[i1].seq_tokens[s].size(); ++i) { - llama_batch_add(batch, data[i1].seq_tokens[s][i], i, { s0 + s }, true); - n_logits += 1; - } - } - - data[i1].i_logits = i_logits; - i_logits += n_logits; - - n_cur += data[i1].required_tokens; - if (++i1 == data.size()) { - break; - } - } - - if (i0 == i1) { - fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0); - return; - } - - llama_kv_cache_clear(ctx); - - // decode all tasks [i0, i1) - if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) { - fprintf(stderr, "%s: llama_decode() failed\n", __func__); - return; - } - - eval_pairs.clear(); - for (size_t i = i0; i < i1; ++i) { - auto & task = data[i]; - - const bool skip_choice = - task.seq_tokens[0].size() - task.common_prefix > k_min_trailing_ctx && - task.seq_tokens[1].size() - task.common_prefix > k_min_trailing_ctx; - - const auto& n_base1 = skip_choice ? task.n_base1 : task.common_prefix; - const int last_1st = task.seq_tokens[0].size() - n_base1 > 1 ? 1 : 0; - size_t li = n_base1 - task.common_prefix; - for (size_t j = n_base1-1; j < task.seq_tokens[0].size()-1-last_1st; ++j) { - eval_pairs.emplace_back(task.i_logits + li++, task.seq_tokens[0][j+1]); - } - const auto& n_base2 = skip_choice ? task.n_base2 : task.common_prefix; - const int last_2nd = task.seq_tokens[1].size() - n_base2 > 1 ? 1 : 0; - // FIXME: this uses the wrong first logits when not skipping the choice word - li = task.seq_tokens[0].size() - task.common_prefix + n_base2 - task.common_prefix; - for (size_t j = n_base2-1; j < task.seq_tokens[1].size()-1-last_2nd; ++j) { - eval_pairs.emplace_back(task.i_logits + li++, task.seq_tokens[1][j+1]); - } - } - compute_logprobs(batch_logits.data(), n_vocab, workers, eval_pairs, eval_results); - - size_t ir = 0; - for (size_t i = i0; i < i1; ++i) { - auto & task = data[i]; - - const bool skip_choice = - task.seq_tokens[0].size() - task.common_prefix > k_min_trailing_ctx && - task.seq_tokens[1].size() - task.common_prefix > k_min_trailing_ctx; - - float score_1st = 0; - const auto& n_base1 = skip_choice ? task.n_base1 : task.common_prefix; - const int last_1st = task.seq_tokens[0].size() - n_base1 > 1 ? 1 : 0; - for (size_t j = n_base1-1; j < task.seq_tokens[0].size()-1-last_1st; ++j) { - score_1st += eval_results[ir++]; - } - score_1st /= (task.seq_tokens[0].size() - n_base1 - last_1st); - - float score_2nd = 0; - const auto& n_base2 = skip_choice ? task.n_base2 : task.common_prefix; - const int last_2nd = task.seq_tokens[1].size() - n_base2 > 1 ? 1 : 0; - for (size_t j = n_base2-1; j < task.seq_tokens[1].size()-1-last_2nd; ++j) { - score_2nd += eval_results[ir++]; - } - score_2nd /= (task.seq_tokens[1].size() - n_base2 - last_2nd); - - int result = score_1st > score_2nd ? 1 : 2; - - if (result == task.answer) { - ++n_correct; - } - ++n_done; - - // print the accumulated accuracy mean x 100 - printf("%zu\t%.4lf\t%10.6f %10.6f %d %d\n", i+1, 100.0 * n_correct/n_done, score_1st, score_2nd, result, task.answer); - fflush(stdout); - } - - i0 = i1 - 1; - } - - printf("\n"); - - if (n_done < 100) return; - - const float p = 1.f*n_correct/n_done; - const float sigma = 100.f*sqrt(p*(1-p)/(n_done-1)); - printf("Final Winogrande score(%d tasks): %.4lf +/- %.4lf\n", n_done, 100*p, sigma); -} - -static bool deserialize_string(std::istream & in, std::string & str) { - uint32_t size; - if (!in.read((char *)&size, sizeof(size)).fail()) { - str.resize(size); - if (!in.read((char *)&str[0], size).fail()) return true; - } - return false; -} - -struct multiple_choice_answers { - std::vector answers; - std::vector labels; - bool deserialize(std::istream& in) { - uint32_t n; - in.read((char *)&n, sizeof(n)); - if (in.fail() || n > 100) return false; // 100 as max. number of answers should be good enough for any practical purpose - answers.resize(n); - labels.resize(n); - for (auto& a : answers) { - if (!deserialize_string(in, a)) return false; - } - in.read((char *)labels.data(), n*sizeof(int)); - return !in.fail(); - } -}; - -struct multiple_choice_task { - std::string question; // the question (or context that needs to be continued) - multiple_choice_answers mc1; // possible answers (continuations) with a single correct answer - multiple_choice_answers mc2; // possible answers (continuations) with multiple correct answers - not handled yet - bool deserialize(std::istream& in) { - if (!deserialize_string(in, question)) return false; - return mc1.deserialize(in) && mc2.deserialize(in); - } - - // For evaluation - size_t i_logits; // starting index of logits in the llama_batch - size_t common_prefix; // max number of initial tokens that are the same in all sentences - size_t required_tokens; // needed number of tokens to evaluate all answers - std::vector> seq_tokens; - std::vector log_probs; -}; - -static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choice_task& task, bool log_error) { - if (task.question.empty() || task.mc1.answers.empty()) { - if (log_error) { - printf("%s: found bad task with empty question and/or answers\n", __func__); - } - return false; - } - task.seq_tokens.reserve(task.mc1.answers.size()); - for (auto& answer : task.mc1.answers) { - if (answer.empty()) { - if (log_error) { - printf("%s: found empty answer\n", __func__); - } - return false; - } - task.seq_tokens.emplace_back(::llama_tokenize(ctx, task.question + " " + answer, true)); - } - auto min_len = task.seq_tokens.front().size(); - for (auto& seq : task.seq_tokens) { - min_len = std::min(min_len, seq.size()); - } - task.common_prefix = 0; - for (size_t k = 0; k < min_len; ++k) { - auto token = task.seq_tokens[0][k]; - bool all_same = true; - for (size_t i = 1; i < task.seq_tokens.size(); ++i) { - if (task.seq_tokens[i][k] != token) { - all_same = false; - break; - } - } - if (!all_same) { - break; - } - ++task.common_prefix; - } - task.required_tokens = task.common_prefix; - for (auto& seq : task.seq_tokens) { - task.required_tokens += seq.size() - task.common_prefix; - } - return true; -} - -// -// Calculates score for multiple choice tasks with single correct answer from prompt. -// Commonly used LLM evaluation metrics of this type are -// * ARC -// * HellaSwag -// * MMLU -// * TruthfulQA -// -// Validation datasets for these 4 tests can be found at -// https://huggingface.co/datasets/ikawrakow/validation-datasets-for-llama.cpp -// The data for these datasets was extracted from -// git@hf.co:datasets/allenai/ai2_arc -// https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl -// git@hf.co:datasets/Stevross/mmlu -// https://huggingface.co/datasets/truthful_qa -// -static void multiple_choice_score(llama_context * ctx, const gpt_params & params) { - - std::istringstream strstream(params.prompt); - uint32_t n_task; - strstream.read((char *)&n_task, sizeof(n_task)); - if (strstream.fail() || n_task == 0) { - printf("%s: no tasks\n", __func__); - return; - } - printf("%s: there are %u tasks in prompt\n", __func__, n_task); - std::vector task_pos(n_task); - strstream.read((char *)task_pos.data(), task_pos.size()*sizeof(uint32_t)); - if (strstream.fail()) { - printf("%s: failed to read task positions from prompt\n", __func__); - return; - } - - std::vector tasks; - if (params.multiple_choice_tasks == 0 || params.multiple_choice_tasks >= (size_t)n_task) { - // Use all tasks - tasks.resize(n_task); - printf("%s: reading tasks", __func__); - int n_dot = std::max((int) n_task/100, 1); - int i = 0; - for (auto& task : tasks) { - ++i; - if (!task.deserialize(strstream)) { - printf("%s: failed to read task %d of %u\n", __func__, i, n_task); - return; - } - if (i%n_dot == 0) printf("."); - } - printf("done\n"); - } - else { - printf("%s: selecting %zu random tasks from %u tasks available\n", __func__, params.multiple_choice_tasks, n_task); - std::mt19937 rng(1); - std::vector aux(n_task); - for (uint32_t i = 0; i < n_task; ++i) aux[i] = i; - float scale = 1.f/(1.f + (float)std::mt19937::max()); - tasks.resize(params.multiple_choice_tasks); - for (auto& task : tasks) { - int j = (int)(scale * rng() * aux.size()); - int idx = aux[j]; - aux[j] = aux.back(); - aux.pop_back(); - strstream.seekg(task_pos[idx], std::ios::beg); - if (!task.deserialize(strstream)) { - printf("%s: failed to read task %d at position %u\n", __func__, idx, task_pos[idx]); - return; - } - } - n_task = params.multiple_choice_tasks; - } - - printf("%s: preparing task data", __func__); - fflush(stdout); - if (n_task > 500) { - printf("..."); - fflush(stdout); - std::atomic counter(0); - std::atomic n_bad(0); - auto prepare = [&counter, &n_bad, &tasks, ctx] () { - int num_tasks = tasks.size(); - int n_bad_local = 0; - while (true) { - int first = counter.fetch_add(K_TOKEN_CHUNK); - if (first >= num_tasks) { - if (n_bad_local > 0) n_bad += n_bad_local; - break; - } - int last = std::min(first + K_TOKEN_CHUNK, num_tasks); - for (int i = first; i < last; ++i) { - if (!multiple_choice_prepare_one_task(ctx, tasks[i], false)) ++n_bad_local; - } - } - }; - size_t max_thread = std::thread::hardware_concurrency(); - max_thread = std::min(max_thread, (tasks.size() + K_TOKEN_CHUNK - 1)/K_TOKEN_CHUNK); - std::vector workers(max_thread-1); - for (auto& w : workers) w = std::thread(prepare); - prepare(); - for (auto& w : workers) w.join(); - printf("done\n"); - fflush(stdout); - int nbad = n_bad; - if (nbad > 0) { - printf("%s: found %d malformed tasks\n", __func__, nbad); - return; - } - } else { - int n_dot = std::max((int) n_task/100, 1); - int i_task = 0; - for (auto& task : tasks) { - ++i_task; - if (!multiple_choice_prepare_one_task(ctx, task, true)) { - return; - } - if (i_task%n_dot == 0) { - printf("."); - fflush(stdout); - } - } - printf("done\n"); - } - - printf("%s : calculating TruthfulQA score over %zu tasks.\n", __func__, tasks.size()); - - printf("\ntask\tacc_norm\n"); - - const int n_vocab = llama_n_vocab(llama_get_model(ctx)); - const int n_ctx = llama_n_ctx(ctx); - const int n_batch = params.n_batch; - - const int max_tasks_per_batch = 32; - const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx)); - - llama_batch batch = llama_batch_init(n_ctx, 0, max_seq); - - std::vector tok_logits(n_vocab); - std::vector batch_logits(n_vocab*n_ctx); - - std::vector> eval_pairs; - std::vector eval_results; - std::vector workers(std::thread::hardware_concurrency()); - std::vector batch_indeces; - - int n_done = 0; - int n_correct = 0; - int n_tot_answers = 0; - - for (size_t i0 = 0; i0 < tasks.size(); i0++) { - int n_cur = 0; - - size_t i1 = i0; - size_t i_logits = 0; // this tells us how many logits were needed before this point in the batch - - llama_batch_clear(batch); - - // batch as much tasks as possible into the available context - // each task has 4 unique sequence ids - one for each ending - // the common prefix is shared among the 4 sequences to save tokens - // we extract logits only from the last common token and from all ending tokens of each sequence - int s0 = 0; - while (n_cur + (int) tasks[i1].required_tokens <= n_ctx) { - auto& cur_task = tasks[i1]; - int n_logits = 0; - - int num_answers = cur_task.seq_tokens.size(); - if (s0 + num_answers > max_seq) { - break; - } - - if (int(batch_indeces.size()) != num_answers) { - batch_indeces.resize(num_answers); - } - for (int s = 0; s < num_answers; ++s) batch_indeces[s] = s0 + s; - - for (size_t i = 0; i < cur_task.common_prefix; ++i) { - //llama_batch_add(batch, cur_task.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3}, false); - llama_batch_add(batch, cur_task.seq_tokens[0][i], i, batch_indeces, false); - } - batch.logits[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix - n_logits += 1; - - for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) { - const size_t seq_tokens_size = cur_task.seq_tokens[s].size(); - // TODO: don't evaluate the last token of each sequence - for (size_t i = cur_task.common_prefix; i < seq_tokens_size; ++i) { - const bool needs_logits = i < seq_tokens_size - 1; - llama_batch_add(batch, cur_task.seq_tokens[s][i], i, { s0 + s }, needs_logits); - n_logits += needs_logits; - } - } - - s0 += num_answers; - - cur_task.i_logits = i_logits; - i_logits += n_logits; - - n_cur += cur_task.required_tokens; - if (++i1 == tasks.size()) { - break; - } - } - - if (i0 == i1) { - fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0); - return; - } - - llama_kv_cache_clear(ctx); - - // decode all tasks [i0, i1) - if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) { - fprintf(stderr, "%s: llama_decode() failed\n", __func__); - return; - } - - // Compute log-probs in parallel - // First we collect all tasks - eval_pairs.clear(); - for (size_t i = i0; i < i1; ++i) { - auto& cur_task = tasks[i]; - size_t li = 1; // skip the last logit of the common prefix (computed separately below) - for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) { - for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) { - eval_pairs.emplace_back(cur_task.i_logits + li++, cur_task.seq_tokens[s][j + 1]); - } - } - } - // Then we do the actual calculation - compute_logprobs(batch_logits.data(), n_vocab, workers, eval_pairs, eval_results); - - size_t ir = 0; - - // compute the logprobs for each ending of the decoded tasks - for (size_t i = i0; i < i1; ++i) { - auto & cur_task = tasks[i]; - //printf("==== Evaluating <%s> with correct answer ", cur_task.question.c_str()); - //for (int j = 0; j < int(cur_task.mc1.labels.size()); ++j) { - // if (cur_task.mc1.labels[j] == 1) { - // printf("%d", j+1); - // } - //} - //printf("\n common_prefix: %zu\n", cur_task.common_prefix); - - // get the logits of the last token of the common prefix - std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*cur_task.i_logits, n_vocab*sizeof(float)); - - const auto first_probs = softmax(tok_logits); - - cur_task.log_probs.resize(cur_task.seq_tokens.size()); - for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) { - size_t count = 1; - float log_prob = std::log(first_probs[cur_task.seq_tokens[s][cur_task.common_prefix]]); - for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) { - //printf(" %zu %g\n", ir, eval_results[ir]); - ++count; - log_prob += eval_results[ir++]; - } - cur_task.log_probs[s] = log_prob / count; - //printf(" Final: %g\n", log_prob / count); - //printf(" <%s> : %g\n", cur_task.mc1.answers[s].c_str(), log_prob/count); - } - - // Find the ending with maximum logprob - size_t logprob_max_idx = 0; - float logprob_max_val = cur_task.log_probs[0]; - for (size_t s = 1; s < cur_task.log_probs.size(); s++) { - if (cur_task.log_probs[s] > logprob_max_val) { - logprob_max_val = cur_task.log_probs[s]; - logprob_max_idx = s; - } - } - - n_tot_answers += cur_task.log_probs.size(); - if (cur_task.mc1.labels[logprob_max_idx] == 1) { - ++n_correct; - } - ++n_done; - - // Print the accumulated accuracy mean x 100 - printf("%d\t%.8lf\n", n_done, 100.*n_correct/n_done); - fflush(stdout); - } - - i0 = i1 - 1; - } - - llama_batch_free(batch); - - if (n_done < 100 && (params.multiple_choice_tasks != 0 && params.multiple_choice_tasks < (size_t)n_task)) return; - - float p = 1.f*n_correct/n_done; - float sigma = sqrt(p*(1-p)/(n_done-1)); - printf("\n Final result: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma); - p = 1.f*n_done/n_tot_answers; - sigma = sqrt(p*(1-p)/(n_done-1)); - printf("Random chance: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma); - - printf("\n"); -} - -static void kl_divergence(llama_context * ctx, const gpt_params & params) { - if (params.logits_file.empty()) { - fprintf(stderr, "%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__); - return; - } - std::ifstream in(params.logits_file.c_str(), std::ios::binary); - if (!in) { - fprintf(stderr, "%s: failed to open %s\n", __func__, params.logits_file.c_str()); - return; - } - { - char check[9]; check[8] = 0; - in.read(check, 8); - if (in.fail() || strncmp("_logits_", check, 8) != 0) { - fprintf(stderr, "%s: %s does not look like a file containing log-probabilities\n", __func__, params.logits_file.c_str()); - return; - } - } - - uint32_t n_ctx; - in.read((char *)&n_ctx, sizeof(n_ctx)); - if (n_ctx > llama_n_ctx(ctx)) { - fprintf(stderr, "%s: %s has been computed with %u, while the current context is %d. Increase it with -c and retry\n", - __func__, params.logits_file.c_str(), n_ctx, params.n_ctx); - } - - int n_vocab, n_chunk; - in.read((char *)&n_vocab, sizeof(n_vocab)); - in.read((char *)&n_chunk, sizeof(n_chunk)); - if (in.fail()) { - fprintf(stderr, "%s: failed reading n_vocab, n_chunk from %s\n", __func__, params.logits_file.c_str()); - return; - } - if (n_vocab != llama_n_vocab(llama_get_model(ctx))) { - fprintf(stderr, "%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx))); - } - - std::vector tokens(n_ctx * n_chunk); - if (in.read((char *)tokens.data(), tokens.size()*sizeof(tokens[0])).fail()) { - fprintf(stderr, "%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str()); - return; - } - - const int n_batch = params.n_batch; - const int num_batches = (n_ctx + n_batch - 1)/n_batch; - const int nv = 2*((n_vocab + 1)/2) + 4; - const bool add_bos = llama_add_bos_token(llama_get_model(ctx)); - GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx))); - - std::vector log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv); - std::vector kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk); - std::vector p_diff_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk); - std::vector logits; - if (num_batches > 1) { - logits.reserve(n_ctx * n_vocab); - } - - std::vector workers(std::thread::hardware_concurrency() - 1); - - auto mean_and_uncertainty = [] (double sum, double sum2, size_t count) { - if (count < 1) { - return std::make_pair(0., 0.); - } - double f = sum/count; - double df = sum2/count - f*f; - df = df > 0 && count > 10 ? sqrt(df/(count-1)) : 0.; - return std::make_pair(f, df); - }; - auto covariance = [] (double suma, double sumb, double sumab, size_t count) { - if (count < 10) { - return 0.0; - } - double var = sumab/count - (suma/count)*(sumb/count); - var /= count - 1; - return var; - }; - - kl_divergence_result kld; - auto kld_ptr = kld_values.data(); - auto p_diff_ptr = p_diff_values.data(); - - for (int i = 0; i < n_chunk; ++i) { - const int start = i * n_ctx; - const int end = start + n_ctx; - - const auto t_start = std::chrono::high_resolution_clock::now(); - - if (in.read((char *)log_probs_uint16.data(), log_probs_uint16.size()*sizeof(uint16_t)).fail()) { - fprintf(stderr, "%s: failed reading log-probs for chunk %d\n", __func__, i); - return; - } - - // clear the KV cache - llama_kv_cache_clear(ctx); - - for (int j = 0; j < num_batches; ++j) { - const int batch_start = start + j * n_batch; - const int batch_size = std::min(end - batch_start, n_batch); - - // save original token and restore it after eval - const auto token_org = tokens[batch_start]; - - // add BOS token for the first batch of each chunk - if (add_bos && j == 0) { - tokens[batch_start] = llama_token_bos(llama_get_model(ctx)); - } - - // TODO: use llama_batch.logits instead of relying on logits_all == true - if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) { - fprintf(stderr, "%s : failed to eval\n", __func__); - return; - } - - // restore the original token in case it was set to BOS - tokens[batch_start] = token_org; - - if (num_batches > 1) { - const auto * batch_logits = llama_get_logits(ctx); - logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab); - } - } - - const auto t_end = std::chrono::high_resolution_clock::now(); - - if (i == 0) { - const float t_total = std::chrono::duration(t_end - t_start).count(); - fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total); - int total_seconds = (int)(t_total * n_chunk); - if (total_seconds >= 60*60) { - fprintf(stderr, "%d hours ", total_seconds / (60*60)); - total_seconds = total_seconds % (60*60); - } - fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0); - - printf("\nchunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p\n"); - } - - const int first = n_ctx/2; - const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx); - process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first, - workers, log_probs_uint16, kld, kld_ptr, p_diff_ptr); - p_diff_ptr += n_ctx - 1 - first; - kld_ptr += n_ctx - 1 - first; - - printf("%4d", i+1); - - auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count); - const double ppl_val = exp(log_ppl.first); - const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 ) - printf(" %9.4lf ± %9.4lf", ppl_val, ppl_unc); - - auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count); - const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count); - const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first; - const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov); - printf(" %10.5lf ± %10.5lf", log_ppl_ratio_val, log_ppl_ratio_unc); - - auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count); - printf(" %10.5lf ± %10.5lf", kl_div.first, kl_div.second); - - auto p_diff_mse = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count); - const double p_diff_rms_val = sqrt(p_diff_mse.first); - const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second; - printf(" %6.3lf ± %6.3lf %%", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc); - - double p_top_val = 1.*kld.n_same_top/kld.count; - double p_top_unc = sqrt(p_top_val*(1 - p_top_val)/(kld.count - 1)); - printf(" %6.3lf ± %6.3lf %%", 100.0*p_top_val, 100.0*p_top_unc); - - printf("\n"); - - fflush(stdout); - - logits.clear(); - } - printf("\n"); - - if (kld.count < 100) return; // we do not wish to do statistics on so few values - - std::sort(kld_values.begin(), kld_values.end()); - std::sort(p_diff_values.begin(), p_diff_values.end()); - - printf("====== Perplexity statistics ======\n"); - - auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count); - const double ppl_val = exp(log_ppl.first); - const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 ) - printf("Mean PPL(Q) : %10.6lf ± %10.6lf\n", ppl_val, ppl_unc); - - auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count); - const double ppl_base_val = exp(log_ppl_base.first); - const double ppl_base_unc = ppl_base_val * log_ppl_base.second; // ppl_base_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_base.second ** 2 ) - printf("Mean PPL(base) : %10.6lf ± %10.6lf\n", ppl_base_val, ppl_base_unc); - - const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count); - // printf("Cov(ln(PPL(Q)), ln(PPL(base))): %10.6lf\n", log_ppl_cov); - const double log_ppl_cor = log_ppl_cov / (log_ppl.second*log_ppl_base.second); - printf("Cor(ln(PPL(Q)), ln(PPL(base))): %6.2lf%%\n", 100.0*log_ppl_cor); - - const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first; - const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov); - printf("Mean ln(PPL(Q)/PPL(base)) : %10.6lf ± %10.6lf\n", log_ppl_ratio_val, log_ppl_ratio_unc); - - const double ppl_ratio_val = exp(log_ppl_ratio_val); - const double ppl_ratio_unc = ppl_ratio_val * log_ppl_ratio_unc; // ppl_ratio_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_ratio.second ** 2 ) - printf("Mean PPL(Q)/PPL(base) : %10.6lf ± %10.6lf\n", ppl_ratio_val, ppl_ratio_unc); - - const double ppl_cov = ppl_val * ppl_base_val * log_ppl_cov; - const double ppl_diff_val = ppl_val - ppl_base_val; - const double ppl_diff_unc = sqrt(ppl_unc*ppl_unc + ppl_base_unc*ppl_base_unc - 2.0*ppl_cov); - printf("Mean PPL(Q)-PPL(base) : %10.6lf ± %10.6lf\n", ppl_diff_val, ppl_diff_unc); - - printf("\n"); - - printf("====== KL divergence statistics ======\n"); - auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count); - printf("Mean KLD: %10.6lf ± %10.6lf\n", kl_div.first, kl_div.second); - auto kld_median = kld_values.size()%2 == 0 ? 0.5f*(kld_values[kld_values.size()/2] + kld_values[kld_values.size()/2-1]) - : kld_values[kld_values.size()/2]; - - auto percentile = [] (std::vector values, float fraction) { - if (fraction <= 0) return values.front(); - if (fraction >= 1) return values.back(); - float p = fraction*(values.size() - 1); - size_t ip = size_t(p); p -= ip; - return (1 - p)*values[ip] + p*values[std::min(ip+1, values.size()-1)]; - }; - - printf("Maximum KLD: %10.6f\n", kld_values.back()); - printf("99.9%% KLD: %10.6f\n", percentile(kld_values, 0.999f)); - printf("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f)); - printf("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f)); - printf("Median KLD: %10.6f\n", kld_median); - printf("10.0%% KLD: %10.6f\n", percentile(kld_values, 0.100f)); - printf(" 5.0%% KLD: %10.6f\n", percentile(kld_values, 0.050f)); - printf(" 1.0%% KLD: %10.6f\n", percentile(kld_values, 0.010f)); - printf("Minimum KLD: %10.6f\n", kld_values.front()); - - printf("\n"); - - printf("====== Token probability statistics ======\n"); - - auto p_diff = mean_and_uncertainty(kld.sum_p_diff, kld.sum_p_diff2, kld.count); - printf("Mean Δp: %6.3lf ± %5.3lf %%\n", 100.0*p_diff.first, 100.0*p_diff.second); - - auto p_diff_median = p_diff_values.size()%2 == 0 ? 0.5f*(p_diff_values[p_diff_values.size()/2] + p_diff_values[p_diff_values.size()/2-1]) - : p_diff_values[p_diff_values.size()/2]; - - printf("Maximum Δp: %6.3lf%%\n", 100.0*p_diff_values.back()); - printf("99.9%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.999f)); - printf("99.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.990f)); - printf("95.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.950f)); - printf("90.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.900f)); - printf("75.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.750f)); - printf("Median Δp: %6.3lf%%\n", 100.0*p_diff_median); - printf("25.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.250f)); - printf("10.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.100f)); - printf(" 5.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.050f)); - printf(" 1.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.010f)); - printf(" 0.1%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.001f)); - printf("Minimum Δp: %6.3lf%%\n", 100.0*p_diff_values.front()); - - auto p_diff_mse = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count); - // printf("MSE Δp : %10.6lf ± %10.6lf\n", p_diff_mse.first, p_diff_mse.second); - - const double p_diff_rms_val = sqrt(p_diff_mse.first); - const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second; - printf("RMS Δp : %6.3lf ± %5.3lf %%\n", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc); - - const double same_top_p = 1.0*kld.n_same_top/kld.count; - printf("Same top p: %6.3lf ± %5.3lf %%\n", 100.0*same_top_p, 100.0*sqrt(same_top_p*(1.0 - same_top_p)/(kld.count - 1))); - -} - -int main(int argc, char ** argv) { - gpt_params params; - - params.n_ctx = 512; - params.logits_all = true; - - auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_PERPLEXITY); - if (!gpt_params_parse(argc, argv, params, options)) { - return 1; - } - - const int32_t n_ctx = params.n_ctx; - - if (n_ctx <= 0) { - fprintf(stderr, "%s: perplexity tool requires '--ctx-size' > 0\n", __func__); - return 1; - } - - const bool ppl = !params.hellaswag && !params.winogrande && !params.multiple_choice && !params.kl_divergence; - - if (ppl) { - const int32_t n_seq = std::max(1, params.n_batch / n_ctx); - const int32_t n_kv = n_seq * n_ctx; - - params.n_parallel = n_seq; - params.n_ctx = n_kv; - - params.n_batch = std::min(params.n_batch, n_kv); - } else { - params.n_batch = std::min(params.n_batch, params.n_ctx); - if (params.kl_divergence) { - params.n_parallel = 1; - } else { - // ensure there's at least enough seq_ids for HellaSwag - params.n_parallel = std::max(4, params.n_parallel); - } - } - - if (params.ppl_stride > 0) { - fprintf(stderr, "Will perform strided perplexity calculation -> adjusting context size from %d to %d\n", - params.n_ctx, params.n_ctx + params.ppl_stride/2); - params.n_ctx += params.ppl_stride/2; - } - - print_build_info(); - - LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed); - - llama_backend_init(); - llama_numa_init(params.numa); - - // load the model and apply lora adapter, if any - llama_init_result llama_init = llama_init_from_gpt_params(params); - - llama_model * model = llama_init.model; - llama_context * ctx = llama_init.context; - if (model == NULL) { - fprintf(stderr, "%s: error: unable to load model\n", __func__); - return 1; - } - - const int n_ctx_train = llama_n_ctx_train(model); - - if (params.n_ctx > n_ctx_train) { - fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n", - __func__, n_ctx_train, params.n_ctx); - } - - // print system information - { - fprintf(stderr, "\n"); - fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str()); - } - - struct results_perplexity results; - if (params.hellaswag) { - hellaswag_score(ctx, params); - } else if (params.winogrande) { - winogrande_score(ctx, params); - } else if (params.multiple_choice) { - multiple_choice_score(ctx, params); - } else if (params.kl_divergence) { - kl_divergence(ctx, params); - } else { - results = perplexity(ctx, params, n_ctx); - } - - LOG_TEE("\n"); - llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT); - write_logfile(ctx, params, model, results); - - llama_free(ctx); - llama_free_model(model); - - llama_backend_free(); - - return 0; -} diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp index dd8a82e6e..7a360b731 100644 --- a/examples/retrieval/retrieval.cpp +++ b/examples/retrieval/retrieval.cpp @@ -1,3 +1,4 @@ +#include "arg.h" #include "common.h" #include "llama.h" @@ -111,8 +112,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu int main(int argc, char ** argv) { gpt_params params; - auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_RETRIEVAL, print_usage); - if (!gpt_params_parse(argc, argv, params, options)) { + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_RETRIEVAL, print_usage)) { return 1; } diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp deleted file mode 100644 index 9e3fad60a..000000000 --- a/examples/save-load-state/save-load-state.cpp +++ /dev/null @@ -1,257 +0,0 @@ -#include "build-info.h" -#include "common.h" -#include "llama.h" - -#include -#include - -int main(int argc, char ** argv) { - gpt_params params; - - params.prompt = "The quick brown fox"; - params.sparams.seed = 1234; - - auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON); - if (!gpt_params_parse(argc, argv, params, options)) { - return 1; - } - - print_build_info(); - - if (params.n_predict < 0) { - params.n_predict = 16; - } - - auto n_past = 0; - - std::string result0; - std::string result1; - std::string result2; - - // init - llama_init_result llama_init = llama_init_from_gpt_params(params); - - llama_model * model = llama_init.model; - llama_context * ctx = llama_init.context; - - if (model == nullptr || ctx == nullptr) { - fprintf(stderr, "%s : failed to init\n", __func__); - return 1; - } - - auto sparams = llama_sampler_chain_default_params(); - - llama_sampler * smpl = llama_sampler_chain_init(sparams); - - llama_sampler_chain_add(smpl, llama_sampler_init_softmax()); - llama_sampler_chain_add(smpl, llama_sampler_init_dist(params.sparams.seed)); - - // tokenize prompt - auto tokens = llama_tokenize(ctx, params.prompt, true); - - // evaluate prompt - llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), n_past, 0)); - n_past += tokens.size(); - - // save state (rng, logits, embedding and kv_cache) to file - { - std::vector state_mem(llama_state_get_size(ctx)); - const size_t written = llama_state_get_data(ctx, state_mem.data(), state_mem.size()); - - FILE *fp_write = fopen("dump_state.bin", "wb"); - fwrite(state_mem.data(), 1, written, fp_write); - fclose(fp_write); - - fprintf(stderr, "%s : serialized state into %zd out of a maximum of %zd bytes\n", __func__, written, state_mem.size()); - } - - // save state (last tokens) - const auto n_past_saved = n_past; - - // first run - printf("\nfirst run: %s", params.prompt.c_str()); - - for (auto i = 0; i < params.n_predict; i++) { - auto next_token = llama_sampler_sample(smpl, ctx, -1); - auto next_token_str = llama_token_to_piece(ctx, next_token); - - llama_sampler_accept(smpl, next_token); - - printf("%s", next_token_str.c_str()); - result0 += next_token_str; - - if (llama_decode(ctx, llama_batch_get_one(&next_token, 1, n_past, 0))) { - fprintf(stderr, "\n%s : failed to evaluate\n", __func__); - llama_free(ctx); - llama_free_model(model); - return 1; - } - n_past += 1; - } - - printf("\n\n"); - - // free old context - llama_free(ctx); - - // make new context - auto * ctx2 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params)); - - llama_sampler * smpl2 = llama_sampler_chain_init(sparams); - - llama_sampler_chain_add(smpl2, llama_sampler_init_softmax()); - llama_sampler_chain_add(smpl2, llama_sampler_init_dist(params.sparams.seed)); - - printf("\nsecond run: %s", params.prompt.c_str()); - - // load state (rng, logits, embedding and kv_cache) from file - { - std::vector state_mem; - - FILE * fp_read = fopen("dump_state.bin", "rb"); - fseek(fp_read, 0, SEEK_END); - state_mem.resize(ftell(fp_read)); - fseek(fp_read, 0, SEEK_SET); - const size_t read = fread(state_mem.data(), 1, state_mem.size(), fp_read); - fclose(fp_read); - - if (read != llama_state_set_data(ctx2, state_mem.data(), state_mem.size())) { - fprintf(stderr, "\n%s : failed to read state\n", __func__); - llama_free(ctx2); - llama_free_model(model); - return 1; - } - - fprintf(stderr, "%s : deserialized state from %zd out of a maximum of %zd bytes\n", __func__, read, state_mem.size()); - } - - // restore state (last tokens) - n_past = n_past_saved; - - // second run - for (auto i = 0; i < params.n_predict; i++) { - auto next_token = llama_sampler_sample(smpl2, ctx2, -1); - auto next_token_str = llama_token_to_piece(ctx2, next_token); - - llama_sampler_accept(smpl2, next_token); - - printf("%s", next_token_str.c_str()); - result1 += next_token_str; - - if (llama_decode(ctx2, llama_batch_get_one(&next_token, 1, n_past, 0))) { - fprintf(stderr, "\n%s : failed to evaluate\n", __func__); - llama_free(ctx2); - llama_free_model(model); - return 1; - } - n_past += 1; - } - - printf("\n\n"); - - llama_free(ctx2); - - if (result0 != result1) { - fprintf(stderr, "\n%s : error : the 2 generations are different\n", __func__); - return 1; - } - - // make new context - auto * ctx3 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params)); - - llama_sampler * smpl3 = llama_sampler_chain_init(sparams); - - llama_sampler_chain_add(smpl3, llama_sampler_init_softmax()); - llama_sampler_chain_add(smpl3, llama_sampler_init_dist(params.sparams.seed)); - - printf("\nsingle seq run: %s", params.prompt.c_str()); - - // load state (rng, logits, embedding and kv_cache) from file - { - std::vector state_mem; - - FILE * fp_read = fopen("dump_state.bin", "rb"); - fseek(fp_read, 0, SEEK_END); - state_mem.resize(ftell(fp_read)); - fseek(fp_read, 0, SEEK_SET); - const size_t read = fread(state_mem.data(), 1, state_mem.size(), fp_read); - fclose(fp_read); - - if (read != llama_state_set_data(ctx3, state_mem.data(), state_mem.size())) { - fprintf(stderr, "\n%s : failed to read state\n", __func__); - llama_free(ctx3); - llama_free_model(model); - return 1; - } - - fprintf(stderr, "%s : deserialized state from %zd out of a maximum of %zd bytes\n", __func__, read, state_mem.size()); - } - - // restore state (last tokens) - n_past = n_past_saved; - - // save seq 0 and load into seq 1 - { - // save kv of seq 0 - std::vector seq_store(llama_state_seq_get_size(ctx3, 0)); - const size_t ncopy = llama_state_seq_get_data(ctx3, seq_store.data(), seq_store.size(), 0); - if (ncopy != seq_store.size()) { - fprintf(stderr, "\n%s : seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size()); - llama_free(ctx3); - llama_free_model(model); - return 1; - } - fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy); - - // erase whole kv - llama_kv_cache_clear(ctx3); - fprintf(stderr, "%s : kv cache cleared\n", __func__); - - // restore kv into seq 1 - const size_t nset = llama_state_seq_set_data(ctx3, seq_store.data(), seq_store.size(), 1); - if (nset != seq_store.size()) { - fprintf(stderr, "\n%s : seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size()); - llama_free(ctx3); - llama_free_model(model); - return 1; - } - fprintf(stderr, "%s : seq 1 restored, %zd bytes\n", __func__, nset); - } - - // third run with seq 1 instead of 0 - for (auto i = 0; i < params.n_predict; i++) { - auto next_token = llama_sampler_sample(smpl3, ctx3, -1); - auto next_token_str = llama_token_to_piece(ctx3, next_token); - - llama_sampler_accept(smpl3, next_token); - - printf("%s", next_token_str.c_str()); - result2 += next_token_str; - - if (llama_decode(ctx3, llama_batch_get_one(&next_token, 1, n_past, 1))) { - fprintf(stderr, "\n%s : failed to evaluate\n", __func__); - llama_free(ctx3); - llama_free_model(model); - return 1; - } - n_past += 1; - } - - printf("\n"); - - llama_sampler_free(smpl); - llama_sampler_free(smpl2); - llama_sampler_free(smpl3); - - llama_free(ctx3); - llama_free_model(model); - - if (result0 != result2) { - fprintf(stderr, "\n%s : error : the seq restore generation is different\n", __func__); - return 1; - } - - fprintf(stderr, "\n%s : success\n", __func__); - - return 0; -} diff --git a/examples/server/server.cpp b/examples/server/server.cpp index d3dce1294..77ee4d4b9 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1,6 +1,8 @@ #include "utils.hpp" +#include "arg.h" #include "common.h" +#include "sampling.h" #include "json-schema-to-grammar.h" #include "llama.h" #include "build-info.h" @@ -614,7 +616,7 @@ struct server_context { gpt_params params; - llama_batch batch; + llama_batch batch = {}; bool clean_kv_cache = true; bool add_bos_token = true; @@ -2424,8 +2426,7 @@ int main(int argc, char ** argv) { // own arguments required by this example gpt_params params; - auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_SERVER); - if (!gpt_params_parse(argc, argv, params, options)) { + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_SERVER)) { return 1; } diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index a53cef547..3fdc04394 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -1,3 +1,4 @@ +#include "arg.h" #include "common.h" #include "llama.h" @@ -18,8 +19,7 @@ int main(int argc, char ** argv) { params.prompt = "Hello my name is"; params.n_predict = 32; - auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, print_usage); - if (!gpt_params_parse(argc, argv, params, options)) { + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) { return 1; } @@ -118,8 +118,6 @@ int main(int argc, char ** argv) { { const llama_token new_token_id = llama_sampler_sample(smpl, ctx, batch.n_tokens - 1); - llama_sampler_accept(smpl, new_token_id); - // is it an end of generation? if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) { LOG_TEE("\n"); diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp deleted file mode 100644 index 8df587fee..000000000 --- a/examples/speculative/speculative.cpp +++ /dev/null @@ -1,644 +0,0 @@ -#include "build-info.h" - -#include "common.h" -#include "llama.h" - -#include -#include -#include -#include -#include - -#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 100 -#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5 - -struct seq_draft { - bool active = false; - bool drafting = false; - bool skip = false; - - int i_batch_dft = 0; - std::vector i_batch_tgt; - - std::vector tokens; - std::vector> dists; - - struct gpt_sampler * smpl = nullptr; -}; - -int main(int argc, char ** argv) { - gpt_params params; - - auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_SPECULATIVE); - if (!gpt_params_parse(argc, argv, params, options)) { - return 1; - } - - if (params.model_draft.empty()) { - fprintf(stderr, "%s: error: --model-draft is required\n", __func__); - return 1; - } - - // max number of parallel drafting sequences (i.e. tree branches) - const int n_seq_dft = params.n_parallel; - - // probability threshold for splitting a draft branch (only for n_seq_dft > 1) - const float p_split = params.p_split; - - std::default_random_engine rng(params.sparams.seed); - std::uniform_real_distribution<> u_dist; - -#ifndef LOG_DISABLE_LOGS - log_set_target(log_filename_generator("speculative", "log")); - LOG_TEE("Log start\n"); - log_dump_cmdline(argc, argv); -#endif // LOG_DISABLE_LOGS - - // init llama.cpp - llama_backend_init(); - llama_numa_init(params.numa); - - llama_model * model_tgt = NULL; - llama_model * model_dft = NULL; - - llama_context * ctx_tgt = NULL; - llama_context * ctx_dft = NULL; - - // load the target model - llama_init_result llama_init_tgt = llama_init_from_gpt_params(params); - model_tgt = llama_init_tgt.model; - ctx_tgt = llama_init_tgt.context; - - // load the draft model - params.model = params.model_draft; - params.n_gpu_layers = params.n_gpu_layers_draft; - if (params.draft_cpuparams.n_threads > 0) { - params.cpuparams.n_threads = params.draft_cpuparams.n_threads; - } - - params.cpuparams_batch.n_threads = params.draft_cpuparams_batch.n_threads; - llama_init_result llama_init_dft = llama_init_from_gpt_params(params); - model_dft = llama_init_dft.model; - ctx_dft = llama_init_dft.context; - - const bool vocab_type_tgt = llama_vocab_type(model_tgt); - LOG("vocab_type tgt: %d\n", vocab_type_tgt); - - const bool vocab_type_dft = llama_vocab_type(model_dft); - LOG("vocab_type dft: %d\n", vocab_type_dft); - - if (vocab_type_tgt != vocab_type_dft) { - fprintf(stderr, "%s: error: draft model vocab type must match target model to use speculation but ", __func__); - fprintf(stderr, "vocab_type_dft = %d while vocab_type_tgt = %d\n", vocab_type_dft, vocab_type_tgt); - return 1; - } - - if ( - llama_add_bos_token(model_tgt) != llama_add_bos_token(model_dft) || - llama_add_eos_token(model_tgt) != llama_add_eos_token(model_dft) || - llama_token_bos(model_tgt) != llama_token_bos(model_dft) || - llama_token_eos(model_tgt) != llama_token_eos(model_dft) - ) { - fprintf(stderr, "%s: error: draft model special tokens must match target model to use speculation\n", __func__); - return 1; - } - - { - const int n_vocab_tgt = llama_n_vocab(model_tgt); - const int n_vocab_dft = llama_n_vocab(model_dft); - const int vocab_diff = n_vocab_tgt > n_vocab_dft - ? n_vocab_tgt - n_vocab_dft - : n_vocab_dft - n_vocab_tgt; - - if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) { - fprintf(stderr, "%s: error: draft model vocab must closely match target model to use speculation but ", __func__); - fprintf(stderr, "target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n", - n_vocab_tgt, llama_n_vocab(model_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE); - return 1; - } - - for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(n_vocab_tgt, n_vocab_dft); ++i) { - const char * token_text_tgt = llama_token_get_text(model_tgt, i); - const char * token_text_dft = llama_token_get_text(model_dft, i); - if (std::strcmp(token_text_tgt, token_text_dft) != 0) { - fprintf(stderr, "%s: error: draft model vocab must match target model to use speculation but ", __func__); - fprintf(stderr, "token %d content differs - target '%s', draft '%s'\n", i, - llama_token_to_piece(ctx_tgt, i).c_str(), - llama_token_to_piece(ctx_dft, i).c_str()); - return 1; - } - } - } - - - // Tokenize the prompt - std::vector inp; - inp = ::llama_tokenize(ctx_tgt, params.prompt, true, true); - - const int max_context_size = llama_n_ctx(ctx_tgt); - const int max_tokens_list_size = max_context_size - 4; - - if ((int) inp.size() > max_tokens_list_size) { - fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size); - return 1; - } - - fprintf(stderr, "\n\n"); - - for (auto id : inp) { - fprintf(stderr, "%s", llama_token_to_piece(ctx_tgt, id).c_str()); - } - - fflush(stderr); - - const int n_input = inp.size(); - - const auto t_enc_start = ggml_time_us(); - - // eval the prompt with both models - llama_decode(ctx_tgt, llama_batch_get_one( inp.data(), n_input - 1, 0, 0)); - llama_decode(ctx_tgt, llama_batch_get_one(&inp.back(), 1, n_input - 1, 0)); - llama_decode(ctx_dft, llama_batch_get_one( inp.data(), n_input, 0, 0)); - - const auto t_enc_end = ggml_time_us(); - - // the 2 models should have the same vocab - //GGML_ASSERT(n_vocab == llama_n_vocab(model_dft)); - - // how many tokens to draft each time - int n_draft = params.n_draft; - - int n_predict = 0; - int n_drafted = 0; - int n_accept = 0; - - int n_past_tgt = inp.size(); - int n_past_dft = inp.size(); - - // used to determine end of generation - bool has_eos = false; - - // target model sampling context (reuse the llama_context's sampling instance) - struct gpt_sampler * smpl = gpt_sampler_init(model_tgt, params.sparams); - - struct llama_sampler * softmax = llama_sampler_init_softmax(); - - // draft sequence data - std::vector drafts(n_seq_dft); - - for (int s = 0; s < n_seq_dft; ++s) { - // allocate gpt_sampler for each draft sequence - drafts[s].smpl = gpt_sampler_init(model_dft, params.sparams); - } - - llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 1); - llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, n_seq_dft); - - const auto t_dec_start = ggml_time_us(); - - // sample from the last token of the prompt - drafts[0].i_batch_tgt.resize(1); - drafts[0].i_batch_tgt[0] = 0; - - while (true) { - std::set active_seqs = {}; - - // print current draft sequences - for (int s = 0; s < n_seq_dft; ++s) { - if (!drafts[s].active) { - continue; - } - - active_seqs.insert(s); - const auto & tokens = drafts[s].tokens; - - LOG("draft %d: %s\n", s, LOG_TOKENS_TOSTR_PRETTY(ctx_dft, tokens).c_str()); - } - - int i_dft = 0; - int s_keep = 0; - - llama_token token_id; - std::string token_str; - - // loop until we fail to accept a drafted token or we run out of drafted tokens - while (true) { - - // check if the target token matches any of the drafts - // for stochastic sampling, attempt to match the token with the drafted tokens - { - bool accept = false; - if (params.sparams.temp > 0) { - // stochastic verification - gpt_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft], true); - - auto & dist_tgt = *gpt_sampler_get_candidates(smpl); - - float p_tgt = 0.0f; - float p_dft = 0.0f; - - while (active_seqs.size() > 0) { - // randomly select a sequence to verify from active sequences - std::uniform_int_distribution u_int_dist(0, active_seqs.size() - 1); - int s = *std::next(active_seqs.begin(), u_int_dist(rng)); - if (i_dft >= (int) drafts[s].tokens.size()) { - drafts[s].active = false; - active_seqs.erase(s); - continue; - } - if (accept) { - // if we already accepted a token, we can skip the rest - if (drafts[s].tokens[i_dft] != drafts[s_keep].tokens[i_dft]) { - drafts[s].active = false; - active_seqs.erase(s); - } - continue; - } - - LOG("verifying sequence #%d at pos #%d from %d active sequence(s)\n", s, i_dft, (int) active_seqs.size()); - float r = u_dist(rng); - llama_token_data_array dist_dft = { drafts[s].dists[i_dft].data() , drafts[s].dists[i_dft].size(), LLAMA_TOKEN_NULL, true }; - - //GGML_ASSERT(dist_tgt.size <= dist_dft.size); - - // acquire the token probabilities assigned by the draft and target models - for (size_t i = 0; i < dist_tgt.size; i++) { - if (dist_tgt.data[i].id == drafts[s].tokens[i_dft]) { - p_tgt = dist_tgt.data[i].p; - } - if (dist_dft.data[i].id == drafts[s].tokens[i_dft]) { - p_dft = dist_dft.data[i].p; - } - if (p_tgt && p_dft) { - break; - } - } - LOG("r = %f, p_dft = %f, p_tgt = %f\n", r, p_dft, p_tgt); - if (r <= p_tgt / p_dft) { - s_keep = s; - accept = true; - token_id = drafts[s].tokens[i_dft]; - token_str = llama_token_to_piece(ctx_tgt, token_id); - gpt_sampler_accept(smpl, token_id, true); - - LOG("draft token %d of sequence %d (%d, '%s') accepted\n", i_dft, s, token_id, token_str.c_str()); - break; - } else { - LOG("draft token %d of sequence %d (%d, '%s') rejected\n", i_dft, s, drafts[s].tokens[i_dft], llama_token_to_piece(ctx_tgt, drafts[s].tokens[i_dft]).c_str()); - drafts[s].active = false; - - // calculate residual probability - GGML_ASSERT(dist_tgt.sorted); - GGML_ASSERT(dist_dft.sorted); - - // sort dist by id - std::sort(dist_tgt.data, dist_tgt.data + dist_tgt.size, [](const llama_token_data &a, const llama_token_data &b) { - return a.id < b.id; - }); - std::sort(dist_dft.data, dist_dft.data + dist_dft.size, [](const llama_token_data &a, const llama_token_data &b) { - return a.id < b.id; - }); - - float sum_probs = 0.0f; - - for (size_t i = 0; i < dist_tgt.size; i++) { - if (i < dist_dft.size) { - dist_tgt.data[i].p = std::max(0.0f, dist_tgt.data[i].p - dist_dft.data[i].p); - } else { - dist_tgt.data[i].p = std::max(0.0f, dist_tgt.data[i].p); - } - - sum_probs += dist_tgt.data[i].p; - } - - for (size_t i = 0; i < dist_tgt.size; i++) { - dist_tgt.data[i].p /= sum_probs; - } - - // sort dist_tgt by p desc - std::sort(dist_tgt.data, dist_tgt.data + dist_tgt.size, [](const llama_token_data &a, const llama_token_data &b) { - return a.p > b.p; - }); - } - - active_seqs.erase(s); - for(int i = 0; i < n_seq_dft; i++) { - if (i == s) { - continue; - } - if (drafts[i].tokens[i_dft] == drafts[s].tokens[i_dft]) { - // synchronize active status for sequences with the same drafted token - drafts[i].active = drafts[i].active && accept; - if (!drafts[i].active) { - active_seqs.erase(s); - } - } - } - } - - if (!accept) { - // all drafted tokens were rejected - // sample from the target model - LOG("all drafted tokens were rejected, sampling from residual distribution\n"); - std::vector probs(dist_tgt.size); - for (size_t i = 0; i < dist_tgt.size; ++i) { - probs[i] = dist_tgt.data[i].p; - } - - std::discrete_distribution<> dist(probs.begin(), probs.end()); - - const int idx = dist(rng); - - token_id = dist_tgt.data[idx].id; - gpt_sampler_accept(smpl, token_id, true); - token_str = llama_token_to_piece(ctx_tgt, token_id); - } - } else { - // greedy verification - - // sample from the target model - LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]); - token_id = gpt_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft]); - - gpt_sampler_accept(smpl, token_id, true); - - //LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, smpl->prev).c_str()); - - token_str = llama_token_to_piece(ctx_tgt, token_id); - - for (int s = 0; s < n_seq_dft; ++s) { - if (!drafts[s].active) { - continue; - } - - if (i_dft < (int) drafts[s].tokens.size() && token_id == drafts[s].tokens[i_dft]) { - LOG("the sampled target token matches the %dth drafted token of sequence %d (%d, '%s') - accepted\n", i_dft, s, token_id, token_str.c_str()); - - s_keep = s; - accept = true; - } else { - drafts[s].active = false; - } - } - } - - if (llama_token_is_eog(model_tgt, token_id)) { - has_eos = true; - } - ++n_predict; - - if (accept) { - ++n_accept; - ++n_past_tgt; - ++n_past_dft; - ++i_dft; - if (params.use_color) { - // Color token according to its origin sequence - printf("\u001b[%dm%s\u001b[37m", (36 - s_keep % 6), token_str.c_str()); - } else { - printf("%s", token_str.c_str()); - } - fflush(stdout); - continue; - } else { - printf("%s", token_str.c_str()); - fflush(stdout); - break; - } - } - } - - { - LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", token_id, token_str.c_str()); - - // TODO: simplify - { - LOG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft); - - llama_kv_cache_seq_keep(ctx_dft, s_keep); - llama_kv_cache_seq_cp (ctx_dft, s_keep, 0, -1, -1); - llama_kv_cache_seq_keep(ctx_dft, 0); - - llama_kv_cache_seq_rm (ctx_tgt, s_keep, n_past_tgt, -1); - llama_kv_cache_seq_keep(ctx_tgt, s_keep); - llama_kv_cache_seq_cp (ctx_tgt, s_keep, 0, -1, -1); - llama_kv_cache_seq_keep(ctx_tgt, 0); - } - - for (int s = 0; s < n_seq_dft; ++s) { - drafts[s].active = false; - drafts[s].tokens.clear(); - drafts[s].i_batch_tgt.clear(); - drafts[s].dists.clear(); - } - // note: will be erased after the speculation phase - drafts[0].tokens.push_back(token_id); - drafts[0].dists.push_back(std::vector()); - drafts[0].i_batch_tgt.push_back(0); - - llama_batch_clear(batch_dft); - llama_batch_add (batch_dft, token_id, n_past_dft, { 0 }, true); - - llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1); - // LOG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str()); - llama_decode(ctx_dft, batch_dft); - - ++n_past_dft; - } - - if (n_predict > params.n_predict || has_eos) { - break; - } - - if (drafts[0].smpl) { - gpt_sampler_free(drafts[0].smpl); - } - drafts[0].smpl = gpt_sampler_clone(smpl); - - int n_seq_cur = 1; - int n_past_cur = n_past_dft; - - for (int s = 0; s < n_seq_dft; ++s) { - drafts[s].active = false; - drafts[s].drafting = false; - } - drafts[0].active = true; - drafts[0].drafting = true; - drafts[0].i_batch_dft = 0; - - llama_batch_clear(batch_tgt); - llama_batch_add (batch_tgt, drafts[0].tokens[0], n_past_tgt, { 0 }, true); - - // sample n_draft tokens from the draft model using tree-based sampling - for (int i = 0; i < n_draft; ++i) { - batch_dft.n_tokens = 0; - - for (int s = 0; s < n_seq_dft; ++s) { - drafts[s].skip = false; - } - - for (int s = 0; s < n_seq_dft; ++s) { - if (!drafts[s].drafting || drafts[s].skip) { - continue; - } - - gpt_sampler_sample(drafts[s].smpl, ctx_dft, drafts[s].i_batch_dft, true); - - const auto * cur_p = gpt_sampler_get_candidates(drafts[s].smpl); - - for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p->size); ++k) { - LOG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n", - k, s, i, cur_p->data[k].id, cur_p->data[k].p, llama_token_to_piece(ctx_dft, cur_p->data[k].id).c_str()); - } - - std::vector sa(1, s); - - // attempt to split the branch if the probability is high enough - for (int f = 1; f < 8; ++f) { - if (n_seq_cur < n_seq_dft && cur_p->data[f].p > p_split) { - LOG("splitting seq %3d into %3d\n", s, n_seq_cur); - - llama_kv_cache_seq_rm(ctx_dft, n_seq_cur, -1, -1); - llama_kv_cache_seq_cp(ctx_dft, s, n_seq_cur, -1, -1); - - // all previous tokens from this branch are now also part of the new branch - for (int t = 0; t < batch_tgt.n_tokens; ++t) { - for (int p = 0; p < batch_tgt.n_seq_id[t]; ++p) { - if (batch_tgt.seq_id[t][p] == s) { - batch_tgt.seq_id[t][batch_tgt.n_seq_id[t]] = n_seq_cur; - batch_tgt.n_seq_id[t]++; - break; - } - } - } - - // copy the draft state - drafts[n_seq_cur].active = true; - drafts[n_seq_cur].drafting = true; - drafts[n_seq_cur].skip = true; - - drafts[n_seq_cur].tokens = drafts[s].tokens; - drafts[n_seq_cur].dists = drafts[s].dists; - drafts[n_seq_cur].i_batch_dft = drafts[s].i_batch_dft; - drafts[n_seq_cur].i_batch_tgt = drafts[s].i_batch_tgt; - - if (drafts[n_seq_cur].smpl) { - gpt_sampler_free(drafts[n_seq_cur].smpl); - } - drafts[n_seq_cur].smpl = gpt_sampler_clone(drafts[s].smpl); - - sa.push_back(n_seq_cur); - - n_seq_cur++; - } else { - break; - } - } - - // add drafted token for each sequence - for (int is = 0; is < (int) sa.size(); ++is) { - const llama_token id = cur_p->data[is].id; - - const int s = sa[is]; - - gpt_sampler_accept(drafts[s].smpl, id, true); - - drafts[s].tokens.push_back(id); - // save cur_p.data into drafts[s].dists - drafts[s].dists.push_back({cur_p->data, cur_p->data + cur_p->size}); - - // add unique drafted tokens to the target batch - drafts[s].i_batch_tgt.push_back(batch_tgt.n_tokens); - - llama_batch_add(batch_tgt, id, n_past_tgt + i + 1, { s }, true); - - // add the token to the batch for batched decoding with the draft model - drafts[s].i_batch_dft = batch_dft.n_tokens; - - llama_batch_add(batch_dft, id, n_past_cur, { s }, true); - - if (batch_tgt.n_tokens > n_draft) { - drafts[s].drafting = false; - } - } - } - - // no sequence is drafting anymore - if (batch_dft.n_tokens == 0) { - break; - } - - // evaluate the drafted tokens on the draft model - llama_decode(ctx_dft, batch_dft); - ++n_past_cur; - ++n_drafted; - - if (batch_tgt.n_tokens > n_draft) { - break; - } - } - - // evaluate the target model on the drafted tokens - { - llama_kv_cache_seq_keep(ctx_tgt, 0); - for (int s = 1; s < n_seq_dft; ++s) { - llama_kv_cache_seq_cp(ctx_tgt, 0, s, -1, -1); - } - - // LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str()); - llama_decode(ctx_tgt, batch_tgt); - ++n_past_tgt; - } - - // the first token is always proposed by the target model before the speculation loop so we erase it here - for (int s = 0; s < n_seq_dft; ++s) { - if (!drafts[s].active) { - continue; - } - - drafts[s].tokens.erase(drafts[s].tokens.begin()); - drafts[s].dists.erase(drafts[s].dists.begin()); - } - } - - auto t_dec_end = ggml_time_us(); - - LOG_TEE("\n\n"); - - LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f)); - LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f)); - - LOG_TEE("\n"); - LOG_TEE("n_draft = %d\n", n_draft); - LOG_TEE("n_predict = %d\n", n_predict); - LOG_TEE("n_drafted = %d\n", n_drafted); - LOG_TEE("n_accept = %d\n", n_accept); - LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted); - - LOG_TEE("\ndraft:\n\n"); - // TODO: print sampling/grammar timings for all drafts - llama_perf_print(ctx_dft, LLAMA_PERF_TYPE_CONTEXT); - - LOG_TEE("\ntarget:\n\n"); - gpt_perf_print(ctx_tgt, smpl); - - gpt_sampler_free(smpl); - for (int s = 0; s < n_seq_dft; ++s) { - gpt_sampler_free(drafts[s].smpl); - } - - llama_sampler_free(softmax); - llama_batch_free(batch_dft); - - llama_free(ctx_tgt); - llama_free_model(model_tgt); - - llama_free(ctx_dft); - llama_free_model(model_dft); - - llama_backend_free(); - - fprintf(stderr, "\n\n"); - - return 0; -} diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index bdd98487f..b1423b0fd 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -2556,7 +2556,11 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor * node = cgraph->nodes[i]; - if (node->src[0] && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) { + if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { + continue; + } + + if (node->src[0] && node->src[0]->buffer && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) { use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture #ifndef NDEBUG GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to split buffer\n", __func__); diff --git a/ggml/src/ggml-cuda/sum.cu b/ggml/src/ggml-cuda/sum.cu index 0d5e953ee..21da63509 100644 --- a/ggml/src/ggml-cuda/sum.cu +++ b/ggml/src/ggml-cuda/sum.cu @@ -1,13 +1,15 @@ +#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) +// On Windows CUB uses libraries with variables called CC_PASCAL which conflict with the define in common.cuh. +// For this reason CUB must be included BEFORE anything else. +#include +using namespace cub; +#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) + #include "sumrows.cuh" #include "sum.cuh" #include -#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) -#include -using namespace cub; -#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) - void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream) { #if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) size_t tmp_size = 0; diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m index fcf08fbdc..f8283056f 100644 --- a/ggml/src/ggml-metal.m +++ b/ggml/src/ggml-metal.m @@ -17,8 +17,8 @@ #define GGML_METAL_LOG_WARN(...) #define GGML_METAL_LOG_ERROR(...) #else -#define GGML_METAL_LOG_INFO(...) ggml_metal_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__) -#define GGML_METAL_LOG_WARN(...) ggml_metal_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__) +#define GGML_METAL_LOG_INFO(...) ggml_metal_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__) +#define GGML_METAL_LOG_WARN(...) ggml_metal_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__) #define GGML_METAL_LOG_ERROR(...) ggml_metal_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__) #endif @@ -3039,8 +3039,7 @@ static enum ggml_status ggml_metal_graph_compute( if (status != MTLCommandBufferStatusCompleted) { GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status); if (status == MTLCommandBufferStatusError) { - NSString * error_code = [command_buffer error].localizedDescription; - GGML_METAL_LOG_INFO("error: %s\n", [error_code UTF8String]); + GGML_METAL_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]); } return GGML_STATUS_FAILED; diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index abae1fdc0..c757a08b6 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -4004,42 +4004,141 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r float sumf = 0; #if defined(__ARM_FEATURE_SVE) - if (ggml_sve_cnt_b == QK8_0) { - const svbool_t ptrueh = svptrue_pat_b8(SV_VL16); - const svbool_t ptruel = svnot_b_z(svptrue_b8(), ptrueh); + svfloat32_t sumv0 = svdup_n_f32(0.0f); + svfloat32_t sumv1 = svdup_n_f32(0.0f); - svfloat32_t sumv0 = svdup_n_f32(0.0f); - svfloat32_t sumv1 = svdup_n_f32(0.0f); + const int vector_length = ggml_sve_cnt_b*8; - for (; ib + 1 < nb; ib += 2) { - const block_q4_0 * restrict x0 = &x[ib + 0]; - const block_q4_0 * restrict x1 = &x[ib + 1]; - const block_q8_0 * restrict y0 = &y[ib + 0]; - const block_q8_0 * restrict y1 = &y[ib + 1]; + // VLA Implementation using switch case + switch (vector_length) { + case 128: + { + // predicate for activating higher lanes for 4 float32 elements + const svbool_t ph4 = svptrue_pat_b32(SV_VL4); - // load x - const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs); - const svuint8_t qx1r = svld1rq_u8(svptrue_b8(), x1->qs); + for (; ib + 1 < nb; ib += 2) { + const block_q4_0 * restrict x0 = &x[ib + 0]; + const block_q4_0 * restrict x1 = &x[ib + 1]; + const block_q8_0 * restrict y0 = &y[ib + 0]; + const block_q8_0 * restrict y1 = &y[ib + 1]; - // 4-bit -> 8-bit - const svint8_t qx0 = svreinterpret_s8_u8(svlsr_n_u8_m(ptruel, svand_n_u8_m(ptrueh, qx0r, 0x0F), 0x04)); - const svint8_t qx1 = svreinterpret_s8_u8(svlsr_n_u8_m(ptruel, svand_n_u8_m(ptrueh, qx1r, 0x0F), 0x04)); + // load x + const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs); + const svuint8_t qx1r = svld1rq_u8(svptrue_b8(), x1->qs); - // sub 8 - const svint8_t qx0s = svsub_n_s8_x(svptrue_b8(), qx0, 8); - const svint8_t qx1s = svsub_n_s8_x(svptrue_b8(), qx1, 8); + // 4-bit -> 8-bit + const svint8_t qx0l = svreinterpret_s8_u8(svand_n_u8_m(svptrue_b8(), qx0r, 0x0F)); + const svint8_t qx0h = svreinterpret_s8_u8(svlsr_n_u8_m(svptrue_b8(), qx0r, 0x04)); + const svint8_t qx1l = svreinterpret_s8_u8(svand_n_u8_m(svptrue_b8(), qx1r, 0x0F)); + const svint8_t qx1h = svreinterpret_s8_u8(svlsr_n_u8_m(svptrue_b8(), qx1r, 0x04)); - // load y - const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs); - const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs); + // sub 8 + const svint8_t qx0ls = svsub_n_s8_x(svptrue_b8(), qx0h, 8); + const svint8_t qx0hs = svsub_n_s8_x(svptrue_b8(), qx0l, 8); + const svint8_t qx1ls = svsub_n_s8_x(svptrue_b8(), qx1h, 8); + const svint8_t qx1hs = svsub_n_s8_x(svptrue_b8(), qx1l, 8); - // dot product - sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); - sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); - } + // load y + const svint8_t qy0h = svld1_s8(svptrue_b8(), y0->qs); + const svint8_t qy0l = svld1_s8(svptrue_b8(), y0->qs + 16); + const svint8_t qy1h = svld1_s8(svptrue_b8(), y1->qs); + const svint8_t qy1l = svld1_s8(svptrue_b8(), y1->qs + 16); - sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1)); + // dot product + sumv0 = svmla_n_f32_x(ph4, sumv0, svcvt_f32_s32_x(ph4, svadd_x(ph4, + svdot_s32(svdup_n_s32(0), qx0ls, qy0l), + svdot_s32(svdup_n_s32(0), qx0hs, qy0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); + sumv1 = svmla_n_f32_x(ph4, sumv1, svcvt_f32_s32_x(ph4, svadd_x(ph4, + svdot_s32(svdup_n_s32(0), qx1ls, qy1l), + svdot_s32(svdup_n_s32(0), qx1hs, qy1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); + } + + sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1)); + } break; + case 256: + { + // predicate for activating higher lanes for 16 int8 elements + const svbool_t ph16 = svptrue_pat_b8(SV_VL16); + // predicate for activating lower lanes for 16 int8 elements + const svbool_t pl16 = svnot_b_z(svptrue_b8(), ph16); + + for (; ib + 1 < nb; ib += 2) { + const block_q4_0 * restrict x0 = &x[ib + 0]; + const block_q4_0 * restrict x1 = &x[ib + 1]; + const block_q8_0 * restrict y0 = &y[ib + 0]; + const block_q8_0 * restrict y1 = &y[ib + 1]; + + // load x + const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs); + const svuint8_t qx1r = svld1rq_u8(svptrue_b8(), x1->qs); + + // 4-bit -> 8-bit + const svint8_t qx0 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx0r, 0x0F), 0x04)); + const svint8_t qx1 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx1r, 0x0F), 0x04)); + + // sub 8 + const svint8_t qx0s = svsub_n_s8_x(svptrue_b8(), qx0, 8); + const svint8_t qx1s = svsub_n_s8_x(svptrue_b8(), qx1, 8); + + // load y + const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs); + const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs); + + // dot product + sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), + svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); + sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), + svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); + } + + sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1)); + } break; + case 512: + { + // predicate for activating higher lanes for 32 int8 elements + const svbool_t ph32 = svptrue_pat_b8(SV_VL32); + + // predicate for activating higher lanes for 16 int8 elements + const svbool_t ph16 = svptrue_pat_b8(SV_VL16); + // predicate for activating lower lanes for 16 int8 elements from first 32 int8 activated lanes + const svbool_t pl16 = svnot_b_z(ph32, ph16); + + for (; ib + 1 < nb; ib += 2) { + const block_q4_0 * restrict x0 = &x[ib + 0]; + const block_q4_0 * restrict x1 = &x[ib + 1]; + const block_q8_0 * restrict y0 = &y[ib + 0]; + const block_q8_0 * restrict y1 = &y[ib + 1]; + + // load x + const svuint8_t qx0r = svld1rq_u8(ph32, x0->qs); + const svuint8_t qx1r = svld1rq_u8(ph32, x1->qs); + + // 4-bit -> 8-bit + const svint8_t qx0 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx0r, 0x0F), 0x04)); + const svint8_t qx1 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx1r, 0x0F), 0x04)); + + // sub 8 + const svint8_t qx0s = svsub_n_s8_x(ph32, qx0, 8); + const svint8_t qx1s = svsub_n_s8_x(ph32, qx1, 8); + + // load y + const svint8_t qy0 = svld1_s8(ph32, y0->qs); + const svint8_t qy1 = svld1_s8(ph32, y1->qs); + + // dot product + sumv0 = svmla_n_f32_x(ph32, sumv0, svcvt_f32_s32_x(ph32, + svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); + sumv1 = svmla_n_f32_x(ph32, sumv1, svcvt_f32_s32_x(ph32, + svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); + } + + sumf = svaddv_f32(ph32, svadd_f32_x(ph32, sumv0, sumv1)); + } break; + default: + assert(false && "Unsupported vector length"); + break; } + #elif defined(__ARM_NEON) float32x4_t sumv0 = vdupq_n_f32(0.0f); float32x4_t sumv1 = vdupq_n_f32(0.0f); @@ -5489,29 +5588,124 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r float sumf = 0; #if defined(__ARM_FEATURE_SVE) - if (ggml_sve_cnt_b == QK8_0) { - svfloat32_t sumv0 = svdup_n_f32(0.0f); - svfloat32_t sumv1 = svdup_n_f32(0.0f); + svfloat32_t sumv0 = svdup_n_f32(0.0f); + svfloat32_t sumv1 = svdup_n_f32(0.0f); - for (; ib + 1 < nb; ib += 2) { - const block_q8_0 * restrict x0 = &x[ib + 0]; - const block_q8_0 * restrict x1 = &x[ib + 1]; - const block_q8_0 * restrict y0 = &y[ib + 0]; - const block_q8_0 * restrict y1 = &y[ib + 1]; + const int vector_length = ggml_sve_cnt_b*8; - // load x - const svint8_t qx0 = svld1_s8(svptrue_b8(), x0->qs); - const svint8_t qx1 = svld1_s8(svptrue_b8(), x1->qs); + //VLA Implemenation for SVE + switch (vector_length) { + case 128: + { + // predicate for activating lanes for 16 Int8 elements + const svbool_t ph16 = svptrue_pat_b8 (SV_VL16); + const svbool_t pl16 = svptrue_pat_b32(SV_VL4); - // load y - const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs); - const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs); + for (; ib + 1 < nb; ib += 2) { + const block_q8_0 * restrict x0 = &x[ib + 0]; + const block_q8_0 * restrict x1 = &x[ib + 1]; + const block_q8_0 * restrict y0 = &y[ib + 0]; + const block_q8_0 * restrict y1 = &y[ib + 1]; - sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx0, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); - sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx1, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); - } + // load x + const svint8_t qx0_0 = svld1_s8(ph16, x0->qs); + const svint8_t qx0_1 = svld1_s8(ph16, x0->qs+16); + const svint8_t qx1_0 = svld1_s8(ph16, x1->qs); + const svint8_t qx1_1 = svld1_s8(ph16, x1->qs+16); - sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1)); + // load y + const svint8_t qy0_0 = svld1_s8(ph16, y0->qs); + const svint8_t qy0_1 = svld1_s8(ph16, y0->qs+16); + const svint8_t qy1_0 = svld1_s8(ph16, y1->qs); + const svint8_t qy1_1 = svld1_s8(ph16, y1->qs+16); + + sumv0 = svmla_n_f32_x(pl16, sumv0, svcvt_f32_s32_x(pl16, svadd_x(pl16, + svdot_s32(svdup_n_s32(0), qx0_0, qy0_0), + svdot_s32(svdup_n_s32(0), qx0_1, qy0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); + sumv1 = svmla_n_f32_x(pl16, sumv1, svcvt_f32_s32_x(pl16, svadd_x(pl16, + svdot_s32(svdup_n_s32(0), qx1_0, qy1_0), + svdot_s32(svdup_n_s32(0), qx1_1, qy1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); + } + + sumf = svaddv_f32(pl16, svadd_f32_x(pl16, sumv0, sumv1)); + } break; + case 256: + { + //printf("sve256"); + for (; ib + 1 < nb; ib += 2) { + const block_q8_0 * restrict x0 = &x[ib + 0]; + const block_q8_0 * restrict x1 = &x[ib + 1]; + const block_q8_0 * restrict y0 = &y[ib + 0]; + const block_q8_0 * restrict y1 = &y[ib + 1]; + + // load x + const svint8_t qx0 = svld1_s8(svptrue_b8(), x0->qs); + const svint8_t qx1 = svld1_s8(svptrue_b8(), x1->qs); + + // load y + const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs); + const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs); + + sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), + svdot_s32(svdup_n_s32(0), qx0, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); + sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), + svdot_s32(svdup_n_s32(0), qx1, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); + } + + sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1)); + } break; + case 512: + { + // predicate for activating high 256 bit + const svbool_t ph32 = svptrue_pat_b8(SV_VL32); + // predicate for activating low 256 bit + const svbool_t pl32 = svnot_b_z(svptrue_b8(), ph32); + + // predicate for activating high lanes for 8 float32 elements + const svbool_t ph8 = svptrue_pat_b32(SV_VL8); + // predicate for activating low lanes for 8 float32 elements + const svbool_t pl8 = svnot_b_z(svptrue_b32(), ph8); + + svfloat32_t sumv00 = svdup_n_f32(0.0f); + + for (; ib + 1 < nb; ib += 2) { + const block_q8_0 * restrict x0 = &x[ib + 0]; + const block_q8_0 * restrict x1 = &x[ib + 1]; + const block_q8_0 * restrict y0 = &y[ib + 0]; + const block_q8_0 * restrict y1 = &y[ib + 1]; + + //load 32 int8_t in first half of vector and put another 32 int8_t in second vector lower bits + // and add them to make one 64 element vector + // load x + const svint8_t qx_32 = svld1_s8(ph32, x0->qs); + svint8_t qx_64 = svld1_s8(pl32, x0->qs + 2); + + qx_64 = svadd_s8_x(svptrue_b8(), qx_32, qx_64); + + // load y + const svint8_t qy_32 = svld1_s8(ph32, y0->qs); + svint8_t qy_64 = svld1_s8(pl32, y0->qs + 2); + + qy_64 = svadd_s8_x(svptrue_b8(), qy_32, qy_64); + + // scale creation + const float32_t deq1 = GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d); + const float32_t deq2 = GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d); + + // duplicate deq1 in first half of vector and deq2 in second half of vector + const svfloat32_t temp = svdup_f32_m(svdup_f32_z(ph8, deq1), pl8, deq2); + + const svfloat32_t sumvt = svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx_64, qy_64)); + + sumv00 = svmla_f32_m(svptrue_b32(), sumv00, sumvt, temp); + } + + sumf = svaddv_f32(svptrue_b32(), sumv00); + break; + } + default: + assert(false && "Unsupported vector length"); + break; } #elif defined(__ARM_NEON) float32x4_t sumv0 = vdupq_n_f32(0.0f); diff --git a/ggml/src/ggml-rpc.cpp b/ggml/src/ggml-rpc.cpp index 8f9d0a460..9c600c7ca 100644 --- a/ggml/src/ggml-rpc.cpp +++ b/ggml/src/ggml-rpc.cpp @@ -883,15 +883,17 @@ ggml_tensor * rpc_server::deserialize_tensor(struct ggml_context * ctx, const rp } result->buffer = reinterpret_cast(tensor->buffer); if (result->buffer && buffers.find(result->buffer) == buffers.end()) { - return nullptr; + result->buffer = nullptr; } - // require that the tensor data does not go beyond the buffer end - uint64_t tensor_size = (uint64_t) ggml_nbytes(result); - uint64_t buffer_start = (uint64_t) ggml_backend_buffer_get_base(result->buffer); - uint64_t buffer_size = (uint64_t) ggml_backend_buffer_get_size(result->buffer); - GGML_ASSERT(tensor->data + tensor_size >= tensor->data); // check for overflow - GGML_ASSERT(tensor->data >= buffer_start && tensor->data + tensor_size <= buffer_start + buffer_size); + if (result->buffer) { + // require that the tensor data does not go beyond the buffer end + uint64_t tensor_size = (uint64_t) ggml_nbytes(result); + uint64_t buffer_start = (uint64_t) ggml_backend_buffer_get_base(result->buffer); + uint64_t buffer_size = (uint64_t) ggml_backend_buffer_get_size(result->buffer); + GGML_ASSERT(tensor->data + tensor_size >= tensor->data); // check for overflow + GGML_ASSERT(tensor->data >= buffer_start && tensor->data + tensor_size <= buffer_start + buffer_size); + } result->op = (ggml_op) tensor->op; for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) { @@ -1060,7 +1062,7 @@ bool rpc_server::graph_compute(const std::vector & input, std::vector ctx->mem_size) { GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n", - __func__, cur_end + size_needed, ctx->mem_size); + __func__, cur_end + size_needed + GGML_OBJECT_SIZE, ctx->mem_size); assert(false); return NULL; } diff --git a/include/llama.h b/include/llama.h index e6d65f753..8bc8516ba 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1129,15 +1129,16 @@ extern "C" { int32_t n_logit_bias, const llama_logit_bias * logit_bias); - // Shorthand for: + /// @details Sample and accept a token from the idx-th output of the last evaluation // + // Shorthand for: // const auto * logits = llama_get_logits_ith(ctx, idx); // llama_token_data_array cur_p = { ... init from logits ... }; // llama_sampler_apply(smpl, &cur_p); - // return cur_p.data[cur_p.selected].id; - // - // At this point, this is mostly a convenience function. - // + // auto token = cur_p.data[cur_p.selected].id; + // llama_sampler_accept(smpl, token); + // return token; + // Returns the sampled token LLAMA_API llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx); // TODO: extend in the future diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 64d4cc995..832fb5259 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -8,49 +8,44 @@ #include #include #include +#include #include #include #include -static int llama_sample_dist(llama_token_data_array * cur_p, std::mt19937 & rng, std::vector & probs) { -#if 1 - probs.resize(cur_p->size); - for (size_t i = 0; i < cur_p->size; ++i) { - probs[i] = cur_p->data[i].p; - } - - std::discrete_distribution dist(probs.begin(), probs.end()); -#else - // avoid the copy with a custom iterator +static int llama_sample_dist(llama_token_data_array * cur_p, std::mt19937 & rng) { + // iterator for the probabilities +#ifdef __GNUC__ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-local-typedefs" +#endif struct probs_iterator { typedef std::input_iterator_tag iterator_category; typedef float value_type; typedef float * pointer; typedef float & reference; - typedef size_t difference_type; + typedef ptrdiff_t difference_type; - const llama_token_data_array * data; - size_t i; + const llama_token_data * data; - bool operator==(const probs_iterator & other) const { return data + i == other.data + other.i; } - bool operator!=(const probs_iterator & other) const { return data + i != other.data + other.i; } - float operator*() const { return data->data[i].p; } - probs_iterator & operator++() { ++i; return *this; } - probs_iterator operator++(int) { probs_iterator tmp = *this; ++i; return tmp; } + bool operator==(const probs_iterator & other) const { return data == other.data; } + bool operator!=(const probs_iterator & other) const { return data != other.data; } + const float & operator*() const { return data->p; } + probs_iterator & operator++() { ++data; return *this; } + probs_iterator operator++(int) { probs_iterator tmp = *this; ++data; return tmp; } }; + +#ifdef __GNUC__ #pragma GCC diagnostic pop - - std::discrete_distribution dist(probs_iterator{cur_p, 0}, probs_iterator{cur_p, cur_p->size}); - - GGML_UNUSED(probs); #endif + std::discrete_distribution dist(probs_iterator{cur_p->data}, probs_iterator{cur_p->data + cur_p->size}); + return dist(rng); } +/* static void llama_log_softmax(float * array, size_t size) { float max_l = *std::max_element(array, array + size); float sum = 0.f; @@ -64,6 +59,7 @@ static void llama_log_softmax(float * array, size_t size) { array[i] = logf(array[i] / sum); } } +*/ static void llama_sampler_softmax_impl(llama_token_data_array * cur_p) { GGML_ASSERT(cur_p->size > 0); @@ -231,67 +227,92 @@ llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_conte cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f}; } - llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false }; + llama_token_data_array cur_p = { + /* .data = */ cur.data(), + /* .size = */ cur.size(), + /* .selected = */ -1, + /* .sorted = */ false, + }; llama_sampler_apply(smpl, &cur_p); - return cur_p.data[cur_p.selected].id; + GGML_ASSERT(cur_p.selected >= 0 && cur_p.selected < (int32_t) cur_p.size); + + auto token = cur_p.data[cur_p.selected].id; + + llama_sampler_accept(smpl, token); + + return token; } // sampler chain +static const char * llama_sampler_chain_name(const struct llama_sampler * /*smpl*/) { + return "chain"; +} + +static void llama_sampler_chain_accept(struct llama_sampler * smpl, llama_token token) { + auto * chain = (llama_sampler_chain *) smpl->ctx; + + time_meas tm(chain->t_sample_us, chain->params.no_perf); + + for (auto * smpl : chain->samplers) { + llama_sampler_accept(smpl, token); + } + + chain->n_sample++; +} + +static void llama_sampler_chain_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + auto * chain = (llama_sampler_chain *) smpl->ctx; + + time_meas tm(chain->t_sample_us, chain->params.no_perf); + + for (auto * smpl : chain->samplers) { + llama_sampler_apply(smpl, cur_p); + } +} + +static void llama_sampler_chain_reset(struct llama_sampler * smpl) { + auto * chain = (llama_sampler_chain *) smpl->ctx; + + for (auto * smpl : chain->samplers) { + llama_sampler_reset(smpl); + } + + chain->t_sample_us = 0; + chain->n_sample = 0; +} + +static struct llama_sampler * llama_sampler_chain_clone(const struct llama_sampler * smpl) { + const auto * chain_src = (const llama_sampler_chain *) smpl->ctx; + + auto * result = llama_sampler_chain_init(chain_src->params); + + for (auto * smpl : chain_src->samplers) { + llama_sampler_chain_add(result, llama_sampler_clone(smpl)); + } + + return result; +} + +static void llama_sampler_chain_free(struct llama_sampler * smpl) { + auto * chain = (llama_sampler_chain *) smpl->ctx; + + for (auto * smpl : chain->samplers) { + llama_sampler_free(smpl); + } + + delete chain; +} + static struct llama_sampler_i llama_sampler_chain_i = { - /* .name = */ [](const struct llama_sampler * /*smpl*/) { return "chain"; }, - /* .accept = */ [](struct llama_sampler * smpl, llama_token token) { - auto * chain = (llama_sampler_chain *) smpl->ctx; - - time_meas tm(chain->t_sample_us, chain->params.no_perf); - - for (auto * smpl : chain->samplers) { - llama_sampler_accept(smpl, token); - } - - chain->n_sample++; - }, - /* .apply = */ [](struct llama_sampler * smpl, llama_token_data_array * cur_p) { - auto * chain = (llama_sampler_chain *) smpl->ctx; - - time_meas tm(chain->t_sample_us, chain->params.no_perf); - - for (auto * smpl : chain->samplers) { - llama_sampler_apply(smpl, cur_p); - } - }, - /* .reset = */ [](struct llama_sampler * smpl) { - auto * chain = (llama_sampler_chain *) smpl->ctx; - - for (auto * smpl : chain->samplers) { - llama_sampler_reset(smpl); - } - - chain->t_sample_us = 0; - chain->n_sample = 0; - }, - /* .clone = */ [](const struct llama_sampler * smpl) { - const auto * chain_src = (const llama_sampler_chain *) smpl->ctx; - - auto * result = llama_sampler_chain_init(chain_src->params); - - for (auto * smpl : chain_src->samplers) { - llama_sampler_chain_add(result, llama_sampler_clone(smpl)); - } - - return result; - }, - /* .free = */ [](struct llama_sampler * smpl) { - auto * chain = (llama_sampler_chain *) smpl->ctx; - - for (auto * smpl : chain->samplers) { - llama_sampler_free(smpl); - } - - delete chain; - }, + /* .name = */ llama_sampler_chain_name, + /* .accept = */ llama_sampler_chain_accept, + /* .apply = */ llama_sampler_chain_apply, + /* .reset = */ llama_sampler_chain_reset, + /* .clone = */ llama_sampler_chain_clone, + /* .free = */ llama_sampler_chain_free, }; struct llama_sampler * llama_sampler_chain_init(struct llama_sampler_chain_params params) { @@ -368,8 +389,6 @@ struct llama_sampler_dist { const uint32_t seed; std::mt19937 rng; - - std::vector probs; // work array }; static const char * llama_sampler_dist_name(const struct llama_sampler * /*smpl*/) { @@ -378,7 +397,7 @@ static const char * llama_sampler_dist_name(const struct llama_sampler * /*smpl* static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { auto * ctx = (llama_sampler_dist *) smpl->ctx; - cur_p->selected = llama_sample_dist(cur_p, ctx->rng, ctx->probs); + cur_p->selected = llama_sample_dist(cur_p, ctx->rng); } static struct llama_sampler * llama_sampler_dist_clone(const struct llama_sampler * smpl) { @@ -419,7 +438,6 @@ struct llama_sampler * llama_sampler_init_dist(uint32_t seed) { /* .ctx = */ new llama_sampler_dist { /* .seed = */ seed, /* .rng = */ std::mt19937(seed), - /* .probs = */ {}, }, }; } @@ -1023,8 +1041,6 @@ struct llama_sampler_mirostat { float mu; std::mt19937 rng; - - std::vector probs; }; static const char * llama_sampler_mirostat_name(const struct llama_sampler * /*smpl*/) { @@ -1055,7 +1071,7 @@ static void llama_sampler_mirostat_apply(struct llama_sampler * smpl, llama_toke llama_sampler_top_k_impl(cur_p, std::max(int(k), 1)); llama_sampler_softmax_impl(cur_p); - const int idx = llama_sample_dist(cur_p, ctx->rng, ctx->probs); + const int idx = llama_sample_dist(cur_p, ctx->rng); cur_p->selected = idx; @@ -1111,7 +1127,6 @@ struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t see /* .m = */ m, /* .mu = */ 2.0f*tau, /* .rng = */ std::mt19937(seed), - /* .probs = */ {}, }, }; } @@ -1127,8 +1142,6 @@ struct llama_sampler_mirostat_v2 { float mu; std::mt19937 rng; - - std::vector probs; }; static const char * llama_sampler_mirostat_v2_name(const struct llama_sampler * /*smpl*/) { @@ -1152,7 +1165,7 @@ static void llama_sampler_mirostat_v2_apply(struct llama_sampler * smpl, llama_t // Normalize the probabilities of the remaining words llama_sampler_softmax_impl(cur_p); - const int idx = llama_sample_dist(cur_p, ctx->rng, ctx->probs); + const int idx = llama_sample_dist(cur_p, ctx->rng); cur_p->selected = idx; @@ -1207,7 +1220,6 @@ struct llama_sampler * llama_sampler_init_mirostat_v2(uint32_t seed, float tau, /* .eta = */ eta, /* .mu = */ 2.0f*tau, /* .rng = */ std::mt19937(seed), - /* .probs = */ {}, }, }; } @@ -1527,6 +1539,10 @@ static const char * llama_sampler_logit_bias_name(const struct llama_sampler * / static void llama_sampler_logit_bias_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { auto * ctx = (llama_sampler_logit_bias *) smpl->ctx; + if (ctx->logit_bias.empty()) { + return; + } + ctx->to_search.clear(); // update the candidates that have not been shuffled in the vocabulary (i.e. idx == id) @@ -1538,6 +1554,10 @@ static void llama_sampler_logit_bias_apply(struct llama_sampler * smpl, llama_to } } + if (ctx->to_search.empty()) { + return; + } + // search for the remaining candidates that were not found in the previous step for (size_t i = 0; i < cur_p->size; ++i) { for (const auto & lb : ctx->to_search) { diff --git a/src/llama.cpp b/src/llama.cpp index 0687cf463..d1e934970 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -9317,7 +9317,7 @@ static struct ggml_tensor * llm_build_copy_mask_state( // FIXME: zero-out NANs? states = ggml_mul(ctx, states, state_mask); - // copy states which won't be changed further (between n_seqs and n_rs) + // copy states which won't be changed further (between n_seqs and n_kv) ggml_build_forward_expand(graph, ggml_cpy(ctx, ggml_view_1d(ctx, states, n_state*(n_kv - n_seqs), n_seqs*n_state*ggml_element_size(states)), @@ -17607,6 +17607,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s quantize &= name.find("time_mix_first.weight") == std::string::npos; quantize &= name.find("time_mix_w1.weight") == std::string::npos; quantize &= name.find("time_mix_w2.weight") == std::string::npos; + quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos; + quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos; // do not quantize relative position bias (T5) quantize &= name.find("attn_rel_b.weight") == std::string::npos; diff --git a/tests/test-arg-parser.cpp b/tests/test-arg-parser.cpp index 9ad91acc0..f26707910 100644 --- a/tests/test-arg-parser.cpp +++ b/tests/test-arg-parser.cpp @@ -1,3 +1,6 @@ +#include "arg.h" +#include "common.h" + #include #include #include @@ -6,18 +9,16 @@ #undef NDEBUG #include -#include "common.h" - int main(void) { gpt_params params; printf("test-arg-parser: make sure there is no duplicated arguments in any examples\n\n"); for (int ex = 0; ex < LLAMA_EXAMPLE_COUNT; ex++) { try { - auto options = gpt_params_parser_init(params, (enum llama_example)ex); + auto ctx_arg = gpt_params_parser_init(params, (enum llama_example)ex); std::unordered_set seen_args; std::unordered_set seen_env_vars; - for (const auto & opt : options) { + for (const auto & opt : ctx_arg.options) { // check for args duplications for (const auto & arg : opt.args) { if (seen_args.find(arg) == seen_args.end()) { @@ -52,40 +53,51 @@ int main(void) { }; std::vector argv; - auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON); printf("test-arg-parser: test invalid usage\n\n"); + // missing value argv = {"binary_name", "-m"}; - assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options)); + assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); + // wrong value (int) argv = {"binary_name", "-ngl", "hello"}; - assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options)); + assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); + // wrong value (enum) argv = {"binary_name", "-sm", "hello"}; - assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options)); + assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); + + // non-existence arg in specific example (--draft cannot be used outside llama-speculative) + argv = {"binary_name", "--draft", "123"}; + assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SERVER)); printf("test-arg-parser: test valid usage\n\n"); argv = {"binary_name", "-m", "model_file.gguf"}; - assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options)); + assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); assert(params.model == "model_file.gguf"); argv = {"binary_name", "-t", "1234"}; - assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options)); + assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); assert(params.cpuparams.n_threads == 1234); argv = {"binary_name", "--verbose"}; - assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options)); + assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); assert(params.verbosity == 1); argv = {"binary_name", "-m", "abc.gguf", "--predict", "6789", "--batch-size", "9090"}; - assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options)); + assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); assert(params.model == "abc.gguf"); assert(params.n_predict == 6789); assert(params.n_batch == 9090); + // --draft cannot be used outside llama-speculative + argv = {"binary_name", "--draft", "123"}; + assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SPECULATIVE)); + assert(params.n_draft == 123); + // skip this part on windows, because setenv is not supported #ifdef _WIN32 printf("test-arg-parser: skip on windows build\n"); @@ -94,12 +106,12 @@ int main(void) { setenv("LLAMA_ARG_THREADS", "blah", true); argv = {"binary_name"}; - assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options)); + assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); setenv("LLAMA_ARG_MODEL", "blah.gguf", true); setenv("LLAMA_ARG_THREADS", "1010", true); argv = {"binary_name"}; - assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options)); + assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); assert(params.model == "blah.gguf"); assert(params.cpuparams.n_threads == 1010); @@ -109,7 +121,7 @@ int main(void) { setenv("LLAMA_ARG_MODEL", "blah.gguf", true); setenv("LLAMA_ARG_THREADS", "1010", true); argv = {"binary_name", "-m", "overwritten.gguf"}; - assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options)); + assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); assert(params.model == "overwritten.gguf"); assert(params.cpuparams.n_threads == 1010); #endif // _WIN32