diff --git a/common/arg.cpp b/common/arg.cpp index a091c888d..024270342 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -120,32 +120,6 @@ std::string common_arg::to_string() { // utils // -#ifdef __GNUC__ -#ifdef __MINGW32__ -#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__))) -#else -#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__))) -#endif -#else -#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) -#endif - -LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2) -static std::string format(const char * fmt, ...) { - va_list ap; - va_list ap2; - va_start(ap, fmt); - va_copy(ap2, ap); - int size = vsnprintf(NULL, 0, fmt, ap); - GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT - std::vector buf(size + 1); - int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2); - GGML_ASSERT(size2 == size); - va_end(ap2); - va_end(ap); - return std::string(buf.data(), size); -} - static void common_params_handle_model_default(common_params & params) { if (!params.hf_repo.empty()) { // short-hand to avoid specifying --hf-file -> default it to --model @@ -200,7 +174,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context continue; } } catch (std::exception & e) { - throw std::invalid_argument(format( + throw std::invalid_argument(string_format( "error while handling environment variable \"%s\": %s\n\n", opt.env, e.what())); } } @@ -221,7 +195,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context std::replace(arg.begin(), arg.end(), '_', '-'); } if (arg_to_options.find(arg) == arg_to_options.end()) { - throw std::invalid_argument(format("error: invalid argument: %s", arg.c_str())); + throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str())); } auto opt = *arg_to_options[arg]; if (opt.has_value_from_env()) { @@ -253,7 +227,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context continue; } } catch (std::exception & e) { - throw std::invalid_argument(format( + throw std::invalid_argument(string_format( "error while handling argument \"%s\": %s\n\n" "usage:\n%s\n\nto show complete usage, run with -h", arg.c_str(), e.what(), arg_to_options[arg]->to_string().c_str())); @@ -392,28 +366,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex )); add_opt(common_arg( {"--verbose-prompt"}, - format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"), + string_format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"), [](common_params & params) { params.verbose_prompt = true; } )); add_opt(common_arg( {"--no-display-prompt"}, - format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"), + string_format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"), [](common_params & params) { params.display_prompt = false; } ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(common_arg( {"-co", "--color"}, - format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"), + string_format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"), [](common_params & params) { params.use_color = true; } ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP})); add_opt(common_arg( {"-t", "--threads"}, "N", - format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads), + string_format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads), [](common_params & params, int value) { params.cpuparams.n_threads = value; if (params.cpuparams.n_threads <= 0) { @@ -473,14 +447,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex )); add_opt(common_arg( {"--cpu-strict"}, "<0|1>", - format("use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu), + string_format("use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu), [](common_params & params, const std::string & value) { params.cpuparams.strict_cpu = std::stoul(value); } )); add_opt(common_arg( {"--prio"}, "N", - format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority), + string_format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority), [](common_params & params, int prio) { if (prio < 0 || prio > 3) { throw std::invalid_argument("invalid value"); @@ -490,7 +464,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex )); add_opt(common_arg( {"--poll"}, "<0...100>", - format("use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll), + string_format("use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll), [](common_params & params, const std::string & value) { params.cpuparams.poll = std::stoul(value); } @@ -524,7 +498,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex )); add_opt(common_arg( {"--prio-batch"}, "N", - format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams_batch.priority), + string_format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams_batch.priority), [](common_params & params, int prio) { if (prio < 0 || prio > 3) { throw std::invalid_argument("invalid value"); @@ -568,7 +542,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(common_arg( {"--prio-draft"}, "N", - format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams.priority), + string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams.priority), [](common_params & params, int prio) { if (prio < 0 || prio > 3) { throw std::invalid_argument("invalid value"); @@ -612,7 +586,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(common_arg( {"--prio-batch-draft"}, "N", - format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams_batch.priority), + string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams_batch.priority), [](common_params & params, int prio) { if (prio < 0 || prio > 3) { throw std::invalid_argument("invalid value"); @@ -629,14 +603,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(common_arg( {"--draft"}, "N", - format("number of tokens to draft for speculative decoding (default: %d)", params.n_draft), + string_format("number of tokens to draft for speculative decoding (default: %d)", params.n_draft), [](common_params & params, int value) { params.n_draft = value; } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP})); add_opt(common_arg( {"-ps", "--p-split"}, "N", - format("speculative decoding split probability (default: %.1f)", (double)params.p_split), + string_format("speculative decoding split probability (default: %.1f)", (double)params.p_split), [](common_params & params, const std::string & value) { params.p_split = std::stof(value); } @@ -657,56 +631,56 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_examples({LLAMA_EXAMPLE_LOOKUP})); add_opt(common_arg( {"-c", "--ctx-size"}, "N", - format("size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx), + string_format("size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx), [](common_params & params, int value) { params.n_ctx = value; } ).set_env("LLAMA_ARG_CTX_SIZE")); add_opt(common_arg( {"-n", "--predict", "--n-predict"}, "N", - format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict), + string_format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict), [](common_params & params, int value) { params.n_predict = value; } ).set_env("LLAMA_ARG_N_PREDICT")); add_opt(common_arg( {"-b", "--batch-size"}, "N", - format("logical maximum batch size (default: %d)", params.n_batch), + string_format("logical maximum batch size (default: %d)", params.n_batch), [](common_params & params, int value) { params.n_batch = value; } ).set_env("LLAMA_ARG_BATCH")); add_opt(common_arg( {"-ub", "--ubatch-size"}, "N", - format("physical maximum batch size (default: %d)", params.n_ubatch), + string_format("physical maximum batch size (default: %d)", params.n_ubatch), [](common_params & params, int value) { params.n_ubatch = value; } ).set_env("LLAMA_ARG_UBATCH")); add_opt(common_arg( {"--keep"}, "N", - format("number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep), + string_format("number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep), [](common_params & params, int value) { params.n_keep = value; } )); add_opt(common_arg( {"--no-context-shift"}, - format("disables context shift on inifinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"), + string_format("disables context shift on inifinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"), [](common_params & params) { params.ctx_shift = false; } ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT")); add_opt(common_arg( {"--chunks"}, "N", - format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks), + string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks), [](common_params & params, int value) { params.n_chunks = value; } ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL})); add_opt(common_arg( {"-fa", "--flash-attn"}, - format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"), + string_format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"), [](common_params & params) { params.flash_attn = true; } @@ -722,7 +696,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex )); add_opt(common_arg( {"--no-perf"}, - format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"), + string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"), [](common_params & params) { params.no_perf = true; params.sparams.no_perf = true; @@ -734,7 +708,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, const std::string & value) { std::ifstream file(value); if (!file) { - throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); + throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str())); } // store the external file name in params params.prompt_file = value; @@ -750,7 +724,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, const std::string & value) { std::ifstream file(value); if (!file) { - throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); + throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str())); } params.in_files.push_back(value); } @@ -761,7 +735,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, const std::string & value) { std::ifstream file(value, std::ios::binary); if (!file) { - throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); + throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str())); } // store the external file name in params params.prompt_file = value; @@ -773,7 +747,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex )); add_opt(common_arg( {"-e", "--escape"}, - format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"), + string_format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"), [](common_params & params) { params.escape = true; } @@ -787,7 +761,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex )); add_opt(common_arg( {"-ptc", "--print-token-count"}, "N", - format("print token count every N tokens (default: %d)", params.n_print), + string_format("print token count every N tokens (default: %d)", params.n_print), [](common_params & params, int value) { params.n_print = value; } @@ -822,14 +796,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(common_arg( {"-sp", "--special"}, - format("special tokens output enabled (default: %s)", params.special ? "true" : "false"), + string_format("special tokens output enabled (default: %s)", params.special ? "true" : "false"), [](common_params & params) { params.special = true; } ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"-cnv", "--conversation"}, - format( + string_format( "run in conversation mode:\n" "- does not print special tokens and suffix/prefix\n" "- interactive mode is also enabled\n" @@ -842,14 +816,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(common_arg( {"-i", "--interactive"}, - format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"), + string_format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"), [](common_params & params) { params.interactive = true; } ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(common_arg( {"-if", "--interactive-first"}, - format("run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"), + string_format("run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"), [](common_params & params) { params.interactive_first = true; } @@ -894,7 +868,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(common_arg( {"--spm-infill"}, - format( + string_format( "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" ), @@ -904,7 +878,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_INFILL})); add_opt(common_arg( {"--samplers"}, "SAMPLERS", - format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()), + string_format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()), [](common_params & params, const std::string & value) { const auto sampler_names = string_split(value, ';'); params.sparams.samplers = common_sampler_types_from_names(sampler_names, true); @@ -912,14 +886,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_sparam()); add_opt(common_arg( {"-s", "--seed"}, "SEED", - format("RNG seed (default: %d, use random seed for %d)", params.sparams.seed, LLAMA_DEFAULT_SEED), + string_format("RNG seed (default: %d, use random seed for %d)", params.sparams.seed, LLAMA_DEFAULT_SEED), [](common_params & params, const std::string & value) { params.sparams.seed = std::stoul(value); } ).set_sparam()); add_opt(common_arg( {"--sampling-seq"}, "SEQUENCE", - format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()), + string_format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()), [](common_params & params, const std::string & value) { params.sparams.samplers = common_sampler_types_from_chars(value); } @@ -933,14 +907,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_sparam()); add_opt(common_arg( {"--penalize-nl"}, - format("penalize newline tokens (default: %s)", params.sparams.penalize_nl ? "true" : "false"), + string_format("penalize newline tokens (default: %s)", params.sparams.penalize_nl ? "true" : "false"), [](common_params & params) { params.sparams.penalize_nl = true; } ).set_sparam()); add_opt(common_arg( {"--temp"}, "N", - format("temperature (default: %.1f)", (double)params.sparams.temp), + string_format("temperature (default: %.1f)", (double)params.sparams.temp), [](common_params & params, const std::string & value) { params.sparams.temp = std::stof(value); params.sparams.temp = std::max(params.sparams.temp, 0.0f); @@ -948,42 +922,56 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_sparam()); add_opt(common_arg( {"--top-k"}, "N", - format("top-k sampling (default: %d, 0 = disabled)", params.sparams.top_k), + string_format("top-k sampling (default: %d, 0 = disabled)", params.sparams.top_k), [](common_params & params, int value) { params.sparams.top_k = value; } ).set_sparam()); add_opt(common_arg( {"--top-p"}, "N", - format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sparams.top_p), + string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sparams.top_p), [](common_params & params, const std::string & value) { params.sparams.top_p = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--min-p"}, "N", - format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sparams.min_p), + string_format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sparams.min_p), [](common_params & params, const std::string & value) { params.sparams.min_p = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--tfs"}, "N", - format("tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)params.sparams.tfs_z), + string_format("tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)params.sparams.tfs_z), [](common_params & params, const std::string & value) { params.sparams.tfs_z = std::stof(value); } ).set_sparam()); + add_opt(common_arg( + {"--xtc-probability"}, "N", + string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sparams.xtc_probability), + [](common_params & params, const std::string & value) { + params.sparams.xtc_probability = std::stof(value); + } + ).set_sparam()); + add_opt(common_arg( + {"--xtc-threshold"}, "N", + string_format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sparams.xtc_threshold), + [](common_params & params, const std::string & value) { + params.sparams.xtc_threshold = std::stof(value); + } + ).set_sparam()); add_opt(common_arg( {"--typical"}, "N", - format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sparams.typ_p), + string_format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sparams.typ_p), [](common_params & params, const std::string & value) { params.sparams.typ_p = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--repeat-last-n"}, "N", - format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sparams.penalty_last_n), + string_format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sparams.penalty_last_n), [](common_params & params, int value) { params.sparams.penalty_last_n = value; params.sparams.n_prev = std::max(params.sparams.n_prev, params.sparams.penalty_last_n); @@ -991,42 +979,42 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_sparam()); add_opt(common_arg( {"--repeat-penalty"}, "N", - format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sparams.penalty_repeat), + string_format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sparams.penalty_repeat), [](common_params & params, const std::string & value) { params.sparams.penalty_repeat = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--presence-penalty"}, "N", - format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_present), + string_format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_present), [](common_params & params, const std::string & value) { params.sparams.penalty_present = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--frequency-penalty"}, "N", - format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_freq), + string_format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_freq), [](common_params & params, const std::string & value) { params.sparams.penalty_freq = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--dynatemp-range"}, "N", - format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sparams.dynatemp_range), + string_format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sparams.dynatemp_range), [](common_params & params, const std::string & value) { params.sparams.dynatemp_range = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--dynatemp-exp"}, "N", - format("dynamic temperature exponent (default: %.1f)", (double)params.sparams.dynatemp_exponent), + string_format("dynamic temperature exponent (default: %.1f)", (double)params.sparams.dynatemp_exponent), [](common_params & params, const std::string & value) { params.sparams.dynatemp_exponent = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--mirostat"}, "N", - format("use Mirostat sampling.\nTop K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n" + string_format("use Mirostat sampling.\nTop K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n" "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sparams.mirostat), [](common_params & params, int value) { params.sparams.mirostat = value; @@ -1034,14 +1022,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_sparam()); add_opt(common_arg( {"--mirostat-lr"}, "N", - format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sparams.mirostat_eta), + string_format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sparams.mirostat_eta), [](common_params & params, const std::string & value) { params.sparams.mirostat_eta = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--mirostat-ent"}, "N", - format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sparams.mirostat_tau), + string_format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sparams.mirostat_tau), [](common_params & params, const std::string & value) { params.sparams.mirostat_tau = std::stof(value); } @@ -1070,7 +1058,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_sparam()); add_opt(common_arg( {"--grammar"}, "GRAMMAR", - format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sparams.grammar.c_str()), + string_format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sparams.grammar.c_str()), [](common_params & params, const std::string & value) { params.sparams.grammar = value; } @@ -1081,7 +1069,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, const std::string & value) { std::ifstream file(value); if (!file) { - throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); + throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str())); } std::copy( std::istreambuf_iterator(file), @@ -1151,53 +1139,53 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_env("LLAMA_ARG_ROPE_FREQ_SCALE")); add_opt(common_arg( {"--yarn-orig-ctx"}, "N", - format("YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx), + string_format("YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx), [](common_params & params, int value) { params.yarn_orig_ctx = value; } ).set_env("LLAMA_ARG_YARN_ORIG_CTX")); add_opt(common_arg( {"--yarn-ext-factor"}, "N", - format("YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor), + string_format("YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor), [](common_params & params, const std::string & value) { params.yarn_ext_factor = std::stof(value); } ).set_env("LLAMA_ARG_YARN_EXT_FACTOR")); add_opt(common_arg( {"--yarn-attn-factor"}, "N", - format("YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor), + string_format("YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor), [](common_params & params, const std::string & value) { params.yarn_attn_factor = std::stof(value); } ).set_env("LLAMA_ARG_YARN_ATTN_FACTOR")); add_opt(common_arg( {"--yarn-beta-slow"}, "N", - format("YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow), + string_format("YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow), [](common_params & params, const std::string & value) { params.yarn_beta_slow = std::stof(value); } ).set_env("LLAMA_ARG_YARN_BETA_SLOW")); add_opt(common_arg( {"--yarn-beta-fast"}, "N", - format("YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast), + string_format("YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast), [](common_params & params, const std::string & value) { params.yarn_beta_fast = std::stof(value); } ).set_env("LLAMA_ARG_YARN_BETA_FAST")); add_opt(common_arg( {"-gan", "--grp-attn-n"}, "N", - format("group-attention factor (default: %d)", params.grp_attn_n), + string_format("group-attention factor (default: %d)", params.grp_attn_n), [](common_params & params, int value) { params.grp_attn_n = value; } - ).set_env("LLAMA_ARG_GRP_ATTN_N")); + ).set_env("LLAMA_ARG_GRP_ATTN_N").set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_PASSKEY})); add_opt(common_arg( {"-gaw", "--grp-attn-w"}, "N", - format("group-attention width (default: %.1f)", (double)params.grp_attn_w), + string_format("group-attention width (default: %d)", params.grp_attn_w), [](common_params & params, int value) { params.grp_attn_w = value; } - ).set_env("LLAMA_ARG_GRP_ATTN_W")); + ).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(common_arg( {"-dkvc", "--dump-kv-cache"}, "verbose print of the KV cache", @@ -1214,7 +1202,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_env("LLAMA_ARG_NO_KV_OFFLOAD")); add_opt(common_arg( {"-ctk", "--cache-type-k"}, "TYPE", - format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()), + string_format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()), [](common_params & params, const std::string & value) { // TODO: get the type right here params.cache_type_k = value; @@ -1222,7 +1210,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_env("LLAMA_ARG_CACHE_TYPE_K")); add_opt(common_arg( {"-ctv", "--cache-type-v"}, "TYPE", - format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()), + string_format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()), [](common_params & params, const std::string & value) { // TODO: get the type right here params.cache_type_v = value; @@ -1230,7 +1218,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_env("LLAMA_ARG_CACHE_TYPE_V")); add_opt(common_arg( {"--perplexity", "--all-logits"}, - format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"), + string_format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"), [](common_params & params) { params.logits_all = true; } @@ -1244,7 +1232,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(common_arg( {"--hellaswag-tasks"}, "N", - format("number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks), + string_format("number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks), [](common_params & params, int value) { params.hellaswag_tasks = value; } @@ -1258,7 +1246,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(common_arg( {"--winogrande-tasks"}, "N", - format("number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks), + string_format("number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks), [](common_params & params, int value) { params.winogrande_tasks = value; } @@ -1272,7 +1260,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(common_arg( {"--multiple-choice-tasks"}, "N", - format("number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks), + string_format("number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks), [](common_params & params, int value) { params.multiple_choice_tasks = value; } @@ -1293,42 +1281,42 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(common_arg( {"--ppl-stride"}, "N", - format("stride for perplexity calculation (default: %d)", params.ppl_stride), + string_format("stride for perplexity calculation (default: %d)", params.ppl_stride), [](common_params & params, int value) { params.ppl_stride = value; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(common_arg( {"--ppl-output-type"}, "<0|1>", - format("output type for perplexity calculation (default: %d)", params.ppl_output_type), + string_format("output type for perplexity calculation (default: %d)", params.ppl_output_type), [](common_params & params, int value) { params.ppl_output_type = value; } ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(common_arg( {"-dt", "--defrag-thold"}, "N", - format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold), + string_format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold), [](common_params & params, const std::string & value) { params.defrag_thold = std::stof(value); } ).set_env("LLAMA_ARG_DEFRAG_THOLD")); add_opt(common_arg( {"-np", "--parallel"}, "N", - format("number of parallel sequences to decode (default: %d)", params.n_parallel), + string_format("number of parallel sequences to decode (default: %d)", params.n_parallel), [](common_params & params, int value) { params.n_parallel = value; } ).set_env("LLAMA_ARG_N_PARALLEL")); add_opt(common_arg( {"-ns", "--sequences"}, "N", - format("number of sequences to decode (default: %d)", params.n_sequences), + string_format("number of sequences to decode (default: %d)", params.n_sequences), [](common_params & params, int value) { params.n_sequences = value; } ).set_examples({LLAMA_EXAMPLE_PARALLEL})); add_opt(common_arg( {"-cb", "--cont-batching"}, - format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"), + string_format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"), [](common_params & params) { params.cont_batching = true; } @@ -1452,7 +1440,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex std::vector split_arg{ it, {} }; if (split_arg.size() >= llama_max_devices()) { throw std::invalid_argument( - format("got %d input configs, but system only has %d devices", (int)split_arg.size(), (int)llama_max_devices()) + string_format("got %d input configs, but system only has %d devices", (int)split_arg.size(), (int)llama_max_devices()) ); } for (size_t i = 0; i < llama_max_devices(); ++i) { @@ -1469,7 +1457,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_env("LLAMA_ARG_TENSOR_SPLIT")); add_opt(common_arg( {"-mg", "--main-gpu"}, "INDEX", - format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu), + string_format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu), [](common_params & params, int value) { params.main_gpu = value; if (!llama_supports_gpu_offload()) { @@ -1479,7 +1467,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_env("LLAMA_ARG_MAIN_GPU")); add_opt(common_arg( {"--check-tensors"}, - format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"), + string_format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"), [](common_params & params) { params.check_tensors = true; } @@ -1490,7 +1478,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false", [](common_params & params, const std::string & value) { if (!string_parse_kv_override(value.c_str(), params.kv_overrides)) { - throw std::runtime_error(format("error: Invalid type for KV override: %s\n", value.c_str())); + throw std::runtime_error(string_format("error: Invalid type for KV override: %s\n", value.c_str())); } } )); @@ -1544,7 +1532,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex {"-m", "--model"}, "FNAME", ex == LLAMA_EXAMPLE_EXPORT_LORA ? std::string("model path from which to load base model") - : format( + : string_format( "model path (default: `models/$filename` with filename from `--hf-file` " "or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH ), @@ -1593,42 +1581,42 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, const std::string & value) { std::ifstream file(value, std::ios::binary); if (!file) { - throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); + throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str())); } params.context_files.push_back(value); } ).set_examples({LLAMA_EXAMPLE_RETRIEVAL})); add_opt(common_arg( {"--chunk-size"}, "N", - format("minimum length of embedded text chunks (default: %d)", params.chunk_size), + string_format("minimum length of embedded text chunks (default: %d)", params.chunk_size), [](common_params & params, int value) { params.chunk_size = value; } ).set_examples({LLAMA_EXAMPLE_RETRIEVAL})); add_opt(common_arg( {"--chunk-separator"}, "STRING", - format("separator between chunks (default: '%s')", params.chunk_separator.c_str()), + string_format("separator between chunks (default: '%s')", params.chunk_separator.c_str()), [](common_params & params, const std::string & value) { params.chunk_separator = value; } ).set_examples({LLAMA_EXAMPLE_RETRIEVAL})); add_opt(common_arg( {"--junk"}, "N", - format("number of times to repeat the junk text (default: %d)", params.n_junk), + string_format("number of times to repeat the junk text (default: %d)", params.n_junk), [](common_params & params, int value) { params.n_junk = value; } ).set_examples({LLAMA_EXAMPLE_PASSKEY})); add_opt(common_arg( {"--pos"}, "N", - format("position of the passkey in the junk text (default: %d)", params.i_pos), + string_format("position of the passkey in the junk text (default: %d)", params.i_pos), [](common_params & params, int value) { params.i_pos = value; } ).set_examples({LLAMA_EXAMPLE_PASSKEY})); add_opt(common_arg( {"-o", "--output", "--output-file"}, "FNAME", - format("output file (default: '%s')", + string_format("output file (default: '%s')", ex == LLAMA_EXAMPLE_EXPORT_LORA ? params.lora_outfile.c_str() : ex == LLAMA_EXAMPLE_CVECTOR_GENERATOR @@ -1642,42 +1630,42 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA})); add_opt(common_arg( {"-ofreq", "--output-frequency"}, "N", - format("output the imatrix every N iterations (default: %d)", params.n_out_freq), + string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq), [](common_params & params, int value) { params.n_out_freq = value; } ).set_examples({LLAMA_EXAMPLE_IMATRIX})); add_opt(common_arg( {"--save-frequency"}, "N", - format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq), + string_format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq), [](common_params & params, int value) { params.n_save_freq = value; } ).set_examples({LLAMA_EXAMPLE_IMATRIX})); add_opt(common_arg( {"--process-output"}, - format("collect data for the output tensor (default: %s)", params.process_output ? "true" : "false"), + string_format("collect data for the output tensor (default: %s)", params.process_output ? "true" : "false"), [](common_params & params) { params.process_output = true; } ).set_examples({LLAMA_EXAMPLE_IMATRIX})); add_opt(common_arg( {"--no-ppl"}, - format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"), + string_format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"), [](common_params & params) { params.compute_ppl = false; } ).set_examples({LLAMA_EXAMPLE_IMATRIX})); add_opt(common_arg( {"--chunk", "--from-chunk"}, "N", - format("start processing the input from chunk N (default: %d)", params.i_chunk), + string_format("start processing the input from chunk N (default: %d)", params.i_chunk), [](common_params & params, int value) { params.i_chunk = value; } ).set_examples({LLAMA_EXAMPLE_IMATRIX})); add_opt(common_arg( {"-pps"}, - format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"), + string_format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"), [](common_params & params) { params.is_pp_shared = true; } @@ -1708,7 +1696,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_examples({LLAMA_EXAMPLE_BENCH})); add_opt(common_arg( {"--embd-normalize"}, "N", - format("normalisation for embendings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize), + string_format("normalisation for embendings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize), [](common_params & params, int value) { params.embd_normalize = value; } @@ -1729,35 +1717,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_examples({LLAMA_EXAMPLE_EMBEDDING})); add_opt(common_arg( {"--host"}, "HOST", - format("ip address to listen (default: %s)", params.hostname.c_str()), + string_format("ip address to listen (default: %s)", params.hostname.c_str()), [](common_params & params, const std::string & value) { params.hostname = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_HOST")); add_opt(common_arg( {"--port"}, "PORT", - format("port to listen (default: %d)", params.port), + string_format("port to listen (default: %d)", params.port), [](common_params & params, int value) { params.port = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PORT")); add_opt(common_arg( {"--path"}, "PATH", - format("path to serve static files from (default: %s)", params.public_path.c_str()), + string_format("path to serve static files from (default: %s)", params.public_path.c_str()), [](common_params & params, const std::string & value) { params.public_path = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH")); add_opt(common_arg( {"--embedding", "--embeddings"}, - format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"), + string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"), [](common_params & params) { params.embedding = true; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS")); add_opt(common_arg( {"--reranking", "--rerank"}, - format("enable reranking endpoint on server (default: %s)", params.reranking ? "enabled" : "disabled"), + string_format("enable reranking endpoint on server (default: %s)", params.reranking ? "enabled" : "disabled"), [](common_params & params) { params.reranking = true; } @@ -1775,7 +1763,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, const std::string & value) { std::ifstream key_file(value); if (!key_file) { - throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); + throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str())); } std::string key; while (std::getline(key_file, key)) { @@ -1802,7 +1790,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE")); add_opt(common_arg( {"-to", "--timeout"}, "N", - format("server read/write timeout in seconds (default: %d)", params.timeout_read), + string_format("server read/write timeout in seconds (default: %d)", params.timeout_read), [](common_params & params, int value) { params.timeout_read = value; params.timeout_write = value; @@ -1810,45 +1798,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TIMEOUT")); add_opt(common_arg( {"--threads-http"}, "N", - format("number of threads used to process HTTP requests (default: %d)", params.n_threads_http), + string_format("number of threads used to process HTTP requests (default: %d)", params.n_threads_http), [](common_params & params, int value) { params.n_threads_http = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP")); add_opt(common_arg( - {"-spf", "--system-prompt-file"}, "FNAME", - "set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications", - [](common_params & params, const std::string & value) { - std::ifstream file(value); - if (!file) { - throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str())); - } - std::string system_prompt; - std::copy( - std::istreambuf_iterator(file), - std::istreambuf_iterator(), - std::back_inserter(system_prompt) - ); - params.system_prompt = system_prompt; + {"--cache-reuse"}, "N", + string_format("min chunk size to attempt reusing from the cache via KV shifting (default: %d)", params.n_cache_reuse), + [](common_params & params, int value) { + params.n_cache_reuse = value; } - ).set_examples({LLAMA_EXAMPLE_SERVER})); + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CACHE_REUSE")); add_opt(common_arg( {"--metrics"}, - format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"), + string_format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"), [](common_params & params) { params.endpoint_metrics = true; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS")); add_opt(common_arg( {"--slots"}, - format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"), + string_format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"), [](common_params & params) { params.endpoint_slots = true; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS")); add_opt(common_arg( {"--props"}, - format("enable changing global properties via POST /props (default: %s)", params.endpoint_props ? "enabled" : "disabled"), + string_format("enable changing global properties via POST /props (default: %s)", params.endpoint_props ? "enabled" : "disabled"), [](common_params & params) { params.endpoint_props = true; } @@ -1878,7 +1856,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex "only commonly used templates are accepted:\nhttps://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template", [](common_params & params, const std::string & value) { if (!common_chat_verify_template(value)) { - throw std::runtime_error(format( + throw std::runtime_error(string_format( "error: the supplied chat template is not supported: %s\n" "note: llama.cpp does not use jinja parser, we only support commonly used templates\n", value.c_str() @@ -1889,14 +1867,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE")); add_opt(common_arg( {"-sps", "--slot-prompt-similarity"}, "SIMILARITY", - format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity), + string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity), [](common_params & params, const std::string & value) { params.slot_prompt_similarity = std::stof(value); } ).set_examples({LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"--lora-init-without-apply"}, - format("load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"), + string_format("load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"), [](common_params & params) { params.lora_init_without_apply = true; } @@ -1921,28 +1899,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex )); add_opt(common_arg( {"--positive-file"}, "FNAME", - format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()), + string_format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()), [](common_params & params, const std::string & value) { params.cvector_positive_file = value; } ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); add_opt(common_arg( {"--negative-file"}, "FNAME", - format("negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str()), + string_format("negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str()), [](common_params & params, const std::string & value) { params.cvector_negative_file = value; } ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); add_opt(common_arg( {"--pca-batch"}, "N", - format("batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch), + string_format("batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch), [](common_params & params, int value) { params.n_pca_batch = value; } ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR})); add_opt(common_arg( {"--pca-iter"}, "N", - format("number of iterations used for PCA (default: %d)", params.n_pca_iterations), + string_format("number of iterations used for PCA (default: %d)", params.n_pca_iterations), [](common_params & params, int value) { params.n_pca_iterations = value; } diff --git a/common/common.cpp b/common/common.cpp index a33edf1dd..1a9dcd368 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -14,6 +14,7 @@ #include #include +#include #include #include #include @@ -25,10 +26,10 @@ #include #include #include +#include #include #include #include -#include #if defined(__APPLE__) && defined(__MACH__) #include @@ -402,6 +403,21 @@ std::string common_params_get_system_info(const common_params & params) { // String utils // +std::string string_format(const char * fmt, ...) { + va_list ap; + va_list ap2; + va_start(ap, fmt); + va_copy(ap2, ap); + int size = vsnprintf(NULL, 0, fmt, ap); + GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT + std::vector buf(size + 1); + int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2); + GGML_ASSERT(size2 == size); + va_end(ap2); + va_end(ap); + return std::string(buf.data(), size); +} + std::vector string_split(std::string input, char separator) { std::vector parts; size_t separator_pos = input.find(separator); @@ -2090,6 +2106,8 @@ void yaml_dump_non_result_info(FILE * stream, const common_params & params, cons fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k); fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p); fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p); + fprintf(stream, "xtc_probability: %f # default: 0.0\n", sparams.xtc_probability); + fprintf(stream, "xtc_threshold: %f # default: 0.1\n", sparams.xtc_threshold); fprintf(stream, "typ_p: %f # default: 1.0\n", sparams.typ_p); fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false"); fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false"); diff --git a/common/common.h b/common/common.h index 2a26f4031..f21e349c5 100644 --- a/common/common.h +++ b/common/common.h @@ -86,6 +86,8 @@ enum common_sampler_type { COMMON_SAMPLER_TYPE_TFS_Z = 4, COMMON_SAMPLER_TYPE_TYPICAL_P = 5, COMMON_SAMPLER_TYPE_TEMPERATURE = 6, + COMMON_SAMPLER_TYPE_XTC = 7, + COMMON_SAMPLER_TYPE_INFILL = 8, }; // dimensionality reduction methods, used by cvector-generator @@ -104,6 +106,8 @@ struct common_sampler_params { int32_t top_k = 40; // <= 0 to use vocab size float top_p = 0.95f; // 1.0 = disabled float min_p = 0.05f; // 0.0 = disabled + float xtc_probability = 0.00f; // 0.0 = disabled + float xtc_threshold = 0.10f; // > 0.5 disables XTC float tfs_z = 1.00f; // 1.0 = disabled float typ_p = 1.00f; // typical_p, 1.0 = disabled float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities @@ -120,13 +124,15 @@ struct common_sampler_params { bool ignore_eos = false; bool no_perf = false; // disable performance metrics + std::vector samplers = { COMMON_SAMPLER_TYPE_TOP_K, COMMON_SAMPLER_TYPE_TFS_Z, COMMON_SAMPLER_TYPE_TYPICAL_P, COMMON_SAMPLER_TYPE_TOP_P, COMMON_SAMPLER_TYPE_MIN_P, - COMMON_SAMPLER_TYPE_TEMPERATURE + COMMON_SAMPLER_TYPE_XTC, + COMMON_SAMPLER_TYPE_TEMPERATURE, }; std::string grammar; // optional BNF-like grammar to constrain sampling @@ -273,12 +279,12 @@ struct common_params { int32_t port = 8080; // server listens on this network port int32_t timeout_read = 600; // http read timeout in seconds int32_t timeout_write = timeout_read; // http write timeout in seconds - int n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool) + int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool) + int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting std::string hostname = "127.0.0.1"; std::string public_path = ""; // NOLINT std::string chat_template = ""; // NOLINT - std::string system_prompt = ""; // NOLINT bool enable_chat_template = true; std::vector api_keys; @@ -348,15 +354,28 @@ void common_init(); std::string common_params_get_system_info(const common_params & params); -bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]); -bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[GGML_MAX_N_THREADS]); -void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr); +bool parse_cpu_range(const std::string & range, bool(&boolmask)[GGML_MAX_N_THREADS]); +bool parse_cpu_mask(const std::string & mask, bool(&boolmask)[GGML_MAX_N_THREADS]); +void postprocess_cpu_params(cpu_params & cpuparams, const cpu_params * role_model = nullptr); bool set_process_priority(enum ggml_sched_priority prio); // // String utils // +#ifdef __GNUC__ +#ifdef __MINGW32__ +#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__))) +#else +#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__))) +#endif +#else +#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) +#endif + +LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2) +std::string string_format(const char * fmt, ...); + std::vector string_split(std::string input, char separator); std::string string_strip(const std::string & str); diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp index 881eb49e3..dadc18c8b 100644 --- a/common/json-schema-to-grammar.cpp +++ b/common/json-schema-to-grammar.cpp @@ -611,7 +611,7 @@ private: } return join_seq(); }; - return _add_rule(name, "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space"); + return _add_rule(name, "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space"); } /* diff --git a/common/sampling.cpp b/common/sampling.cpp index cd49ade69..56cd0df6b 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -130,10 +130,10 @@ std::string common_sampler_params::print() const { snprintf(result, sizeof(result), "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n" - "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n" + "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, temp = %.3f\n" "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f", penalty_last_n, penalty_repeat, penalty_freq, penalty_present, - top_k, tfs_z, top_p, min_p, typ_p, temp, + top_k, tfs_z, top_p, min_p, xtc_probability, xtc_threshold, typ_p, temp, mirostat, mirostat_eta, mirostat_tau); return std::string(result); @@ -184,6 +184,9 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co case COMMON_SAMPLER_TYPE_MIN_P: llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep)); break; + case COMMON_SAMPLER_TYPE_XTC: + llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed)); + break; case COMMON_SAMPLER_TYPE_TFS_Z: llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep)); break; @@ -193,6 +196,9 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co case COMMON_SAMPLER_TYPE_TEMPERATURE: llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent)); break; + case COMMON_SAMPLER_TYPE_INFILL: + llama_sampler_chain_add(result->chain, llama_sampler_init_infill (model)); + break; default: GGML_ASSERT(false && "unknown sampler type"); } @@ -372,6 +378,8 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) { case COMMON_SAMPLER_TYPE_TOP_P: return 'p'; case COMMON_SAMPLER_TYPE_MIN_P: return 'm'; case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't'; + case COMMON_SAMPLER_TYPE_XTC: return 'x'; + case COMMON_SAMPLER_TYPE_INFILL: return 'i'; default : return '?'; } } @@ -384,6 +392,8 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) { case COMMON_SAMPLER_TYPE_TOP_P: return "top_p"; case COMMON_SAMPLER_TYPE_MIN_P: return "min_p"; case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature"; + case COMMON_SAMPLER_TYPE_XTC: return "xtc"; + case COMMON_SAMPLER_TYPE_INFILL: return "infill"; default : return ""; } } @@ -396,6 +406,8 @@ std::vector common_sampler_types_from_names(const std::vect { "min_p", COMMON_SAMPLER_TYPE_MIN_P }, { "tfs_z", COMMON_SAMPLER_TYPE_TFS_Z }, { "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE }, + { "xtc", COMMON_SAMPLER_TYPE_XTC }, + { "infill", COMMON_SAMPLER_TYPE_INFILL }, }; // since samplers names are written multiple ways @@ -441,7 +453,9 @@ std::vector common_sampler_types_from_chars(const std::stri { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P), COMMON_SAMPLER_TYPE_TYPICAL_P }, { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P), COMMON_SAMPLER_TYPE_TOP_P }, { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P), COMMON_SAMPLER_TYPE_MIN_P }, - { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE } + { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE }, + { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC), COMMON_SAMPLER_TYPE_XTC }, + { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_INFILL), COMMON_SAMPLER_TYPE_INFILL }, }; std::vector samplers; diff --git a/examples/json_schema_to_grammar.py b/examples/json_schema_to_grammar.py index a8779bf3b..fc9f0097f 100755 --- a/examples/json_schema_to_grammar.py +++ b/examples/json_schema_to_grammar.py @@ -540,7 +540,7 @@ class SchemaConverter: return self._add_rule( name, to_rule(transform()) if self._raw_pattern \ - else "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space") + else "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space") def _resolve_ref(self, ref): diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index 8558c6bdc..2c96973c8 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -432,7 +432,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos); if (!image_embed_result) { clip_image_u8_free(img); - LOG_ERR("%s: coulnd't embed the image\n", __func__); + LOG_ERR("%s: couldn't embed the image\n", __func__); return NULL; } diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 1bfbcdc99..a4fee0065 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -570,30 +570,30 @@ int main(int argc, char ** argv) { if (!params.ctx_shift){ LOG_DBG("\n\n%s: context full and context shift is disabled => stopping\n", __func__); break; - } else { - if (params.n_predict == -2) { - LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict); - break; - } - - const int n_left = n_past - params.n_keep; - const int n_discard = n_left/2; - - LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", - n_past, n_left, n_ctx, params.n_keep, n_discard); - - llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard); - llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard); - - n_past -= n_discard; - - LOG_DBG("after swap: n_past = %d\n", n_past); - - LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str()); - - LOG_DBG("clear session path\n"); - path_session.clear(); } + + if (params.n_predict == -2) { + LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict); + break; + } + + const int n_left = n_past - params.n_keep; + const int n_discard = n_left/2; + + LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", + n_past, n_left, n_ctx, params.n_keep, n_discard); + + llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard); + llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard); + + n_past -= n_discard; + + LOG_DBG("after swap: n_past = %d\n", n_past); + + LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str()); + + LOG_DBG("clear session path\n"); + path_session.clear(); } } else { // context extension via Self-Extend diff --git a/examples/server/public/index-new.html b/examples/server/public/index-new.html index c87dd8f1e..ad4183cd9 100644 --- a/examples/server/public/index-new.html +++ b/examples/server/public/index-new.html @@ -43,6 +43,8 @@ top_k: 0, // <= 0 to use vocab size top_p: 1.0, // 1.0 = disabled min_p: 0.05, // 0 = disabled; recommended for non-english: ~ 0.4 + xtc_probability: 0.0, // 0 = disabled; + xtc_threshold: 0.1, // > 0.5 disables XTC; tfs_z: 1.0, // 1.0 = disabled typical_p: 1.0, // 1.0 = disabled presence_penalty: 0.0, // 0.0 = disabled @@ -836,6 +838,8 @@ return html` ${FloatField({ label: "TFS-Z", title: "Activates tail-free sampling, a method used to limit the prediction of tokens that are too frequent. The parameter z controls the strength of this limitation. A value of 1.0 means that this function is deactivated.", max: 1.0, min: 0.0, name: "tfs_z", step: 0.01, value: params.value.tfs_z })} ${FloatField({ label: "Frequency Penalty", title: "A penalty that is applied based on the frequency with which certain tokens occur in the training data set. A higher value results in rare tokens being favoured.", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty })} ${FloatField({ label: "Typical-P", title: "Activates local typical sampling, a method used to limit the prediction of tokens that are atypical in the current context. The parameter p controls the strength of this limitation. A value of 1.0 means that this function is deactivated.", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p })} + ${FloatField({ label: "XTC probability", title: "Sets the chance for token removal (checked once on sampler start)", max: 1.0, min: 0.0, name: "xtc_probability", step: 0.01, value: params.value.xtc_probability })} + ${FloatField({ label: "XTC threshold", title: "Sets a minimum probability threshold for tokens to be removed", max: 0.5, min: 0.0, name: "xtc_threshold", step: 0.01, value: params.value.xtc_threshold })} ${IntField({ label: "Min Keep", title: "If greater than 0, samplers are forced to return N possible tokens at minimum. Default is 0", max: 10, min: 0, name: "min_keep", value: params.value.min_keep })} @@ -1132,6 +1136,8 @@ document.addEventListener('DOMContentLoaded', (event) => { const snapSettings = { temperature: { snapValue: 1.0, snapRangeMultiplier: 6 }, min_p: { snapValue: 0.05, snapRangeMultiplier: 2 }, + xtc_probability: { snapValue: 0.0, snapRangeMultiplier: 4 }, + xtc_threshold: { snapValue: 0.5, snapRangeMultiplier: 4 }, top_p: { snapValue: 1.0, snapRangeMultiplier: 4 }, tfs_z: { snapValue: 1.0, snapRangeMultiplier: 4 }, typical_p: { snapValue: 1.0, snapRangeMultiplier: 4 }, diff --git a/examples/server/public/index.html b/examples/server/public/index.html index 07fec6a38..88065705f 100644 --- a/examples/server/public/index.html +++ b/examples/server/public/index.html @@ -307,6 +307,8 @@ top_k: 40, // <= 0 to use vocab size top_p: 0.95, // 1.0 = disabled min_p: 0.05, // 0 = disabled + xtc_probability: 0.0, // 0 = disabled; + xtc_threshold: 0.1, // > 0.5 disables XTC; tfs_z: 1.0, // 1.0 = disabled typical_p: 1.0, // 1.0 = disabled presence_penalty: 0.0, // 0.0 = disabled @@ -1013,6 +1015,8 @@ ${FloatField({ label: "Typical P", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p })} ${FloatField({ label: "Presence penalty", max: 1.0, min: 0.0, name: "presence_penalty", step: 0.01, value: params.value.presence_penalty })} ${FloatField({ label: "Frequency penalty", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty })} + ${FloatField({ label: "XTC probability", max: 1.0, min: 0.0, name: "xtc_probability", step: 0.01, value: params.value.xtc_probability })} + ${FloatField({ label: "XTC threshold", max: 0.5, min: 0.0, name: "xtc_threshold", step: 0.01, value: params.value.xtc_threshold })}
diff --git a/examples/server/public/index.js b/examples/server/public/index.js index fe615ca25..32ec6e9e1 100644 --- a/examples/server/public/index.js +++ b/examples/server/public/index.js @@ -1 +1 @@ -const t=Symbol.for("preact-signals");function n(){if(r>1){r--;return}let t,n=!1;while(void 0!==i){let _=i;i=void 0;u++;while(void 0!==_){const i=_.o;_.o=void 0;_.f&=-3;if(!(8&_.f)&&h(_))try{_.c()}catch(e){if(!n){t=e;n=!0}}_=i}}u=0;r--;if(n)throw t}function e(t){if(r>0)return t();r++;try{return t()}finally{n()}}let _,i;function o(t){const n=_;_=void 0;try{return t()}finally{_=n}}let r=0,u=0,l=0;function f(t){if(void 0===_)return;let n=t.n;if(void 0===n||n.t!==_){n={i:0,S:t,p:_.s,n:void 0,t:_,e:void 0,x:void 0,r:n};if(void 0!==_.s)_.s.n=n;_.s=n;t.n=n;if(32&_.f)t.S(n);return n}else if(-1===n.i){n.i=0;if(void 0!==n.n){n.n.p=n.p;if(void 0!==n.p)n.p.n=n.n;n.p=_.s;n.n=void 0;_.s.n=n;_.s=n}return n}}function s(t){this.v=t;this.i=0;this.n=void 0;this.t=void 0}s.prototype.brand=t;s.prototype.h=function(){return!0};s.prototype.S=function(t){if(this.t!==t&&void 0===t.e){t.x=this.t;if(void 0!==this.t)this.t.e=t;this.t=t}};s.prototype.U=function(t){if(void 0!==this.t){const n=t.e,e=t.x;if(void 0!==n){n.x=e;t.e=void 0}if(void 0!==e){e.e=n;t.x=void 0}if(t===this.t)this.t=e}};s.prototype.subscribe=function(t){return k(()=>{const n=this.value,e=_;_=void 0;try{t(n)}finally{_=e}})};s.prototype.valueOf=function(){return this.value};s.prototype.toString=function(){return this.value+""};s.prototype.toJSON=function(){return this.value};s.prototype.peek=function(){const t=_;_=void 0;try{return this.value}finally{_=t}};Object.defineProperty(s.prototype,"value",{get(){const t=f(this);if(void 0!==t)t.i=this.i;return this.v},set(t){if(t!==this.v){if(u>100)throw new Error("Cycle detected");this.v=t;this.i++;l++;r++;try{for(let t=this.t;void 0!==t;t=t.x)t.t.N()}finally{n()}}}});function c(t){return new s(t)}function h(t){for(let n=t.s;void 0!==n;n=n.n)if(n.S.i!==n.i||!n.S.h()||n.S.i!==n.i)return!0;return!1}function a(t){for(let n=t.s;void 0!==n;n=n.n){const e=n.S.n;if(void 0!==e)n.r=e;n.S.n=n;n.i=-1;if(void 0===n.n){t.s=n;break}}}function p(t){let n,e=t.s;while(void 0!==e){const t=e.p;if(-1===e.i){e.S.U(e);if(void 0!==t)t.n=e.n;if(void 0!==e.n)e.n.p=t}else n=e;e.S.n=e.r;if(void 0!==e.r)e.r=void 0;e=t}t.s=n}function d(t){s.call(this,void 0);this.x=t;this.s=void 0;this.g=l-1;this.f=4}(d.prototype=new s).h=function(){this.f&=-3;if(1&this.f)return!1;if(32==(36&this.f))return!0;this.f&=-5;if(this.g===l)return!0;this.g=l;this.f|=1;if(this.i>0&&!h(this)){this.f&=-2;return!0}const t=_;try{a(this);_=this;const t=this.x();if(16&this.f||this.v!==t||0===this.i){this.v=t;this.f&=-17;this.i++}}catch(t){this.v=t;this.f|=16;this.i++}_=t;p(this);this.f&=-2;return!0};d.prototype.S=function(t){if(void 0===this.t){this.f|=36;for(let t=this.s;void 0!==t;t=t.n)t.S.S(t)}s.prototype.S.call(this,t)};d.prototype.U=function(t){if(void 0!==this.t){s.prototype.U.call(this,t);if(void 0===this.t){this.f&=-33;for(let t=this.s;void 0!==t;t=t.n)t.S.U(t)}}};d.prototype.N=function(){if(!(2&this.f)){this.f|=6;for(let t=this.t;void 0!==t;t=t.x)t.t.N()}};Object.defineProperty(d.prototype,"value",{get(){if(1&this.f)throw new Error("Cycle detected");const t=f(this);this.h();if(void 0!==t)t.i=this.i;if(16&this.f)throw this.v;return this.v}});function v(t){return new d(t)}function y(t){const e=t.u;t.u=void 0;if("function"==typeof e){r++;const i=_;_=void 0;try{e()}catch(n){t.f&=-2;t.f|=8;m(t);throw n}finally{_=i;n()}}}function m(t){for(let n=t.s;void 0!==n;n=n.n)n.S.U(n);t.x=void 0;t.s=void 0;y(t)}function g(t){if(_!==this)throw new Error("Out-of-order effect");p(this);_=t;this.f&=-2;if(8&this.f)m(this);n()}function b(t){this.x=t;this.u=void 0;this.s=void 0;this.o=void 0;this.f=32}b.prototype.c=function(){const t=this.S();try{if(8&this.f)return;if(void 0===this.x)return;const n=this.x();if("function"==typeof n)this.u=n}finally{t()}};b.prototype.S=function(){if(1&this.f)throw new Error("Cycle detected");this.f|=1;this.f&=-9;y(this);a(this);r++;const t=_;_=this;return g.bind(this,t)};b.prototype.N=function(){if(!(2&this.f)){this.f|=2;this.o=i;i=this}};b.prototype.d=function(){this.f|=8;if(!(1&this.f))m(this)};function k(t){const n=new b(t);try{n.c()}catch(t){n.d();throw t}return n.d.bind(n)}var w,S,x,C,U,E,H,P,N,$,T,D,M={},F=[],A=/acit|ex(?:s|g|n|p|$)|rph|grid|ows|mnc|ntw|ine[ch]|zoo|^ord|itera/i,W=Array.isArray;function L(t,n){for(var e in n)t[e]=n[e];return t}function O(t){var n=t.parentNode;n&&n.removeChild(t)}function R(t,n,e){var _,i,o,r={};for(o in n)"key"==o?_=n[o]:"ref"==o?i=n[o]:r[o]=n[o];if(arguments.length>2&&(r.children=arguments.length>3?w.call(arguments,2):e),"function"==typeof t&&null!=t.defaultProps)for(o in t.defaultProps)void 0===r[o]&&(r[o]=t.defaultProps[o]);return I(t,r,_,i,null)}function I(t,n,e,_,i){var o={type:t,props:n,key:e,ref:_,__k:null,__:null,__b:0,__e:null,__d:void 0,__c:null,constructor:void 0,__v:null==i?++x:i,__i:-1,__u:0};return null==i&&null!=S.vnode&&S.vnode(o),o}function V(){return{current:null}}function j(t){return t.children}function q(t,n){this.props=t,this.context=n}function B(t,n){if(null==n)return t.__?B(t.__,t.__i+1):null;for(var e;nn&&U.sort(P));J.__r=0}function K(t,n,e,_,i,o,r,u,l,f,s){var c,h,a,p,d,v=_&&_.__k||F,y=n.length;for(e.__d=l,Q(e,n,v),l=e.__d,c=0;c0?I(i.type,i.props,i.key,i.ref?i.ref:null,i.__v):i)?(i.__=t,i.__b=t.__b+1,u=Z(i,e,r,s),i.__i=u,o=null,-1!==u&&(s--,(o=e[u])&&(o.__u|=131072)),null==o||null===o.__v?(-1==u&&c--,"function"!=typeof i.type&&(i.__u|=65536)):u!==r&&(u==r-1?c--:u==r+1?c++:u>r?s>l-r?c+=u-r:c--:u(null!=l&&0==(131072&l.__u)?1:0))for(;r>=0||u=0){if((l=n[r])&&0==(131072&l.__u)&&i==l.key&&o===l.type)return r;r--}if(u2&&(u.children=arguments.length>3?w.call(arguments,2):e),I(t.type,u,_||t.key,i||t.ref,null)}function ht(t,n){var e={__c:n="__cC"+D++,__:t,Consumer:function(t,n){return t.children(n)},Provider:function(t){var e,_;return this.getChildContext||(e=[],(_={})[n]=this,this.getChildContext=function(){return _},this.componentWillUnmount=function(){e=null},this.shouldComponentUpdate=function(t){this.props.value!==t.value&&e.some((function(t){t.__e=!0,G(t)}))},this.sub=function(t){e.push(t);var n=t.componentWillUnmount;t.componentWillUnmount=function(){e&&e.splice(e.indexOf(t),1),n&&n.call(t)}}),t.children}};return e.Provider.__=e.Consumer.contextType=e}w=F.slice,S={__e:function(t,n,e,_){for(var i,o,r;n=n.__;)if((i=n.__c)&&!i.__)try{if((o=i.constructor)&&null!=o.getDerivedStateFromError&&(i.setState(o.getDerivedStateFromError(t)),r=i.__d),null!=i.componentDidCatch&&(i.componentDidCatch(t,_||{}),r=i.__d),r)return i.__E=i}catch(n){t=n}throw t}},x=0,C=function(t){return null!=t&&null==t.constructor},q.prototype.setState=function(t,n){var e;e=null!=this.__s&&this.__s!==this.state?this.__s:this.__s=L({},this.state),"function"==typeof t&&(t=t(L({},e),this.props)),t&&L(e,t),null!=t&&this.__v&&(n&&this._sb.push(n),G(this))},q.prototype.forceUpdate=function(t){this.__v&&(this.__e=!0,t&&this.__h.push(t),G(this))},q.prototype.render=j,U=[],H="function"==typeof Promise?Promise.prototype.then.bind(Promise.resolve()):setTimeout,P=function(t,n){return t.__v.__b-n.__v.__b},J.__r=0,N=0,$=et(!1),T=et(!0),D=0;var at,pt,dt,vt,yt=0,mt=[],gt=S,bt=gt.__b,kt=gt.__r,wt=gt.diffed,St=gt.__c,xt=gt.unmount,Ct=gt.__;function Ut(t,n){gt.__h&>.__h(pt,t,yt||n),yt=0;var e=pt.__H||(pt.__H={__:[],__h:[]});return t>=e.__.length&&e.__.push({}),e.__[t]}function Et(t){return yt=1,Ht(Bt,t)}function Ht(t,n,e){var _=Ut(at++,2);if(_.t=t,!_.__c&&(_.__=[e?e(n):Bt(void 0,n),function(t){var n=_.__N?_.__N[0]:_.__[0],e=_.t(n,t);n!==e&&(_.__N=[e,_.__[1]],_.__c.setState({}))}],_.__c=pt,!pt.u)){var i=function(t,n,e){if(!_.__c.__H)return!0;var i=_.__c.__H.__.filter((function(t){return!!t.__c}));if(i.every((function(t){return!t.__N})))return!o||o.call(this,t,n,e);var r=!1;return i.forEach((function(t){if(t.__N){var n=t.__[0];t.__=t.__N,t.__N=void 0,n!==t.__[0]&&(r=!0)}})),!(!r&&_.__c.props===t)&&(!o||o.call(this,t,n,e))};pt.u=!0;var o=pt.shouldComponentUpdate,r=pt.componentWillUpdate;pt.componentWillUpdate=function(t,n,e){if(this.__e){var _=o;o=void 0,i(t,n,e),o=_}r&&r.call(this,t,n,e)},pt.shouldComponentUpdate=i}return _.__N||_.__}function Pt(t,n){var e=Ut(at++,3);!gt.__s&&qt(e.__H,n)&&(e.__=t,e.i=n,pt.__H.__h.push(e))}function Nt(t,n){var e=Ut(at++,4);!gt.__s&&qt(e.__H,n)&&(e.__=t,e.i=n,pt.__h.push(e))}function $t(t){return yt=5,Dt((function(){return{current:t}}),[])}function Tt(t,n,e){yt=6,Nt((function(){return"function"==typeof t?(t(n()),function(){return t(null)}):t?(t.current=n(),function(){return t.current=null}):void 0}),null==e?e:e.concat(t))}function Dt(t,n){var e=Ut(at++,7);return qt(e.__H,n)&&(e.__=t(),e.__H=n,e.__h=t),e.__}function Mt(t,n){return yt=8,Dt((function(){return t}),n)}function Ft(t){var n=pt.context[t.__c],e=Ut(at++,9);return e.c=t,n?(null==e.__&&(e.__=!0,n.sub(pt)),n.props.value):t.__}function At(t,n){gt.useDebugValue&>.useDebugValue(n?n(t):t)}function Wt(t){var n=Ut(at++,10),e=Et();return n.__=t,pt.componentDidCatch||(pt.componentDidCatch=function(t,_){n.__&&n.__(t,_),e[1](t)}),[e[0],function(){e[1](void 0)}]}function Lt(){var t=Ut(at++,11);if(!t.__){for(var n=pt.__v;null!==n&&!n.__m&&null!==n.__;)n=n.__;var e=n.__m||(n.__m=[0,0]);t.__="P"+e[0]+"-"+e[1]++}return t.__}function Ot(){for(var t;t=mt.shift();)if(t.__P&&t.__H)try{t.__H.__h.forEach(Vt),t.__H.__h.forEach(jt),t.__H.__h=[]}catch(n){t.__H.__h=[],gt.__e(n,t.__v)}}gt.__b=function(t){pt=null,bt&&bt(t)},gt.__=function(t,n){t&&n.__k&&n.__k.__m&&(t.__m=n.__k.__m),Ct&&Ct(t,n)},gt.__r=function(t){kt&&kt(t),at=0;var n=(pt=t.__c).__H;n&&(dt===pt?(n.__h=[],pt.__h=[],n.__.forEach((function(t){t.__N&&(t.__=t.__N),t.i=t.__N=void 0}))):(n.__h.forEach(Vt),n.__h.forEach(jt),n.__h=[],at=0)),dt=pt},gt.diffed=function(t){wt&&wt(t);var n=t.__c;n&&n.__H&&(n.__H.__h.length&&(1!==mt.push(n)&&vt===gt.requestAnimationFrame||((vt=gt.requestAnimationFrame)||It)(Ot)),n.__H.__.forEach((function(t){t.i&&(t.__H=t.i),t.i=void 0}))),dt=pt=null},gt.__c=function(t,n){n.some((function(t){try{t.__h.forEach(Vt),t.__h=t.__h.filter((function(t){return!t.__||jt(t)}))}catch(r){n.some((function(t){t.__h&&(t.__h=[])})),n=[],gt.__e(r,t.__v)}})),St&&St(t,n)},gt.unmount=function(t){xt&&xt(t);var n,e=t.__c;e&&e.__H&&(e.__H.__.forEach((function(t){try{Vt(t)}catch(t){n=t}})),e.__H=void 0,n&>.__e(n,e.__v))};var Rt="function"==typeof requestAnimationFrame;function It(t){var n,e=function(){clearTimeout(_),Rt&&cancelAnimationFrame(n),setTimeout(t)},_=setTimeout(e,100);Rt&&(n=requestAnimationFrame(e))}function Vt(t){var n=pt,e=t.__c;"function"==typeof e&&(t.__c=void 0,e()),pt=n}function jt(t){var n=pt;t.__c=t.__(),pt=n}function qt(t,n){return!t||t.length!==n.length||n.some((function(n,e){return n!==t[e]}))}function Bt(t,n){return"function"==typeof n?n(t):n}function zt(t,n){S[t]=n.bind(null,S[t]||(()=>{}))}let Gt,Jt;function Kt(t){if(Jt)Jt();Jt=t&&t.S()}function Qt({data:t}){const n=Yt(t);n.value=t;const e=Dt(()=>{let t=this.__v;while(t=t.__)if(t.__c){t.__c.__$f|=4;break}this.__$u.c=()=>{var t;if(!C(e.peek())&&3===(null==(t=this.base)?void 0:t.nodeType))this.base.data=e.peek();else{this.__$f|=1;this.setState({})}};return v(()=>{let t=n.value.value;return 0===t?0:!0===t?"":t||""})},[]);return e.value}Qt.displayName="_st";Object.defineProperties(s.prototype,{constructor:{configurable:!0,value:void 0},type:{configurable:!0,value:Qt},props:{configurable:!0,get(){return{data:this}}},__b:{configurable:!0,value:1}});zt("__b",(t,n)=>{if("string"==typeof n.type){let t,e=n.props;for(let _ in e){if("children"===_)continue;let i=e[_];if(i instanceof s){if(!t)n.__np=t={};t[_]=i;e[_]=i.peek()}}}t(n)});zt("__r",(t,n)=>{Kt();let e,_=n.__c;if(_){_.__$f&=-2;e=_.__$u;if(void 0===e)_.__$u=e=function(t){let n;k((function(){n=this}));n.c=()=>{_.__$f|=1;_.setState({})};return n}()}Gt=_;Kt(e);t(n)});zt("__e",(t,n,e,_)=>{Kt();Gt=void 0;t(n,e,_)});zt("diffed",(t,n)=>{Kt();Gt=void 0;let e;if("string"==typeof n.type&&(e=n.__e)){let t=n.__np,_=n.props;if(t){let n=e.U;if(n)for(let e in n){let _=n[e];if(void 0!==_&&!(e in t)){_.d();n[e]=void 0}}else{n={};e.U=n}for(let i in t){let o=n[i],r=t[i];if(void 0===o){o=Xt(e,i,r,_);n[i]=o}else o.o(r,_)}}}t(n)});function Xt(t,n,e,_){const i=n in t&&void 0===t.ownerSVGElement,o=c(e);return{o:(t,n)=>{o.value=t;_=n},d:k(()=>{const e=o.value.value;if(_[n]!==e){_[n]=e;if(i)t[n]=e;else if(e)t.setAttribute(n,e);else t.removeAttribute(n)}})}}zt("unmount",(t,n)=>{if("string"==typeof n.type){let t=n.__e;if(t){const n=t.U;if(n){t.U=void 0;for(let t in n){let e=n[t];if(e)e.d()}}}}else{let t=n.__c;if(t){const n=t.__$u;if(n){t.__$u=void 0;n.d()}}}t(n)});zt("__h",(t,n,e,_)=>{if(_<3||9===_)n.__$f|=2;t(n,e,_)});q.prototype.shouldComponentUpdate=function(t,n){const e=this.__$u;if(!(e&&void 0!==e.s||4&this.__$f))return!0;if(3&this.__$f)return!0;for(let _ in n)return!0;for(let _ in t)if("__source"!==_&&t[_]!==this.props[_])return!0;for(let _ in this.props)if(!(_ in t))return!0;return!1};function Yt(t){return Dt(()=>c(t),[])}function Zt(t){const n=$t(t);n.current=t;Gt.__$f|=4;return Dt(()=>v(()=>n.current()),[])}function tn(t){const n=$t(t);n.current=t;Pt(()=>k(()=>n.current()),[])}var nn=function(t,n,e,_){var i;n[0]=0;for(var o=1;o=5&&((i||!t&&5===_)&&(r.push(_,0,i,e),_=6),t&&(r.push(_,t,0,e),_=6)),i=""},l=0;l"===n?(_=1,i=""):i=n+i[0]:o?n===o?o="":i+=n:'"'===n||"'"===n?o=n:">"===n?(u(),_=1):_&&("="===n?(_=5,e=i,i=""):"/"===n&&(_<5||">"===t[l][f+1])?(u(),3===_&&(r=r[0]),_=r,(r=r[0]).push(2,0,_),_=0):" "===n||"\t"===n||"\n"===n||"\r"===n?(u(),_=2):i+=n),3===_&&"!--"===i&&(_=4,r=r[0])}return u(),r}(t)),n),arguments,[])).length>1?n:n[0]}var on=_n.bind(R);export{q as Component,j as Fragment,s as Signal,e as batch,ct as cloneElement,v as computed,ht as createContext,R as createElement,V as createRef,k as effect,R as h,on as html,st as hydrate,C as isValidElement,S as options,ft as render,c as signal,Y as toChildArray,o as untracked,Mt as useCallback,Zt as useComputed,Ft as useContext,At as useDebugValue,Pt as useEffect,Wt as useErrorBoundary,Lt as useId,Tt as useImperativeHandle,Nt as useLayoutEffect,Dt as useMemo,Ht as useReducer,$t as useRef,Yt as useSignal,tn as useSignalEffect,Et as useState}; +const t=Symbol.for("preact-signals");function n(){if(r>1){r--;return}let t,n=!1;while(void 0!==i){let _=i;i=void 0;u++;while(void 0!==_){const i=_.o;_.o=void 0;_.f&=-3;if(!(8&_.f)&&h(_))try{_.c()}catch(e){if(!n){t=e;n=!0}}_=i}}u=0;r--;if(n)throw t}function e(t){if(r>0)return t();r++;try{return t()}finally{n()}}let _,i;function o(t){const n=_;_=void 0;try{return t()}finally{_=n}}let r=0,u=0,l=0;function s(t){if(void 0===_)return;let n=t.n;if(void 0===n||n.t!==_){n={i:0,S:t,p:_.s,n:void 0,t:_,e:void 0,x:void 0,r:n};if(void 0!==_.s)_.s.n=n;_.s=n;t.n=n;if(32&_.f)t.S(n);return n}else if(-1===n.i){n.i=0;if(void 0!==n.n){n.n.p=n.p;if(void 0!==n.p)n.p.n=n.n;n.p=_.s;n.n=void 0;_.s.n=n;_.s=n}return n}}function f(t){this.v=t;this.i=0;this.n=void 0;this.t=void 0}f.prototype.brand=t;f.prototype.h=function(){return!0};f.prototype.S=function(t){if(this.t!==t&&void 0===t.e){t.x=this.t;if(void 0!==this.t)this.t.e=t;this.t=t}};f.prototype.U=function(t){if(void 0!==this.t){const n=t.e,e=t.x;if(void 0!==n){n.x=e;t.e=void 0}if(void 0!==e){e.e=n;t.x=void 0}if(t===this.t)this.t=e}};f.prototype.subscribe=function(t){return k(()=>{const n=this.value,e=_;_=void 0;try{t(n)}finally{_=e}})};f.prototype.valueOf=function(){return this.value};f.prototype.toString=function(){return this.value+""};f.prototype.toJSON=function(){return this.value};f.prototype.peek=function(){const t=_;_=void 0;try{return this.value}finally{_=t}};Object.defineProperty(f.prototype,"value",{get(){const t=s(this);if(void 0!==t)t.i=this.i;return this.v},set(t){if(t!==this.v){if(u>100)throw new Error("Cycle detected");this.v=t;this.i++;l++;r++;try{for(let t=this.t;void 0!==t;t=t.x)t.t.N()}finally{n()}}}});function c(t){return new f(t)}function h(t){for(let n=t.s;void 0!==n;n=n.n)if(n.S.i!==n.i||!n.S.h()||n.S.i!==n.i)return!0;return!1}function a(t){for(let n=t.s;void 0!==n;n=n.n){const e=n.S.n;if(void 0!==e)n.r=e;n.S.n=n;n.i=-1;if(void 0===n.n){t.s=n;break}}}function p(t){let n,e=t.s;while(void 0!==e){const t=e.p;if(-1===e.i){e.S.U(e);if(void 0!==t)t.n=e.n;if(void 0!==e.n)e.n.p=t}else n=e;e.S.n=e.r;if(void 0!==e.r)e.r=void 0;e=t}t.s=n}function d(t){f.call(this,void 0);this.x=t;this.s=void 0;this.g=l-1;this.f=4}(d.prototype=new f).h=function(){this.f&=-3;if(1&this.f)return!1;if(32==(36&this.f))return!0;this.f&=-5;if(this.g===l)return!0;this.g=l;this.f|=1;if(this.i>0&&!h(this)){this.f&=-2;return!0}const t=_;try{a(this);_=this;const t=this.x();if(16&this.f||this.v!==t||0===this.i){this.v=t;this.f&=-17;this.i++}}catch(t){this.v=t;this.f|=16;this.i++}_=t;p(this);this.f&=-2;return!0};d.prototype.S=function(t){if(void 0===this.t){this.f|=36;for(let t=this.s;void 0!==t;t=t.n)t.S.S(t)}f.prototype.S.call(this,t)};d.prototype.U=function(t){if(void 0!==this.t){f.prototype.U.call(this,t);if(void 0===this.t){this.f&=-33;for(let t=this.s;void 0!==t;t=t.n)t.S.U(t)}}};d.prototype.N=function(){if(!(2&this.f)){this.f|=6;for(let t=this.t;void 0!==t;t=t.x)t.t.N()}};Object.defineProperty(d.prototype,"value",{get(){if(1&this.f)throw new Error("Cycle detected");const t=s(this);this.h();if(void 0!==t)t.i=this.i;if(16&this.f)throw this.v;return this.v}});function v(t){return new d(t)}function y(t){const e=t.u;t.u=void 0;if("function"==typeof e){r++;const i=_;_=void 0;try{e()}catch(n){t.f&=-2;t.f|=8;m(t);throw n}finally{_=i;n()}}}function m(t){for(let n=t.s;void 0!==n;n=n.n)n.S.U(n);t.x=void 0;t.s=void 0;y(t)}function g(t){if(_!==this)throw new Error("Out-of-order effect");p(this);_=t;this.f&=-2;if(8&this.f)m(this);n()}function b(t){this.x=t;this.u=void 0;this.s=void 0;this.o=void 0;this.f=32}b.prototype.c=function(){const t=this.S();try{if(8&this.f)return;if(void 0===this.x)return;const n=this.x();if("function"==typeof n)this.u=n}finally{t()}};b.prototype.S=function(){if(1&this.f)throw new Error("Cycle detected");this.f|=1;this.f&=-9;y(this);a(this);r++;const t=_;_=this;return g.bind(this,t)};b.prototype.N=function(){if(!(2&this.f)){this.f|=2;this.o=i;i=this}};b.prototype.d=function(){this.f|=8;if(!(1&this.f))m(this)};function k(t){const n=new b(t);try{n.c()}catch(t){n.d();throw t}return n.d.bind(n)}var w,S,x,C,U,E,H,P,N,$,T,D,M={},A=[],F=/acit|ex(?:s|g|n|p|$)|rph|grid|ows|mnc|ntw|ine[ch]|zoo|^ord|itera/i,W=Array.isArray;function L(t,n){for(var e in n)t[e]=n[e];return t}function O(t){t&&t.parentNode&&t.parentNode.removeChild(t)}function R(t,n,e){var _,i,o,r={};for(o in n)"key"==o?_=n[o]:"ref"==o?i=n[o]:r[o]=n[o];if(arguments.length>2&&(r.children=arguments.length>3?w.call(arguments,2):e),"function"==typeof t&&null!=t.defaultProps)for(o in t.defaultProps)void 0===r[o]&&(r[o]=t.defaultProps[o]);return I(t,r,_,i,null)}function I(t,n,e,_,i){var o={type:t,props:n,key:e,ref:_,__k:null,__:null,__b:0,__e:null,__d:void 0,__c:null,constructor:void 0,__v:null==i?++x:i,__i:-1,__u:0};return null==i&&null!=S.vnode&&S.vnode(o),o}function V(){return{current:null}}function j(t){return t.children}function q(t,n){this.props=t,this.context=n}function B(t,n){if(null==n)return t.__?B(t.__,t.__i+1):null;for(var e;nn&&U.sort(P));J.__r=0}function K(t,n,e,_,i,o,r,u,l,s,f){var c,h,a,p,d,v=_&&_.__k||A,y=n.length;for(e.__d=l,Q(e,n,v),l=e.__d,c=0;c0?I(i.type,i.props,i.key,i.ref?i.ref:null,i.__v):i).__=t,i.__b=t.__b+1,o=null,-1!==(u=i.__i=Z(i,e,r,f))&&(f--,(o=e[u])&&(o.__u|=131072)),null==o||null===o.__v?(-1==u&&c--,"function"!=typeof i.type&&(i.__u|=65536)):u!==r&&(u==r-1?c--:u==r+1?c++:(u>r?c--:c++,i.__u|=65536))):i=t.__k[_]=null;if(f)for(_=0;_(null!=l&&0==(131072&l.__u)?1:0))for(;r>=0||u=0){if((l=n[r])&&0==(131072&l.__u)&&i==l.key&&o===l.type)return r;r--}if(u2&&(u.children=arguments.length>3?w.call(arguments,2):e),I(t.type,u,_||t.key,i||t.ref,null)}function ht(t,n){var e={__c:n="__cC"+D++,__:t,Consumer:function(t,n){return t.children(n)},Provider:function(t){var e,_;return this.getChildContext||(e=new Set,(_={})[n]=this,this.getChildContext=function(){return _},this.componentWillUnmount=function(){e=null},this.shouldComponentUpdate=function(t){this.props.value!==t.value&&e.forEach((function(t){t.__e=!0,G(t)}))},this.sub=function(t){e.add(t);var n=t.componentWillUnmount;t.componentWillUnmount=function(){e&&e.delete(t),n&&n.call(t)}}),t.children}};return e.Provider.__=e.Consumer.contextType=e}w=A.slice,S={__e:function(t,n,e,_){for(var i,o,r;n=n.__;)if((i=n.__c)&&!i.__)try{if((o=i.constructor)&&null!=o.getDerivedStateFromError&&(i.setState(o.getDerivedStateFromError(t)),r=i.__d),null!=i.componentDidCatch&&(i.componentDidCatch(t,_||{}),r=i.__d),r)return i.__E=i}catch(n){t=n}throw t}},x=0,C=function(t){return null!=t&&null==t.constructor},q.prototype.setState=function(t,n){var e;e=null!=this.__s&&this.__s!==this.state?this.__s:this.__s=L({},this.state),"function"==typeof t&&(t=t(L({},e),this.props)),t&&L(e,t),null!=t&&this.__v&&(n&&this._sb.push(n),G(this))},q.prototype.forceUpdate=function(t){this.__v&&(this.__e=!0,t&&this.__h.push(t),G(this))},q.prototype.render=j,U=[],H="function"==typeof Promise?Promise.prototype.then.bind(Promise.resolve()):setTimeout,P=function(t,n){return t.__v.__b-n.__v.__b},J.__r=0,N=0,$=et(!1),T=et(!0),D=0;var at,pt,dt,vt,yt=0,mt=[],gt=S,bt=gt.__b,kt=gt.__r,wt=gt.diffed,St=gt.__c,xt=gt.unmount,Ct=gt.__;function Ut(t,n){gt.__h&>.__h(pt,t,yt||n),yt=0;var e=pt.__H||(pt.__H={__:[],__h:[]});return t>=e.__.length&&e.__.push({}),e.__[t]}function Et(t){return yt=1,Ht(Bt,t)}function Ht(t,n,e){var _=Ut(at++,2);if(_.t=t,!_.__c&&(_.__=[e?e(n):Bt(void 0,n),function(t){var n=_.__N?_.__N[0]:_.__[0],e=_.t(n,t);n!==e&&(_.__N=[e,_.__[1]],_.__c.setState({}))}],_.__c=pt,!pt.u)){var i=function(t,n,e){if(!_.__c.__H)return!0;var i=_.__c.__H.__.filter((function(t){return!!t.__c}));if(i.every((function(t){return!t.__N})))return!o||o.call(this,t,n,e);var r=!1;return i.forEach((function(t){if(t.__N){var n=t.__[0];t.__=t.__N,t.__N=void 0,n!==t.__[0]&&(r=!0)}})),!(!r&&_.__c.props===t)&&(!o||o.call(this,t,n,e))};pt.u=!0;var o=pt.shouldComponentUpdate,r=pt.componentWillUpdate;pt.componentWillUpdate=function(t,n,e){if(this.__e){var _=o;o=void 0,i(t,n,e),o=_}r&&r.call(this,t,n,e)},pt.shouldComponentUpdate=i}return _.__N||_.__}function Pt(t,n){var e=Ut(at++,3);!gt.__s&&qt(e.__H,n)&&(e.__=t,e.i=n,pt.__H.__h.push(e))}function Nt(t,n){var e=Ut(at++,4);!gt.__s&&qt(e.__H,n)&&(e.__=t,e.i=n,pt.__h.push(e))}function $t(t){return yt=5,Dt((function(){return{current:t}}),[])}function Tt(t,n,e){yt=6,Nt((function(){return"function"==typeof t?(t(n()),function(){return t(null)}):t?(t.current=n(),function(){return t.current=null}):void 0}),null==e?e:e.concat(t))}function Dt(t,n){var e=Ut(at++,7);return qt(e.__H,n)&&(e.__=t(),e.__H=n,e.__h=t),e.__}function Mt(t,n){return yt=8,Dt((function(){return t}),n)}function At(t){var n=pt.context[t.__c],e=Ut(at++,9);return e.c=t,n?(null==e.__&&(e.__=!0,n.sub(pt)),n.props.value):t.__}function Ft(t,n){gt.useDebugValue&>.useDebugValue(n?n(t):t)}function Wt(t){var n=Ut(at++,10),e=Et();return n.__=t,pt.componentDidCatch||(pt.componentDidCatch=function(t,_){n.__&&n.__(t,_),e[1](t)}),[e[0],function(){e[1](void 0)}]}function Lt(){var t=Ut(at++,11);if(!t.__){for(var n=pt.__v;null!==n&&!n.__m&&null!==n.__;)n=n.__;var e=n.__m||(n.__m=[0,0]);t.__="P"+e[0]+"-"+e[1]++}return t.__}function Ot(){for(var t;t=mt.shift();)if(t.__P&&t.__H)try{t.__H.__h.forEach(Vt),t.__H.__h.forEach(jt),t.__H.__h=[]}catch(n){t.__H.__h=[],gt.__e(n,t.__v)}}gt.__b=function(t){pt=null,bt&&bt(t)},gt.__=function(t,n){t&&n.__k&&n.__k.__m&&(t.__m=n.__k.__m),Ct&&Ct(t,n)},gt.__r=function(t){kt&&kt(t),at=0;var n=(pt=t.__c).__H;n&&(dt===pt?(n.__h=[],pt.__h=[],n.__.forEach((function(t){t.__N&&(t.__=t.__N),t.i=t.__N=void 0}))):(n.__h.forEach(Vt),n.__h.forEach(jt),n.__h=[],at=0)),dt=pt},gt.diffed=function(t){wt&&wt(t);var n=t.__c;n&&n.__H&&(n.__H.__h.length&&(1!==mt.push(n)&&vt===gt.requestAnimationFrame||((vt=gt.requestAnimationFrame)||It)(Ot)),n.__H.__.forEach((function(t){t.i&&(t.__H=t.i),t.i=void 0}))),dt=pt=null},gt.__c=function(t,n){n.some((function(t){try{t.__h.forEach(Vt),t.__h=t.__h.filter((function(t){return!t.__||jt(t)}))}catch(r){n.some((function(t){t.__h&&(t.__h=[])})),n=[],gt.__e(r,t.__v)}})),St&&St(t,n)},gt.unmount=function(t){xt&&xt(t);var n,e=t.__c;e&&e.__H&&(e.__H.__.forEach((function(t){try{Vt(t)}catch(t){n=t}})),e.__H=void 0,n&>.__e(n,e.__v))};var Rt="function"==typeof requestAnimationFrame;function It(t){var n,e=function(){clearTimeout(_),Rt&&cancelAnimationFrame(n),setTimeout(t)},_=setTimeout(e,100);Rt&&(n=requestAnimationFrame(e))}function Vt(t){var n=pt,e=t.__c;"function"==typeof e&&(t.__c=void 0,e()),pt=n}function jt(t){var n=pt;t.__c=t.__(),pt=n}function qt(t,n){return!t||t.length!==n.length||n.some((function(n,e){return n!==t[e]}))}function Bt(t,n){return"function"==typeof n?n(t):n}function zt(t,n){S[t]=n.bind(null,S[t]||(()=>{}))}let Gt,Jt;function Kt(t){if(Jt)Jt();Jt=t&&t.S()}function Qt({data:t}){const n=Yt(t);n.value=t;const e=Dt(()=>{let t=this.__v;while(t=t.__)if(t.__c){t.__c.__$f|=4;break}this.__$u.c=()=>{var t;if(!C(e.peek())&&3===(null==(t=this.base)?void 0:t.nodeType))this.base.data=e.peek();else{this.__$f|=1;this.setState({})}};return v(()=>{let t=n.value.value;return 0===t?0:!0===t?"":t||""})},[]);return e.value}Qt.displayName="_st";Object.defineProperties(f.prototype,{constructor:{configurable:!0,value:void 0},type:{configurable:!0,value:Qt},props:{configurable:!0,get(){return{data:this}}},__b:{configurable:!0,value:1}});zt("__b",(t,n)=>{if("string"==typeof n.type){let t,e=n.props;for(let _ in e){if("children"===_)continue;let i=e[_];if(i instanceof f){if(!t)n.__np=t={};t[_]=i;e[_]=i.peek()}}}t(n)});zt("__r",(t,n)=>{Kt();let e,_=n.__c;if(_){_.__$f&=-2;e=_.__$u;if(void 0===e)_.__$u=e=function(t){let n;k((function(){n=this}));n.c=()=>{_.__$f|=1;_.setState({})};return n}()}Gt=_;Kt(e);t(n)});zt("__e",(t,n,e,_)=>{Kt();Gt=void 0;t(n,e,_)});zt("diffed",(t,n)=>{Kt();Gt=void 0;let e;if("string"==typeof n.type&&(e=n.__e)){let t=n.__np,_=n.props;if(t){let n=e.U;if(n)for(let e in n){let _=n[e];if(void 0!==_&&!(e in t)){_.d();n[e]=void 0}}else{n={};e.U=n}for(let i in t){let o=n[i],r=t[i];if(void 0===o){o=Xt(e,i,r,_);n[i]=o}else o.o(r,_)}}}t(n)});function Xt(t,n,e,_){const i=n in t&&void 0===t.ownerSVGElement,o=c(e);return{o:(t,n)=>{o.value=t;_=n},d:k(()=>{const e=o.value.value;if(_[n]!==e){_[n]=e;if(i)t[n]=e;else if(e)t.setAttribute(n,e);else t.removeAttribute(n)}})}}zt("unmount",(t,n)=>{if("string"==typeof n.type){let t=n.__e;if(t){const n=t.U;if(n){t.U=void 0;for(let t in n){let e=n[t];if(e)e.d()}}}}else{let t=n.__c;if(t){const n=t.__$u;if(n){t.__$u=void 0;n.d()}}}t(n)});zt("__h",(t,n,e,_)=>{if(_<3||9===_)n.__$f|=2;t(n,e,_)});q.prototype.shouldComponentUpdate=function(t,n){const e=this.__$u;if(!(e&&void 0!==e.s||4&this.__$f))return!0;if(3&this.__$f)return!0;for(let _ in n)return!0;for(let _ in t)if("__source"!==_&&t[_]!==this.props[_])return!0;for(let _ in this.props)if(!(_ in t))return!0;return!1};function Yt(t){return Dt(()=>c(t),[])}function Zt(t){const n=$t(t);n.current=t;Gt.__$f|=4;return Dt(()=>v(()=>n.current()),[])}function tn(t){const n=$t(t);n.current=t;Pt(()=>k(()=>n.current()),[])}var nn=function(t,n,e,_){var i;n[0]=0;for(var o=1;o=5&&((i||!t&&5===_)&&(r.push(_,0,i,e),_=6),t&&(r.push(_,t,0,e),_=6)),i=""},l=0;l"===n?(_=1,i=""):i=n+i[0]:o?n===o?o="":i+=n:'"'===n||"'"===n?o=n:">"===n?(u(),_=1):_&&("="===n?(_=5,e=i,i=""):"/"===n&&(_<5||">"===t[l][s+1])?(u(),3===_&&(r=r[0]),_=r,(r=r[0]).push(2,0,_),_=0):" "===n||"\t"===n||"\n"===n||"\r"===n?(u(),_=2):i+=n),3===_&&"!--"===i&&(_=4,r=r[0])}return u(),r}(t)),n),arguments,[])).length>1?n:n[0]}var on=_n.bind(R);export{q as Component,j as Fragment,f as Signal,e as batch,ct as cloneElement,v as computed,ht as createContext,R as createElement,V as createRef,k as effect,R as h,on as html,ft as hydrate,C as isValidElement,S as options,st as render,c as signal,Y as toChildArray,o as untracked,Mt as useCallback,Zt as useComputed,At as useContext,Ft as useDebugValue,Pt as useEffect,Wt as useErrorBoundary,Lt as useId,Tt as useImperativeHandle,Nt as useLayoutEffect,Dt as useMemo,Ht as useReducer,$t as useRef,Yt as useSignal,tn as useSignalEffect,Et as useState}; diff --git a/examples/server/public/json-schema-to-grammar.mjs b/examples/server/public/json-schema-to-grammar.mjs index 7267f3f9c..e67bb15c1 100644 --- a/examples/server/public/json-schema-to-grammar.mjs +++ b/examples/server/public/json-schema-to-grammar.mjs @@ -529,7 +529,7 @@ export class SchemaConverter { return joinSeq(); }; - return this._addRule(name, "\"\\\"\" " + toRule(transform()) + " \"\\\"\" space") + return this._addRule(name, "\"\\\"\" (" + toRule(transform()) + ") \"\\\"\" space") } _notStrings(strings) { diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 508c1c065..5e0a236d3 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -129,14 +129,14 @@ struct slot_params { bool stream = true; bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt - int32_t n_keep = 0; // number of tokens to keep from initial prompt - int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half - int32_t n_predict = -1; // new tokens to predict + int32_t n_keep = 0; // number of tokens to keep from initial prompt + int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half + int32_t n_predict = -1; // new tokens to predict + + int64_t t_max_prompt_ms = -1; // TODO: implement + int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit std::vector antiprompt; - - json input_prefix; - json input_suffix; }; struct server_slot { @@ -166,8 +166,13 @@ struct server_slot { json prompt; // can be either a string, array of strings or array of token ids + json input_prefix; + json input_suffix; + json input_extra; + // when a task is submitted, we first tokenize the prompt and store it here std::vector prompt_tokens; + std::vector extra_tokens; std::string generated_text; std::vector cache_tokens; @@ -176,6 +181,7 @@ struct server_slot { server_task_cmpl_type cmpl_type = SERVER_TASK_CMPL_TYPE_NORMAL; bool has_next_token = true; + bool has_new_line = false; bool truncated = false; bool stopped_eos = false; bool stopped_word = false; @@ -194,21 +200,15 @@ struct server_slot { llama_token sampled; - int32_t ga_i = 0; // group-attention state - int32_t ga_n = 1; // group-attention factor - int32_t ga_w = 512; // group-attention width - - int32_t n_past_se = 0; // self-extend - // stats - size_t n_sent_text = 0; // number of sent text character + size_t n_sent_text = 0; // number of sent text character size_t n_sent_token_probs = 0; int64_t t_start_process_prompt; int64_t t_start_generation; double t_prompt_processing; // ms - double t_token_generation; // ms + double t_token_generation; // ms std::function callback_on_release; @@ -217,6 +217,7 @@ struct server_slot { n_prompt_tokens = 0; generated_text = ""; + has_new_line = false; truncated = false; stopped_eos = false; stopped_word = false; @@ -226,8 +227,6 @@ struct server_slot { n_sent_text = 0; n_sent_token_probs = 0; cmpl_type = SERVER_TASK_CMPL_TYPE_NORMAL; - ga_i = 0; - n_past_se = 0; generated_token_probs.clear(); } @@ -624,12 +623,6 @@ struct server_context { int32_t n_ctx; // total context for all clients / slots - // system prompt - bool system_need_update = false; - - std::string system_prompt; - std::vector system_tokens; - // slots / clients std::vector slots; json default_generation_settings_for_props; @@ -666,7 +659,7 @@ struct server_context { bool load_model(const common_params & params_) { params = params_; - // dedicate one sequence to the system prompt + // reserve one extra sequence (seq_id == 0) for extra features params.n_parallel += 1; common_init_result llama_init = common_init_from_params(params); @@ -712,22 +705,6 @@ struct server_context { SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx); - const int ga_n = params.grp_attn_n; - const int ga_w = params.grp_attn_w; - - if (ga_n != 1) { - GGML_ASSERT(ga_n > 0 && "ga_n must be positive"); // NOLINT - GGML_ASSERT(ga_w % ga_n == 0 && "ga_w must be a multiple of ga_n"); // NOLINT - //GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of ga_w"); // NOLINT - //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT - - SLT_INF(slot, "slot self-extend: ga_n = %d, ga_w = %d\n", ga_n, ga_w); - } - - slot.ga_i = 0; - slot.ga_n = ga_n; - slot.ga_w = ga_w; - slot.sparams = params.sparams; slot.callback_on_release = [this](int) { @@ -754,12 +731,7 @@ struct server_context { metrics.init(); } - std::vector tokenize(const json & json_prompt, bool add_special) const { - // TODO: currently, we tokenize using special tokens by default - // this is not always correct (see https://github.com/ggerganov/llama.cpp/pull/4160#issuecomment-1824826216) - // but it's better compared to completely ignoring ChatML and other chat templates - const bool TMP_FORCE_SPECIAL = true; - + std::vector tokenize(const json & json_prompt, bool add_special, bool parse_special) const { // If `add_bos` is true, we only add BOS, when json_prompt is a string, // or the first element of the json_prompt array is a string. std::vector prompt_tokens; @@ -772,10 +744,10 @@ struct server_context { std::vector p; if (first) { - p = common_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL); + p = common_tokenize(ctx, s, add_special, parse_special); first = false; } else { - p = common_tokenize(ctx, s, false, TMP_FORCE_SPECIAL); + p = common_tokenize(ctx, s, false, parse_special); } prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end()); @@ -789,7 +761,7 @@ struct server_context { } } else { auto s = json_prompt.template get(); - prompt_tokens = common_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL); + prompt_tokens = common_tokenize(ctx, s, add_special, parse_special); } return prompt_tokens; @@ -831,7 +803,7 @@ struct server_context { int slot_prompt_len = slot_prompt.size(); // length of the Longest Common Prefix between the current slot's prompt and the input prompt - int lcp_len = common_part(slot_prompt, prompt); + int lcp_len = longest_common_prefix(slot_prompt, prompt); // fraction of the common substring length compared to the current slot's prompt length similarity = static_cast(lcp_len) / slot_prompt_len; @@ -892,6 +864,8 @@ struct server_context { slot.sparams.top_k = json_value(data, "top_k", default_sparams.top_k); slot.sparams.top_p = json_value(data, "top_p", default_sparams.top_p); slot.sparams.min_p = json_value(data, "min_p", default_sparams.min_p); + slot.sparams.xtc_probability = json_value(data, "xtc_probability", default_sparams.xtc_probability); + slot.sparams.xtc_threshold = json_value(data, "xtc_threshold", default_sparams.xtc_threshold); slot.sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z); slot.sparams.typ_p = json_value(data, "typical_p", default_sparams.typ_p); slot.sparams.temp = json_value(data, "temperature", default_sparams.temp); @@ -910,6 +884,8 @@ struct server_context { slot.sparams.seed = json_value(data, "seed", default_sparams.seed); slot.sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs); slot.sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep); + //slot.params.t_max_prompt_ms = json_value(data, "t_max_prompt_ms", default_params.t_max_prompt_ms); // TODO: implement + slot.params.t_max_predict_ms = json_value(data, "t_max_predict_ms", default_params.t_max_predict_ms); // process "json_schema" and "grammar" if (data.contains("json_schema") && !data.at("json_schema").is_null() && data.contains("grammar") && !data.at("grammar").is_null()) { @@ -918,19 +894,14 @@ struct server_context { } if (data.contains("json_schema") && !data.contains("grammar")) { try { - auto schema = json_value(data, "json_schema", json::object()); - slot.sparams.grammar = json_schema_to_grammar(schema); + auto schema = json_value(data, "json_schema", json::object()); + slot.sparams.grammar = json_schema_to_grammar(schema); } catch (const std::exception & e) { send_error(task, std::string("\"json_schema\": ") + e.what(), ERROR_TYPE_INVALID_REQUEST); return false; } } else { - slot.sparams.grammar = json_value(data, "grammar", default_sparams.grammar); - } - - if (slot.params.cache_prompt && slot.ga_n != 1) { - slot.params.cache_prompt = false; - SLT_WRN(slot, "%s", "group-attention is not supported with prompt caching. disabling cache\n"); + slot.sparams.grammar = json_value(data, "grammar", default_sparams.grammar); } if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) { @@ -940,11 +911,29 @@ struct server_context { } // infill - slot.params.input_prefix = json_value(data, "input_prefix", default_params.input_prefix); - slot.params.input_suffix = json_value(data, "input_suffix", default_params.input_suffix); + slot.input_prefix = json_value(data, "input_prefix", json()); + slot.input_suffix = json_value(data, "input_suffix", json()); + slot.input_extra = json_value(data, "input_extra", json()); + + SLT_DBG(slot, "extra_context chunks: %d\n", (int) slot.input_extra.size()); + for (const auto & chunk : slot.input_extra) { + // { "text": string, "filename": string } + if (!chunk.contains("text") || !chunk["text"].is_string()) { + send_error(task, "extra_context chunk must contain a \"text\" field with a string value", ERROR_TYPE_INVALID_REQUEST); + return false; + } + + // filename is optional + if (chunk.contains("filename") && !chunk["filename"].is_string()) { + send_error(task, "extra_context chunk's \"filename\" field must be a string", ERROR_TYPE_INVALID_REQUEST); + return false; + } + + SLT_DBG(slot, "extra_context chunk in file '%s':\n%s\n", chunk.value("filename", "").c_str(), chunk.value("text", "").c_str()); + } // get prompt - if (task.cmpl_type != SERVER_TASK_CMPL_TYPE_INFILL) { + { const auto & prompt = data.find("prompt"); if (prompt == data.end()) { send_error(task, "\"prompt\" must be provided", ERROR_TYPE_INVALID_REQUEST); @@ -1067,51 +1056,6 @@ struct server_context { clean_kv_cache = false; } - void system_prompt_update() { - SRV_DBG("updating system prompt: '%s'\n", system_prompt.c_str()); - - kv_cache_clear(); - system_tokens.clear(); - - if (!system_prompt.empty()) { - system_tokens = common_tokenize(ctx, system_prompt, true); - - const int32_t n_batch = llama_n_batch(ctx); - const int32_t n_tokens_prompt = system_tokens.size(); - - for (int32_t i = 0; i < n_tokens_prompt; i += n_batch) { - const int32_t n_tokens = std::min(n_batch, n_tokens_prompt - i); - - common_batch_clear(batch); - - for (int32_t j = 0; j < n_tokens; ++j) { - common_batch_add(batch, system_tokens[i + j], i + j, { 0 }, false); - } - - if (llama_decode(ctx, batch) != 0) { - SRV_ERR("%s", "llama_decode() failed\n"); - return; - } - } - - // assign the system KV cache to all parallel sequences - for (int32_t i = 1; i <= params.n_parallel; ++i) { - llama_kv_cache_seq_cp(ctx, 0, i, -1, -1); - } - } - - system_need_update = false; - } - - bool system_prompt_set(const std::string & sys_prompt) { - SRV_DBG("system prompt set: '%s'\n", system_prompt.c_str()); - - system_prompt = sys_prompt; - // update system_tokens and KV cache as soon as all slots are idle - system_need_update = true; - return true; - } - bool process_token(completion_token_output & result, server_slot & slot) { // remember which tokens were sampled - used for repetition penalties during sampling const std::string token_str = common_token_to_piece(ctx, result.tok, params.special); @@ -1147,22 +1091,21 @@ struct server_context { size_t pos = std::min(slot.n_sent_text, slot.generated_text.size()); const std::string str_test = slot.generated_text.substr(pos); - bool is_stop_full = false; + bool send_text = true; size_t stop_pos = slot.find_stopping_strings(str_test, token_str.size(), STOP_TYPE_FULL); if (stop_pos != std::string::npos) { - is_stop_full = true; slot.generated_text.erase( slot.generated_text.begin() + pos + stop_pos, slot.generated_text.end()); pos = std::min(slot.n_sent_text, slot.generated_text.size()); - } else { - is_stop_full = false; + } else if (slot.has_next_token) { stop_pos = slot.find_stopping_strings(str_test, token_str.size(), STOP_TYPE_PARTIAL); + send_text = stop_pos == std::string::npos; } // check if there is any token to predict - if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0)) { + if (send_text) { // no send the stop word in the response result.text_to_send = slot.generated_text.substr(pos, std::string::npos); slot.n_sent_text += result.text_to_send.size(); @@ -1187,13 +1130,28 @@ struct server_context { SLT_DBG(slot, "stopped by limit, n_decoded = %d, n_predict = %d\n", slot.n_decoded, slot.params.n_predict); } + // if we have already seen a new line, we stop after a certain time limit + if (slot.has_new_line && slot.params.t_max_predict_ms > 0 && + (ggml_time_us() - slot.t_start_generation > 1000.0f*slot.params.t_max_predict_ms)) { + slot.stopped_limit = true; + slot.has_next_token = false; + + SLT_DBG(slot, "stopped by time limit, n_decoded = %d, t_max_predict_ms = %d ms\n", slot.n_decoded, (int) slot.params.t_max_predict_ms); + } + + // check if there is a new line in the generated text + if (result.text_to_send.find('\n') != std::string::npos) { + slot.has_new_line = true; + } + // if context shift is disabled, we stop when it reaches the context limit - if (slot.n_decoded >= slot.n_ctx) { + if (slot.n_past >= slot.n_ctx) { slot.truncated = true; slot.stopped_limit = true; slot.has_next_token = false; - SLT_DBG(slot, "stopped due to running out of context capacity, n_decoded = %d, n_ctx = %d\n", slot.n_decoded, slot.n_ctx); + SLT_DBG(slot, "stopped due to running out of context capacity, n_past = %d, n_prompt_tokens = %d, n_decoded = %d, n_ctx = %d\n", + slot.n_decoded, slot.n_prompt_tokens, slot.n_past, slot.n_ctx); } if (llama_token_is_eog(model, result.tok)) { @@ -1205,18 +1163,18 @@ struct server_context { const auto n_ctx_train = llama_n_ctx_train(model); - if (slot.params.n_predict < 1 && slot.n_predict < 1 && slot.ga_n == 1 && slot.n_prompt_tokens + slot.n_decoded >= n_ctx_train) { + if (slot.params.n_predict < 1 && slot.n_predict < 1 && slot.n_prompt_tokens + slot.n_decoded >= n_ctx_train) { slot.truncated = true; slot.stopped_limit = true; slot.has_next_token = false; // stop prediction SLT_WRN(slot, - "n_predict (%d) is not set and self-context extend is disabled. " + "n_predict (%d) is set for infinite generation. " "Limiting generated tokens to n_ctx_train (%d) to avoid EOS-less generation infinite loop\n", slot.params.n_predict, n_ctx_train); } - SLT_DBG(slot, "n_decoded = %d, n_remaining = %d, next token: '%s'\n", slot.n_decoded, slot.n_remaining, token_str.c_str()); + SLT_DBG(slot, "n_decoded = %d, n_remaining = %d, next token: %5d '%s'\n", slot.n_decoded, slot.n_remaining, result.tok, token_str.c_str()); return slot.has_next_token; // continue } @@ -1240,6 +1198,8 @@ struct server_context { {"top_k", slot.sparams.top_k}, {"top_p", slot.sparams.top_p}, {"min_p", slot.sparams.min_p}, + {"xtc_probability", slot.sparams.xtc_probability}, + {"xtc_threshold", slot.sparams.xtc_threshold}, {"tfs_z", slot.sparams.tfs_z}, {"typical_p", slot.sparams.typ_p}, {"repeat_last_n", slot.sparams.penalty_last_n}, @@ -1335,6 +1295,7 @@ struct server_context { {"tokens_evaluated", slot.n_prompt_tokens}, {"generation_settings", get_formated_generation(slot)}, {"prompt", slot.prompt}, + {"has_new_line", slot.has_new_line}, {"truncated", slot.truncated}, {"stopped_eos", slot.stopped_eos}, {"stopped_word", slot.stopped_word}, @@ -1484,9 +1445,8 @@ struct server_context { if (prompt.is_string() || json_is_array_of_numbers(prompt)) { data["index"] = 0; create_task(data, false, nullptr); - } - // otherwise, it's a multiple-prompt task, we break it into smaller tasks - else if (prompt.is_array()) { + } else if (prompt.is_array()) { + // otherwise, it's a multiple-prompt task, we break it into smaller tasks std::vector prompts = prompt; if (cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK) { // prompts[0] is the question @@ -1511,9 +1471,8 @@ struct server_context { } } } - } - // invalid case - else { + } else { + // invalid case throw std::runtime_error(error_msg); } @@ -1663,6 +1622,7 @@ struct server_context { slot_data["prompt"] = slot.prompt; slot_data["next_token"] = { {"has_next_token", slot.has_next_token}, + {"has_new_line", slot.has_new_line}, {"n_remain", slot.n_remaining}, {"n_decoded", slot.n_decoded}, {"stopped_eos", slot.stopped_eos}, @@ -1786,6 +1746,9 @@ struct server_context { } slot->cache_tokens.resize(token_count); + // TODO: maybe detokenize the slot->cache_tokens instead? + slot->prompt = string_format("[restored %d tokens from file]", (int) token_count); + const int64_t t_end = ggml_time_us(); const double t_restore_ms = (t_end - t_start) / 1000.0; @@ -1860,12 +1823,8 @@ struct server_context { } if (all_idle) { - if (system_need_update) { - system_prompt_update(); - } - SRV_INF("%s", "all slots are idle\n"); - if (system_prompt.empty() && clean_kv_cache) { + if (clean_kv_cache) { kv_cache_clear(); } @@ -1886,38 +1845,36 @@ struct server_context { // apply context-shift if needed // TODO: simplify and improve for (server_slot & slot : slots) { - if (slot.ga_n == 1) { - if (slot.is_processing() && (int) system_tokens.size() + slot.n_past >= slot.n_ctx - 1) { - if (!params.ctx_shift) { - // this check is redundant (for good) - // we should never get here, because generation should already stopped in process_token() - slot.release(); - send_error(slot, "context shift is disabled", ERROR_TYPE_SERVER); - continue; - } - - // Shift context - const int n_keep = slot.params.n_keep + add_bos_token; - const int n_left = (int) system_tokens.size() + slot.n_past - n_keep; - const int n_discard = slot.params.n_discard ? slot.params.n_discard : (n_left / 2); - - SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard); - - llama_kv_cache_seq_rm (ctx, slot.id + 1, n_keep , n_keep + n_discard); - llama_kv_cache_seq_add(ctx, slot.id + 1, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard); - - if (slot.params.cache_prompt) { - for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) { - slot.cache_tokens[i - n_discard] = slot.cache_tokens[i]; - } - - slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard); - } - - slot.n_past -= n_discard; - - slot.truncated = true; + if (slot.is_processing() && slot.n_past + 1 >= slot.n_ctx) { + if (!params.ctx_shift) { + // this check is redundant (for good) + // we should never get here, because generation should already stopped in process_token() + slot.release(); + send_error(slot, "context shift is disabled", ERROR_TYPE_SERVER); + continue; } + + // Shift context + const int n_keep = slot.params.n_keep + add_bos_token; + const int n_left = slot.n_past - n_keep; + const int n_discard = slot.params.n_discard ? slot.params.n_discard : (n_left / 2); + + SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard); + + llama_kv_cache_seq_rm (ctx, slot.id + 1, n_keep , n_keep + n_discard); + llama_kv_cache_seq_add(ctx, slot.id + 1, n_keep + n_discard, slot.n_past, -n_discard); + + if (slot.params.cache_prompt) { + for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) { + slot.cache_tokens[i - n_discard] = slot.cache_tokens[i]; + } + + slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard); + } + + slot.n_past -= n_discard; + + slot.truncated = true; } } @@ -1932,11 +1889,7 @@ struct server_context { slot.i_batch = batch.n_tokens; - const int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past; - - // TODO: we always have to take into account the "system_tokens" - // this is not great and needs to be improved somehow - common_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id + 1 }, true); + common_batch_add(batch, slot.sampled, slot.n_past, { slot.id + 1 }, true); slot.n_past += 1; @@ -1944,8 +1897,8 @@ struct server_context { slot.cache_tokens.push_back(slot.sampled); } - SLT_DBG(slot, "slot decode token, n_ctx = %d, n_past = %d, n_system_tokens = %d, n_cache_tokens = %d, truncated = %d\n", - slot.n_ctx, slot.n_past, (int) system_tokens.size(), (int) slot.cache_tokens.size(), slot.truncated); + SLT_DBG(slot, "slot decode token, n_ctx = %d, n_past = %d, n_cache_tokens = %d, truncated = %d\n", + slot.n_ctx, slot.n_past, (int) slot.cache_tokens.size(), slot.truncated); } // process in chunks of params.n_batch @@ -1972,63 +1925,126 @@ struct server_context { slot.t_start_process_prompt = ggml_time_us(); slot.t_start_generation = 0; - if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_INFILL) { - const bool add_bos = llama_add_bos_token(model); - bool suff_rm_leading_spc = true; - if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) { - params.input_suffix.erase(0, 1); - suff_rm_leading_spc = false; - } + switch (slot.cmpl_type) { + case SERVER_TASK_CMPL_TYPE_NORMAL: + case SERVER_TASK_CMPL_TYPE_EMBEDDING: + { + prompt_tokens = tokenize(slot.prompt, llama_add_bos_token(model), true); + } break; + case SERVER_TASK_CMPL_TYPE_RERANK: + { + // require slot.prompt to be array of 2 strings + if (!slot.prompt.is_array() || slot.prompt.size() != 2) { + SLT_ERR(slot, "%s", "invalid prompt for rerank task\n"); + slot.release(); + send_error(slot, "invalid prompt for rerank task", ERROR_TYPE_INVALID_REQUEST); + continue; + } - auto prefix_tokens = tokenize(slot.params.input_prefix, false); - auto suffix_tokens = tokenize(slot.params.input_suffix, false); + // prompt: [BOS]query[EOS][SEP]doc[EOS] + prompt_tokens.clear(); + prompt_tokens.push_back(llama_token_bos(model)); + { + const auto part = tokenize(slot.prompt[0], false, false); + prompt_tokens.insert(prompt_tokens.end(), part.begin(), part.end()); + } + prompt_tokens.push_back(llama_token_eos(model)); + prompt_tokens.push_back(llama_token_sep(model)); + { + const auto part = tokenize(slot.prompt[1], false, false); + prompt_tokens.insert(prompt_tokens.end(), part.begin(), part.end()); + } + prompt_tokens.push_back(llama_token_eos(model)); + } break; + case SERVER_TASK_CMPL_TYPE_INFILL: + { + // TODO: optimize this block by reducing memory allocations and movement - const int space_token = 29871; // TODO: this should not be hardcoded - if (suff_rm_leading_spc && !suffix_tokens.empty() && suffix_tokens[0] == space_token) { - suffix_tokens.erase(suffix_tokens.begin()); - } + // use FIM repo-level pattern: + // ref: https://arxiv.org/pdf/2409.12186 + // + // [FIM_REP]myproject + // [FIM_SEP]filename0 + // extra chunk 0 + // [FIM_SEP]filename1 + // extra chunk 1 + // ... + // [FIM_SEP]filename + // [FIM_PRE]prefix[FIM_SUF]suffix[FIM_MID]prompt + // + auto tokens_prefix = tokenize(slot.input_prefix, false, false); + auto tokens_suffix = tokenize(slot.input_suffix, false, false); + auto tokens_prompt = tokenize(slot.prompt, false, false); - prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model)); - suffix_tokens.insert(suffix_tokens.begin(), llama_token_suffix(model)); + slot.extra_tokens.clear(); + if (llama_token_fim_rep(model) != LLAMA_TOKEN_NULL) { + static const auto k_fim_repo = tokenize("myproject\n", false, false); - auto embd_inp = params.spm_infill ? suffix_tokens : prefix_tokens; - auto embd_end = params.spm_infill ? prefix_tokens : suffix_tokens; - if (add_bos) { - embd_inp.insert(embd_inp.begin(), llama_token_bos(model)); - } - embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end()); + slot.extra_tokens.push_back(llama_token_fim_rep(model)); + slot.extra_tokens.insert(slot.extra_tokens.end(), k_fim_repo.begin(), k_fim_repo.end()); + } - const llama_token middle_token = llama_token_middle(model); - if (middle_token >= 0) { - embd_inp.push_back(middle_token); - } + for (const auto & chunk : slot.input_extra) { + // { "text": string, "filename": string } + const std::string text = chunk.value("text", ""); + const std::string filename = chunk.value("filename", "tmp"); - prompt_tokens = embd_inp; - } else if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK) { - // require slot.prompt to be array of 2 strings - if (!slot.prompt.is_array() || slot.prompt.size() != 2) { - SLT_ERR(slot, "%s", "invalid prompt for rerank task\n"); - slot.release(); - send_error(slot, "invalid prompt for rerank task", ERROR_TYPE_INVALID_REQUEST); - continue; - } + if (llama_token_fim_sep(model) != LLAMA_TOKEN_NULL) { + const auto k_fim_file = tokenize(filename + "\n", false, false); - // prompt: [BOS]query[EOS][SEP]doc[EOS] - prompt_tokens.clear(); - prompt_tokens.push_back(llama_token_bos(model)); - { - const auto part = tokenize(slot.prompt[0], false); - prompt_tokens.insert(prompt_tokens.end(), part.begin(), part.end()); - } - prompt_tokens.push_back(llama_token_eos(model)); - prompt_tokens.push_back(llama_token_sep(model)); - { - const auto part = tokenize(slot.prompt[1], false); - prompt_tokens.insert(prompt_tokens.end(), part.begin(), part.end()); - } - prompt_tokens.push_back(llama_token_eos(model)); - } else { - prompt_tokens = tokenize(slot.prompt, system_prompt.empty()); // add BOS if there isn't system prompt + slot.extra_tokens.insert(slot.extra_tokens.end(), llama_token_fim_sep(model)); + slot.extra_tokens.insert(slot.extra_tokens.end(), k_fim_file.begin(), k_fim_file.end()); + } else { + // chunk separator in binary form to avoid confusing the AI + static const char k_chunk_prefix_str[] = {0x0a, 0x0a, 0x2d, 0x2d, 0x2d, 0x20, 0x73, 0x6e, 0x69, 0x70, 0x70, 0x65, 0x74, 0x20, 0x2d, 0x2d, 0x2d, 0x0a, 0x0a, 0x00}; + static const auto k_chunk_prefix_tokens = tokenize(k_chunk_prefix_str, false, false); + + slot.extra_tokens.insert(slot.extra_tokens.end(), k_chunk_prefix_tokens.begin(), k_chunk_prefix_tokens.end()); + } + + const auto chunk_tokens = tokenize(text, false, false); + slot.extra_tokens.insert(slot.extra_tokens.end(), chunk_tokens.begin(), chunk_tokens.end()); + } + + if (llama_token_fim_sep(model) != LLAMA_TOKEN_NULL) { + // TODO: current filename + static const auto k_fim_file = tokenize("filename\n", false, false); + + slot.extra_tokens.insert(slot.extra_tokens.end(), llama_token_fim_sep(model)); + slot.extra_tokens.insert(slot.extra_tokens.end(), k_fim_file.begin(), k_fim_file.end()); + } + + // for now pick FIM context to fit in a batch (ratio prefix:suffix = 3:1, TODO: configurable?) + const int n_suffix_take = std::min(tokens_suffix.size(), (n_batch/4)); + const int n_prefix_take = std::min(tokens_prefix.size(), 3*(n_batch/4) - 3); + + // fill the rest of the context with extra chunks + const int n_extra_take = std::min(std::max(0, slot.n_ctx - (n_batch) - 2*slot.n_predict), slot.extra_tokens.size()); + + tokens_prefix.erase(tokens_prefix.begin(), tokens_prefix.begin() + tokens_prefix.size() - n_prefix_take); + tokens_suffix.resize(n_suffix_take); + + tokens_prefix.insert(tokens_prefix.begin(), llama_token_fim_pre(model)); + tokens_prefix.insert(tokens_prefix.end(), tokens_prompt.begin(), tokens_prompt.end()); + tokens_suffix.insert(tokens_suffix.begin(), llama_token_fim_suf(model)); + + auto embd_inp = params.spm_infill ? tokens_suffix : tokens_prefix; + auto embd_end = params.spm_infill ? tokens_prefix : tokens_suffix; + + if (llama_add_bos_token(model)) { + embd_inp.insert(embd_inp.begin(), llama_token_bos(model)); + } + + SLT_DBG(slot, "extra: n_ctx = %d, n_extra_take = %d, n_extra = %d\n", slot.n_ctx, n_extra_take, (int) slot.extra_tokens.size()); + + // put the extra context before the FIM prefix + embd_inp.insert(embd_inp.begin(), slot.extra_tokens.end() - n_extra_take, slot.extra_tokens.end()); + + embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end()); + embd_inp.push_back(llama_token_fim_mid(model)); + + prompt_tokens = std::move(embd_inp); + } break; } slot.n_past = 0; @@ -2036,6 +2052,19 @@ struct server_context { SLT_INF(slot, "prompt tokenized, n_ctx_slot = %d, n_keep = %d, n_prompt_tokens = %d\n", slot.n_ctx, slot.params.n_keep, slot.n_prompt_tokens); + // print prompt tokens (for debugging) + if (1) { + // first 16 tokens (avoid flooding logs) + for (int i = 0; i < std::min(16, prompt_tokens.size()); i++) { + SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str()); + } + } else { + // all + for (int i = 0; i < (int) prompt_tokens.size(); i++) { + SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str()); + } + } + // empty prompt passed -> release the slot and send empty response if (prompt_tokens.empty()) { SLT_WRN(slot, "%s", "empty prompt - releasing slot\n"); @@ -2056,7 +2085,9 @@ struct server_context { } else { if (!params.ctx_shift) { // if context shift is disabled, we make sure prompt size is smaller than KV size - if ((int) system_tokens.size() + slot.n_prompt_tokens >= slot.n_ctx) { + // TODO: there should be a separate parameter that control prompt truncation + // context shift should be applied only during the generation phase + if (slot.n_prompt_tokens >= slot.n_ctx) { slot.release(); send_error(slot, "the request exceeds the available context size. try increasing the context size or enable context shift", ERROR_TYPE_INVALID_REQUEST); continue; @@ -2067,8 +2098,8 @@ struct server_context { } slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep); - // if input prompt is too big, truncate it (if group attention self-extend is disabled) - if (slot.ga_n == 1 && slot.n_prompt_tokens >= slot.n_ctx) { + // if input prompt is too big, truncate it + if (slot.n_prompt_tokens >= slot.n_ctx) { const int n_left = slot.n_ctx - slot.params.n_keep; const int n_block_size = n_left / 2; @@ -2095,19 +2126,61 @@ struct server_context { common_sampler_reset(slot.smpl); - if (!slot.params.cache_prompt) { - slot.n_past_se = 0; - slot.ga_i = 0; - } else { - GGML_ASSERT(slot.ga_n == 1); - + if (slot.params.cache_prompt) { // reuse any previously computed tokens that are common with the new prompt - slot.n_past = common_part(slot.cache_tokens, prompt_tokens); + slot.n_past = longest_common_prefix(slot.cache_tokens, prompt_tokens); // push the prompt into the sampling context (do not apply grammar) for (int i = 0; i < slot.n_past; ++i) { common_sampler_accept(slot.smpl, slot.cache_tokens[i], false); } + + // reuse chunks from the cached prompt by shifting their KV cache in the new position + if (params.n_cache_reuse > 0) { + size_t head_c = slot.n_past; // cache + size_t head_p = slot.n_past; // current prompt + + SLT_DBG(slot, "trying to reuse chunks with size > %d, slot.n_past = %d\n", params.n_cache_reuse, slot.n_past); + + while (head_c < slot.cache_tokens.size() && + head_p < prompt_tokens.size()) { + + size_t n_match = 0; + while (head_c + n_match < slot.cache_tokens.size() && + head_p + n_match < prompt_tokens.size() && + slot.cache_tokens[head_c + n_match] == prompt_tokens[head_p + n_match]) { + + n_match++; + } + + if (n_match >= (size_t) params.n_cache_reuse) { + SLT_INF(slot, "reusing chunk with size %zu, shifting KV cache [%zu, %zu) -> [%zu, %zu)\n", n_match, head_c, head_c + n_match, head_p, head_p + n_match); + //for (size_t i = head_p; i < head_p + n_match; i++) { + // SLT_DBG(slot, "cache token %3zu: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str()); + //} + + const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c; + + llama_kv_cache_seq_rm (ctx, slot.id + 1, head_p, head_c); + llama_kv_cache_seq_add(ctx, slot.id + 1, head_c, -1, kv_shift); + + for (size_t i = 0; i < n_match; i++) { + slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i]; + + common_sampler_accept(slot.smpl, slot.cache_tokens[head_p + i], false); + + slot.n_past++; + } + + head_c += n_match; + head_p += n_match; + } else { + head_c += 1; + } + } + + SLT_DBG(slot, "after context reuse, new slot.n_past = %d\n", slot.n_past); + } } } @@ -2116,9 +2189,6 @@ struct server_context { SLT_WRN(slot, "need to evaluate at least 1 token to generate logits, n_past = %d, n_prompt_tokens = %d\n", slot.n_past, slot.n_prompt_tokens); slot.n_past--; - if (slot.ga_i > 0) { - slot.n_past_se--; - } } slot.n_prompt_tokens_processed = 0; @@ -2144,55 +2214,31 @@ struct server_context { } // keep only the common part - int p0 = (int) system_tokens.size() + slot.n_past; - if (!llama_kv_cache_seq_rm(ctx, slot.id + 1, p0, -1)) { + if (!llama_kv_cache_seq_rm(ctx, slot.id + 1, slot.n_past, -1)) { // could not partially delete (likely using a non-Transformer model) llama_kv_cache_seq_rm(ctx, slot.id + 1, -1, -1); - p0 = (int) system_tokens.size(); - if (p0 != 0) { - // copy over the system prompt when there is one - llama_kv_cache_seq_cp(ctx, 0, slot.id + 1, -1, -1); - } - - // there is no common part left (except for the system prompt) + // there is no common part left slot.n_past = 0; - slot.n_past_se = 0; - slot.ga_i = 0; - // TODO: is the system prompt ever in the sampling context? + common_sampler_reset(slot.smpl); } + SLT_INF(slot, "kv cache rm [%d, end)\n", slot.n_past); + // remove the non-common part from the cache slot.cache_tokens.resize(slot.n_past); - SLT_INF(slot, "kv cache rm [%d, end)\n", p0); - - int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past; - - int32_t ga_i = slot.ga_i; - int32_t ga_n = slot.ga_n; - int32_t ga_w = slot.ga_w; - // add prompt tokens for processing in the current batch - // TODO: the self-extend stuff here is a mess - simplify and/or abstract it somehow - for (; slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch; ++slot.n_past) { - if (slot.ga_n != 1) { - while (slot_npast >= ga_i + ga_w) { - const int bd = (ga_w/ga_n)*(ga_n - 1); - slot_npast -= bd; - ga_i += ga_w/ga_n; - } - } - - common_batch_add(batch, prompt_tokens[slot.n_past], system_tokens.size() + slot_npast, { slot.id + 1 }, false); + while (slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch) { + common_batch_add(batch, prompt_tokens[slot.n_past], slot.n_past, { slot.id + 1 }, false); if (slot.params.cache_prompt) { slot.cache_tokens.push_back(prompt_tokens[slot.n_past]); } slot.n_prompt_tokens_processed++; - slot_npast++; + slot.n_past++; } SLT_INF(slot, "prompt processing progress, n_past = %d, n_tokens = %d, progress = %f\n", slot.n_past, batch.n_tokens, (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens); @@ -2233,34 +2279,6 @@ struct server_context { for (int32_t i = 0; i < batch.n_tokens; i += n_batch) { const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i); - for (auto & slot : slots) { - if (slot.ga_n != 1) { - // context extension via Self-Extend - // TODO: simplify and/or abstract this - while (slot.n_past_se >= slot.ga_i + slot.ga_w) { - const int ib = (slot.ga_n * slot.ga_i) / slot.ga_w; - const int bd = (slot.ga_w / slot.ga_n) * (slot.ga_n - 1); - const int dd = (slot.ga_w / slot.ga_n) - ib * bd - slot.ga_w; - - SLT_DBG(slot, "shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd); - SLT_DBG(slot, "div: [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n); - SLT_DBG(slot, "shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd); - - llama_kv_cache_seq_add(ctx, slot.id + 1, slot.ga_i, slot.n_past_se, ib * bd); - llama_kv_cache_seq_div(ctx, slot.id + 1, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n); - llama_kv_cache_seq_add(ctx, slot.id + 1, slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd); - - slot.n_past_se -= bd; - - slot.ga_i += slot.ga_w / slot.ga_n; - - SLT_DBG(slot, "\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i); - } - - slot.n_past_se += n_tokens; - } - } - llama_batch batch_view = { n_tokens, batch.token + i, @@ -2415,10 +2433,6 @@ int main(int argc, char ** argv) { // struct that contains llama context and inference server_context ctx_server; - if (!params.system_prompt.empty()) { - ctx_server.system_prompt_set(params.system_prompt); - } - if (params.model_alias == "unknown") { params.model_alias = params.model; } @@ -2846,7 +2860,6 @@ int main(int argc, char ** argv) { const auto handle_props = [&ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) { json data = { - { "system_prompt", ctx_server.system_prompt }, { "default_generation_settings", ctx_server.default_generation_settings_for_props }, { "total_slots", ctx_server.params.n_parallel }, { "chat_template", llama_get_chat_template(ctx_server.model) }, @@ -2862,10 +2875,8 @@ int main(int argc, char ** argv) { } json data = json::parse(req.body); - if (data.contains("system_prompt")) { - std::string system_prompt = data.at("system_prompt"); - ctx_server.system_prompt_set(system_prompt); - } + + // update any props here res_ok(res, {{ "success", true }}); }; @@ -2925,7 +2936,23 @@ int main(int argc, char ** argv) { return handle_completions_generic(SERVER_TASK_CMPL_TYPE_NORMAL, data, res); }; - const auto handle_infill = [&handle_completions_generic](const httplib::Request & req, httplib::Response & res) { + const auto handle_infill = [&ctx_server, &res_error, &handle_completions_generic](const httplib::Request & req, httplib::Response & res) { + std::string err; + if (llama_token_fim_pre(ctx_server.model) == LLAMA_TOKEN_NULL) { + err += "prefix token is missing. "; + } + if (llama_token_fim_suf(ctx_server.model) == LLAMA_TOKEN_NULL) { + err += "suffix token is missing. "; + } + if (llama_token_fim_mid(ctx_server.model) == LLAMA_TOKEN_NULL) { + err += "middle token is missing. "; + } + + if (!err.empty()) { + res_error(res, format_error_response(string_format("Infill is not supported by this model: %s", err.c_str()), ERROR_TYPE_NOT_SUPPORTED)); + return; + } + json data = json::parse(req.body); return handle_completions_generic(SERVER_TASK_CMPL_TYPE_INFILL, data, res); }; @@ -3011,7 +3038,8 @@ int main(int argc, char ** argv) { if (body.count("content") != 0) { const bool add_special = json_value(body, "add_special", false); const bool with_pieces = json_value(body, "with_pieces", false); - std::vector tokens = ctx_server.tokenize(body.at("content"), add_special); + + std::vector tokens = ctx_server.tokenize(body.at("content"), add_special, true); if (with_pieces) { for (const auto& token : tokens) { @@ -3362,6 +3390,7 @@ int main(int argc, char ** argv) { ctx_server.queue_tasks.on_new_task(std::bind( &server_context::process_single_task, &ctx_server, std::placeholders::_1)); + ctx_server.queue_tasks.on_update_slots(std::bind( &server_context::update_slots, &ctx_server)); diff --git a/examples/server/tests/features/ctx_shift.feature b/examples/server/tests/features/ctx_shift.feature index ba3afcf06..ae6c6b01b 100644 --- a/examples/server/tests/features/ctx_shift.feature +++ b/examples/server/tests/features/ctx_shift.feature @@ -13,6 +13,10 @@ Feature: llama.cpp server And 32 as batch size And 2 slots + # the prompt is 301 tokens + # the slot context is 256/2 = 128 tokens + # the prompt is truncated to keep the last 109 tokens + # 64 tokens are generated thanks to shifting the context when it gets full Scenario: Inference with context shift And 64 server max tokens to predict Then the server is starting diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index ad99e9574..69519ef95 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -195,14 +195,14 @@ static std::string gen_chatcmplid() { // other common utils // -static size_t common_part(const std::vector & a, const std::vector & b) { +static size_t longest_common_prefix(const std::vector & a, const std::vector & b) { size_t i; for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {} return i; } -static size_t common_part(const std::string & a, const std::string & b) { +static size_t longest_common_prefix(const std::string & a, const std::string & b) { size_t i; for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {} @@ -360,9 +360,9 @@ static json oaicompat_completion_params_parse( // Handle "logprobs" field // TODO: The response format of this option is not yet OAI-compatible, but seems like no one really using it; We may need to fix it in the future - if (body.contains("logprobs")) { + if (json_value(body, "logprobs", false)) { llama_params["n_probs"] = json_value(body, "top_logprobs", 20); - } else if (body.contains("top_logprobs")) { + } else if (body.contains("top_logprobs") && !body.at("top_logprobs").is_null()) { throw std::runtime_error("top_logprobs requires logprobs to be set to true"); } diff --git a/ggml/include/ggml-vulkan.h b/ggml/include/ggml-vulkan.h index e074042ef..c03bbfe5e 100644 --- a/ggml/include/ggml-vulkan.h +++ b/ggml/include/ggml-vulkan.h @@ -24,6 +24,8 @@ GGML_API ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num); // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU GGML_API ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void); +GGML_API ggml_backend_reg_t ggml_backend_vk_reg(void); + #ifdef __cplusplus } #endif diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c index 70187b9b6..041de9e3e 100644 --- a/ggml/src/ggml-alloc.c +++ b/ggml/src/ggml-alloc.c @@ -14,7 +14,7 @@ //#define GGML_ALLOCATOR_DEBUG -//#define AT_PRINTF(...) fprintf(stderr, __VA_ARGS__) +//#define AT_PRINTF(...) GGML_LOG_DEBUG(__VA_ARGS__) #define AT_PRINTF(...) @@ -89,7 +89,7 @@ void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tenso size = GGML_PAD(size, talloc->alignment); if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) { - fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n", + GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n", __func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset); GGML_ABORT("not enough space in the buffer"); } @@ -172,7 +172,7 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz best_fit_block = alloc->n_free_blocks - 1; } else { // this should never happen - fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n", + GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n", __func__, size, max_avail); GGML_ABORT("not enough space in the buffer"); } @@ -209,16 +209,16 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz } } } - fprintf(stderr, "max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0); + GGML_LOG_DEBUG("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0); for (int i = 0; i < 1024; i++) { if (alloc->allocated_tensors[i].tensor) { - fprintf(stderr, "%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name, + GGML_LOG_DEBUG("%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name, alloc->allocated_tensors[i].offset, alloc->allocated_tensors[i].offset + ggml_nbytes(alloc->allocated_tensors[i].tensor), ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0); } } - fprintf(stderr, "\n"); + GGML_LOG_DEBUG("\n"); } #endif @@ -348,7 +348,6 @@ struct tensor_alloc { }; struct leaf_alloc { - int buffer_id; struct tensor_alloc leaf; }; @@ -740,7 +739,6 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c for (int i = 0; i < graph->n_leafs; i++) { struct ggml_tensor * leaf = graph->leafs[i]; struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf); - galloc->leaf_allocs[i].buffer_id = hn->buffer_id; if (leaf->view_src || leaf->data) { galloc->leaf_allocs[i].leaf.buffer_id = -1; galloc->leaf_allocs[i].leaf.offset = SIZE_MAX; @@ -768,13 +766,13 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c // even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views if (new_size > cur_size || galloc->buffers[i] == NULL) { #ifndef NDEBUG - fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); + GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); #endif ggml_backend_buffer_free(galloc->buffers[i]); galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size); if (galloc->buffers[i] == NULL) { - fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size); + GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size); return false; } ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE); @@ -825,14 +823,14 @@ static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_t static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph * graph) { if (galloc->n_nodes != graph->n_nodes) { #ifndef NDEBUG - fprintf(stderr, "%s: graph has different number of nodes\n", __func__); + GGML_LOG_DEBUG("%s: graph has different number of nodes\n", __func__); #endif return true; } if (galloc->n_leafs != graph->n_leafs) { #ifndef NDEBUG - fprintf(stderr, "%s: graph has different number of leafs\n", __func__); + GGML_LOG_DEBUG("%s: graph has different number of leafs\n", __func__); #endif return true; } @@ -843,7 +841,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph if (!ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) { #ifndef NDEBUG - fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name); + GGML_LOG_DEBUG("%s: node %s is not valid\n", __func__, node->name); #endif return true; } @@ -855,7 +853,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph } if (!ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) { #ifndef NDEBUG - fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name); + GGML_LOG_DEBUG("%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name); #endif return true; } @@ -869,14 +867,14 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph) if (ggml_gallocr_needs_realloc(galloc, graph)) { if (galloc->n_buffers == 1) { #ifndef NDEBUG - fprintf(stderr, "%s: reallocating buffers automatically\n", __func__); + GGML_LOG_DEBUG("%s: reallocating buffers automatically\n", __func__); #endif if (!ggml_gallocr_reserve(galloc, graph)) { return false; } } else { #ifndef NDEBUG - fprintf(stderr, "%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__); + GGML_LOG_DEBUG("%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__); #endif return false; } @@ -940,7 +938,7 @@ static bool alloc_tensor_range(struct ggml_context * ctx, ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size); if (buffer == NULL) { #ifndef NDEBUG - fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size); + GGML_LOG_DEBUG("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size); #endif for (size_t i = 0; i < *n_buffers; i++) { ggml_backend_buffer_free((*buffers)[i]); @@ -990,7 +988,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte } if (this_size > max_size) { - fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n", + GGML_LOG_ERROR("%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n", __func__, t->name, ggml_backend_buft_name(buft), this_size, max_size); @@ -1022,7 +1020,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte if (n_buffers == 0) { #ifndef NDEBUG - fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__); + GGML_LOG_DEBUG("%s: all tensors in the context are already allocated\n", __func__); #endif return NULL; } diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 9eb00a340..d13e71d84 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -379,7 +379,7 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src)); } else if (!ggml_backend_buffer_copy_tensor(src, dst)) { #ifndef NDEBUG - fprintf(stderr, "%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer)); + GGML_LOG_DEBUG("%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer)); #endif size_t nbytes = ggml_nbytes(src); void * data = malloc(nbytes); @@ -538,6 +538,10 @@ void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * na #include "ggml-metal.h" #endif +#ifdef GGML_USE_VULKAN +#include "ggml-vulkan.h" +#endif + #ifdef GGML_USE_BLAS #include "ggml-blas.h" #endif @@ -557,6 +561,9 @@ struct ggml_backend_registry { #ifdef GGML_USE_METAL register_backend(ggml_backend_metal_reg()); #endif +#ifdef GGML_USE_VULKAN + register_backend(ggml_backend_vk_reg()); +#endif #ifdef GGML_USE_BLAS register_backend(ggml_backend_blas_reg()); #endif @@ -564,14 +571,14 @@ struct ggml_backend_registry { register_backend(ggml_backend_rpc_reg()); #endif - // TODO: sycl, vulkan, kompute, cann + // TODO: sycl, kompute, cann register_backend(ggml_backend_cpu_reg()); } void register_backend(ggml_backend_reg_t reg) { #ifndef NDEBUG - fprintf(stderr, "%s: registered backend %s (%zu devices)\n", + GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n", __func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg)); #endif backends.push_back(reg); @@ -582,7 +589,7 @@ struct ggml_backend_registry { void register_device(ggml_backend_dev_t device) { #ifndef NDEBUG - fprintf(stderr, "%s: registered device %s (%s)\n", __func__, ggml_backend_dev_name(device), ggml_backend_dev_description(device)); + GGML_LOG_DEBUG("%s: registered device %s (%s)\n", __func__, ggml_backend_dev_name(device), ggml_backend_dev_description(device)); #endif devices.push_back(device); } @@ -682,8 +689,6 @@ ggml_backend_t ggml_backend_init_best(void) { // backend CPU -static const size_t TENSOR_ALIGNMENT = 32; // required for mmap as gguf only guarantees 32-byte alignment - static const char * ggml_backend_cpu_buffer_get_name(ggml_backend_buffer_t buffer) { return "CPU"; @@ -702,7 +707,7 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) { } static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) { - free(buffer->context); + ggml_aligned_free(buffer->context, buffer->size); } static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { @@ -770,14 +775,19 @@ static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_ty } static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { - size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned - void * data = malloc(size); // TODO: use GGML_ALIGNED_MALLOC (move to ggml-impl.h) + auto alloc_size = size; + if (alloc_size == 0) { + alloc_size = 1; + } + + void * data = ggml_aligned_malloc(alloc_size); + if (data == NULL) { - fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size); + GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, alloc_size); return NULL; } - return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, size); + return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, alloc_size); } static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { @@ -836,7 +846,7 @@ static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_ void * ptr; int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size); if (result != 0) { - fprintf(stderr, "failed to allocate HBM buffer of size %zu\n", size); + GGML_LOG_ERROR("failed to allocate HBM buffer of size %zu\n", size); return NULL; } @@ -1459,7 +1469,7 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, co } #ifndef NDEBUG - fprintf(stderr, "%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n", + GGML_LOG_DEBUG("%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n", __func__, ggml_op_desc(tensor), ggml_backend_buffer_name(buffer), tensor->name); #endif @@ -1554,13 +1564,13 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str for (int i = 0; i < graph->n_nodes; i++) { if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) { ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id]; - fprintf(stderr, "\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend), + GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend), sched->splits[cur_split].n_inputs); for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) { - fprintf(stderr, "[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name, + GGML_LOG_DEBUG("[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name, fmt_size(ggml_nbytes(sched->splits[cur_split].inputs[j]))); } - fprintf(stderr, "\n"); + GGML_LOG_DEBUG("\n"); cur_split++; } struct ggml_tensor * node = graph->nodes[i]; @@ -1568,7 +1578,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str continue; } ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node); - fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name, + GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name, fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node)); for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * src = node->src[j]; @@ -1576,10 +1586,10 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str continue; } ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src); - fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name, + GGML_LOG_DEBUG(" %20.20s (%5.5s) [%5.5s %8.8s]", src->name, fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src)); } - fprintf(stderr, "\n"); + GGML_LOG_DEBUG("\n"); } } @@ -2093,11 +2103,11 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) { // the re-allocation may cause the split inputs to be moved to a different address ggml_backend_sched_synchronize(sched); #ifndef NDEBUG - fprintf(stderr, "%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed); + GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed); #endif ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids); if (!ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) { - fprintf(stderr, "%s: failed to allocate graph\n", __func__); + GGML_LOG_ERROR("%s: failed to allocate graph\n", __func__); return false; } } @@ -2491,7 +2501,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s struct ggml_context * ctx_unallocated = ggml_init(params); if (ctx_allocated == NULL || ctx_unallocated == NULL) { - fprintf(stderr, "failed to allocate context for graph copy\n"); + GGML_LOG_ERROR("%s: failed to allocate context for graph copy\n", __func__); ggml_hash_set_free(&hash_set); free(node_copies); free(node_init); @@ -2514,7 +2524,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s // allocate nodes ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend); if (buffer == NULL) { - fprintf(stderr, "failed to allocate buffer for graph copy\n"); + GGML_LOG_ERROR("%s: failed to allocate buffer for graph copy\n", __func__); ggml_hash_set_free(&hash_set); free(node_copies); free(node_init); diff --git a/ggml/src/ggml-blas.cpp b/ggml/src/ggml-blas.cpp index 55f724586..7875ec86d 100644 --- a/ggml/src/ggml-blas.cpp +++ b/ggml/src/ggml-blas.cpp @@ -297,14 +297,14 @@ ggml_backend_t ggml_backend_blas_init(void) { /* .context = */ ctx, }; -#if !defined(NDEBUG) && defined(OPENBLAS_VERSION) && defined(GGML_USE_OPENMP) +#if defined(OPENBLAS_VERSION) && defined(GGML_USE_OPENMP) if (openblas_get_parallel() != OPENBLAS_OPENMP) { - fprintf(stderr, "%s: warning: ggml is using OpenMP, but OpenBLAS was compiled without OpenMP support\n", __func__); + GGML_LOG_DEBUG("%s: warning: ggml is using OpenMP, but OpenBLAS was compiled without OpenMP support\n", __func__); } #endif -#if !defined(NDEBUG) && defined(BLIS_ENABLE_CBLAS) && defined(GGML_USE_OPENMP) && !defined(BLIS_ENABLE_OPENMP) - fprintf(stderr, "%s: warning: ggml is using OpenMP, but BLIS was compiled without OpenMP support\n", __func__); +#if defined(BLIS_ENABLE_CBLAS) && defined(GGML_USE_OPENMP) && !defined(BLIS_ENABLE_OPENMP) + GGML_LOG_DEBUG("%s: warning: ggml is using OpenMP, but BLIS was compiled without OpenMP support\n", __func__); #endif return backend; diff --git a/ggml/src/ggml-cann.cpp b/ggml/src/ggml-cann.cpp index db5f8f186..ec3c0a688 100644 --- a/ggml/src/ggml-cann.cpp +++ b/ggml/src/ggml-cann.cpp @@ -1148,6 +1148,7 @@ ggml_backend_cann_buffer_type(int32_t device) { for (int32_t i = 0; i < GGML_CANN_MAX_DEVICES; i++) { ggml_backend_cann_buffer_types[i] = { /* .iface = */ ggml_backend_cann_buffer_type_interface, + /* .device = */ nullptr, /* .context = */ new ggml_backend_cann_buffer_type_context{ i, "CANN" + std::to_string(i)}, @@ -1868,7 +1869,7 @@ static ggml_backend_event_t ggml_backend_cann_event_new( ACL_CHECK(aclrtCreateEvent(&event)); return new ggml_backend_event{ - /* .backend = */ backend, + /* .device = */ nullptr, /* .context = */ event, }; } @@ -1895,10 +1896,9 @@ static void ggml_backend_cann_event_free(ggml_backend_event_t event) { * * @param event Pointer to the event structure to be recorded. */ -static void ggml_backend_cann_event_record(ggml_backend_event_t event) { +static void ggml_backend_cann_event_record(ggml_backend_t backend, ggml_backend_event_t event) { ggml_backend_cann_context* cann_ctx = - (ggml_backend_cann_context*)event->backend->context; - + (ggml_backend_cann_context*)backend->context; ACL_CHECK(aclrtRecordEvent((aclrtEvent)event->context, cann_ctx->stream())); } @@ -1916,8 +1916,7 @@ static void ggml_backend_cann_event_wait(ggml_backend_t backend, ggml_backend_event_t event) { ggml_backend_cann_context* cann_ctx = (ggml_backend_cann_context*)backend->context; - - if (ggml_backend_is_cann(event->backend)) { + if (ggml_backend_is_cann(backend)) { ACL_CHECK(aclrtStreamWaitEvent(cann_ctx->stream(), (aclrtEvent)event->context)); } else { diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index e547984da..cec3c5e9a 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -291,7 +291,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool { return; } } - GGML_LOG_WARN(GGML_CUDA_NAME " buffer pool full, increase MAX_CUDA_BUFFERS\n"); + GGML_LOG_DEBUG(GGML_CUDA_NAME " buffer pool full, increase MAX_CUDA_BUFFERS\n"); ggml_cuda_set_device(device); CUDA_CHECK(cudaFree(ptr)); pool_size -= size; @@ -980,7 +980,7 @@ static void * ggml_cuda_host_malloc(size_t size) { if (err != cudaSuccess) { // clear the error cudaGetLastError(); - GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__, + GGML_LOG_DEBUG("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__, size / 1024.0 / 1024.0, cudaGetErrorString(err)); return nullptr; } @@ -2410,7 +2410,7 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_ if (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device) { #ifndef NDEBUG - GGML_LOG_WARN("%s: backend and buffer devices do not match\n", __func__); + GGML_LOG_DEBUG("%s: backend and buffer devices do not match\n", __func__); #endif return false; } @@ -2528,7 +2528,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) { cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true; #ifndef NDEBUG - GGML_LOG_WARN("%s: disabling CUDA graphs due to GPU architecture\n", __func__); + GGML_LOG_DEBUG("%s: disabling CUDA graphs due to GPU architecture\n", __func__); #endif } } @@ -2579,14 +2579,14 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, if (node->src[0] && node->src[0]->buffer && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) { use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture #ifndef NDEBUG - GGML_LOG_WARN("%s: disabling CUDA graphs due to split buffer\n", __func__); + GGML_LOG_DEBUG("%s: disabling CUDA graphs due to split buffer\n", __func__); #endif } if (node->op == GGML_OP_MUL_MAT_ID) { use_cuda_graph = false; // This node type is not supported by CUDA graph capture #ifndef NDEBUG - GGML_LOG_WARN("%s: disabling CUDA graphs due to mul_mat_id\n", __func__); + GGML_LOG_DEBUG("%s: disabling CUDA graphs due to mul_mat_id\n", __func__); #endif } @@ -2595,7 +2595,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, // Changes in batch size or context size can cause changes to the grid size of some kernels. use_cuda_graph = false; #ifndef NDEBUG - GGML_LOG_WARN("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]); + GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]); #endif } @@ -2607,7 +2607,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, if (!ptr) { use_cuda_graph = false; #ifndef NDEBUG - GGML_LOG_WARN("%s: disabling CUDA graphs due to unsupported copy op\n", __func__); + GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported copy op\n", __func__); #endif } else { if (std::find(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), ptr) == ggml_cuda_cpy_fn_ptrs.end()) { @@ -2631,7 +2631,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) { cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true; #ifndef NDEBUG - GGML_LOG_WARN("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__); + GGML_LOG_DEBUG("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__); #endif } } @@ -2689,7 +2689,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, use_cuda_graph = false; cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true; #ifndef NDEBUG - GGML_LOG_WARN("%s: disabling CUDA graphs due to failed graph capture\n", __func__); + GGML_LOG_DEBUG("%s: disabling CUDA graphs due to failed graph capture\n", __func__); #endif } else { graph_evaluated_or_captured = true; // CUDA graph has been captured @@ -2858,7 +2858,7 @@ bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size) { // clear the error cudaGetLastError(); - GGML_LOG_WARN("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__, + GGML_LOG_DEBUG("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__, size / 1024.0 / 1024.0, cudaGetErrorString(err)); return false; } diff --git a/ggml/src/ggml-cuda/dmmv.cu b/ggml/src/ggml-cuda/dmmv.cu index 96a5adef5..00e21b5d7 100644 --- a/ggml/src/ggml-cuda/dmmv.cu +++ b/ggml/src/ggml-cuda/dmmv.cu @@ -416,10 +416,11 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, static __device__ void convert_f16(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){ const half * x = (const half *) vx; - + // load 2 halfs into register in a single instruction + const half2 x_reg = *((half2 *) &(x[ib + iqs])); // automatic half -> float type cast if dfloat == float - v.x = x[ib + iqs + 0]; - v.y = x[ib + iqs + 1]; + v.x = __low2float(x_reg); + v.y = __high2float(x_reg); } static constexpr __device__ dequantize_kernel_t get_dequantize_kernel(ggml_type type) { @@ -476,13 +477,28 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons // matrix multiplication // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2 #ifdef GGML_CUDA_F16 - tmp += __hmul2(v, { - y[iybs + iqs + j/qr + 0], - y[iybs + iqs + j/qr + y_offset] - }); + if ( y_offset == 1 ) { + // load 2 dfloats into register in a single instruction + const dfloat2 y_reg = *((dfloat2 *) &(y[iybs + iqs + j/qr])); + tmp += __hmul2(v, y_reg); + } + else { + tmp += __hmul2(v, { + y[iybs + iqs + j/qr + 0], + y[iybs + iqs + j/qr + y_offset] + }); + } #else - tmp += v.x * y[iybs + iqs + j/qr + 0]; - tmp += v.y * y[iybs + iqs + j/qr + y_offset]; + if ( y_offset == 1 ) { + // load 2 dfloats into register in a single instruction + const dfloat2 y_reg = *((dfloat2 *) &(y[iybs + iqs + j/qr])); + tmp += v.x * y_reg.x; + tmp += v.y * y_reg.y; + } + else { + tmp += v.x * y[iybs + iqs + j/qr + 0]; + tmp += v.y * y[iybs + iqs + j/qr + y_offset]; + } #endif // GGML_CUDA_F16 } } diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index d3f4bad8c..65c4f8119 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -19,6 +19,9 @@ extern "C" { #define MIN(a, b) ((a) < (b) ? (a) : (b)) #define MAX(a, b) ((a) > (b) ? (a) : (b)) +// required for mmap as gguf only guarantees 32-byte alignment +#define TENSOR_ALIGNMENT 32 + // static_assert should be a #define, but if it's not, // fall back to the _Static_assert C11 keyword. // if C99 - static_assert is noop @@ -196,6 +199,11 @@ struct ggml_cgraph { struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1); +// Memory allocation + +void * ggml_aligned_malloc(size_t size); +void ggml_aligned_free(void * ptr, size_t size); + #ifdef __cplusplus } #endif diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp index 4a2345971..306c25be4 100644 --- a/ggml/src/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan.cpp @@ -1941,7 +1941,7 @@ static vk_device ggml_vk_get_device(size_t idx) { if (device->fp16) { device_extensions.push_back("VK_KHR_shader_float16_int8"); } - device->name = device->properties.deviceName.data(); + device->name = GGML_VK_NAME + std::to_string(idx); device_create_info = { vk::DeviceCreateFlags(), @@ -1968,7 +1968,7 @@ static vk_device ggml_vk_get_device(size_t idx) { device->buffer_type = { /* .iface = */ ggml_backend_vk_buffer_type_interface, - /* .device = */ nullptr, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_vk_reg(), idx), /* .context = */ new ggml_backend_vk_buffer_type_context{ device->name, device }, }; @@ -6378,7 +6378,7 @@ ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() { /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, }, - /* .device = */ nullptr, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_vk_reg(), 0), /* .context = */ nullptr, }; @@ -6581,9 +6581,135 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg UNUSED(backend); } -static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const ggml_tensor * op) { - // ggml_backend_vk_context * ctx = (ggml_backend_vk_context *) backend->context; +// TODO: enable async and synchronize +static ggml_backend_i ggml_backend_vk_interface = { + /* .get_name = */ ggml_backend_vk_name, + /* .free = */ ggml_backend_vk_free, + /* .get_default_buffer_type = */ ggml_backend_vk_get_default_buffer_type, + /* .set_tensor_async = */ NULL, // ggml_backend_vk_set_tensor_async, + /* .get_tensor_async = */ NULL, // ggml_backend_vk_get_tensor_async, + /* .cpy_tensor_async = */ NULL, // ggml_backend_vk_cpy_tensor_async, + /* .synchronize = */ NULL, // ggml_backend_vk_synchronize, + /* .graph_plan_create = */ NULL, + /* .graph_plan_free = */ NULL, + /* .graph_plan_update = */ NULL, + /* .graph_plan_compute = */ NULL, + /* .graph_compute = */ ggml_backend_vk_graph_compute, + /* .supports_op = */ NULL, + /* .supports_buft = */ NULL, + /* .offload_op = */ NULL, + /* .event_record = */ NULL, + /* .event_wait = */ NULL, +}; +static ggml_guid_t ggml_backend_vk_guid() { + static ggml_guid guid = { 0xb8, 0xf7, 0x4f, 0x86, 0x40, 0x3c, 0xe1, 0x02, 0x91, 0xc8, 0xdd, 0xe9, 0x02, 0x3f, 0xc0, 0x2b }; + return &guid; +} + +ggml_backend_t ggml_backend_vk_init(size_t dev_num) { + VK_LOG_DEBUG("ggml_backend_vk_init(" << dev_num << ")"); + + ggml_backend_vk_context * ctx = new ggml_backend_vk_context; + ggml_vk_init(ctx, dev_num); + + ggml_backend_t vk_backend = new ggml_backend { + /* .guid = */ ggml_backend_vk_guid(), + /* .interface = */ ggml_backend_vk_interface, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_vk_reg(), dev_num), + /* .context = */ ctx, + }; + + return vk_backend; +} + +bool ggml_backend_is_vk(ggml_backend_t backend) { + return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_vk_guid()); +} + +int ggml_backend_vk_get_device_count() { + return ggml_vk_get_device_count(); +} + +void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size) { + GGML_ASSERT(device < (int) vk_instance.device_indices.size()); + int dev_idx = vk_instance.device_indices[device]; + ggml_vk_get_device_description(dev_idx, description, description_size); +} + +void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) { + GGML_ASSERT(device < (int) vk_instance.device_indices.size()); + + vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]]; + + vk::PhysicalDeviceMemoryProperties memprops = vkdev.getMemoryProperties(); + + for (const vk::MemoryHeap& heap : memprops.memoryHeaps) { + if (heap.flags & vk::MemoryHeapFlagBits::eDeviceLocal) { + *total = heap.size; + *free = heap.size; + break; + } + } +} + +////////////////////////// + +struct ggml_backend_vk_device_context { + int device; + std::string name; + std::string description; +}; + +static const char * ggml_backend_vk_device_get_name(ggml_backend_dev_t dev) { + ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; + return ctx->name.c_str(); +} + +static const char * ggml_backend_vk_device_get_description(ggml_backend_dev_t dev) { + ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; + return ctx->description.c_str(); +} + +static void ggml_backend_vk_device_get_memory(ggml_backend_dev_t device, size_t * free, size_t * total) { + ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)device->context; + ggml_backend_vk_get_device_memory(ctx->device, free, total); +} + +static ggml_backend_buffer_type_t ggml_backend_vk_device_get_buffer_type(ggml_backend_dev_t dev) { + ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; + return ggml_backend_vk_buffer_type(ctx->device); +} + +static ggml_backend_buffer_type_t ggml_backend_vk_device_get_host_buffer_type(ggml_backend_dev_t dev) { + UNUSED(dev); + return ggml_backend_vk_host_buffer_type(); +} + +static enum ggml_backend_dev_type ggml_backend_vk_device_get_type(ggml_backend_dev_t dev) { + UNUSED(dev); + return GGML_BACKEND_DEVICE_TYPE_GPU_FULL; +} + +static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) { + props->name = ggml_backend_vk_device_get_name(dev); + props->description = ggml_backend_vk_device_get_description(dev); + props->type = ggml_backend_vk_device_get_type(dev); + ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total); + props->caps = { + /* async */ false, + /* host_buffer */ true, + /* events */ false, + }; +} + +static ggml_backend_t ggml_backend_vk_device_init(ggml_backend_dev_t dev, const char * params) { + UNUSED(params); + ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; + return ggml_backend_vk_init(ctx->device); +} + +static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { switch (op->op) { case GGML_OP_UNARY: switch (ggml_get_unary_op(op)) { @@ -6701,97 +6827,101 @@ static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const ggml_tenso return false; } - UNUSED(backend); + UNUSED(dev); } -static bool ggml_backend_vk_offload_op(ggml_backend_t backend, const ggml_tensor * op) { +static bool ggml_backend_vk_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { + if (buft->iface.get_name != ggml_backend_vk_buffer_type_name) { + return false; + } + + ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; + ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context; + + return buft_ctx->device->idx == ctx->device; +} + +static bool ggml_backend_vk_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) { const int min_batch_size = 32; return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) || (op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID); - UNUSED(backend); + UNUSED(dev); } -static bool ggml_backend_vk_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { - if (buft->iface.get_name != ggml_backend_vk_buffer_type_name) { - return false; - } - - ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context; - ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; - - return buft_ctx->device == ctx->device; -} - -// TODO: enable async and synchronize -static ggml_backend_i ggml_backend_vk_interface = { - /* .get_name = */ ggml_backend_vk_name, - /* .free = */ ggml_backend_vk_free, - /* .get_default_buffer_type = */ ggml_backend_vk_get_default_buffer_type, - /* .set_tensor_async = */ NULL, // ggml_backend_vk_set_tensor_async, - /* .get_tensor_async = */ NULL, // ggml_backend_vk_get_tensor_async, - /* .cpy_tensor_async = */ NULL, // ggml_backend_vk_cpy_tensor_async, - /* .synchronize = */ NULL, // ggml_backend_vk_synchronize, - /* .graph_plan_create = */ NULL, - /* .graph_plan_free = */ NULL, - /* .graph_plan_update = */ NULL, - /* .graph_plan_compute = */ NULL, - /* .graph_compute = */ ggml_backend_vk_graph_compute, - /* .supports_op = */ ggml_backend_vk_supports_op, - /* .supports_buft = */ ggml_backend_vk_supports_buft, - /* .offload_op = */ ggml_backend_vk_offload_op, - /* .event_record = */ NULL, - /* .event_wait = */ NULL, +static const struct ggml_backend_device_i ggml_backend_vk_device_i = { + /* .get_name = */ ggml_backend_vk_device_get_name, + /* .get_description = */ ggml_backend_vk_device_get_description, + /* .get_memory = */ ggml_backend_vk_device_get_memory, + /* .get_type = */ ggml_backend_vk_device_get_type, + /* .get_props = */ ggml_backend_vk_device_get_props, + /* .init_backend = */ ggml_backend_vk_device_init, + /* .get_buffer_type = */ ggml_backend_vk_device_get_buffer_type, + /* .get_host_buffer_type = */ ggml_backend_vk_device_get_host_buffer_type, + /* .buffer_from_host_ptr = */ NULL, + /* .supports_op = */ ggml_backend_vk_device_supports_op, + /* .supports_buft = */ ggml_backend_vk_device_supports_buft, + /* .offload_op = */ ggml_backend_vk_device_offload_op, + /* .event_new = */ NULL, + /* .event_free = */ NULL, + /* .event_synchronize = */ NULL, }; -static ggml_guid_t ggml_backend_vk_guid() { - static ggml_guid guid = { 0xb8, 0xf7, 0x4f, 0x86, 0x40, 0x3c, 0xe1, 0x02, 0x91, 0xc8, 0xdd, 0xe9, 0x02, 0x3f, 0xc0, 0x2b }; - return &guid; +static const char * ggml_backend_vk_reg_get_name(ggml_backend_reg_t reg) { + UNUSED(reg); + return GGML_VK_NAME; } -ggml_backend_t ggml_backend_vk_init(size_t dev_num) { - VK_LOG_DEBUG("ggml_backend_vk_init(" << dev_num << ")"); - - ggml_backend_vk_context * ctx = new ggml_backend_vk_context; - ggml_vk_init(ctx, dev_num); - - ggml_backend_t vk_backend = new ggml_backend { - /* .guid = */ ggml_backend_vk_guid(), - /* .interface = */ ggml_backend_vk_interface, - /* .device = */ nullptr, - /* .context = */ ctx, - }; - - return vk_backend; +static size_t ggml_backend_vk_reg_get_device_count(ggml_backend_reg_t reg) { + UNUSED(reg); + return ggml_backend_vk_get_device_count(); } -bool ggml_backend_is_vk(ggml_backend_t backend) { - return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_vk_guid()); -} +static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg, size_t device) { + static std::vector devices; -int ggml_backend_vk_get_device_count() { - return ggml_vk_get_device_count(); -} + static bool initialized = false; -void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size) { - ggml_vk_get_device_description(device, description, description_size); -} - -void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total) { - GGML_ASSERT(device < (int) vk_instance.device_indices.size()); - - vk::PhysicalDevice vkdev = vk_instance.instance.enumeratePhysicalDevices()[vk_instance.device_indices[device]]; - - vk::PhysicalDeviceMemoryProperties memprops = vkdev.getMemoryProperties(); - - for (const vk::MemoryHeap& heap : memprops.memoryHeaps) { - if (heap.flags & vk::MemoryHeapFlagBits::eDeviceLocal) { - *total = heap.size; - *free = heap.size; - break; + { + static std::mutex mutex; + std::lock_guard lock(mutex); + if (!initialized) { + for (size_t i = 0; i < ggml_backend_vk_get_device_count(); i++) { + ggml_backend_vk_device_context * ctx = new ggml_backend_vk_device_context; + char desc[256]; + ggml_backend_vk_get_device_description(i, desc, sizeof(desc)); + ctx->device = i; + ctx->name = GGML_VK_NAME + std::to_string(i); + ctx->description = desc; + devices.push_back(new ggml_backend_device { + /* .iface = */ ggml_backend_vk_device_i, + /* .reg = */ reg, + /* .context = */ ctx, + }); + } + initialized = true; } } + + GGML_ASSERT(device < devices.size()); + return devices[device]; +} + +static const struct ggml_backend_reg_i ggml_backend_vk_reg_i = { + /* .get_name = */ ggml_backend_vk_reg_get_name, + /* .get_device_count = */ ggml_backend_vk_reg_get_device_count, + /* .get_device = */ ggml_backend_vk_reg_get_device, + /* .get_proc_address = */ NULL, +}; + +ggml_backend_reg_t ggml_backend_vk_reg() { + static ggml_backend_reg reg = { + /* .iface = */ ggml_backend_vk_reg_i, + /* .context = */ nullptr, + }; + + return ® } // Extension availability diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index cd8b3c0f1..85e8dce27 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -35,10 +35,6 @@ #include #endif -#ifdef GGML_USE_METAL -#include -#endif - #if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8) #undef GGML_USE_LLAMAFILE #endif @@ -189,6 +185,8 @@ typedef pthread_t ggml_thread_t; #endif #if defined(__APPLE__) +#include +#include #include #endif @@ -386,22 +384,40 @@ void ggml_log_callback_default(enum ggml_log_level level, const char * text, voi //#define GGML_SOFT_MAX_ACCELERATE #endif + +void * ggml_aligned_malloc(size_t size) { #if defined(_MSC_VER) || defined(__MINGW32__) -#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN) -#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr) + return _aligned_malloc(size, TENSOR_ALIGNMENT); #else -inline static void * ggml_aligned_malloc(size_t size) { if (size == 0) { GGML_LOG_WARN("Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n"); return NULL; } void * aligned_memory = NULL; #ifdef GGML_USE_CPU_HBM - int result = hbw_posix_memalign(&aligned_memory, 16, size); + int result = hbw_posix_memalign(&aligned_memory, TENSOR_ALIGNMENT, size); +#elif TARGET_OS_OSX + kern_return_t alloc_status = vm_allocate((vm_map_t) mach_task_self(), (vm_address_t *) &aligned_memory, size, VM_FLAGS_ANYWHERE); + int result = EFAULT; + switch (alloc_status) { + case KERN_SUCCESS: + result = 0; + break; + case KERN_INVALID_ADDRESS: + result = EINVAL; + break; + case KERN_NO_SPACE: + result = ENOMEM; + break; + default: + result = EFAULT; + break; + } #elif GGML_USE_METAL - int result = posix_memalign(&aligned_memory, sysconf(_SC_PAGESIZE), size); + const long page_size = sysconf(_SC_PAGESIZE); + int result = posix_memalign(&aligned_memory, MAX(TENSOR_ALIGNMENT, page_size), size); #else - int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size); + int result = posix_memalign(&aligned_memory, TENSOR_ALIGNMENT, size); #endif if (result != 0) { // Handle allocation failure @@ -419,14 +435,26 @@ inline static void * ggml_aligned_malloc(size_t size) { return NULL; } return aligned_memory; +#endif } -#define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size) -#ifdef GGML_USE_CPU_HBM -#define GGML_ALIGNED_FREE(ptr) if(NULL != ptr) hbw_free(ptr) + +void ggml_aligned_free(void * ptr, size_t size) { + GGML_UNUSED(size); +#if defined(_MSC_VER) || defined(__MINGW32__) + _aligned_free(ptr); +#elif GGML_USE_CPU_HBM + if (ptr != NULL) { + hbw_free(ptr); + } +#elif TARGET_OS_OSX + if (ptr != NULL) { + vm_deallocate((vm_map_t)mach_task_self(), (vm_address_t)ptr, size); + } #else -#define GGML_ALIGNED_FREE(ptr) free(ptr) -#endif + free(ptr); #endif +} + inline static void * ggml_malloc(size_t size) { if (size == 0) { @@ -3894,7 +3922,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { *ctx = (struct ggml_context) { /*.mem_size =*/ mem_size, - /*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : GGML_ALIGNED_MALLOC(mem_size), + /*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : ggml_aligned_malloc(mem_size), /*.mem_buffer_owned =*/ params.mem_buffer ? false : true, /*.no_alloc =*/ params.no_alloc, /*.no_alloc_save =*/ params.no_alloc, @@ -3934,7 +3962,7 @@ void ggml_free(struct ggml_context * ctx) { __func__, i, ggml_used_mem(ctx)); if (ctx->mem_buffer_owned) { - GGML_ALIGNED_FREE(ctx->mem_buffer); + ggml_aligned_free(ctx->mem_buffer, ctx->mem_size); } found = true; @@ -19663,9 +19691,10 @@ static void ggml_thread_cpumask_next(const bool * global_mask, bool * local_mask void ggml_threadpool_free(struct ggml_threadpool* threadpool) { if (!threadpool) return; + const int n_threads = threadpool->n_threads_max; + #ifndef GGML_USE_OPENMP struct ggml_compute_state* workers = threadpool->workers; - const int n_threads = threadpool->n_threads_max; ggml_mutex_lock(&threadpool->mutex); @@ -19685,8 +19714,9 @@ void ggml_threadpool_free(struct ggml_threadpool* threadpool) { ggml_cond_destroy(&threadpool->cond); #endif // GGML_USE_OPENMP - GGML_ALIGNED_FREE(threadpool->workers); - GGML_ALIGNED_FREE(threadpool); + const size_t workers_size = sizeof(struct ggml_compute_state) * n_threads; + ggml_aligned_free(threadpool->workers, workers_size); + ggml_aligned_free(threadpool, sizeof(struct ggml_threadpool)); } #ifndef GGML_USE_OPENMP @@ -20123,7 +20153,7 @@ static struct ggml_threadpool * ggml_threadpool_new_impl( struct ggml_cplan * cplan) { struct ggml_threadpool * threadpool = - GGML_ALIGNED_MALLOC(sizeof(struct ggml_threadpool)); + ggml_aligned_malloc(sizeof(struct ggml_threadpool)); { threadpool->cgraph = cgraph; threadpool->cplan = cplan; @@ -20144,7 +20174,7 @@ static struct ggml_threadpool * ggml_threadpool_new_impl( // Allocate and init workers state const size_t workers_size = sizeof(struct ggml_compute_state) * tpp->n_threads; - struct ggml_compute_state * workers = GGML_ALIGNED_MALLOC(workers_size); + struct ggml_compute_state * workers = ggml_aligned_malloc(workers_size); memset(workers, 0, workers_size); for (int j = 0; j < tpp->n_threads; j++) { diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index e08617ba2..7ab08b036 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -152,6 +152,8 @@ class Keys: MERGES = "tokenizer.ggml.merges" BOS_ID = "tokenizer.ggml.bos_token_id" EOS_ID = "tokenizer.ggml.eos_token_id" + EOT_ID = "tokenizer.ggml.eot_token_id" + EOM_ID = "tokenizer.ggml.eom_token_id" UNK_ID = "tokenizer.ggml.unknown_token_id" SEP_ID = "tokenizer.ggml.seperator_token_id" PAD_ID = "tokenizer.ggml.padding_token_id" @@ -168,11 +170,16 @@ class Keys: CHAT_TEMPLATE_N = "tokenizer.chat_template.{name}" CHAT_TEMPLATES = "tokenizer.chat_templates" # FIM/Infill special tokens constants + FIM_PRE_ID = "tokenizer.ggml.fim_pre_token_id" + FIM_SUF_ID = "tokenizer.ggml.fim_suf_token_id" + FIM_MID_ID = "tokenizer.ggml.fim_mid_token_id" + FIM_PAD_ID = "tokenizer.ggml.fim_pad_token_id" + FIM_REP_ID = "tokenizer.ggml.fim_rep_token_id" + FIM_SEP_ID = "tokenizer.ggml.fim_sep_token_id" + # deprecated: PREFIX_ID = "tokenizer.ggml.prefix_token_id" SUFFIX_ID = "tokenizer.ggml.suffix_token_id" MIDDLE_ID = "tokenizer.ggml.middle_token_id" - EOT_ID = "tokenizer.ggml.eot_token_id" - EOM_ID = "tokenizer.ggml.eom_token_id" class Adapter: TYPE = "adapter.type" @@ -1579,6 +1586,8 @@ KEY_TOKENIZER_SCORES = Keys.Tokenizer.SCORES KEY_TOKENIZER_MERGES = Keys.Tokenizer.MERGES KEY_TOKENIZER_BOS_ID = Keys.Tokenizer.BOS_ID KEY_TOKENIZER_EOS_ID = Keys.Tokenizer.EOS_ID +KEY_TOKENIZER_EOT_ID = Keys.Tokenizer.EOT_ID +KEY_TOKENIZER_EOM_ID = Keys.Tokenizer.EOM_ID KEY_TOKENIZER_UNK_ID = Keys.Tokenizer.UNK_ID KEY_TOKENIZER_SEP_ID = Keys.Tokenizer.SEP_ID KEY_TOKENIZER_PAD_ID = Keys.Tokenizer.PAD_ID @@ -1586,8 +1595,15 @@ KEY_TOKENIZER_CLS_ID = Keys.Tokenizer.CLS_ID KEY_TOKENIZER_MASK_ID = Keys.Tokenizer.MASK_ID KEY_TOKENIZER_HF_JSON = Keys.Tokenizer.HF_JSON KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV -KEY_TOKENIZER_PRIFIX_ID = Keys.Tokenizer.PREFIX_ID + +KEY_TOKENIZER_FIM_PRE_ID = Keys.Tokenizer.FIM_PRE_ID +KEY_TOKENIZER_FIM_SUF_ID = Keys.Tokenizer.FIM_SUF_ID +KEY_TOKENIZER_FIM_MID_ID = Keys.Tokenizer.FIM_MID_ID +KEY_TOKENIZER_FIM_PAD_ID = Keys.Tokenizer.FIM_PAD_ID +KEY_TOKENIZER_FIM_REP_ID = Keys.Tokenizer.FIM_REP_ID +KEY_TOKENIZER_FIM_SEP_ID = Keys.Tokenizer.FIM_SEP_ID + +# deprecated +KEY_TOKENIZER_PREFIX_ID = Keys.Tokenizer.PREFIX_ID KEY_TOKENIZER_SUFFIX_ID = Keys.Tokenizer.SUFFIX_ID KEY_TOKENIZER_MIDDLE_ID = Keys.Tokenizer.MIDDLE_ID -KEY_TOKENIZER_EOT_ID = Keys.Tokenizer.EOT_ID -KEY_TOKENIZER_EOM_ID = Keys.Tokenizer.EOM_ID diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 5c460ef1b..0d8d8a0b0 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -843,15 +843,6 @@ class GGUFWriter: self.add_string(Keys.Tokenizer.CHAT_TEMPLATE, value) - def add_prefix_token_id(self, id: int) -> None: - self.add_uint32(Keys.Tokenizer.PREFIX_ID, id) - - def add_suffix_token_id(self, id: int) -> None: - self.add_uint32(Keys.Tokenizer.SUFFIX_ID, id) - - def add_middle_token_id(self, id: int) -> None: - self.add_uint32(Keys.Tokenizer.MIDDLE_ID, id) - def add_eot_token_id(self, id: int) -> None: self.add_uint32(Keys.Tokenizer.EOT_ID, id) diff --git a/include/llama.h b/include/llama.h index c1537c85e..361df855b 100644 --- a/include/llama.h +++ b/include/llama.h @@ -899,6 +899,7 @@ extern "C" { // Special tokens LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence + LLAMA_API llama_token llama_token_eot(const struct llama_model * model); // end-of-turn LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line @@ -907,11 +908,17 @@ extern "C" { LLAMA_API bool llama_add_bos_token(const struct llama_model * model); LLAMA_API bool llama_add_eos_token(const struct llama_model * model); - // Codellama infill tokens - LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix - LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle - LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix - LLAMA_API llama_token llama_token_eot (const struct llama_model * model); // End of infill middle + // infill tokens + DEPRECATED(LLAMA_API llama_token llama_token_prefix(const struct llama_model * model), "use llama_token_fim_pre instead"); + DEPRECATED(LLAMA_API llama_token llama_token_middle(const struct llama_model * model), "use llama_token_fim_mid instead"); + DEPRECATED(LLAMA_API llama_token llama_token_suffix(const struct llama_model * model), "use llama_token_fim_suf instead"); + + LLAMA_API llama_token llama_token_fim_pre(const struct llama_model * model); + LLAMA_API llama_token llama_token_fim_suf(const struct llama_model * model); + LLAMA_API llama_token llama_token_fim_mid(const struct llama_model * model); + LLAMA_API llama_token llama_token_fim_pad(const struct llama_model * model); + LLAMA_API llama_token llama_token_fim_rep(const struct llama_model * model); + LLAMA_API llama_token llama_token_fim_sep(const struct llama_model * model); // // Tokenization @@ -948,6 +955,12 @@ extern "C" { int32_t lstrip, bool special); + // check if token0 is contained as a prefix in token1 + LLAMA_API bool llama_token_is_prefix( + const struct llama_model * model, + llama_token token0, + llama_token token1); + /// @details Convert the provided tokens into text (inverse of llama_tokenize()). /// @param text The char pointer must be large enough to hold the resulting text. /// @return Returns the number of chars/bytes on success, no more than text_len_max. @@ -1096,6 +1109,9 @@ extern "C" { /// @details Dynamic temperature implementation (a.k.a. entropy) described in the paper https://arxiv.org/abs/2309.02772. LLAMA_API struct llama_sampler * llama_sampler_init_temp_ext (float t, float delta, float exponent); + /// @details XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335 + LLAMA_API struct llama_sampler * llama_sampler_init_xtc (float p, float t, size_t min_keep, uint32_t seed); + /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. @@ -1140,6 +1156,28 @@ extern "C" { int32_t n_logit_bias, const llama_logit_bias * logit_bias); + // this sampler is meant to be used for fill-in-the-middle infilling + // it's supposed to be used after top_k + top_p sampling + // + // 1. if the sum of the EOG probs times the number of candidates is higher than the sum of the other probs -> pick EOG + // 2. combine probs of tokens that have the same prefix + // + // example: + // + // - before: + // "hel": 0.5 + // "hell": 0.2 + // "hello": 0.1 + // "dummy": 0.1 + // + // - after: + // "hel": 0.8 + // "dummy": 0.1 + // + // 3. discard non-EOG tokens with low prob + // 4. if no tokens are left -> pick EOT + // + LLAMA_API struct llama_sampler * llama_sampler_init_infill(const struct llama_model * model); // Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl); diff --git a/koboldcpp.py b/koboldcpp.py index 42feb4b50..7c4a9fb07 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -44,7 +44,7 @@ maxhordelen = 400 modelbusy = threading.Lock() requestsinqueue = 0 defaultport = 5001 -KcppVersion = "1.76" +KcppVersion = "1.77" showdebug = True guimode = False showsamplerwarning = True @@ -2250,8 +2250,8 @@ def show_gui(): import customtkinter as ctk nextstate = 0 #0=exit, 1=launch - original_windowwidth = 550 - original_windowheight = 550 + original_windowwidth = 580 + original_windowheight = 560 windowwidth = original_windowwidth windowheight = original_windowheight ctk.set_appearance_mode("dark") diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index e255a8fc4..2e6550682 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -1059,6 +1059,101 @@ struct llama_sampler * llama_sampler_init_temp_ext(float temp, float delta, floa }; } +// xtc + +struct llama_sampler_xtc { + const float probability; + const float threshold; + const size_t min_keep; + + const uint32_t seed; + uint32_t seed_cur; + + std::mt19937 rng; +}; + +static const char * llama_sampler_xtc_name(const struct llama_sampler * /*smpl*/) { + return "xtc"; +} + +static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + auto * ctx = (llama_sampler_xtc *) smpl->ctx; + + if (ctx->probability <= 0.0f + || ctx->threshold > 0.5f + || cur_p->size < 2) { + return; + } + + std::uniform_real_distribution distribution(0.0f, 1.0f); + float chance = distribution(ctx->rng); + if (chance > ctx->probability) return; + + // in case it's not sorted/recalculated yet + llama_sampler_softmax_impl(cur_p); + + int pos_last = 0; + + for (size_t i = 0; i < cur_p->size; ++i) { + if (cur_p->data[i].p >= ctx->threshold) { + pos_last = i; + } else break; + } + + if (cur_p->size - pos_last >= ctx->min_keep && pos_last > 0) { + cur_p->data += pos_last; + cur_p->size -= pos_last; + } +} + +static struct llama_sampler * llama_sampler_xtc_clone(const struct llama_sampler * smpl) { + const auto * ctx = (const llama_sampler_xtc *) smpl->ctx; + auto * result = llama_sampler_init_xtc(ctx->probability, ctx->threshold, ctx->min_keep, ctx->seed); + + // copy the state + { + auto * result_ctx = (llama_sampler_xtc *) result->ctx; + + result_ctx->rng = ctx->rng; + } + + return result; +} + +static void llama_sampler_xtc_free(struct llama_sampler * smpl) { + delete (llama_sampler_xtc *) smpl->ctx; +} + +static void llama_sampler_xtc_reset(struct llama_sampler * smpl) { + auto * ctx = (llama_sampler_xtc *) smpl->ctx; + ctx->seed_cur = get_rng_seed(ctx->seed); + ctx->rng.seed(ctx->seed_cur); +} + +static struct llama_sampler_i llama_sampler_xtc_i = { + /* .name = */ llama_sampler_xtc_name, + /* .accept = */ nullptr, + /* .apply = */ llama_sample_xtc_apply, + /* .reset = */ llama_sampler_xtc_reset, + /* .clone = */ llama_sampler_xtc_clone, + /* .free = */ llama_sampler_xtc_free, +}; + +struct llama_sampler * llama_sampler_init_xtc(float p, float t, size_t min_keep, uint32_t seed) { + auto seed_cur = get_rng_seed(seed); + return new llama_sampler { + /* .iface = */ &llama_sampler_xtc_i, + /* .ctx = */ new llama_sampler_xtc { + /* .probability = */ p, + /* .threshold = */ t, + /* .min_keep = */ min_keep, + /* .seed = */ seed, + /* .seed_cur = */ seed_cur, + /* .rng = */ std::mt19937(seed_cur), + }, + }; +} + // mirostat struct llama_sampler_mirostat { @@ -1644,6 +1739,207 @@ struct llama_sampler * llama_sampler_init_logit_bias( }; } +// infill + +//#define GGML_DEBUG_SAMPLER_INFILL + +struct llama_sampler_infill { + const struct llama_vocab * vocab; +}; + +static const char * llama_sampler_infill_name(const struct llama_sampler * /*smpl*/) { + return "infill"; +} + +static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { + auto * ctx = (llama_sampler_infill *) smpl->ctx; + + llama_sampler_softmax_impl(cur_p); + +#if defined(GGML_DEBUG_SAMPLER_INFILL) +#define LOG_DBG_CUR LLAMA_LOG_DEBUG +#else +#define LOG_DBG_CUR(...) +#endif + + for (size_t i = 0; i < cur_p->size; ++i) { + LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit); + } + + float p_txt_sum = 0.0f; + float p_eog_sum = 0.0f; + + for (size_t i = 0; i < cur_p->size; ++i) { + if (llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id)) { + p_eog_sum += cur_p->data[i].p; + } else { + p_txt_sum += cur_p->data[i].p; + } + } + + const float rat = p_eog_sum == 0.0 ? INFINITY : p_txt_sum / p_eog_sum; GGML_UNUSED(rat); + + LOG_DBG_CUR("%s: p_txt_sum = %.2f, p_eog_sum = %.2f, rat = %.2f, n = %zu\n", __func__, p_txt_sum, p_eog_sum, rat, cur_p->size); + + if (3*p_eog_sum*cur_p->size > p_txt_sum) { + LOG_DBG_CUR("%s: the ratio p_txt/p_eog = %.2f is too low -> sampling EOG\n", __func__, p_txt_sum/p_eog_sum); + + // keep just the EOG tokens + const auto size_org = cur_p->size; + + cur_p->size = 0; + + float p_sum = 0.0f; + + for (size_t i = 0; i < size_org; ++i) { + if (llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id)) { + p_sum += cur_p->data[i].p; + + cur_p->data[cur_p->size++] = cur_p->data[i]; + } + } + + // normalize probs + for (size_t i = 0; i < cur_p->size; ++i) { + cur_p->data[i].p /= p_sum; + } + + return; + } + + size_t n_combined = 0; GGML_UNUSED(n_combined); + + // combine tokens with common prefix + for (size_t i = 0; i < cur_p->size; ++i) { + for (size_t j = 0; j < cur_p->size; ++j) { + if (cur_p->data[i].logit == -INFINITY) { + break; + } + + if (i == j || cur_p->data[j].logit == -INFINITY) { + continue; + } + + if (llama_token_is_prefix_impl(*ctx->vocab, cur_p->data[i].id, cur_p->data[j].id)) { + if (cur_p->data[i].p > cur_p->data[j].p) { + cur_p->data[i].p += cur_p->data[j].p; + cur_p->data[j].logit = -INFINITY; + cur_p->data[j].p = 0.0f; + } else { + cur_p->data[j].p += cur_p->data[i].p; + cur_p->data[i].logit = -INFINITY; + cur_p->data[i].p = 0.0f; + } + + n_combined++; + } + } + } + + size_t n_non_eog = 0; + + size_t size_org = cur_p->size; + + float p_sum = 0.0f; + float thold = 0.2f; + + cur_p->size = 0; + + LOG_DBG_CUR("%s: n_combined = %zu, applying thold = %.3f\n", __func__, n_combined, thold); + + for (size_t i = 0; i < size_org; ++i) { + const bool is_eog = llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id); + + if (cur_p->data[i].p < thold && !is_eog) { + continue; + } + + if (!is_eog) { + ++n_non_eog; + } + + p_sum += cur_p->data[i].p; + + // keep this token + cur_p->data[cur_p->size++] = cur_p->data[i]; + } + + LOG_DBG_CUR("%s: n_non_eog = %zu\n", __func__, n_non_eog); + + // if no non-EOG tokens are left -> reduce cur_p to single EOT token + if (n_non_eog == 0) { + cur_p->size = 1; + cur_p->data[0].id = llama_token_eot_impl(*ctx->vocab); + cur_p->data[0].logit = 1.0f; + + return; + } + + // normalize probs + for (size_t i = 0; i < cur_p->size; ++i) { + cur_p->data[i].p /= p_sum; + + LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit); + } + + size_org = cur_p->size; + p_sum = 0.0f; + thold = 1.0/(n_non_eog + 1); + + cur_p->size = 0; + + LOG_DBG_CUR("%s: applying thold = %.3f\n", __func__, thold); + + for (size_t i = 0; i < size_org; ++i) { + const bool is_eog = llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id); + + if (cur_p->data[i].p < thold && !is_eog) { + continue; + } + + p_sum += cur_p->data[i].p; + + cur_p->data[cur_p->size++] = cur_p->data[i]; + } + + // normalize probs + for (size_t i = 0; i < cur_p->size; ++i) { + cur_p->data[i].p /= p_sum; + + LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit); + } + +#undef LOG_DBG_CUR +} + +static struct llama_sampler * llama_sampler_infill_clone(const struct llama_sampler * smpl) { + const auto * ctx = (const llama_sampler_infill *) smpl->ctx; + return llama_sampler_init_infill_impl(*ctx->vocab); +} + +static void llama_sampler_infill_free(struct llama_sampler * smpl) { + delete (llama_sampler_infill *) smpl->ctx; +} + +static struct llama_sampler_i llama_sampler_infill_i = { + /* .name = */ llama_sampler_infill_name, + /* .accept = */ nullptr, + /* .apply = */ llama_sampler_infill_apply, + /* .reset = */ nullptr, + /* .clone = */ llama_sampler_infill_clone, + /* .free = */ llama_sampler_infill_free, +}; + +struct llama_sampler * llama_sampler_init_infill_impl( + const struct llama_vocab & vocab) { + return new llama_sampler { + /* .iface = */ &llama_sampler_infill_i, + /* .ctx = */ new llama_sampler_infill { + /* .vocab = */ &vocab, + }, + }; +} + // utils uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl) { diff --git a/src/llama-sampling.h b/src/llama-sampling.h index d90b14713..2683f1b92 100644 --- a/src/llama-sampling.h +++ b/src/llama-sampling.h @@ -4,8 +4,6 @@ #include "llama-grammar.h" -#include - struct llama_vocab; struct llama_grammar; @@ -27,3 +25,6 @@ struct llama_sampler * llama_sampler_init_grammar_impl( const struct llama_vocab & vocab, const char * grammar_str, const char * grammar_root); + +struct llama_sampler * llama_sampler_init_infill_impl( + const struct llama_vocab & vocab); diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index d8acd584a..be9ac9d6b 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -230,7 +230,7 @@ struct llm_tokenizer_spm_session { } // seed the work queue with all possible 2-character tokens. - for (size_t i = 1; i < symbols.size(); ++i) { + for (int i = 1; i < (int) symbols.size(); ++i) { try_add_bigram(i - 1, i); } @@ -790,7 +790,7 @@ struct llm_tokenizer_bpe_session { index++; symbols.emplace_back(sym); } - for (size_t i = 1; i < symbols.size(); ++i) { + for (int i = 1; i < (int) symbols.size(); ++i) { add_new_bigram(i - 1, i); } @@ -1918,6 +1918,14 @@ llama_token llama_token_eos_impl(const struct llama_vocab & vocab) { return vocab.special_eos_id; } +llama_token llama_token_eot_impl(const struct llama_vocab & vocab) { + return vocab.special_eot_id; +} + +llama_token llama_token_eom_impl(const struct llama_vocab & vocab) { + return vocab.special_eom_id; +} + llama_token llama_token_cls_impl(const struct llama_vocab & vocab) { return vocab.special_cls_id; } @@ -1943,23 +1951,39 @@ bool llama_add_eos_token_impl(const struct llama_vocab & vocab) { } llama_token llama_token_prefix_impl(const struct llama_vocab & vocab) { - return vocab.special_prefix_id; + return vocab.special_fim_pre_id; } llama_token llama_token_middle_impl(const struct llama_vocab & vocab) { - return vocab.special_middle_id; + return vocab.special_fim_mid_id; } llama_token llama_token_suffix_impl(const struct llama_vocab & vocab) { - return vocab.special_suffix_id; + return vocab.special_fim_suf_id; } -llama_token llama_token_eot_impl(const struct llama_vocab & vocab) { - return vocab.special_eot_id; +llama_token llama_token_fim_pre_impl(const struct llama_vocab & vocab) { + return vocab.special_fim_pre_id; } -llama_token llama_token_eom_impl(const struct llama_vocab & vocab) { - return vocab.special_eom_id; +llama_token llama_token_fim_suf_impl(const struct llama_vocab & vocab) { + return vocab.special_fim_suf_id; +} + +llama_token llama_token_fim_mid_impl(const struct llama_vocab & vocab) { + return vocab.special_fim_mid_id; +} + +llama_token llama_token_fim_pad_impl(const struct llama_vocab & vocab) { + return vocab.special_fim_pad_id; +} + +llama_token llama_token_fim_rep_impl(const struct llama_vocab & vocab) { + return vocab.special_fim_rep_id; +} + +llama_token llama_token_fim_sep_impl(const struct llama_vocab & vocab) { + return vocab.special_fim_sep_id; } int32_t llama_tokenize_impl( @@ -2094,6 +2118,23 @@ int32_t llama_token_to_piece_impl(const struct llama_vocab & vocab, llama_token return 0; } +bool llama_token_is_prefix_impl( + const struct llama_vocab & vocab, + llama_token token0, + llama_token token1) { + char text_buf_0[128]; + char text_buf_1[128]; + + const int32_t len0 = llama_token_to_piece_impl(vocab, token0, text_buf_0, sizeof(text_buf_0) - 1, 0, false); + const int32_t len1 = llama_token_to_piece_impl(vocab, token1, text_buf_1, sizeof(text_buf_1) - 1, 0, false); + + if (len0 <= 0 || len1 <= 0) { + return false; + } + + return len0 <= len1 && memcmp(text_buf_0, text_buf_1, len0) == 0; +} + int32_t llama_detokenize_impl( const struct llama_vocab & vocab, const llama_token * tokens, diff --git a/src/llama-vocab.h b/src/llama-vocab.h index 28bad9135..d958d0073 100644 --- a/src/llama-vocab.h +++ b/src/llama-vocab.h @@ -37,20 +37,26 @@ struct llama_vocab { std::map, int> bpe_ranks; // default LLaMA special tokens + // TODO: should we set all of these to LLAMA_TOKEN_NULL? id special_bos_id = 1; id special_eos_id = 2; + id special_eot_id = LLAMA_TOKEN_NULL; + id special_eom_id = LLAMA_TOKEN_NULL; id special_unk_id = 0; id special_sep_id = LLAMA_TOKEN_NULL; id special_pad_id = LLAMA_TOKEN_NULL; id special_cls_id = LLAMA_TOKEN_NULL; id special_mask_id = LLAMA_TOKEN_NULL; - id linefeed_id = 13; - id special_prefix_id = LLAMA_TOKEN_NULL; - id special_suffix_id = LLAMA_TOKEN_NULL; - id special_middle_id = LLAMA_TOKEN_NULL; - id special_eot_id = LLAMA_TOKEN_NULL; // TODO: move above after "eos_id", and here add "file separator" token - id special_eom_id = LLAMA_TOKEN_NULL; + id linefeed_id = 13; + + // fim tokens + id special_fim_pre_id = LLAMA_TOKEN_NULL; + id special_fim_suf_id = LLAMA_TOKEN_NULL; + id special_fim_mid_id = LLAMA_TOKEN_NULL; + id special_fim_pad_id = LLAMA_TOKEN_NULL; + id special_fim_rep_id = LLAMA_TOKEN_NULL; // repo + id special_fim_sep_id = LLAMA_TOKEN_NULL; // file separator // set of all tokens that cause "end of generation" std::set special_eog_ids; @@ -104,19 +110,26 @@ bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token t llama_token llama_token_bos_impl(const struct llama_vocab & vocab); llama_token llama_token_eos_impl(const struct llama_vocab & vocab); +llama_token llama_token_eot_impl(const struct llama_vocab & vocab); +llama_token llama_token_eom_impl(const struct llama_vocab & vocab); llama_token llama_token_cls_impl(const struct llama_vocab & vocab); llama_token llama_token_sep_impl(const struct llama_vocab & vocab); llama_token llama_token_nl_impl (const struct llama_vocab & vocab); llama_token llama_token_pad_impl(const struct llama_vocab & vocab); -bool llama_add_bos_token_impl(const struct llama_vocab & vocab); -bool llama_add_eos_token_impl(const struct llama_vocab & vocab); - llama_token llama_token_prefix_impl(const struct llama_vocab & vocab); llama_token llama_token_middle_impl(const struct llama_vocab & vocab); llama_token llama_token_suffix_impl(const struct llama_vocab & vocab); -llama_token llama_token_eot_impl (const struct llama_vocab & vocab); -llama_token llama_token_eom_impl (const struct llama_vocab & vocab); + +llama_token llama_token_fim_pre_impl(const struct llama_vocab & vocab); +llama_token llama_token_fim_suf_impl(const struct llama_vocab & vocab); +llama_token llama_token_fim_mid_impl(const struct llama_vocab & vocab); +llama_token llama_token_fim_pad_impl(const struct llama_vocab & vocab); +llama_token llama_token_fim_rep_impl(const struct llama_vocab & vocab); +llama_token llama_token_fim_sep_impl(const struct llama_vocab & vocab); + +bool llama_add_bos_token_impl(const struct llama_vocab & vocab); +bool llama_add_eos_token_impl(const struct llama_vocab & vocab); int32_t llama_tokenize_impl( const struct llama_vocab & vocab, @@ -136,6 +149,12 @@ int32_t llama_token_to_piece_impl( int32_t lstrip, bool special); +// check if token0 is contained as a prefix in token1 +bool llama_token_is_prefix_impl( + const struct llama_vocab & vocab, + llama_token token0, + llama_token token1); + int32_t llama_detokenize_impl( const struct llama_vocab & vocab, const llama_token * tokens, diff --git a/src/llama.cpp b/src/llama.cpp index 974da63ce..963c44b76 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -15,8 +15,6 @@ # include "ggml-cuda.h" #elif defined(GGML_USE_CLBLAST) # include "ggml-opencl.h" -#elif defined(GGML_USE_VULKAN) -# include "ggml-vulkan.h" #elif defined(GGML_USE_SYCL) # include "ggml-sycl.h" #elif defined(GGML_USE_KOMPUTE) @@ -356,6 +354,8 @@ enum llm_kv { LLM_KV_TOKENIZER_MERGES, LLM_KV_TOKENIZER_BOS_ID, LLM_KV_TOKENIZER_EOS_ID, + LLM_KV_TOKENIZER_EOT_ID, + LLM_KV_TOKENIZER_EOM_ID, LLM_KV_TOKENIZER_UNK_ID, LLM_KV_TOKENIZER_SEP_ID, LLM_KV_TOKENIZER_PAD_ID, @@ -368,14 +368,20 @@ enum llm_kv { LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, LLM_KV_TOKENIZER_HF_JSON, LLM_KV_TOKENIZER_RWKV, - LLM_KV_TOKENIZER_PREFIX_ID, - LLM_KV_TOKENIZER_SUFFIX_ID, - LLM_KV_TOKENIZER_MIDDLE_ID, - LLM_KV_TOKENIZER_EOT_ID, - LLM_KV_TOKENIZER_EOM_ID, + LLM_KV_TOKENIZER_FIM_PRE_ID, + LLM_KV_TOKENIZER_FIM_SUF_ID, + LLM_KV_TOKENIZER_FIM_MID_ID, + LLM_KV_TOKENIZER_FIM_PAD_ID, + LLM_KV_TOKENIZER_FIM_REP_ID, + LLM_KV_TOKENIZER_FIM_SEP_ID, LLM_KV_ADAPTER_TYPE, LLM_KV_ADAPTER_LORA_ALPHA, + + // deprecated: + LLM_KV_TOKENIZER_PREFIX_ID, + LLM_KV_TOKENIZER_SUFFIX_ID, + LLM_KV_TOKENIZER_MIDDLE_ID, }; static const std::map LLM_KV_NAMES = { @@ -433,57 +439,65 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" }, { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" }, - { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, - { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" }, - { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" }, - { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" }, - { LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" }, - { LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" }, - { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" }, - { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" }, - { LLM_KV_ROPE_SCALING_YARN_LOG_MUL, "%s.rope.scaling.yarn_log_multiplier" }, + { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, + { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" }, + { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" }, + { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" }, + { LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" }, + { LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" }, + { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" }, + { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" }, + { LLM_KV_ROPE_SCALING_YARN_LOG_MUL, "%s.rope.scaling.yarn_log_multiplier" }, - { LLM_KV_SPLIT_NO, "split.no" }, - { LLM_KV_SPLIT_COUNT, "split.count" }, - { LLM_KV_SPLIT_TENSORS_COUNT, "split.tensors.count" }, + { LLM_KV_SPLIT_NO, "split.no" }, + { LLM_KV_SPLIT_COUNT, "split.count" }, + { LLM_KV_SPLIT_TENSORS_COUNT, "split.tensors.count" }, - { LLM_KV_SSM_CONV_KERNEL, "%s.ssm.conv_kernel" }, - { LLM_KV_SSM_INNER_SIZE, "%s.ssm.inner_size" }, - { LLM_KV_SSM_STATE_SIZE, "%s.ssm.state_size" }, - { LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" }, - { LLM_KV_SSM_DT_B_C_RMS, "%s.ssm.dt_b_c_rms" }, + { LLM_KV_SSM_CONV_KERNEL, "%s.ssm.conv_kernel" }, + { LLM_KV_SSM_INNER_SIZE, "%s.ssm.inner_size" }, + { LLM_KV_SSM_STATE_SIZE, "%s.ssm.state_size" }, + { LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" }, + { LLM_KV_SSM_DT_B_C_RMS, "%s.ssm.dt_b_c_rms" }, - { LLM_KV_WKV_HEAD_SIZE, "%s.wkv.head_size" }, + { LLM_KV_WKV_HEAD_SIZE, "%s.wkv.head_size" }, - { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" }, - { LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" }, - { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" }, - { LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" }, - { LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" }, - { LLM_KV_TOKENIZER_SCORES, "tokenizer.ggml.scores" }, - { LLM_KV_TOKENIZER_MERGES, "tokenizer.ggml.merges" }, - { LLM_KV_TOKENIZER_BOS_ID, "tokenizer.ggml.bos_token_id" }, - { LLM_KV_TOKENIZER_EOS_ID, "tokenizer.ggml.eos_token_id" }, - { LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" }, - { LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" }, - { LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" }, - { LLM_KV_TOKENIZER_CLS_ID, "tokenizer.ggml.cls_token_id" }, - { LLM_KV_TOKENIZER_MASK_ID, "tokenizer.ggml.mask_token_id" }, - { LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" }, - { LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" }, - { LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" }, - { LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, "tokenizer.ggml.remove_extra_whitespaces" }, - { LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, "tokenizer.ggml.precompiled_charsmap" }, - { LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" }, - { LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" }, - { LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" }, - { LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" }, - { LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" }, - { LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" }, - { LLM_KV_TOKENIZER_EOM_ID, "tokenizer.ggml.eom_token_id" }, + { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" }, + { LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" }, + { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" }, + { LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" }, + { LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" }, + { LLM_KV_TOKENIZER_SCORES, "tokenizer.ggml.scores" }, + { LLM_KV_TOKENIZER_MERGES, "tokenizer.ggml.merges" }, + { LLM_KV_TOKENIZER_BOS_ID, "tokenizer.ggml.bos_token_id" }, + { LLM_KV_TOKENIZER_EOS_ID, "tokenizer.ggml.eos_token_id" }, + { LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" }, + { LLM_KV_TOKENIZER_EOM_ID, "tokenizer.ggml.eom_token_id" }, + { LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" }, + { LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" }, + { LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" }, + { LLM_KV_TOKENIZER_CLS_ID, "tokenizer.ggml.cls_token_id" }, + { LLM_KV_TOKENIZER_MASK_ID, "tokenizer.ggml.mask_token_id" }, + { LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" }, + { LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" }, + { LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" }, + { LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, "tokenizer.ggml.remove_extra_whitespaces" }, + { LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, "tokenizer.ggml.precompiled_charsmap" }, + { LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" }, + { LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" }, + { LLM_KV_TOKENIZER_FIM_PRE_ID, "tokenizer.ggml.fim_pre_token_id" }, + { LLM_KV_TOKENIZER_FIM_SUF_ID, "tokenizer.ggml.fim_suf_token_id" }, + { LLM_KV_TOKENIZER_FIM_MID_ID, "tokenizer.ggml.fim_mid_token_id" }, + { LLM_KV_TOKENIZER_FIM_PAD_ID, "tokenizer.ggml.fim_pad_token_id" }, + { LLM_KV_TOKENIZER_FIM_REP_ID, "tokenizer.ggml.fim_rep_token_id" }, + { LLM_KV_TOKENIZER_FIM_SEP_ID, "tokenizer.ggml.fim_sep_token_id" }, - { LLM_KV_ADAPTER_TYPE, "adapter.type" }, - { LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" }, + { LLM_KV_ADAPTER_TYPE, "adapter.type" }, + { LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" }, + + // deprecated + { LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" }, + { LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" }, + { LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" }, }; struct LLM_KV { @@ -3417,8 +3431,6 @@ static int llama_get_device_count(const llama_model & model) { #if defined(GGML_USE_SYCL) count += ggml_backend_sycl_get_device_count(); -#elif defined(GGML_USE_VULKAN) - count += ggml_backend_vk_get_device_count(); #elif defined(GGML_USE_CANN) count += ggml_backend_cann_get_device_count(); #endif @@ -3450,10 +3462,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(const llama_mode } #elif defined(GGML_USE_CPU_HBM) buft = ggml_backend_cpu_hbm_buffer_type(); -#elif defined(GGML_USE_VULKAN) - if (host_buffer) { - buft = ggml_backend_vk_host_buffer_type(); - } #endif if (buft == nullptr) { @@ -3472,9 +3480,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_ } device -= (int)model.devices.size(); -#if defined(GGML_USE_VULKAN) - buft = ggml_backend_vk_buffer_type(device); -#elif defined(GGML_USE_SYCL) +#if defined(GGML_USE_SYCL) buft = ggml_backend_sycl_buffer_type(device); #elif defined(GGML_USE_CLBLAST) buft = ggml_backend_opencl_buffer_type(); @@ -3536,11 +3542,6 @@ static size_t llama_get_device_memory(const llama_model & model, int device) { size_t free; ggml_backend_sycl_get_device_memory(device, &free, &total); return free; -#elif defined(GGML_USE_VULKAN) - size_t total; - size_t free; - ggml_backend_vk_get_device_memory(device, &free, &total); - return free; #elif defined(GGML_USE_CANN) size_t total; size_t free; @@ -6197,14 +6198,14 @@ static void llm_load_vocab( vocab.type = LLAMA_VOCAB_TYPE_NONE; // default special tokens - vocab.special_bos_id = -1; - vocab.special_eos_id = -1; - vocab.special_unk_id = -1; - vocab.special_sep_id = -1; - vocab.special_pad_id = -1; - vocab.special_cls_id = -1; - vocab.special_mask_id = -1; - vocab.linefeed_id = -1; + vocab.special_bos_id = LLAMA_TOKEN_NULL; + vocab.special_eos_id = LLAMA_TOKEN_NULL; + vocab.special_unk_id = LLAMA_TOKEN_NULL; + vocab.special_sep_id = LLAMA_TOKEN_NULL; + vocab.special_pad_id = LLAMA_TOKEN_NULL; + vocab.special_cls_id = LLAMA_TOKEN_NULL; + vocab.special_mask_id = LLAMA_TOKEN_NULL; + vocab.linefeed_id = LLAMA_TOKEN_NULL; // read vocab size from metadata if (!ml.get_key(LLM_KV_VOCAB_SIZE, vocab.n_vocab, false)) { @@ -6221,16 +6222,16 @@ static void llm_load_vocab( vocab.special_bos_id = 1; vocab.special_eos_id = 2; vocab.special_unk_id = 0; - vocab.special_sep_id = -1; - vocab.special_pad_id = -1; - vocab.special_cls_id = -1; - vocab.special_mask_id = -1; + vocab.special_sep_id = LLAMA_TOKEN_NULL; + vocab.special_pad_id = LLAMA_TOKEN_NULL; + vocab.special_cls_id = LLAMA_TOKEN_NULL; + vocab.special_mask_id = LLAMA_TOKEN_NULL; } else if (tokenizer_model == "bert") { vocab.type = LLAMA_VOCAB_TYPE_WPM; // default special tokens - vocab.special_bos_id = -1; - vocab.special_eos_id = -1; + vocab.special_bos_id = LLAMA_TOKEN_NULL; + vocab.special_eos_id = LLAMA_TOKEN_NULL; vocab.special_unk_id = 100; vocab.special_sep_id = 102; vocab.special_pad_id = 0; @@ -6275,22 +6276,22 @@ static void llm_load_vocab( // default special tokens vocab.special_bos_id = 11; vocab.special_eos_id = 11; - vocab.special_unk_id = -1; - vocab.special_sep_id = -1; - vocab.special_pad_id = -1; - vocab.special_cls_id = -1; - vocab.special_mask_id = -1; + vocab.special_unk_id = LLAMA_TOKEN_NULL; + vocab.special_sep_id = LLAMA_TOKEN_NULL; + vocab.special_pad_id = LLAMA_TOKEN_NULL; + vocab.special_cls_id = LLAMA_TOKEN_NULL; + vocab.special_mask_id = LLAMA_TOKEN_NULL; } else if (tokenizer_model == "t5") { vocab.type = LLAMA_VOCAB_TYPE_UGM; // default special tokens - vocab.special_bos_id = -1; + vocab.special_bos_id = LLAMA_TOKEN_NULL; vocab.special_eos_id = 1; vocab.special_unk_id = 2; - vocab.special_sep_id = -1; + vocab.special_sep_id = LLAMA_TOKEN_NULL; vocab.special_pad_id = 0; - vocab.special_cls_id = -1; - vocab.special_mask_id = -1; + vocab.special_cls_id = LLAMA_TOKEN_NULL; + vocab.special_mask_id = LLAMA_TOKEN_NULL; const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str()); if (precompiled_charsmap_keyidx != -1) { @@ -6313,11 +6314,11 @@ static void llm_load_vocab( vocab.type = LLAMA_VOCAB_TYPE_RWKV; // default special tokens - vocab.special_bos_id = -1; - vocab.special_eos_id = -1; - vocab.special_unk_id = -1; - vocab.special_sep_id = -1; - vocab.special_pad_id = -1; + vocab.special_bos_id = LLAMA_TOKEN_NULL; + vocab.special_eos_id = LLAMA_TOKEN_NULL; + vocab.special_unk_id = LLAMA_TOKEN_NULL; + vocab.special_sep_id = LLAMA_TOKEN_NULL; + vocab.special_pad_id = LLAMA_TOKEN_NULL; } else { throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str())); } @@ -6401,7 +6402,7 @@ static void llm_load_vocab( } else if ( tokenizer_pre == "chatglm-bpe") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHATGLM4; - vocab.special_bos_id = -1; + vocab.special_bos_id = LLAMA_TOKEN_NULL; } else if ( tokenizer_pre == "viking") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_VIKING; @@ -6528,44 +6529,6 @@ static void llm_load_vocab( // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n' if (vocab.type == LLAMA_VOCAB_TYPE_SPM) { - // For Fill-In-the-Middle (FIM)/infill models which where converted - // prior to support of FIM special tokens in GGUF, the following - // will allow those models to continue to work. The general names - // of the known models are currently CodeLlama (LLM_ARCH_LLAMA) and - // CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once - // new versions of these models have been published. - std::string gen_name; - ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false); - - std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(), - [](unsigned char c){ return std::tolower(c); }); - - if (gen_name.find("code") != std::string::npos) { - if (model.arch == LLM_ARCH_LLAMA - && 32010 < vocab.id_to_token.size() - && vocab.id_to_token[32007].text.find("
") != std::string::npos
-              && vocab.id_to_token[32008].text.find("") != std::string::npos
-              && vocab.id_to_token[32009].text.find("") != std::string::npos
-              && vocab.id_to_token[32010].text.find("") != std::string::npos) {
-                vocab.special_prefix_id = 32007;
-                vocab.special_suffix_id = 32008;
-                vocab.special_middle_id = 32009;
-                vocab.special_eot_id    = 32010;
-            } else if (model.arch == LLM_ARCH_GEMMA
-              && 107 < vocab.id_to_token.size()
-              && vocab.id_to_token[67].text == "<|fim_prefix|>"
-              && vocab.id_to_token[69].text == "<|fim_suffix|>"
-              && vocab.id_to_token[68].text == "<|fim_middle|>"
-              && vocab.id_to_token[107].text == "") {
-                vocab.special_prefix_id = 67;
-                vocab.special_suffix_id = 69;
-                vocab.special_middle_id = 68;
-                // TODO: this is not EOT, it is "file separator" token, needs fix
-                //       https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572
-                //vocab.special_eot_id    = 70;
-                vocab.special_eot_id    = 107;
-            }
-        }
         try {
             vocab.linefeed_id = llama_byte_to_token_impl(vocab, '\n');
         } catch (const std::exception & e) {
@@ -6593,18 +6556,26 @@ static void llm_load_vocab(
     // special tokens
     {
         const std::vector> special_token_types = {
-            { LLM_KV_TOKENIZER_BOS_ID,    vocab.special_bos_id    },
-            { LLM_KV_TOKENIZER_EOS_ID,    vocab.special_eos_id    },
-            { LLM_KV_TOKENIZER_UNK_ID,    vocab.special_unk_id    },
-            { LLM_KV_TOKENIZER_SEP_ID,    vocab.special_sep_id    },
-            { LLM_KV_TOKENIZER_PAD_ID,    vocab.special_pad_id    },
-            { LLM_KV_TOKENIZER_CLS_ID,    vocab.special_cls_id    },
-            { LLM_KV_TOKENIZER_MASK_ID,   vocab.special_mask_id   },
-            { LLM_KV_TOKENIZER_PREFIX_ID, vocab.special_prefix_id },
-            { LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_suffix_id },
-            { LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_middle_id },
-            { LLM_KV_TOKENIZER_EOT_ID,    vocab.special_eot_id    },
-            { LLM_KV_TOKENIZER_EOM_ID,    vocab.special_eom_id    },
+            { LLM_KV_TOKENIZER_BOS_ID,     vocab.special_bos_id     },
+            { LLM_KV_TOKENIZER_EOS_ID,     vocab.special_eos_id     },
+            { LLM_KV_TOKENIZER_EOT_ID,     vocab.special_eot_id     },
+            { LLM_KV_TOKENIZER_EOM_ID,     vocab.special_eom_id     },
+            { LLM_KV_TOKENIZER_UNK_ID,     vocab.special_unk_id     },
+            { LLM_KV_TOKENIZER_SEP_ID,     vocab.special_sep_id     },
+            { LLM_KV_TOKENIZER_PAD_ID,     vocab.special_pad_id     },
+            { LLM_KV_TOKENIZER_CLS_ID,     vocab.special_cls_id     },
+            { LLM_KV_TOKENIZER_MASK_ID,    vocab.special_mask_id    },
+            { LLM_KV_TOKENIZER_FIM_PRE_ID, vocab.special_fim_pre_id },
+            { LLM_KV_TOKENIZER_FIM_SUF_ID, vocab.special_fim_suf_id },
+            { LLM_KV_TOKENIZER_FIM_MID_ID, vocab.special_fim_mid_id },
+            { LLM_KV_TOKENIZER_FIM_PAD_ID, vocab.special_fim_pad_id },
+            { LLM_KV_TOKENIZER_FIM_REP_ID, vocab.special_fim_rep_id },
+            { LLM_KV_TOKENIZER_FIM_SEP_ID, vocab.special_fim_sep_id },
+
+            // deprecated
+            { LLM_KV_TOKENIZER_PREFIX_ID, vocab.special_fim_pre_id },
+            { LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_fim_suf_id },
+            { LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_fim_mid_id },
         };
 
         for (const auto & it : special_token_types) {
@@ -6635,46 +6606,140 @@ static void llm_load_vocab(
             }
         }
 
-        // find EOT token: "<|eot_id|>", "<|im_end|>", "", etc.
-        //
-        // TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOT_ID
-        //       for now, we apply this workaround to find the EOT token based on its text
-        if (vocab.special_eot_id == -1) {
-            for (const auto & t : vocab.token_to_id) {
+        // auto-detect special tokens by text
+        // TODO: convert scripts should provide these tokens through the KV metadata LLM_KV_TOKENIZER_...
+        //       for now, we apply this workaround to find the tokens based on their text
+
+        for (const auto & t : vocab.token_to_id) {
+            // find EOT token: "<|eot_id|>", "<|im_end|>", "", etc.
+            if (vocab.special_eot_id == LLAMA_TOKEN_NULL) {
                 if (false
-                        // TODO: gemma "" is exported as a normal token, so the following check does not work
-                        //       need to fix convert script
-                        //vocab.id_to_token[t.second].type == LLAMA_TOKEN_TYPE_CONTROL &&
                         || t.first == "<|eot_id|>"
                         || t.first == "<|im_end|>"
                         || t.first == "<|end|>"
                         || t.first == ""
                         || t.first == "<|endoftext|>"
                         || t.first == ""
+                        || t.first == "<|end▁of▁sentence|>" // DeepSeek
                    ) {
                     vocab.special_eot_id = t.second;
                     if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
-                        LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
-                                __func__, t.first.c_str());
+                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                                __func__, t.second, t.first.c_str());
                         vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                     }
-                    break;
                 }
             }
-        }
 
-        // find EOM token: "<|eom_id|>"
-        //
-        // TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOM_ID
-        //       for now, we apply this workaround to find the EOM token based on its text
-        if (vocab.special_eom_id == -1) {
-            const auto & t = vocab.token_to_id.find("<|eom_id|>");
-            if (t != vocab.token_to_id.end()) {
-                vocab.special_eom_id = t->second;
-                if ((vocab.id_to_token[t->second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
-                    LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
-                        __func__, t->first.c_str());
-                    vocab.id_to_token[t->second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+            // find EOM token: "<|eom_id|>"
+            if (vocab.special_eom_id == LLAMA_TOKEN_NULL) {
+                if (false
+                        || t.first == "<|eom_id|>"
+                        ) {
+                    vocab.special_eom_id = t.second;
+                    if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                                __func__, t.second, t.first.c_str());
+                        vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                    }
+                }
+            }
+
+            // find FIM_PRE token: "<|fim_prefix|>", "", "
", etc.
+            if (vocab.special_fim_pre_id == LLAMA_TOKEN_NULL) {
+                if (false
+                        || t.first == "<|fim_prefix|>"  // Qwen
+                        || t.first == ""
+                        || t.first == "<|fim▁begin|>" // DeepSeek
+                        || t.first == "
"
+                        ) {
+                    vocab.special_fim_pre_id = t.second;
+                    if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                                __func__, t.second, t.first.c_str());
+                        vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                    }
+                }
+            }
+
+            // find FIM_SUF token: "<|fim_suffix|>", "", "", etc.
+            if (vocab.special_fim_suf_id == LLAMA_TOKEN_NULL) {
+                if (false
+                        || t.first == "<|fim_suffix|>" // Qwen
+                        || t.first == ""
+                        || t.first == "<|fim▁hole|>" // DeepSeek
+                        || t.first == ""
+                        ) {
+                    vocab.special_fim_suf_id = t.second;
+                    if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                                __func__, t.second, t.first.c_str());
+                        vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                    }
+                }
+            }
+
+            // find FIM_MID token: "<|fim_middle|>", "", "", etc.
+            if (vocab.special_fim_mid_id == LLAMA_TOKEN_NULL) {
+                if (false
+                        || t.first == "<|fim_middle|>" // Qwen
+                        || t.first == ""
+                        || t.first == "<|fim▁end|>"  // DeepSeek
+                        || t.first == ""
+                        ) {
+                    vocab.special_fim_mid_id = t.second;
+                    if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                                __func__, t.second, t.first.c_str());
+                        vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                    }
+                }
+            }
+
+            // find FIM_PAD token: "<|fim_pad|>", "", "", etc.
+            if (vocab.special_fim_pad_id == LLAMA_TOKEN_NULL) {
+                if (false
+                        || t.first == "<|fim_pad|>" // Qwen
+                        || t.first == ""
+                        || t.first == ""
+                        ) {
+                    vocab.special_fim_pad_id = t.second;
+                    if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                                __func__, t.second, t.first.c_str());
+                        vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                    }
+                }
+            }
+
+            // find FIM_REP token: "<|fim_repo|>", "", "", etc.
+            if (vocab.special_fim_rep_id == LLAMA_TOKEN_NULL) {
+                if (false
+                        || t.first == "<|fim_repo|>"  // Qwen
+                        || t.first == "<|repo_name|>"
+                        || t.first == ""
+                        || t.first == ""
+                        ) {
+                    vocab.special_fim_rep_id = t.second;
+                    if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                                __func__, t.second, t.first.c_str());
+                        vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                    }
+                }
+            }
+
+            // find FIM_SEP token: "<|file_sep|>"
+            if (vocab.special_fim_sep_id == LLAMA_TOKEN_NULL) {
+                if (false
+                        || t.first == "<|file_sep|>" // Qwen
+                        ) {
+                    vocab.special_fim_sep_id = t.second;
+                    if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
+                        LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                                __func__, t.second, t.first.c_str());
+                        vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
+                    }
                 }
             }
         }
@@ -6683,6 +6748,19 @@ static void llm_load_vocab(
         // this is currently determined based on the token text, which is obviously not ideal
         // ref: https://github.com/ggerganov/llama.cpp/issues/9606
         vocab.special_eog_ids.clear();
+
+        if (vocab.special_fim_pad_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_pad_id) == 0) {
+            vocab.special_eog_ids.insert(vocab.special_fim_pad_id);
+        }
+
+        if (vocab.special_fim_rep_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_rep_id) == 0) {
+            vocab.special_eog_ids.insert(vocab.special_fim_rep_id);
+        }
+
+        if (vocab.special_fim_sep_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_sep_id) == 0) {
+            vocab.special_eog_ids.insert(vocab.special_fim_sep_id);
+        }
+
         for (const auto & t : vocab.token_to_id) {
             if (false
                     || t.first == "<|eot_id|>"
@@ -6695,24 +6773,31 @@ static void llm_load_vocab(
                ) {
                 vocab.special_eog_ids.insert(t.second);
                 if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
-                    LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
-                            __func__, t.first.c_str());
+                    LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
+                            __func__, t.second, t.first.c_str());
                     vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                 }
+            } else {
+                // token is control, but not marked as EOG -> print a warning
+                if (vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL && vocab.special_eog_ids.count(t.second) == 0) {
+                    LLAMA_LOG_WARN("%s: control token: %6d '%s' is not marked as EOG\n",
+                            __func__, t.second, t.first.c_str());
+                }
             }
         }
 
-        if (vocab.special_eos_id != -1 && vocab.special_eog_ids.count(vocab.special_eos_id) == 0) {
+        // sanity checks
+        if (vocab.special_eos_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eos_id) == 0) {
             vocab.special_eog_ids.insert(vocab.special_eos_id);
             LLAMA_LOG_WARN("%s: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
         }
 
-        if (vocab.special_eot_id != -1 && vocab.special_eog_ids.count(vocab.special_eot_id) == 0) {
+        if (vocab.special_eot_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eot_id) == 0) {
             vocab.special_eog_ids.insert(vocab.special_eot_id);
             LLAMA_LOG_WARN("%s: special_eot_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
         }
 
-        if (vocab.special_eom_id != -1 && vocab.special_eog_ids.count(vocab.special_eom_id) == 0) {
+        if (vocab.special_eom_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eom_id) == 0) {
             vocab.special_eog_ids.insert(vocab.special_eom_id);
             LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
         }
@@ -6906,20 +6991,24 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
     LLAMA_LOG_INFO("%s: general.name     = %s\n",    __func__, model.name.c_str());
 
     // special tokens
-    if (vocab.special_bos_id    != -1) { LLAMA_LOG_INFO( "%s: BOS token        = %d '%s'\n", __func__, vocab.special_bos_id,  vocab.id_to_token[vocab.special_bos_id].text.c_str() );  }
-    if (vocab.special_eos_id    != -1) { LLAMA_LOG_INFO( "%s: EOS token        = %d '%s'\n", __func__, vocab.special_eos_id,  vocab.id_to_token[vocab.special_eos_id].text.c_str() );  }
-    if (vocab.special_unk_id    != -1) { LLAMA_LOG_INFO( "%s: UNK token        = %d '%s'\n", __func__, vocab.special_unk_id,  vocab.id_to_token[vocab.special_unk_id].text.c_str() );  }
-    if (vocab.special_sep_id    != -1) { LLAMA_LOG_INFO( "%s: SEP token        = %d '%s'\n", __func__, vocab.special_sep_id,  vocab.id_to_token[vocab.special_sep_id].text.c_str() );  }
-    if (vocab.special_pad_id    != -1) { LLAMA_LOG_INFO( "%s: PAD token        = %d '%s'\n", __func__, vocab.special_pad_id,  vocab.id_to_token[vocab.special_pad_id].text.c_str() );  }
-    if (vocab.special_cls_id    != -1) { LLAMA_LOG_INFO( "%s: CLS token        = %d '%s'\n", __func__, vocab.special_cls_id,  vocab.id_to_token[vocab.special_cls_id].text.c_str() );  }
-    if (vocab.special_mask_id   != -1) { LLAMA_LOG_INFO( "%s: MASK token       = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
+    if (vocab.special_bos_id  != -1)    { LLAMA_LOG_INFO( "%s: BOS token        = %d '%s'\n", __func__, vocab.special_bos_id,     vocab.id_to_token[vocab.special_bos_id].text.c_str() );  }
+    if (vocab.special_eos_id  != -1)    { LLAMA_LOG_INFO( "%s: EOS token        = %d '%s'\n", __func__, vocab.special_eos_id,     vocab.id_to_token[vocab.special_eos_id].text.c_str() );  }
+    if (vocab.special_eot_id  != -1)    { LLAMA_LOG_INFO( "%s: EOT token        = %d '%s'\n", __func__, vocab.special_eot_id,     vocab.id_to_token[vocab.special_eot_id].text.c_str() );  }
+    if (vocab.special_eom_id  != -1)    { LLAMA_LOG_INFO( "%s: EOM token        = %d '%s'\n", __func__, vocab.special_eom_id,     vocab.id_to_token[vocab.special_eom_id].text.c_str() );  }
+    if (vocab.special_unk_id  != -1)    { LLAMA_LOG_INFO( "%s: UNK token        = %d '%s'\n", __func__, vocab.special_unk_id,     vocab.id_to_token[vocab.special_unk_id].text.c_str() );  }
+    if (vocab.special_sep_id  != -1)    { LLAMA_LOG_INFO( "%s: SEP token        = %d '%s'\n", __func__, vocab.special_sep_id,     vocab.id_to_token[vocab.special_sep_id].text.c_str() );  }
+    if (vocab.special_pad_id  != -1)    { LLAMA_LOG_INFO( "%s: PAD token        = %d '%s'\n", __func__, vocab.special_pad_id,     vocab.id_to_token[vocab.special_pad_id].text.c_str() );  }
+    if (vocab.special_cls_id  != -1)    { LLAMA_LOG_INFO( "%s: CLS token        = %d '%s'\n", __func__, vocab.special_cls_id,     vocab.id_to_token[vocab.special_cls_id].text.c_str() );  }
+    if (vocab.special_mask_id != -1)    { LLAMA_LOG_INFO( "%s: MASK token       = %d '%s'\n", __func__, vocab.special_mask_id,    vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
 
-    if (vocab.linefeed_id       != -1) { LLAMA_LOG_INFO( "%s: LF token         = %d '%s'\n", __func__, vocab.linefeed_id,       vocab.id_to_token[vocab.linefeed_id].text.c_str() );       }
-    if (vocab.special_prefix_id != -1) { LLAMA_LOG_INFO( "%s: PRE token        = %d '%s'\n", __func__, vocab.special_prefix_id, vocab.id_to_token[vocab.special_prefix_id].text.c_str() ); }
-    if (vocab.special_suffix_id != -1) { LLAMA_LOG_INFO( "%s: SUF token        = %d '%s'\n", __func__, vocab.special_suffix_id, vocab.id_to_token[vocab.special_suffix_id].text.c_str() ); }
-    if (vocab.special_middle_id != -1) { LLAMA_LOG_INFO( "%s: MID token        = %d '%s'\n", __func__, vocab.special_middle_id, vocab.id_to_token[vocab.special_middle_id].text.c_str() ); }
-    if (vocab.special_eot_id    != -1) { LLAMA_LOG_INFO( "%s: EOT token        = %d '%s'\n", __func__, vocab.special_eot_id,    vocab.id_to_token[vocab.special_eot_id].text.c_str() );    }
-    if (vocab.special_eom_id    != -1) { LLAMA_LOG_INFO( "%s: EOM token        = %d '%s'\n", __func__, vocab.special_eom_id,    vocab.id_to_token[vocab.special_eom_id].text.c_str() );    }
+    if (vocab.linefeed_id != -1)        { LLAMA_LOG_INFO( "%s: LF token         = %d '%s'\n", __func__, vocab.linefeed_id,        vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
+
+    if (vocab.special_fim_pre_id != -1) { LLAMA_LOG_INFO( "%s: FIM PRE token    = %d '%s'\n", __func__, vocab.special_fim_pre_id, vocab.id_to_token[vocab.special_fim_pre_id].text.c_str() ); }
+    if (vocab.special_fim_suf_id != -1) { LLAMA_LOG_INFO( "%s: FIM SUF token    = %d '%s'\n", __func__, vocab.special_fim_suf_id, vocab.id_to_token[vocab.special_fim_suf_id].text.c_str() ); }
+    if (vocab.special_fim_mid_id != -1) { LLAMA_LOG_INFO( "%s: FIM MID token    = %d '%s'\n", __func__, vocab.special_fim_mid_id, vocab.id_to_token[vocab.special_fim_mid_id].text.c_str() ); }
+    if (vocab.special_fim_pad_id != -1) { LLAMA_LOG_INFO( "%s: FIM PAD token    = %d '%s'\n", __func__, vocab.special_fim_pad_id, vocab.id_to_token[vocab.special_fim_pad_id].text.c_str() ); }
+    if (vocab.special_fim_rep_id != -1) { LLAMA_LOG_INFO( "%s: FIM REP token    = %d '%s'\n", __func__, vocab.special_fim_rep_id, vocab.id_to_token[vocab.special_fim_rep_id].text.c_str() ); }
+    if (vocab.special_fim_sep_id != -1) { LLAMA_LOG_INFO( "%s: FIM SEP token    = %d '%s'\n", __func__, vocab.special_fim_sep_id, vocab.id_to_token[vocab.special_fim_sep_id].text.c_str() ); }
 
     for (const auto & id : vocab.special_eog_ids) {
         LLAMA_LOG_INFO( "%s: EOG token        = %d '%s'\n", __func__, id, vocab.id_to_token[id].text.c_str() );
@@ -16061,9 +16150,11 @@ struct llm_build_context {
         cur = ggml_get_rows(ctx0, cur, inp_out_ids);
 
         cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, model.output_norm_b, LLM_NORM, cb, -1);
-        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cb(cur, "result_norm", -1);
 
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
         cb(cur, "result_output", -1);
+
         ggml_build_forward_expand(gf, cur);
 
         return gf;
@@ -19059,7 +19150,7 @@ bool llama_supports_mlock(void) {
 }
 
 bool llama_supports_gpu_offload(void) {
-#if defined(GGML_USE_CLBLAST) || defined(GGML_USE_VULKAN) || \
+#if defined(GGML_USE_CLBLAST) || \
     defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
     // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
     return true;
@@ -19191,8 +19282,13 @@ struct llama_model * llama_load_model_from_file(
 
             case GGML_BACKEND_DEVICE_TYPE_GPU:
             case GGML_BACKEND_DEVICE_TYPE_GPU_FULL:
+            {
+                size_t free, total; // NOLINT
+                ggml_backend_dev_memory(dev, &free, &total);
+                LLAMA_LOG_INFO("%s: using device %s (%s) - %zu MiB free\n", __func__, ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), free/1024/1024);
                 model->devices.push_back(dev);
                 break;
+            }
         }
     }
 
@@ -19387,32 +19483,7 @@ struct llama_context * llama_new_context_with_model(
             main_gpu -= (int)model->devices.size();
         }
 
-#if defined(GGML_USE_VULKAN)
-        if (model->split_mode == LLAMA_SPLIT_MODE_ROW) {
-            LLAMA_LOG_ERROR("%s: Row split not supported. Failed to initialize Vulkan backend\n", __func__);
-            llama_free(ctx);
-            return nullptr;
-        }
-        if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
-            ggml_backend_t backend = ggml_backend_vk_init(main_gpu);
-            if (backend == nullptr) {
-                LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
-                llama_free(ctx);
-                return nullptr;
-            }
-            ctx->backends.push_back(backend);
-        } else {
-            for (int device = 0; device < ggml_backend_vk_get_device_count(); ++device) {
-                ggml_backend_t backend = ggml_backend_vk_init(device);
-                if (backend == nullptr) {
-                    LLAMA_LOG_ERROR("%s: failed to initialize Vulkan%d backend\n", __func__, device);
-                    llama_free(ctx);
-                    return nullptr;
-                }
-                ctx->backends.push_back(backend);
-            }
-        }
-#elif defined(GGML_USE_SYCL)
+#if defined(GGML_USE_SYCL)
         // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
         if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
             ggml_backend_t backend = ggml_backend_sycl_init(main_gpu);
@@ -19523,7 +19594,7 @@ struct llama_context * llama_new_context_with_model(
             }
 
             LLAMA_LOG_INFO("%s: KV self size  = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
-                (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
+                      (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
                 ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
                 ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
         }
@@ -21387,6 +21458,10 @@ llama_token llama_token_eos(const struct llama_model * model) {
     return llama_token_eos_impl(model->vocab);
 }
 
+llama_token llama_token_eot(const struct llama_model * model) {
+    return llama_token_eot_impl(model->vocab);
+}
+
 llama_token llama_token_cls(const struct llama_model * model) {
     return llama_token_cls_impl(model->vocab);
 }
@@ -21423,8 +21498,28 @@ llama_token llama_token_suffix(const struct llama_model * model) {
     return llama_token_suffix_impl(model->vocab);
 }
 
-llama_token llama_token_eot(const struct llama_model * model) {
-    return llama_token_eot_impl(model->vocab);
+llama_token llama_token_fim_pre(const struct llama_model * model) {
+    return llama_token_fim_pre_impl(model->vocab);
+}
+
+llama_token llama_token_fim_suf(const struct llama_model * model) {
+    return llama_token_fim_suf_impl(model->vocab);
+}
+
+llama_token llama_token_fim_mid(const struct llama_model * model) {
+    return llama_token_fim_mid_impl(model->vocab);
+}
+
+llama_token llama_token_fim_pad(const struct llama_model * model) {
+    return llama_token_fim_pad_impl(model->vocab);
+}
+
+llama_token llama_token_fim_rep(const struct llama_model * model) {
+    return llama_token_fim_rep_impl(model->vocab);
+}
+
+llama_token llama_token_fim_sep(const struct llama_model * model) {
+    return llama_token_fim_sep_impl(model->vocab);
 }
 
 //
@@ -21452,6 +21547,13 @@ int32_t llama_token_to_piece(
     return llama_token_to_piece_impl(model->vocab, token, buf, length, lstrip, special);
 }
 
+bool llama_token_is_prefix(
+    const struct llama_model * model,
+                 llama_token   token0,
+                 llama_token   token1) {
+    return llama_token_is_prefix_impl(model->vocab, token0, token1);
+}
+
 int32_t llama_detokenize(
     const struct llama_model * model,
            const llama_token * tokens,
@@ -21782,6 +21884,10 @@ struct llama_sampler * llama_sampler_init_grammar(const struct llama_model * mod
     return llama_sampler_init_grammar_impl(model->vocab, grammar_str, grammar_root);
 }
 
+struct llama_sampler * llama_sampler_init_infill(const struct llama_model * model) {
+    return llama_sampler_init_infill_impl(model->vocab);
+}
+
 //
 // model split
 //