From add2a3aa5a1571211aa5c7303b8e80c8d1824b91 Mon Sep 17 00:00:00 2001 From: Victor <194116445+dodekapod@users.noreply.github.com> Date: Fri, 14 Mar 2025 11:21:17 +0100 Subject: [PATCH 1/5] server: fix "--grammar-file" parameter (#12285) --- examples/server/utils.hpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 36ad276fd..58cdd6af9 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -621,7 +621,9 @@ static json oaicompat_completion_params_parse( llama_params["chat_format"] = static_cast(chat_params.format); llama_params["prompt"] = chat_params.prompt; - llama_params["grammar"] = chat_params.grammar; + if (!chat_params.grammar.empty()) { + llama_params["grammar"] = chat_params.grammar; + } llama_params["grammar_lazy"] = chat_params.grammar_lazy; auto grammar_triggers = json::array(); for (const auto & trigger : chat_params.grammar_triggers) { From 8fcb563613e20a04dd9791f0a9b8a41086428c09 Mon Sep 17 00:00:00 2001 From: fairydreaming <166155368+fairydreaming@users.noreply.github.com> Date: Fri, 14 Mar 2025 13:47:05 +0100 Subject: [PATCH 2/5] Load all MoE experts during warmup (#11571) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * llama : introduce llama_set_warmup() API call that controls warmup mode; use all MoE experts during warmup * common : use new API to enable warmup mode during model warmup --------- Co-authored-by: Stanisław Szymczyk --- common/common.cpp | 3 +++ include/llama.h | 4 ++++ src/llama-context.cpp | 13 ++++++++++++- src/llama-context.h | 1 + src/llama-cparams.h | 1 + src/llama-graph.cpp | 2 +- 6 files changed, 22 insertions(+), 2 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 8487e3834..18ffb4e73 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1033,6 +1033,8 @@ struct common_init_result common_init_from_params(common_params & params) { if (params.warmup) { LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__); + llama_set_warmup(lctx, true); + std::vector tmp; llama_token bos = llama_vocab_bos(vocab); llama_token eos = llama_vocab_eos(vocab); @@ -1063,6 +1065,7 @@ struct common_init_result common_init_from_params(common_params & params) { llama_kv_self_clear(lctx); llama_synchronize(lctx); llama_perf_context_reset(lctx); + llama_set_warmup(lctx, false); } iparams.model.reset(model); diff --git a/include/llama.h b/include/llama.h index e5286f061..6a44be404 100644 --- a/include/llama.h +++ b/include/llama.h @@ -945,6 +945,10 @@ extern "C" { // If set to true, the model will only attend to the past tokens LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn); + // Set whether the model is in warmup mode or not + // If true, all model tensors are activated during llama_decode() to load and cache their weights. + LLAMA_API void llama_set_warmup(struct llama_context * ctx, bool warmup); + // Set abort callback LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data); diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 4df6b18ec..c2fcce42a 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -39,6 +39,7 @@ llama_context::llama_context( cparams.flash_attn = params.flash_attn; cparams.no_perf = params.no_perf; cparams.pooling_type = params.pooling_type; + cparams.warmup = false; cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx; cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base; @@ -948,6 +949,12 @@ void llama_context::set_causal_attn(bool value) { cparams.causal_attn = value; } +void llama_context::set_warmup(bool value) { + LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value); + + cparams.warmup = value; +} + void llama_context::set_adapter_lora( llama_adapter_lora * adapter, float scale) { @@ -1594,7 +1601,7 @@ void llama_context::output_reorder() { // int32_t llama_context::graph_max_nodes() const { - return std::max(8192, 5*model.n_tensors()); + return std::max(65536, 5*model.n_tensors()); } ggml_cgraph * llama_context::graph_init() { @@ -2372,6 +2379,10 @@ void llama_set_causal_attn(llama_context * ctx, bool causal_attn) { ctx->set_causal_attn(causal_attn); } +void llama_set_warmup(llama_context * ctx, bool warmup) { + ctx->set_warmup(warmup); +} + void llama_synchronize(llama_context * ctx) { ctx->synchronize(); } diff --git a/src/llama-context.h b/src/llama-context.h index 88df8950e..04facb544 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -64,6 +64,7 @@ struct llama_context { void set_embeddings (bool value); void set_causal_attn(bool value); + void set_warmup(bool value); void set_adapter_lora( llama_adapter_lora * adapter, diff --git a/src/llama-cparams.h b/src/llama-cparams.h index 252012f3d..30e550f02 100644 --- a/src/llama-cparams.h +++ b/src/llama-cparams.h @@ -29,6 +29,7 @@ struct llama_cparams { bool offload_kqv; bool flash_attn; bool no_perf; + bool warmup; enum llama_pooling_type pooling_type; diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index e4af50778..4e9087339 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -577,7 +577,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) : n_embd_head_v (hparams.n_embd_head_v), n_embd_v_gqa (hparams.n_embd_v_gqa()), n_expert (hparams.n_expert), - n_expert_used (hparams.n_expert_used), + n_expert_used (cparams.warmup ? hparams.n_expert : hparams.n_expert_used), freq_base (cparams.rope_freq_base), freq_scale (cparams.rope_freq_scale), ext_factor (cparams.yarn_ext_factor), From 774973b8f3d5e375b0b74d58638eeb1817e950a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Fri, 14 Mar 2025 16:57:05 +0100 Subject: [PATCH 3/5] main : add -sysf / --system-prompt-file (#12249) (#12250) * add system_prompt_file * add -sysf / --system-prompt-file * remove system_prompt_file --- common/arg.cpp | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/common/arg.cpp b/common/arg.cpp index fe6a1eece..240c699a2 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -853,6 +853,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } } ).set_excludes({LLAMA_EXAMPLE_SERVER})); + add_opt(common_arg( + {"-sysf", "--system-prompt-file"}, "FNAME", + "a file containing the system prompt (default: none)", + [](common_params & params, const std::string & value) { + std::ifstream file(value); + if (!file) { + throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str())); + } + std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(params.system_prompt)); + if (!params.system_prompt.empty() && params.system_prompt.back() == '\n') { + params.system_prompt.pop_back(); + } + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(common_arg( {"--in-file"}, "FNAME", "an input file (repeat to specify multiple files)", From 9f2250ba722738ec0e6ab684636268a79160c854 Mon Sep 17 00:00:00 2001 From: Eric Curtin Date: Fri, 14 Mar 2025 16:41:20 +0000 Subject: [PATCH 4/5] Add CLI arg to llama-run to adjust the number of threads used (#12370) We default to 4, sometimes we want to manually adjust this Signed-off-by: Eric Curtin --- examples/run/run.cpp | 131 +++++++++++++++++++++++++++++-------------- 1 file changed, 88 insertions(+), 43 deletions(-) diff --git a/examples/run/run.cpp b/examples/run/run.cpp index 437f2533e..462a6d151 100644 --- a/examples/run/run.cpp +++ b/examples/run/run.cpp @@ -79,6 +79,7 @@ class Opt { ctx_params = llama_context_default_params(); model_params = llama_model_default_params(); context_size_default = ctx_params.n_batch; + n_threads_default = ctx_params.n_threads; ngl_default = model_params.n_gpu_layers; common_params_sampling sampling; temperature_default = sampling.temp; @@ -104,6 +105,7 @@ class Opt { ctx_params.n_batch = context_size >= 0 ? context_size : context_size_default; ctx_params.n_ctx = ctx_params.n_batch; + ctx_params.n_threads = ctx_params.n_threads_batch = n_threads >= 0 ? n_threads : n_threads_default; model_params.n_gpu_layers = ngl >= 0 ? ngl : ngl_default; temperature = temperature >= 0 ? temperature : temperature_default; @@ -116,12 +118,12 @@ class Opt { std::string chat_template_file; std::string user; bool use_jinja = false; - int context_size = -1, ngl = -1; + int context_size = -1, ngl = -1, n_threads = -1; float temperature = -1; bool verbose = false; private: - int context_size_default = -1, ngl_default = -1; + int context_size_default = -1, ngl_default = -1, n_threads_default = -1; float temperature_default = -1; bool help = false; @@ -159,53 +161,94 @@ class Opt { return 0; } + int parse_options_with_value(int argc, const char ** argv, int & i, bool & options_parsing) { + if (options_parsing && (strcmp(argv[i], "-c") == 0 || strcmp(argv[i], "--context-size") == 0)) { + if (handle_option_with_value(argc, argv, i, context_size) == 1) { + return 1; + } + } else if (options_parsing && + (strcmp(argv[i], "-n") == 0 || strcmp(argv[i], "-ngl") == 0 || strcmp(argv[i], "--ngl") == 0)) { + if (handle_option_with_value(argc, argv, i, ngl) == 1) { + return 1; + } + } else if (options_parsing && (strcmp(argv[i], "-t") == 0 || strcmp(argv[i], "--threads") == 0)) { + if (handle_option_with_value(argc, argv, i, n_threads) == 1) { + return 1; + } + } else if (options_parsing && strcmp(argv[i], "--temp") == 0) { + if (handle_option_with_value(argc, argv, i, temperature) == 1) { + return 1; + } + } else if (options_parsing && strcmp(argv[i], "--chat-template-file") == 0) { + if (handle_option_with_value(argc, argv, i, chat_template_file) == 1) { + return 1; + } + use_jinja = true; + } else { + return 2; + } + + return 0; + } + + int parse_options(const char ** argv, int & i, bool & options_parsing) { + if (options_parsing && (parse_flag(argv, i, "-v", "--verbose") || parse_flag(argv, i, "-v", "--log-verbose"))) { + verbose = true; + } else if (options_parsing && strcmp(argv[i], "--jinja") == 0) { + use_jinja = true; + } else if (options_parsing && parse_flag(argv, i, "-h", "--help")) { + help = true; + return 0; + } else if (options_parsing && strcmp(argv[i], "--") == 0) { + options_parsing = false; + } else { + return 2; + } + + return 0; + } + + int parse_positional_args(const char ** argv, int & i, int & positional_args_i) { + if (positional_args_i == 0) { + if (!argv[i][0] || argv[i][0] == '-') { + return 1; + } + + ++positional_args_i; + model_ = argv[i]; + } else if (positional_args_i == 1) { + ++positional_args_i; + user = argv[i]; + } else { + user += " " + std::string(argv[i]); + } + + return 0; + } + int parse(int argc, const char ** argv) { bool options_parsing = true; for (int i = 1, positional_args_i = 0; i < argc; ++i) { - if (options_parsing && (strcmp(argv[i], "-c") == 0 || strcmp(argv[i], "--context-size") == 0)) { - if (handle_option_with_value(argc, argv, i, context_size) == 1) { - return 1; - } - } else if (options_parsing && - (strcmp(argv[i], "-n") == 0 || strcmp(argv[i], "-ngl") == 0 || strcmp(argv[i], "--ngl") == 0)) { - if (handle_option_with_value(argc, argv, i, ngl) == 1) { - return 1; - } - } else if (options_parsing && strcmp(argv[i], "--temp") == 0) { - if (handle_option_with_value(argc, argv, i, temperature) == 1) { - return 1; - } - } else if (options_parsing && - (parse_flag(argv, i, "-v", "--verbose") || parse_flag(argv, i, "-v", "--log-verbose"))) { - verbose = true; - } else if (options_parsing && strcmp(argv[i], "--jinja") == 0) { - use_jinja = true; - } else if (options_parsing && strcmp(argv[i], "--chat-template-file") == 0){ - if (handle_option_with_value(argc, argv, i, chat_template_file) == 1) { - return 1; - } - use_jinja = true; - } else if (options_parsing && parse_flag(argv, i, "-h", "--help")) { - help = true; - return 0; - } else if (options_parsing && strcmp(argv[i], "--") == 0) { - options_parsing = false; - } else if (positional_args_i == 0) { - if (!argv[i][0] || argv[i][0] == '-') { - return 1; - } + int ret = parse_options_with_value(argc, argv, i, options_parsing); + if (ret == 0) { + continue; + } else if (ret == 1) { + return ret; + } - ++positional_args_i; - model_ = argv[i]; - } else if (positional_args_i == 1) { - ++positional_args_i; - user = argv[i]; - } else { - user += " " + std::string(argv[i]); + ret = parse_options(argv, i, options_parsing); + if (ret == 0) { + continue; + } else if (ret == 1) { + return ret; + } + + if (parse_positional_args(argv, i, positional_args_i)) { + return 1; } } - if (model_.empty()){ + if (model_.empty()) { return 1; } @@ -232,6 +275,8 @@ class Opt { " Number of GPU layers (default: %d)\n" " --temp \n" " Temperature (default: %.1f)\n" + " -t, --threads \n" + " Number of threads to use during generation (default: %d)\n" " -v, --verbose, --log-verbose\n" " Set verbosity level to infinity (i.e. log all messages, useful for debugging)\n" " -h, --help\n" @@ -260,7 +305,7 @@ class Opt { " llama-run file://some-file3.gguf\n" " llama-run --ngl 999 some-file4.gguf\n" " llama-run --ngl 999 some-file5.gguf Hello World\n", - context_size_default, ngl_default, temperature_default); + context_size_default, ngl_default, temperature_default, n_threads_default); } }; From 92a391327e9201b9b5b32fdd3afb452026c22d4c Mon Sep 17 00:00:00 2001 From: Chenguang Li <757486878@qq.com> Date: Sat, 15 Mar 2025 09:31:08 +0800 Subject: [PATCH 5/5] [CANN]MUL_MAT optimization (#12382) --- ggml/src/ggml-cann/aclnn_ops.cpp | 8 ++++++-- ggml/src/ggml-cann/ggml-cann.cpp | 5 ----- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index b2d857e1e..6bb5d0834 100644 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -2790,10 +2790,14 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx, (char*)output_buffer + batch1 * output_stride, ACL_FLOAT16, output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND, output_ne_offset); + int64_t antiquantGroupSize = 0; + if (src0->ne[0] > QK8_0) { + antiquantGroupSize = QK8_0; + } ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize( acl_input_tensor, acl_weight_tensor, acl_scale_tensor, nullptr, - nullptr, nullptr, nullptr, QK8_0, acl_output_tensor, + nullptr, nullptr, nullptr, antiquantGroupSize, acl_output_tensor, &workspaceSize, &executor)); if (workspaceAddr == nullptr) { workspaceAddr = workspace_allocator.alloc(workspaceSize); @@ -2833,7 +2837,7 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx, ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize( acl_input_tensor, acl_weight_tensor, acl_scale_tensor, - nullptr, nullptr, nullptr, nullptr, QK8_0, + nullptr, nullptr, nullptr, nullptr, antiquantGroupSize, acl_output_tensor, &workspaceSize, &executor)); ACL_CHECK(aclnnWeightQuantBatchMatmulV2( workspaceAddr, workspaceSize, executor, ctx.stream())); diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index b8d272cda..68cd9920d 100644 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -1689,11 +1689,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, case GGML_OP_MUL_MAT: { switch (op->src[0]->type) { case GGML_TYPE_Q8_0: - // Current groupsize should not be greater than k-1 in - // aclnnWeightQuantBatchMatmulV2GetWorkspaceSize - if (op->src[0]->ne[0] <= QK8_0) { - return false; - } case GGML_TYPE_F16: case GGML_TYPE_F32: case GGML_TYPE_Q4_0: