diff --git a/common/arg.cpp b/common/arg.cpp index 66dd5aa88..573ec06a6 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -854,6 +854,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } } ).set_excludes({LLAMA_EXAMPLE_SERVER})); + add_opt(common_arg( + {"-sysf", "--system-prompt-file"}, "FNAME", + "a file containing the system prompt (default: none)", + [](common_params & params, const std::string & value) { + std::ifstream file(value); + if (!file) { + throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str())); + } + std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(params.system_prompt)); + if (!params.system_prompt.empty() && params.system_prompt.back() == '\n') { + params.system_prompt.pop_back(); + } + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(common_arg( {"--in-file"}, "FNAME", "an input file (repeat to specify multiple files)", diff --git a/common/common.cpp b/common/common.cpp index 7d4f72057..20ef8aa8b 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1037,6 +1037,8 @@ struct common_init_result common_init_from_params(common_params & params) { if (params.warmup) { LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__); + llama_set_warmup(lctx, true); + std::vector tmp; llama_token bos = llama_vocab_bos(vocab); llama_token eos = llama_vocab_eos(vocab); @@ -1067,6 +1069,7 @@ struct common_init_result common_init_from_params(common_params & params) { llama_kv_self_clear(lctx); llama_synchronize(lctx); llama_perf_context_reset(lctx); + llama_set_warmup(lctx, false); } iparams.model.reset(model); diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 36ad276fd..58cdd6af9 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -621,7 +621,9 @@ static json oaicompat_completion_params_parse( llama_params["chat_format"] = static_cast(chat_params.format); llama_params["prompt"] = chat_params.prompt; - llama_params["grammar"] = chat_params.grammar; + if (!chat_params.grammar.empty()) { + llama_params["grammar"] = chat_params.grammar; + } llama_params["grammar_lazy"] = chat_params.grammar_lazy; auto grammar_triggers = json::array(); for (const auto & trigger : chat_params.grammar_triggers) { diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index b8d272cda..68cd9920d 100644 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -1689,11 +1689,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, case GGML_OP_MUL_MAT: { switch (op->src[0]->type) { case GGML_TYPE_Q8_0: - // Current groupsize should not be greater than k-1 in - // aclnnWeightQuantBatchMatmulV2GetWorkspaceSize - if (op->src[0]->ne[0] <= QK8_0) { - return false; - } case GGML_TYPE_F16: case GGML_TYPE_F32: case GGML_TYPE_Q4_0: diff --git a/include/llama.h b/include/llama.h index 58b3c01f8..fd183c0d5 100644 --- a/include/llama.h +++ b/include/llama.h @@ -947,6 +947,10 @@ extern "C" { // If set to true, the model will only attend to the past tokens LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn); + // Set whether the model is in warmup mode or not + // If true, all model tensors are activated during llama_decode() to load and cache their weights. + LLAMA_API void llama_set_warmup(struct llama_context * ctx, bool warmup); + // Set abort callback LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data); diff --git a/src/llama-context.cpp b/src/llama-context.cpp index a88dae736..cf9181cc3 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -39,6 +39,7 @@ llama_context::llama_context( cparams.flash_attn = params.flash_attn; cparams.no_perf = params.no_perf; cparams.pooling_type = params.pooling_type; + cparams.warmup = false; cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx; cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base; @@ -949,6 +950,12 @@ void llama_context::set_causal_attn(bool value) { cparams.causal_attn = value; } +void llama_context::set_warmup(bool value) { + LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value); + + cparams.warmup = value; +} + void llama_context::set_adapter_lora( llama_adapter_lora * adapter, float scale) { @@ -1595,7 +1602,7 @@ void llama_context::output_reorder() { // int32_t llama_context::graph_max_nodes() const { - return std::max(8192, 5*model.n_tensors()); + return std::max(65536, 5*model.n_tensors()); } ggml_cgraph * llama_context::graph_init() { @@ -2373,6 +2380,10 @@ void llama_set_causal_attn(llama_context * ctx, bool causal_attn) { ctx->set_causal_attn(causal_attn); } +void llama_set_warmup(llama_context * ctx, bool warmup) { + ctx->set_warmup(warmup); +} + void llama_synchronize(llama_context * ctx) { ctx->synchronize(); } diff --git a/src/llama-context.h b/src/llama-context.h index 88df8950e..04facb544 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -64,6 +64,7 @@ struct llama_context { void set_embeddings (bool value); void set_causal_attn(bool value); + void set_warmup(bool value); void set_adapter_lora( llama_adapter_lora * adapter, diff --git a/src/llama-cparams.h b/src/llama-cparams.h index 252012f3d..30e550f02 100644 --- a/src/llama-cparams.h +++ b/src/llama-cparams.h @@ -29,6 +29,7 @@ struct llama_cparams { bool offload_kqv; bool flash_attn; bool no_perf; + bool warmup; enum llama_pooling_type pooling_type; diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 42a6722aa..d21b89d04 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -577,7 +577,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) : n_embd_head_v (hparams.n_embd_head_v), n_embd_v_gqa (hparams.n_embd_v_gqa()), n_expert (hparams.n_expert), - n_expert_used (hparams.n_expert_used), + n_expert_used (cparams.warmup ? hparams.n_expert : hparams.n_expert_used), freq_base (cparams.rope_freq_base), freq_scale (cparams.rope_freq_scale), ext_factor (cparams.yarn_ext_factor),