Merge branch 'upstream' into concedo_experimental

# Conflicts: # examples/run/run.cpp # ggml/src/ggml-cann/aclnn_ops.cpp
2025-09-11 01:24:36 +00:00 · 2025-03-15 19:54:19 +08:00 · 2025-03-15 19:54:19 +08:00 · 67851e5415
commit 67851e5415
parent e84596ec1a 92a391327e
9 changed files with 39 additions and 8 deletions
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -854,6 +854,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            }
        }
    ).set_excludes({LLAMA_EXAMPLE_SERVER}));
+    add_opt(common_arg(
+        {"-sysf", "--system-prompt-file"}, "FNAME",
+        "a file containing the system prompt (default: none)",
+        [](common_params & params, const std::string & value) {
+            std::ifstream file(value);
+            if (!file) {
+                throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
+            }
+            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.system_prompt));
+            if (!params.system_prompt.empty() && params.system_prompt.back() == '\n') {
+                params.system_prompt.pop_back();
+            }
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
    add_opt(common_arg(
        {"--in-file"}, "FNAME",
        "an input file (repeat to specify multiple files)",
--- a/common/common.cpp
+++ b/common/common.cpp
@ -1037,6 +1037,8 @@ struct common_init_result common_init_from_params(common_params & params) {
    if (params.warmup) {
        LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);

+        llama_set_warmup(lctx, true);
+
        std::vector<llama_token> tmp;
        llama_token bos = llama_vocab_bos(vocab);
        llama_token eos = llama_vocab_eos(vocab);
@ -1067,6 +1069,7 @@ struct common_init_result common_init_from_params(common_params & params) {
        llama_kv_self_clear(lctx);
        llama_synchronize(lctx);
        llama_perf_context_reset(lctx);
+        llama_set_warmup(lctx, false);
    }

    iparams.model.reset(model);
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@ -621,7 +621,9 @@ static json oaicompat_completion_params_parse(

    llama_params["chat_format"]      = static_cast<int>(chat_params.format);
    llama_params["prompt"]           = chat_params.prompt;
-    llama_params["grammar"]          = chat_params.grammar;
+    if (!chat_params.grammar.empty()) {
+        llama_params["grammar"] = chat_params.grammar;
+    }
    llama_params["grammar_lazy"]     = chat_params.grammar_lazy;
    auto grammar_triggers = json::array();
    for (const auto & trigger : chat_params.grammar_triggers) {
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@ -1689,11 +1689,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
        case GGML_OP_MUL_MAT: {
            switch (op->src[0]->type) {
                case GGML_TYPE_Q8_0:
-                    // Current groupsize should not be greater than k-1 in
-                    // aclnnWeightQuantBatchMatmulV2GetWorkspaceSize
-                    if (op->src[0]->ne[0] <= QK8_0) {
-                        return false;
-                    }
                case GGML_TYPE_F16:
                case GGML_TYPE_F32:
                case GGML_TYPE_Q4_0:
--- a/include/llama.h
+++ b/include/llama.h
@ -947,6 +947,10 @@ extern "C" {
    // If set to true, the model will only attend to the past tokens
    LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);

+    // Set whether the model is in warmup mode or not
+    // If true, all model tensors are activated during llama_decode() to load and cache their weights.
+    LLAMA_API void llama_set_warmup(struct llama_context * ctx, bool warmup);
+
    // Set abort callback
    LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);

--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@ -39,6 +39,7 @@ llama_context::llama_context(
    cparams.flash_attn       = params.flash_attn;
    cparams.no_perf          = params.no_perf;
    cparams.pooling_type     = params.pooling_type;
+    cparams.warmup           = false;

    cparams.n_ctx            = params.n_ctx           == 0    ? hparams.n_ctx_train           : params.n_ctx;
    cparams.rope_freq_base   = params.rope_freq_base  == 0.0f ? hparams.rope_freq_base_train  : params.rope_freq_base;
@ -949,6 +950,12 @@ void llama_context::set_causal_attn(bool value) {
    cparams.causal_attn = value;
 }

+void llama_context::set_warmup(bool value) {
+    LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value);
+
+    cparams.warmup = value;
+}
+
 void llama_context::set_adapter_lora(
            llama_adapter_lora * adapter,
            float scale) {
@ -1595,7 +1602,7 @@ void llama_context::output_reorder() {
 //

 int32_t llama_context::graph_max_nodes() const {
-    return std::max<int32_t>(8192, 5*model.n_tensors());
+    return std::max<int32_t>(65536, 5*model.n_tensors());
 }

 ggml_cgraph * llama_context::graph_init() {
@ -2373,6 +2380,10 @@ void llama_set_causal_attn(llama_context * ctx, bool causal_attn) {
    ctx->set_causal_attn(causal_attn);
 }

+void llama_set_warmup(llama_context * ctx, bool warmup) {
+    ctx->set_warmup(warmup);
+}
+
 void llama_synchronize(llama_context * ctx) {
    ctx->synchronize();
 }
--- a/src/llama-context.h
+++ b/src/llama-context.h
@ -64,6 +64,7 @@ struct llama_context {

    void set_embeddings (bool value);
    void set_causal_attn(bool value);
+    void set_warmup(bool value);

    void set_adapter_lora(
            llama_adapter_lora * adapter,
--- a/src/llama-cparams.h
+++ b/src/llama-cparams.h
@ -29,6 +29,7 @@ struct llama_cparams {
    bool offload_kqv;
    bool flash_attn;
    bool no_perf;
+    bool warmup;

    enum llama_pooling_type pooling_type;

--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@ -577,7 +577,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
    n_embd_head_v    (hparams.n_embd_head_v),
    n_embd_v_gqa     (hparams.n_embd_v_gqa()),
    n_expert         (hparams.n_expert),
-    n_expert_used    (hparams.n_expert_used),
+    n_expert_used    (cparams.warmup ? hparams.n_expert : hparams.n_expert_used),
    freq_base        (cparams.rope_freq_base),
    freq_scale       (cparams.rope_freq_scale),
    ext_factor       (cparams.yarn_ext_factor),