llama-server: fix k-shift when output overlength

2025-09-11 09:34:39 +00:00 · 2025-07-17 21:03:41 +08:00 · 2025-07-17 21:03:41 +08:00 · bdf9d8e74b
commit bdf9d8e74b
parent f032680cab
3 changed files with 21 additions and 14 deletions
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -2037,6 +2037,13 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
            params.chat_template = value;
        }
    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
+    add_opt(llama_arg(
+        {"-sys", "--system-prompt"}, "PROMPT",
+        "system prompt to use with model (if applicable, depending on chat template)",
+        [](gpt_params & params, const std::string & value) {
+            params.system_prompt = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SYSTEM_PROMPT"));
    add_opt(llama_arg(
        {"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
        format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),