llama-server: fix k-shift when output overlength

This commit is contained in:
Li, Zonghang 2025-07-17 21:03:41 +08:00
parent f032680cab
commit bdf9d8e74b
3 changed files with 21 additions and 14 deletions

View file

@ -2037,6 +2037,13 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
params.chat_template = value;
}
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
add_opt(llama_arg(
{"-sys", "--system-prompt"}, "PROMPT",
"system prompt to use with model (if applicable, depending on chat template)",
[](gpt_params & params, const std::string & value) {
params.system_prompt = value;
}
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SYSTEM_PROMPT"));
add_opt(llama_arg(
{"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),