mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-10 04:00:53 +00:00
Merge commit 'fec9519802' into concedo_experimental
# Conflicts: # Makefile # examples/lookahead/README.md # tools/server/CMakeLists.txt
This commit is contained in:
commit
90706ddb14
3 changed files with 14 additions and 2 deletions
|
|
@ -1534,7 +1534,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
|
||||
add_opt(common_arg(
|
||||
{"--context-shift"},
|
||||
string_format("enables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
|
||||
string_format("enables context shift on infinite text generation (default: %s)", params.ctx_shift ? "enabled" : "disabled"),
|
||||
[](common_params & params) {
|
||||
params.ctx_shift = true;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -147,6 +147,7 @@ struct templates_params {
|
|||
json extra_context;
|
||||
bool add_bos;
|
||||
bool add_eos;
|
||||
bool is_inference = true;
|
||||
};
|
||||
|
||||
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice) {
|
||||
|
|
@ -1336,6 +1337,17 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
|
|||
common_chat_params data;
|
||||
auto prompt = apply(tmpl, inputs);
|
||||
|
||||
// Check if we need to replace the return token with end token during
|
||||
// inference and without generation prompt. For more details see:
|
||||
// https://github.com/ggml-org/llama.cpp/issues/15417
|
||||
if (inputs.is_inference && !inputs.add_generation_prompt) {
|
||||
static constexpr std::string_view return_token = "<|return|>";
|
||||
static constexpr std::string_view end_token = "<|end|>";
|
||||
if (size_t pos = prompt.rfind(return_token); pos != std::string::npos) {
|
||||
prompt.replace(pos, return_token.length(), end_token);
|
||||
}
|
||||
}
|
||||
|
||||
data.prompt = prompt;
|
||||
data.format = COMMON_CHAT_FORMAT_GPT_OSS;
|
||||
|
||||
|
|
|
|||
|
|
@ -371,7 +371,7 @@ struct common_params {
|
|||
bool cont_batching = true; // insert new sequences for decoding on-the-fly
|
||||
bool flash_attn = false; // flash attention
|
||||
bool no_perf = false; // disable performance metrics
|
||||
bool ctx_shift = false; // context shift on inifinite text generation
|
||||
bool ctx_shift = false; // context shift on infinite text generation
|
||||
bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
|
||||
bool kv_unified = false; // enable unified KV cache
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue