diff --git a/common/arg.cpp b/common/arg.cpp index f2c2255c4..2487356fc 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1534,7 +1534,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT")); add_opt(common_arg( {"--context-shift"}, - string_format("enables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"), + string_format("enables context shift on infinite text generation (default: %s)", params.ctx_shift ? "enabled" : "disabled"), [](common_params & params) { params.ctx_shift = true; } diff --git a/common/chat.cpp b/common/chat.cpp index cb2f40523..a7e4bb4c7 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -147,6 +147,7 @@ struct templates_params { json extra_context; bool add_bos; bool add_eos; + bool is_inference = true; }; common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice) { @@ -1336,6 +1337,17 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp common_chat_params data; auto prompt = apply(tmpl, inputs); + // Check if we need to replace the return token with end token during + // inference and without generation prompt. For more details see: + // https://github.com/ggml-org/llama.cpp/issues/15417 + if (inputs.is_inference && !inputs.add_generation_prompt) { + static constexpr std::string_view return_token = "<|return|>"; + static constexpr std::string_view end_token = "<|end|>"; + if (size_t pos = prompt.rfind(return_token); pos != std::string::npos) { + prompt.replace(pos, return_token.length(), end_token); + } + } + data.prompt = prompt; data.format = COMMON_CHAT_FORMAT_GPT_OSS; diff --git a/common/common.h b/common/common.h index 5b0502ac2..4d8111b1b 100644 --- a/common/common.h +++ b/common/common.h @@ -371,7 +371,7 @@ struct common_params { bool cont_batching = true; // insert new sequences for decoding on-the-fly bool flash_attn = false; // flash attention bool no_perf = false; // disable performance metrics - bool ctx_shift = false; // context shift on inifinite text generation + bool ctx_shift = false; // context shift on infinite text generation bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055) bool kv_unified = false; // enable unified KV cache