Merge commit 'fec9519802' into concedo_experimental

# Conflicts: # Makefile # examples/lookahead/README.md # tools/server/CMakeLists.txt
2026-05-22 11:16:08 +00:00 · 2025-08-21 19:19:20 +08:00 · 2025-08-21 19:19:20 +08:00 · 90706ddb14
commit 90706ddb14
parent c13db49d5b fec9519802
3 changed files with 14 additions and 2 deletions
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -1534,7 +1534,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
    add_opt(common_arg(
        {"--context-shift"},
-        string_format("enables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
+        string_format("enables context shift on infinite text generation (default: %s)", params.ctx_shift ? "enabled" : "disabled"),
        [](common_params & params) {
            params.ctx_shift = true;
        }
--- a/common/chat.cpp
+++ b/common/chat.cpp
@ -147,6 +147,7 @@ struct templates_params {
    json extra_context;
    bool add_bos;
    bool add_eos;
+    bool is_inference = true;
 };

 common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice) {
@ -1336,6 +1337,17 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
    common_chat_params data;
    auto prompt = apply(tmpl, inputs);

+    // Check if we need to replace the return token with end token during
+    // inference and without generation prompt. For more details see:
+    // https://github.com/ggml-org/llama.cpp/issues/15417
+    if (inputs.is_inference && !inputs.add_generation_prompt) {
+        static constexpr std::string_view return_token = "<|return|>";
+        static constexpr std::string_view end_token    = "<|end|>";
+        if (size_t pos = prompt.rfind(return_token); pos != std::string::npos) {
+            prompt.replace(pos, return_token.length(), end_token);
+        }
+    }
+
    data.prompt = prompt;
    data.format = COMMON_CHAT_FORMAT_GPT_OSS;

--- a/common/common.h
+++ b/common/common.h
@ -371,7 +371,7 @@ struct common_params {
    bool cont_batching     = true;  // insert new sequences for decoding on-the-fly
    bool flash_attn        = false; // flash attention
    bool no_perf           = false; // disable performance metrics
-    bool ctx_shift         = false;  // context shift on inifinite text generation
+    bool ctx_shift         = false;  // context shift on infinite text generation
    bool swa_full          = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
    bool kv_unified        = false; // enable unified KV cache