From 657b8a77bd01854f99d37a47318fa24f2e7e298f Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Wed, 20 Aug 2025 14:26:01 +0200 Subject: [PATCH] chat: handle gpt-oss return/end token inconsistency (#15421) This commit addresses an inconsistency during inference by adding a new member to the `templates_params` struct to indicate whether the chat is in inference mode. This allows the gpt-oss specific function `common_chat_params_init_gpt_oss` to check this flag and the `add_generation_prompt` flag to determine if it should replace the `<|return|>` token with the `<|end|>` token in the prompt. The motivation for this change is to ensure that the formatted prompt of past messages in `common_chat_format_single` matches the output of the formatted new message. The issue is that the gpt-oss template returns different end tags: `<|return|>` when `add_generation_prompt` is false, and `<|end|>` when `add_generation_prompt` is true. This causes the substring function to start at an incorrect position, resulting in tokenization starting with 'tart|>' instead of '<|start|>'. Resolves: https://github.com/ggml-org/llama.cpp/issues/15417 --- common/chat.cpp | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/common/chat.cpp b/common/chat.cpp index 5fe5643d3..7f6809a4e 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -147,6 +147,7 @@ struct templates_params { json extra_context; bool add_bos; bool add_eos; + bool is_inference = true; }; common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice) { @@ -1336,6 +1337,17 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp common_chat_params data; auto prompt = apply(tmpl, inputs); + // Check if we need to replace the return token with end token during + // inference and without generation prompt. For more details see: + // https://github.com/ggml-org/llama.cpp/issues/15417 + if (inputs.is_inference && !inputs.add_generation_prompt) { + static constexpr std::string_view return_token = "<|return|>"; + static constexpr std::string_view end_token = "<|end|>"; + if (size_t pos = prompt.rfind(return_token); pos != std::string::npos) { + prompt.replace(pos, return_token.length(), end_token); + } + } + data.prompt = prompt; data.format = COMMON_CHAT_FORMAT_GPT_OSS;