chat: handle gpt-oss return/end token inconsistency (#15421)

This commit addresses an inconsistency during inference by adding a new
member to the `templates_params` struct to indicate whether the chat is
in inference mode. This allows the gpt-oss specific function
`common_chat_params_init_gpt_oss` to check this flag and the
`add_generation_prompt` flag to determine if it should replace the
`<|return|>` token with the `<|end|>` token in the prompt.

The motivation for this change is to ensure that the formatted prompt of
past messages in `common_chat_format_single` matches the output of the
formatted new message. The issue is that the gpt-oss template returns
different end tags: `<|return|>` when `add_generation_prompt` is false,
and `<|end|>` when `add_generation_prompt` is true. This causes the
substring function to start at an incorrect position, resulting in
tokenization starting with 'tart|>' instead of '<|start|>'.

Resolves: https://github.com/ggml-org/llama.cpp/issues/15417
This commit is contained in:
Daniel Bevenius 2025-08-20 14:26:01 +02:00 committed by GitHub
parent ec5ab1a36c
commit 657b8a77bd
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -147,6 +147,7 @@ struct templates_params {
json extra_context; json extra_context;
bool add_bos; bool add_bos;
bool add_eos; bool add_eos;
bool is_inference = true;
}; };
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice) { common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice) {
@ -1336,6 +1337,17 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
common_chat_params data; common_chat_params data;
auto prompt = apply(tmpl, inputs); auto prompt = apply(tmpl, inputs);
// Check if we need to replace the return token with end token during
// inference and without generation prompt. For more details see:
// https://github.com/ggml-org/llama.cpp/issues/15417
if (inputs.is_inference && !inputs.add_generation_prompt) {
static constexpr std::string_view return_token = "<|return|>";
static constexpr std::string_view end_token = "<|end|>";
if (size_t pos = prompt.rfind(return_token); pos != std::string::npos) {
prompt.replace(pos, return_token.length(), end_token);
}
}
data.prompt = prompt; data.prompt = prompt;
data.format = COMMON_CHAT_FORMAT_GPT_OSS; data.format = COMMON_CHAT_FORMAT_GPT_OSS;