common/gemma4 : handle parsing edge cases (#21760)

2026-05-08 18:30:50 +00:00 · 2026-04-13 18:18:18 -05:00 · 2026-04-13 18:18:18 -05:00 · e21cdc11a0
commit e21cdc11a0
parent e974923698
5 changed files with 140 additions and 7 deletions
--- a/common/chat.cpp
+++ b/common/chat.cpp
@ -1091,6 +1091,14 @@ static common_chat_params common_chat_params_init_gemma4(const common_chat_templ
    common_chat_params data;

    data.prompt            = common_chat_template_direct_apply_impl(tmpl, inputs);
+
+    if (inputs.add_generation_prompt && string_ends_with(data.prompt, "<turn|>\n")) {
+        // This may happen if the model generates content + tool_call, the
+        // template does not add the model's next turn and confuses the model
+        // from emitting its proper reasoning token sequence.
+        data.prompt += "<|turn>model\n";
+    }
+
    data.format            = COMMON_CHAT_FORMAT_PEG_GEMMA4;
    data.supports_thinking  = true;
    data.thinking_start_tag = "<|channel>thought";
@ -1118,7 +1126,8 @@ static common_chat_params common_chat_params_init_gemma4(const common_chat_templ
            p.rule("thought", p.content(p.literal("<|channel>thought") + p.space() + p.until("<channel|>") + p.literal("<channel|>")));
        }

-        auto thought = (p.peek(p.literal("<|channel>")) + p.ref("thought")) | p.negate(p.literal("<|channel>"));
+        auto consume_empty_channels = p.gbnf(p.zero_or_more(p.literal("<|channel>") + p.negate(p.literal("thought"))), "");
+        auto thought = (p.peek(p.literal("<|channel>")) + consume_empty_channels + p.ref("thought")) | p.negate(p.literal("<|channel>"));

        if (has_response_format) {
            auto response_format = p.literal("```json") <<
@ -1182,12 +1191,16 @@ static common_chat_params common_chat_params_init_gemma4(const common_chat_templ
                /* max = */ inputs.parallel_tool_calls ? -1 : 1
            ));

-            auto content = p.rule("content", p.content(p.until_one_of({"<|channel>", "<|tool_call>"})));
+            auto scan_to_toolcall = p.rule("scan-to-toolcall", p.until("<|tool_call>"));
+            auto content = p.rule("content", p.content(p.until_one_of({"<|channel>", "<channel|>", "<|tool_call>"})));
            auto message = p.rule("message", thought + content);
-            return start + p.zero_or_more(message) + tool_call;
+            return start + p.zero_or_more(message) + scan_to_toolcall + tool_call;
        }

-        auto content = p.rule("content", p.content(p.until("<|channel>")));
+        // Gemma 4 may emit an extra <|channel>thought\n<channel|> at the end of the content. It may
+        // also emit a single trailing <channel|> token. Consume all complete reasoning blocks and
+        // then stop at the first unmatched <channel|> token.
+        auto content = p.rule("content", p.content(p.until_one_of({"<|channel>", "<channel|>"})));
        auto message = p.rule("message", thought + content);
        return start + p.one_or_more(message);
    });