vocab: add gemma4 tokenizer tests, fix edge case (#21534)

* YATF (Yet Another Tokenizer Fix) for Gemma 4. With tests! * Remove unnecessary hash from update script. * minor: move constant
2026-05-06 16:21:49 +00:00 · 2026-04-09 11:41:14 +02:00 · 2026-04-09 11:41:14 +02:00 · 0ec191e1d7
commit 0ec191e1d7
parent 243532e556
5 changed files with 169 additions and 2 deletions
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@ -659,8 +659,17 @@ struct llm_tokenizer_bpe_session {

                if (token == LLAMA_TOKEN_NULL) {
                    for (auto j = str.begin(); j != str.end(); ++j) {
-                        std::string byte_str(1, *j);
-                        auto token_multibyte = vocab.text_to_token(byte_str);
+                        llama_token token_multibyte = LLAMA_TOKEN_NULL;
+                        if (tokenizer.byte_encode) {
+                            std::string byte_str(1, *j);
+                            token_multibyte = vocab.text_to_token(byte_str);
+                        } else {
+                            // For non-byte-encoded BPE (e.g. gemma-4), byte tokens use <0xXX> format
+                            static const char * hex = "0123456789ABCDEF";
+                            const uint8_t ch = (uint8_t)*j;
+                            const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
+                            token_multibyte = vocab.text_to_token(buf);
+                        }
                        if (token_multibyte != LLAMA_TOKEN_NULL) {
                            output.push_back(token_multibyte);
                        }