vocab: add gemma4 tokenizer tests, fix edge case (#21534)

* YATF (Yet Another Tokenizer Fix) for Gemma 4. With tests!
* Remove unnecessary hash  from update script.
* minor: move constant
This commit is contained in:
Piotr Wilkin (ilintar) 2026-04-09 11:41:14 +02:00 committed by GitHub
parent 243532e556
commit 0ec191e1d7
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 169 additions and 2 deletions

View file

@ -659,8 +659,17 @@ struct llm_tokenizer_bpe_session {
if (token == LLAMA_TOKEN_NULL) {
for (auto j = str.begin(); j != str.end(); ++j) {
std::string byte_str(1, *j);
auto token_multibyte = vocab.text_to_token(byte_str);
llama_token token_multibyte = LLAMA_TOKEN_NULL;
if (tokenizer.byte_encode) {
std::string byte_str(1, *j);
token_multibyte = vocab.text_to_token(byte_str);
} else {
// For non-byte-encoded BPE (e.g. gemma-4), byte tokens use <0xXX> format
static const char * hex = "0123456789ABCDEF";
const uint8_t ch = (uint8_t)*j;
const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
token_multibyte = vocab.text_to_token(buf);
}
if (token_multibyte != LLAMA_TOKEN_NULL) {
output.push_back(token_multibyte);
}