mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-06 16:21:49 +00:00
vocab: add gemma4 tokenizer tests, fix edge case (#21534)
* YATF (Yet Another Tokenizer Fix) for Gemma 4. With tests! * Remove unnecessary hash from update script. * minor: move constant
This commit is contained in:
parent
243532e556
commit
0ec191e1d7
5 changed files with 169 additions and 2 deletions
|
|
@ -659,8 +659,17 @@ struct llm_tokenizer_bpe_session {
|
|||
|
||||
if (token == LLAMA_TOKEN_NULL) {
|
||||
for (auto j = str.begin(); j != str.end(); ++j) {
|
||||
std::string byte_str(1, *j);
|
||||
auto token_multibyte = vocab.text_to_token(byte_str);
|
||||
llama_token token_multibyte = LLAMA_TOKEN_NULL;
|
||||
if (tokenizer.byte_encode) {
|
||||
std::string byte_str(1, *j);
|
||||
token_multibyte = vocab.text_to_token(byte_str);
|
||||
} else {
|
||||
// For non-byte-encoded BPE (e.g. gemma-4), byte tokens use <0xXX> format
|
||||
static const char * hex = "0123456789ABCDEF";
|
||||
const uint8_t ch = (uint8_t)*j;
|
||||
const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
|
||||
token_multibyte = vocab.text_to_token(buf);
|
||||
}
|
||||
if (token_multibyte != LLAMA_TOKEN_NULL) {
|
||||
output.push_back(token_multibyte);
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue