Merge branch 'upstream' into concedo_experimental

# Conflicts: # tests/test-tokenizer-0.cpp # tests/test-tokenizer-random.py
2025-09-11 09:34:37 +00:00 · 2024-07-14 17:21:11 +08:00 · 2024-07-14 17:21:11 +08:00 · 5e43ebd151
commit 5e43ebd151
parent 6abe450c82 e236528e76
3 changed files with 104 additions and 64 deletions
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -5482,6 +5482,7 @@ static void llm_load_vocab(
            } else if (
                tokenizer_pre == "command-r") {
                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;
+                vocab.tokenizer_clean_spaces = false;
            } else if (
                tokenizer_pre == "qwen2") {
                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
@ -5724,7 +5725,7 @@ static void llm_load_vocab(
    // build special tokens cache
    {
        for (llama_vocab::id id = 0; id < (llama_vocab::id)n_vocab; ++id) {
-            if (!(vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL)) {
+            if (vocab.id_to_token[id].attr & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_USER_DEFINED | LLAMA_TOKEN_ATTR_UNKNOWN)) {
                vocab.cache_special_tokens.push_back(id);
            }
        }
@ -15713,17 +15714,6 @@ struct llm_tokenizer_bpe {
                    "[0-9][0-9][0-9]",
                };
                break;
-            case LLAMA_VOCAB_PRE_TYPE_MPT:
-                // TODO: MPT pre-tokenization regexes are unknown
-                //       the following are close, but not exact. run the following:
-                //       ./bin/test-tokenizer-0 ../models/ggml-vocab-mpt.gguf
-                GGML_ASSERT("MPT pre-tokenization regexes are unknown - fixes needed");
-                regex_exprs = {
-                    "\\s?\\p{L}+",
-                    "\\s?\\p{P}+",
-                    "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
-                };
-                break;
            case LLAMA_VOCAB_PRE_TYPE_STARCODER:
            case LLAMA_VOCAB_PRE_TYPE_REFACT:
            case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
@ -15733,6 +15723,7 @@ struct llm_tokenizer_bpe {
                };
                break;
            case LLAMA_VOCAB_PRE_TYPE_GPT2:
+            case LLAMA_VOCAB_PRE_TYPE_MPT:
            case LLAMA_VOCAB_PRE_TYPE_OLMO:
            case LLAMA_VOCAB_PRE_TYPE_JAIS:
                regex_exprs = {
@ -15759,8 +15750,8 @@ struct llm_tokenizer_bpe {
                break;
            case LLAMA_VOCAB_PRE_TYPE_VIKING:
                regex_exprs = {
-                    "\\p{N}",
                    " ?[^(\\s|.,!?…。，、।۔،)]+",
+                    "\\p{N}",
                };
                break;
            default:
@ -16480,12 +16471,20 @@ struct fragment_buffer_variant {

 // #define PRETOKENIZERDEBUG

-static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer) {
+static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer, bool parse_special) {
    // for each special token
    for (const llama_vocab::id special_id : vocab.cache_special_tokens) {
        const auto & data = vocab.id_to_token[special_id];
        const auto & special_token = data.text;

+        if (!parse_special && (data.attr & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_UNKNOWN))) {
+            // Ignore control and unknown tokens when parse_special == false
+            continue;
+            // User-defined tokens are still pre-tokenized before everything else
+            // ref: https://github.com/huggingface/tokenizers/blob/fdd26ba9a3f0c133427aab0423888cbde91362d7/tokenizers/src/tokenizer/mod.rs#L726
+            // This is mostly relevant for neox-style tokenizers (mpt, olmo, stablelm, etc.)
+        }
+
        // for each text fragment
        std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
        while (it != buffer.end()) {
@ -16598,7 +16597,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &

    if (!raw_text.empty()) {
        fragment_buffer.emplace_front(raw_text, 0, raw_text.length());
-        if (parse_special) tokenizer_st_partition(vocab, fragment_buffer);
+        tokenizer_st_partition(vocab, fragment_buffer, parse_special);
    }

    switch (vocab.type) {