convert: add MiniCPM5 tokenizer support (#23384)

Add minicpm5 pre-tokenizer hash via convert_hf_to_gguf_update.py and implement hardcoded regex handling in llama-vocab.cpp, consistent with other BPE pre-tokenizers. Co-authored-by: zhangtao <zhangtao2@modelbest.cn>
2026-07-10 01:18:32 +00:00 · 2026-05-27 13:08:33 +08:00 · 2026-05-27 13:08:33 +08:00 · 9777256c31
commit 9777256c31
parent 7085492c6f
4 changed files with 16 additions and 0 deletions
--- a/conversion/base.py
+++ b/conversion/base.py
@ -1625,6 +1625,9 @@ class TextModel(ModelBase):
        if chkhsh == "f728162c1315c26e40249849799b4ba3fe584c32084b4795b03eb295e63cb5af":
            # ref: https://huggingface.co/lewtun/talkie-1930-13b-it-hf
            res = "talkie"
+        if chkhsh == "36f3066e97b7f3994b379aaacde306c1444c6ae84e81a5ae3cd2b7ed3b8c42d4":
+            # ref: https://huggingface.co/openbmb/MiniCPM5-1B
+            res = "minicpm5"

        if res is None:
            logger.warning("\n")
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@ -157,6 +157,7 @@ models = [
    {"name": "f2llmv2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/codefuse-ai/F2LLM-v2-4B", },
    {"name": "sarvam-moe",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sarvamai/sarvam-30b", },
    {"name": "talkie",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/lewtun/talkie-1930-13b-it-hf", },
+    {"name": "minicpm5",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openbmb/MiniCPM5-1B"},
 ]

 # some models are known to be broken upstream, so we will skip them as exceptions
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@ -511,6 +511,14 @@ struct llm_tokenizer_bpe : llm_tokenizer {
                };
                byte_encode = false;
                break;
+            case LLAMA_VOCAB_PRE_TYPE_MINICPM5:
+                regex_exprs = {
+                    // original regex from tokenizer.json (openbmb/MiniCPM5-1B)
+                    "\\p{N}{1,3}",
+                    // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}+| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
+                    "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}+| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
+                };
+                break;
            default:
                // default regex for BPE tokenization pre-processing
                regex_exprs = {
@ -2039,6 +2047,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
            } else if (tokenizer_pre == "default") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+            } else if (tokenizer_pre == "minicpm5") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_MINICPM5;
+                ignore_merges = true;
            } else if (
                    tokenizer_pre == "llama3"   ||
                    tokenizer_pre == "llama-v3" ||
--- a/src/llama-vocab.h
+++ b/src/llama-vocab.h
@ -60,6 +60,7 @@ enum llama_vocab_pre_type {
    LLAMA_VOCAB_PRE_TYPE_JAIS2           = 49,
    LLAMA_VOCAB_PRE_TYPE_GEMMA4          = 50,
    LLAMA_VOCAB_PRE_TYPE_SARVAM_MOE      = 51,
+    LLAMA_VOCAB_PRE_TYPE_MINICPM5        = 52,
 };

 struct LLM_KV;