convert: add MiniCPM5 tokenizer support (#23384)

Add minicpm5 pre-tokenizer hash via convert_hf_to_gguf_update.py and
implement hardcoded regex handling in llama-vocab.cpp, consistent with
other BPE pre-tokenizers.

Co-authored-by: zhangtao <zhangtao2@modelbest.cn>
This commit is contained in:
zhangtao2-1 2026-05-27 13:08:33 +08:00 committed by GitHub
parent 7085492c6f
commit 9777256c31
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 16 additions and 0 deletions

View file

@ -1625,6 +1625,9 @@ class TextModel(ModelBase):
if chkhsh == "f728162c1315c26e40249849799b4ba3fe584c32084b4795b03eb295e63cb5af":
# ref: https://huggingface.co/lewtun/talkie-1930-13b-it-hf
res = "talkie"
if chkhsh == "36f3066e97b7f3994b379aaacde306c1444c6ae84e81a5ae3cd2b7ed3b8c42d4":
# ref: https://huggingface.co/openbmb/MiniCPM5-1B
res = "minicpm5"
if res is None:
logger.warning("\n")

View file

@ -157,6 +157,7 @@ models = [
{"name": "f2llmv2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/codefuse-ai/F2LLM-v2-4B", },
{"name": "sarvam-moe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sarvamai/sarvam-30b", },
{"name": "talkie", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/lewtun/talkie-1930-13b-it-hf", },
{"name": "minicpm5", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openbmb/MiniCPM5-1B"},
]
# some models are known to be broken upstream, so we will skip them as exceptions

View file

@ -511,6 +511,14 @@ struct llm_tokenizer_bpe : llm_tokenizer {
};
byte_encode = false;
break;
case LLAMA_VOCAB_PRE_TYPE_MINICPM5:
regex_exprs = {
// original regex from tokenizer.json (openbmb/MiniCPM5-1B)
"\\p{N}{1,3}",
// "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}+| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}+| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
};
break;
default:
// default regex for BPE tokenization pre-processing
regex_exprs = {
@ -2039,6 +2047,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} else if (tokenizer_pre == "default") {
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} else if (tokenizer_pre == "minicpm5") {
pre_type = LLAMA_VOCAB_PRE_TYPE_MINICPM5;
ignore_merges = true;
} else if (
tokenizer_pre == "llama3" ||
tokenizer_pre == "llama-v3" ||

View file

@ -60,6 +60,7 @@ enum llama_vocab_pre_type {
LLAMA_VOCAB_PRE_TYPE_JAIS2 = 49,
LLAMA_VOCAB_PRE_TYPE_GEMMA4 = 50,
LLAMA_VOCAB_PRE_TYPE_SARVAM_MOE = 51,
LLAMA_VOCAB_PRE_TYPE_MINICPM5 = 52,
};
struct LLM_KV;