mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-30 03:43:40 +00:00
convert: add MiniCPM5 tokenizer support (#23384)
Add minicpm5 pre-tokenizer hash via convert_hf_to_gguf_update.py and implement hardcoded regex handling in llama-vocab.cpp, consistent with other BPE pre-tokenizers. Co-authored-by: zhangtao <zhangtao2@modelbest.cn>
This commit is contained in:
parent
7085492c6f
commit
9777256c31
4 changed files with 16 additions and 0 deletions
|
|
@ -1625,6 +1625,9 @@ class TextModel(ModelBase):
|
|||
if chkhsh == "f728162c1315c26e40249849799b4ba3fe584c32084b4795b03eb295e63cb5af":
|
||||
# ref: https://huggingface.co/lewtun/talkie-1930-13b-it-hf
|
||||
res = "talkie"
|
||||
if chkhsh == "36f3066e97b7f3994b379aaacde306c1444c6ae84e81a5ae3cd2b7ed3b8c42d4":
|
||||
# ref: https://huggingface.co/openbmb/MiniCPM5-1B
|
||||
res = "minicpm5"
|
||||
|
||||
if res is None:
|
||||
logger.warning("\n")
|
||||
|
|
|
|||
|
|
@ -157,6 +157,7 @@ models = [
|
|||
{"name": "f2llmv2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/codefuse-ai/F2LLM-v2-4B", },
|
||||
{"name": "sarvam-moe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sarvamai/sarvam-30b", },
|
||||
{"name": "talkie", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/lewtun/talkie-1930-13b-it-hf", },
|
||||
{"name": "minicpm5", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openbmb/MiniCPM5-1B"},
|
||||
]
|
||||
|
||||
# some models are known to be broken upstream, so we will skip them as exceptions
|
||||
|
|
|
|||
|
|
@ -511,6 +511,14 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
|||
};
|
||||
byte_encode = false;
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_MINICPM5:
|
||||
regex_exprs = {
|
||||
// original regex from tokenizer.json (openbmb/MiniCPM5-1B)
|
||||
"\\p{N}{1,3}",
|
||||
// "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}+| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
|
||||
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}+| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||
};
|
||||
break;
|
||||
default:
|
||||
// default regex for BPE tokenization pre-processing
|
||||
regex_exprs = {
|
||||
|
|
@ -2039,6 +2047,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|||
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
} else if (tokenizer_pre == "default") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
} else if (tokenizer_pre == "minicpm5") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_MINICPM5;
|
||||
ignore_merges = true;
|
||||
} else if (
|
||||
tokenizer_pre == "llama3" ||
|
||||
tokenizer_pre == "llama-v3" ||
|
||||
|
|
|
|||
|
|
@ -60,6 +60,7 @@ enum llama_vocab_pre_type {
|
|||
LLAMA_VOCAB_PRE_TYPE_JAIS2 = 49,
|
||||
LLAMA_VOCAB_PRE_TYPE_GEMMA4 = 50,
|
||||
LLAMA_VOCAB_PRE_TYPE_SARVAM_MOE = 51,
|
||||
LLAMA_VOCAB_PRE_TYPE_MINICPM5 = 52,
|
||||
};
|
||||
|
||||
struct LLM_KV;
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue