Merge commit '017cc5f446' into concedo_experimental

# Conflicts:
#	.github/ISSUE_TEMPLATE/010-bug-compilation.yml
#	.github/ISSUE_TEMPLATE/019-bug-misc.yml
#	CODEOWNERS
#	examples/batched-bench/batched-bench.cpp
#	examples/batched/batched.cpp
#	examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
#	examples/gritlm/gritlm.cpp
#	examples/llama-bench/llama-bench.cpp
#	examples/passkey/passkey.cpp
#	examples/quantize-stats/quantize-stats.cpp
#	examples/run/run.cpp
#	examples/simple-chat/simple-chat.cpp
#	examples/simple/simple.cpp
#	examples/tokenize/tokenize.cpp
#	ggml/CMakeLists.txt
#	ggml/src/ggml-metal/CMakeLists.txt
#	ggml/src/ggml-vulkan/CMakeLists.txt
#	scripts/sync-ggml.last
#	src/llama.cpp
#	tests/test-autorelease.cpp
#	tests/test-model-load-cancel.cpp
#	tests/test-tokenizer-0.cpp
#	tests/test-tokenizer-1-bpe.cpp
#	tests/test-tokenizer-1-spm.cpp
This commit is contained in:
Concedo 2025-01-08 23:15:21 +08:00
commit dcfa1eca4e
45 changed files with 806 additions and 229 deletions

View file

@ -609,6 +609,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
"\\p{N}+",
};
break;
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM:
regex_exprs = {
"\\p{N}{1,3}",
"[一-龥぀-ゟ゠-ヿ]+",
"[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\r\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\r\n]*|\\s*[\r\n]+|\\s+(?!\\S)|\\s+",
};
break;
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
regex_exprs = {
"[\r\n]",
@ -717,7 +724,7 @@ struct llm_tokenizer_bpe_session {
bool append_bos(std::vector<llama_vocab::id> & output) const {
if (vocab.tokenizer_add_bos) {
GGML_ASSERT(vocab.special_bos_id != -1);
GGML_ASSERT(vocab.special_bos_id != LLAMA_TOKEN_NULL);
output.push_back(vocab.special_bos_id);
return true;
}
@ -726,7 +733,7 @@ struct llm_tokenizer_bpe_session {
bool append_eos(std::vector<llama_vocab::id> & output) const {
if (vocab.tokenizer_add_eos) {
GGML_ASSERT(vocab.special_eos_id != -1);
GGML_ASSERT(vocab.special_eos_id != LLAMA_TOKEN_NULL);
output.push_back(vocab.special_eos_id);
return true;
}
@ -1623,7 +1630,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
if (source == 0) {
buffer.erase_after(buffer.before_begin());
} else {
buffer.erase_after(std::next(buffer.begin(), (source-1)));
buffer.erase_after(std::next(buffer.begin(), (source - 1)));
}
// repeat for the right side
@ -1637,7 +1644,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
if (source == 0) {
buffer.erase_after(buffer.before_begin());
} else {
buffer.erase_after(std::next(buffer.begin(), (source-1)));
buffer.erase_after(std::next(buffer.begin(), (source - 1)));
}
break;
}
@ -1675,7 +1682,7 @@ std::vector<llama_vocab::id> llama_tokenize_internal(
bool is_prev_special = true; // prefix with space if first token
if (add_special && vocab.tokenizer_add_bos) {
GGML_ASSERT(vocab.special_bos_id != -1);
GGML_ASSERT(vocab.special_bos_id != LLAMA_TOKEN_NULL);
output.push_back(vocab.special_bos_id);
is_prev_special = true;
}
@ -1710,7 +1717,7 @@ std::vector<llama_vocab::id> llama_tokenize_internal(
// }
if (add_special && vocab.tokenizer_add_eos) {
GGML_ASSERT(vocab.special_eos_id != -1);
GGML_ASSERT(vocab.special_eos_id != LLAMA_TOKEN_NULL);
output.push_back(vocab.special_eos_id);
}
} break;
@ -1769,7 +1776,7 @@ std::vector<llama_vocab::id> llama_tokenize_internal(
case LLAMA_VOCAB_TYPE_WPM:
{
if (add_special) {
GGML_ASSERT(vocab.special_cls_id != -1);
GGML_ASSERT(vocab.special_cls_id != LLAMA_TOKEN_NULL);
output.push_back(vocab.special_cls_id);
}
@ -1789,14 +1796,14 @@ std::vector<llama_vocab::id> llama_tokenize_internal(
}
if (add_special) {
GGML_ASSERT(vocab.special_sep_id != -1);
GGML_ASSERT(vocab.special_sep_id != LLAMA_TOKEN_NULL);
output.push_back(vocab.special_sep_id);
}
} break;
case LLAMA_VOCAB_TYPE_UGM:
{
if (add_special && vocab.tokenizer_add_bos) {
GGML_ASSERT(vocab.special_bos_id != -1);
GGML_ASSERT(vocab.special_bos_id != LLAMA_TOKEN_NULL);
output.push_back(vocab.special_bos_id);
}
llm_tokenizer_ugm_session session(vocab);
@ -1821,7 +1828,7 @@ std::vector<llama_vocab::id> llama_tokenize_internal(
//}
if (add_special && vocab.tokenizer_add_eos) {
GGML_ASSERT(vocab.special_eos_id != -1);
GGML_ASSERT(vocab.special_eos_id != LLAMA_TOKEN_NULL);
output.push_back(vocab.special_eos_id);
}
} break;
@ -1890,7 +1897,7 @@ llama_token_attr llama_token_get_attr_impl(const struct llama_vocab & vocab, lla
}
bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token) {
return token != -1 && vocab.special_eog_ids.count(token) > 0;
return token != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(token) > 0;
}
bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token token) {
@ -2134,7 +2141,7 @@ int32_t llama_detokenize_impl(
}
if (remove_special && vocab.tokenizer_add_eos) {
if (n_tokens > 0 && tokens[n_tokens-1] == vocab.special_eos_id) {
if (n_tokens > 0 && tokens[n_tokens - 1] == vocab.special_eos_id) {
n_tokens--;
}
}