mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-14 02:49:41 +00:00
Merge commit '017cc5f446
' into concedo_experimental
# Conflicts: # .github/ISSUE_TEMPLATE/010-bug-compilation.yml # .github/ISSUE_TEMPLATE/019-bug-misc.yml # CODEOWNERS # examples/batched-bench/batched-bench.cpp # examples/batched/batched.cpp # examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp # examples/gritlm/gritlm.cpp # examples/llama-bench/llama-bench.cpp # examples/passkey/passkey.cpp # examples/quantize-stats/quantize-stats.cpp # examples/run/run.cpp # examples/simple-chat/simple-chat.cpp # examples/simple/simple.cpp # examples/tokenize/tokenize.cpp # ggml/CMakeLists.txt # ggml/src/ggml-metal/CMakeLists.txt # ggml/src/ggml-vulkan/CMakeLists.txt # scripts/sync-ggml.last # src/llama.cpp # tests/test-autorelease.cpp # tests/test-model-load-cancel.cpp # tests/test-tokenizer-0.cpp # tests/test-tokenizer-1-bpe.cpp # tests/test-tokenizer-1-spm.cpp
This commit is contained in:
commit
dcfa1eca4e
45 changed files with 806 additions and 229 deletions
|
@ -609,6 +609,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
|||
"\\p{N}+",
|
||||
};
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM:
|
||||
regex_exprs = {
|
||||
"\\p{N}{1,3}",
|
||||
"[一-龥-ゟ゠-ヿ]+",
|
||||
"[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\r\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\r\n]*|\\s*[\r\n]+|\\s+(?!\\S)|\\s+",
|
||||
};
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
|
||||
regex_exprs = {
|
||||
"[\r\n]",
|
||||
|
@ -717,7 +724,7 @@ struct llm_tokenizer_bpe_session {
|
|||
|
||||
bool append_bos(std::vector<llama_vocab::id> & output) const {
|
||||
if (vocab.tokenizer_add_bos) {
|
||||
GGML_ASSERT(vocab.special_bos_id != -1);
|
||||
GGML_ASSERT(vocab.special_bos_id != LLAMA_TOKEN_NULL);
|
||||
output.push_back(vocab.special_bos_id);
|
||||
return true;
|
||||
}
|
||||
|
@ -726,7 +733,7 @@ struct llm_tokenizer_bpe_session {
|
|||
|
||||
bool append_eos(std::vector<llama_vocab::id> & output) const {
|
||||
if (vocab.tokenizer_add_eos) {
|
||||
GGML_ASSERT(vocab.special_eos_id != -1);
|
||||
GGML_ASSERT(vocab.special_eos_id != LLAMA_TOKEN_NULL);
|
||||
output.push_back(vocab.special_eos_id);
|
||||
return true;
|
||||
}
|
||||
|
@ -1623,7 +1630,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
|||
if (source == 0) {
|
||||
buffer.erase_after(buffer.before_begin());
|
||||
} else {
|
||||
buffer.erase_after(std::next(buffer.begin(), (source-1)));
|
||||
buffer.erase_after(std::next(buffer.begin(), (source - 1)));
|
||||
}
|
||||
|
||||
// repeat for the right side
|
||||
|
@ -1637,7 +1644,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
|||
if (source == 0) {
|
||||
buffer.erase_after(buffer.before_begin());
|
||||
} else {
|
||||
buffer.erase_after(std::next(buffer.begin(), (source-1)));
|
||||
buffer.erase_after(std::next(buffer.begin(), (source - 1)));
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
@ -1675,7 +1682,7 @@ std::vector<llama_vocab::id> llama_tokenize_internal(
|
|||
bool is_prev_special = true; // prefix with space if first token
|
||||
|
||||
if (add_special && vocab.tokenizer_add_bos) {
|
||||
GGML_ASSERT(vocab.special_bos_id != -1);
|
||||
GGML_ASSERT(vocab.special_bos_id != LLAMA_TOKEN_NULL);
|
||||
output.push_back(vocab.special_bos_id);
|
||||
is_prev_special = true;
|
||||
}
|
||||
|
@ -1710,7 +1717,7 @@ std::vector<llama_vocab::id> llama_tokenize_internal(
|
|||
// }
|
||||
|
||||
if (add_special && vocab.tokenizer_add_eos) {
|
||||
GGML_ASSERT(vocab.special_eos_id != -1);
|
||||
GGML_ASSERT(vocab.special_eos_id != LLAMA_TOKEN_NULL);
|
||||
output.push_back(vocab.special_eos_id);
|
||||
}
|
||||
} break;
|
||||
|
@ -1769,7 +1776,7 @@ std::vector<llama_vocab::id> llama_tokenize_internal(
|
|||
case LLAMA_VOCAB_TYPE_WPM:
|
||||
{
|
||||
if (add_special) {
|
||||
GGML_ASSERT(vocab.special_cls_id != -1);
|
||||
GGML_ASSERT(vocab.special_cls_id != LLAMA_TOKEN_NULL);
|
||||
output.push_back(vocab.special_cls_id);
|
||||
}
|
||||
|
||||
|
@ -1789,14 +1796,14 @@ std::vector<llama_vocab::id> llama_tokenize_internal(
|
|||
}
|
||||
|
||||
if (add_special) {
|
||||
GGML_ASSERT(vocab.special_sep_id != -1);
|
||||
GGML_ASSERT(vocab.special_sep_id != LLAMA_TOKEN_NULL);
|
||||
output.push_back(vocab.special_sep_id);
|
||||
}
|
||||
} break;
|
||||
case LLAMA_VOCAB_TYPE_UGM:
|
||||
{
|
||||
if (add_special && vocab.tokenizer_add_bos) {
|
||||
GGML_ASSERT(vocab.special_bos_id != -1);
|
||||
GGML_ASSERT(vocab.special_bos_id != LLAMA_TOKEN_NULL);
|
||||
output.push_back(vocab.special_bos_id);
|
||||
}
|
||||
llm_tokenizer_ugm_session session(vocab);
|
||||
|
@ -1821,7 +1828,7 @@ std::vector<llama_vocab::id> llama_tokenize_internal(
|
|||
//}
|
||||
|
||||
if (add_special && vocab.tokenizer_add_eos) {
|
||||
GGML_ASSERT(vocab.special_eos_id != -1);
|
||||
GGML_ASSERT(vocab.special_eos_id != LLAMA_TOKEN_NULL);
|
||||
output.push_back(vocab.special_eos_id);
|
||||
}
|
||||
} break;
|
||||
|
@ -1890,7 +1897,7 @@ llama_token_attr llama_token_get_attr_impl(const struct llama_vocab & vocab, lla
|
|||
}
|
||||
|
||||
bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token) {
|
||||
return token != -1 && vocab.special_eog_ids.count(token) > 0;
|
||||
return token != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(token) > 0;
|
||||
}
|
||||
|
||||
bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token token) {
|
||||
|
@ -2134,7 +2141,7 @@ int32_t llama_detokenize_impl(
|
|||
}
|
||||
|
||||
if (remove_special && vocab.tokenizer_add_eos) {
|
||||
if (n_tokens > 0 && tokens[n_tokens-1] == vocab.special_eos_id) {
|
||||
if (n_tokens > 0 && tokens[n_tokens - 1] == vocab.special_eos_id) {
|
||||
n_tokens--;
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue