mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-14 10:59:41 +00:00
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .github/workflows/build.yml # README.md # examples/CMakeLists.txt # examples/batched/batched.cpp # examples/gritlm/gritlm.cpp # examples/llama.android/llama/build.gradle.kts # examples/main/README.md # examples/retrieval/retrieval.cpp # examples/server/CMakeLists.txt # examples/server/README.md # ggml/CMakeLists.txt # ggml/src/ggml-cpu/CMakeLists.txt # ggml/src/ggml.c # scripts/compare-commits.sh # scripts/sync-ggml.last # tests/CMakeLists.txt # tests/test-backend-ops.cpp # tests/test-chat-template.cpp # tests/test-sampling.cpp
This commit is contained in:
commit
ee486bad3e
59 changed files with 20531 additions and 13185 deletions
|
@ -965,7 +965,7 @@ struct llm_tokenizer_wpm_session {
|
|||
std::vector<std::string> words(1, "");
|
||||
|
||||
for (const uint32_t cpt : cpts_nfd) {
|
||||
const auto flags = unicode_cpt_flags(cpt);
|
||||
const auto flags = unicode_cpt_flags_from_cpt(cpt);
|
||||
|
||||
if (flags.is_whitespace) {
|
||||
if (words.back().size()) { // finish previous word if any
|
||||
|
@ -2127,6 +2127,10 @@ int32_t llama_detokenize_impl(
|
|||
int32_t text_len_max,
|
||||
bool remove_special,
|
||||
bool unparse_special) {
|
||||
if (vocab.type == LLAMA_VOCAB_TYPE_NONE) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
GGML_ASSERT(vocab.tokenizer && "Tokenizer not initialized. Call llama_vocab::init_tokenizer() first.");
|
||||
|
||||
int32_t avail = text_len_max;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue