Merge branch 'upstream' into concedo_experimental

# Conflicts: # ci/run.sh # ggml/src/CMakeLists.txt # ggml/src/ggml-cpu/CMakeLists.txt # ggml/src/ggml-cpu/repack.cpp # ggml/src/ggml-sycl/binbcast.cpp # ggml/src/ggml-sycl/concat.cpp # ggml/src/ggml-sycl/conv.cpp # ggml/src/ggml-sycl/convert.cpp # ggml/src/ggml-sycl/cpy.cpp # ggml/src/ggml-sycl/dmmv.cpp # ggml/src/ggml-sycl/dpct/helper.hpp # ggml/src/ggml-sycl/element_wise.cpp # ggml/src/ggml-sycl/getrows.cpp # ggml/src/ggml-sycl/ggml-sycl.cpp # ggml/src/ggml-sycl/gla.cpp # ggml/src/ggml-sycl/im2col.cpp # ggml/src/ggml-sycl/mmq.cpp # ggml/src/ggml-sycl/mmvq.cpp # ggml/src/ggml-sycl/norm.cpp # ggml/src/ggml-sycl/rope.cpp # ggml/src/ggml-sycl/softmax.cpp # ggml/src/ggml-sycl/tsembd.cpp # ggml/src/ggml-sycl/wkv.cpp # tests/test-backend-ops.cpp
2025-09-14 02:49:41 +00:00 · 2025-06-21 00:32:22 +08:00 · 2025-06-21 00:32:22 +08:00 · 4f2fcaa2ef
commit 4f2fcaa2ef
parent c16d672ce4 c959f462a0
21 changed files with 443 additions and 81 deletions
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@ -1494,6 +1494,7 @@ struct llama_vocab::impl {
    bool add_space_prefix           = false;
    bool add_bos                    = false;
    bool add_eos                    = false;
+    bool add_sep                    = false;
    bool ignore_merges              = false;
    bool clean_spaces               = false;  // clean_up_tokenization_spaces
    bool remove_extra_whitespaces   = false;
@ -1647,6 +1648,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
            special_sep_id  = 102;
            special_pad_id  = 0;
            special_mask_id = 103;
+
+            add_sep = true;
        } else if (tokenizer_model == "gpt2") {
            type = LLAMA_VOCAB_TYPE_BPE;

@ -1786,12 +1789,15 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                    tokenizer_pre == "jina-es" ||
                    tokenizer_pre == "jina-de" ||
                    tokenizer_pre == "gigachat"   ||
-                    tokenizer_pre == "jina-v1-en" ||
                    tokenizer_pre == "jina-v2-es" ||
-                    tokenizer_pre == "jina-v2-de" ||
+                    tokenizer_pre == "jina-v2-de") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
+            } else if (
+                    tokenizer_pre == "jina-v1-en" ||
                    tokenizer_pre == "jina-v2-code" ||
                    tokenizer_pre == "roberta-bpe") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
+                add_sep = true;
            } else if (
                    tokenizer_pre == "refact") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_REFACT;
@ -1901,6 +1907,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
            clean_spaces = true;
            add_bos = true;
            add_eos = false;
+            add_sep = true;
        } else if (type == LLAMA_VOCAB_TYPE_UGM) {
            pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
            add_bos = false;
@ -2040,7 +2047,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
            }
        }

-        // Handle add_bos and add_eos
+        // Handle add_bos, add_eos and add_sep
        {
            bool temp = true;

@ -2050,6 +2057,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
            if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) {
                add_eos = temp;
            }
+            if (ml.get_key(LLM_KV_TOKENIZER_ADD_SEP, temp, false)) {
+                add_sep = temp;
+            }
        }

        // auto-detect special tokens by text
@ -3281,6 +3291,10 @@ bool llama_vocab::get_add_eos() const {
    return pimpl->add_eos;
 }

+bool llama_vocab::get_add_sep() const {
+    return pimpl->add_sep;
+}
+
 bool llama_vocab::get_ignore_merges() const {
    return pimpl->ignore_merges;
 }
@ -3348,6 +3362,11 @@ int32_t llama_vocab::tokenize(
                        bool   add_special,
                        bool   parse_special) const {
    auto res = tokenize(std::string(text, text_len), add_special, parse_special);
+    if (res.size() >= static_cast<size_t>(std::numeric_limits<int32_t>::max())) {
+        LLAMA_LOG_ERROR("%s: tokenization result size %zu exceeds int32_t limit\n", __func__, res.size());
+        return std::numeric_limits<int32_t>::min();
+    }
+
    if (n_tokens_max < (int) res.size()) {
        // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
        return -((int) res.size());
@ -3479,6 +3498,10 @@ bool llama_vocab_get_add_eos(const struct llama_vocab * vocab) {
    return vocab->get_add_eos();
 }

+bool llama_vocab_get_add_sep(const struct llama_vocab * vocab) {
+    return vocab->get_add_sep();
+}
+
 llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab) {
    return vocab->token_fim_pre();
 }