Merge branch 'upstream' into concedo_experimental

# Conflicts: # .gitignore # CMakeLists.txt # flake.lock # llama.cpp
2025-09-11 17:44:38 +00:00 · 2024-06-03 14:46:12 +08:00 · 2024-06-03 14:46:12 +08:00 · 8b29d5f848
commit 8b29d5f848
parent 10a1d628ad 549279d804
4 changed files with 35 additions and 13 deletions
--- a/llama.cpp
+++ b/llama.cpp
@ -2190,8 +2190,7 @@ struct llama_vocab {
    std::vector<token_data>       id_to_token;

    std::vector<id>    cache_special_tokens;
-    std::vector<token> cache_token_to_piece;         // llama_token_to_piece(special = false);
-    std::vector<token> cache_token_to_piece_special; // llama_token_to_piece(special = true);
+    std::vector<token> cache_token_to_piece; // llama_token_to_piece(special = true);

    std::map<std::pair<std::string, std::string>, int> bpe_ranks;

@ -4908,23 +4907,19 @@ static void llm_load_vocab(
        LLAMA_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t)vocab.cache_special_tokens.size());
    }

-    // build token to piece caches
+    // build token to piece cache
    {
        size_t size_cache = 0;

-        std::vector<llama_vocab::token> cache_token_to_piece        (n_vocab);
-        std::vector<llama_vocab::token> cache_token_to_piece_special(n_vocab);
+        std::vector<llama_vocab::token> cache_token_to_piece(n_vocab);

        for (uint32_t id = 0; id < n_vocab; ++id) {
-            cache_token_to_piece[id]         = llama_token_to_piece(&model, id, false);
-            cache_token_to_piece_special[id] = llama_token_to_piece(&model, id, true);
+            cache_token_to_piece[id] = llama_token_to_piece(&model, id, true);

            size_cache += cache_token_to_piece[id].size();
-            size_cache += cache_token_to_piece_special[id].size();
        }

-        std::swap(vocab.cache_token_to_piece,         cache_token_to_piece);
-        std::swap(vocab.cache_token_to_piece_special, cache_token_to_piece_special);
+        std::swap(vocab.cache_token_to_piece, cache_token_to_piece);

        LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0);
    }
@ -18638,9 +18633,14 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
        return llama_token_to_piece_old(model, token, buf, length);
    }

+    // ref: https://github.com/ggerganov/llama.cpp/pull/7587#discussion_r1620983843
+    if (!special && llama_is_control_token(model->vocab, token)) {
+        return 0;
+    }
+
    // if we have a cache - use it
    {
-        const auto & cache = special ? model->vocab.cache_token_to_piece_special : model->vocab.cache_token_to_piece;
+        const auto & cache = model->vocab.cache_token_to_piece;

        if (!cache.empty()) {
            const auto & res = cache.at(token);