fixed some old models failing due to tokenizer changes, update lite (+1 squashed commits)

Squashed commits: [9dee81ec] fixed some old models failing due to tokenizer changes, update lite tooltip (+3 squashed commit) Squashed commit: [5ab95a79] fixes [a561d5e2] fixed some old models failing due to tokenizer changes [95e65daf] lite updates
2025-09-10 17:14:36 +00:00 · 2023-10-21 11:33:32 +08:00 · 2023-10-21 11:33:32 +08:00 · cff75061fe
commit cff75061fe
parent dd1d61ea6b
4 changed files with 72 additions and 26 deletions
--- a/llama.cpp
+++ b/llama.cpp
@ -2200,7 +2200,13 @@ static void llm_load_vocab(
                const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
                if (!OldBPETokenizerMode)
                {
-                    GGML_ASSERT_CONTINUE(codepoints_from_utf8(word).size() > 0);
+                    auto validcodepoints = codepoints_from_utf8(word).size() > 0;
+                    GGML_ASSERT_CONTINUE(validcodepoints);
+                    if(!validcodepoints)
+                    {
+                        OldBPETokenizerMode = true;
+                        printf("\nFalling Back to older tokenizer...");
+                    }
                }

                std::string first;
@ -2236,9 +2242,15 @@ static void llm_load_vocab(

    for (uint32_t i = 0; i < n_vocab; i++) {
        std::string word = gguf_get_arr_str(ctx, token_idx, i);
-        if (!OldBPETokenizerMode)
+       if (!OldBPETokenizerMode)
        {
-            GGML_ASSERT_CONTINUE(codepoints_from_utf8(word).size() > 0);
+            auto validcodepoints = codepoints_from_utf8(word).size() > 0;
+            GGML_ASSERT_CONTINUE(validcodepoints);
+            if(!validcodepoints)
+            {
+                OldBPETokenizerMode = true;
+                printf("\nFalling Back to older tokenizer...");
+            }
        }

        vocab.token_to_id[word] = i;