From c78690737c5ed92b59e06b81eb663d91ac55eb43 Mon Sep 17 00:00:00 2001
From: Concedo <39025047+LostRuins@users.noreply.github.com>
Date: Sun, 8 Sep 2024 18:25:00 +0800
Subject: [PATCH] fix for DRY segfault on unicode character substring
 tokenization

---
 gpttype_adapter.cpp | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)
diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp
index c5e8bbada..0b35cd25d 100644
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@@ -313,6 +313,19 @@ static void print_tok_vec_str(std::vector<int> &vec)
     printf("\n%s", get_tok_vec_str(vec).c_str());
 }
 
+bool allExtendedUnicode(const std::string& str) {
+    if(str.size()==0)
+    {
+        return false;
+    }
+    for (unsigned char c : str) {
+        if (c <= 127) {
+            return false;
+        }
+    }
+    return true;
+}
+
 // Find tokens that completely contain `str`, either as a single token, or as a sequence of tokens.
 // It's important to use a hash map for head tokens because some models have many of them.
 // For example, the Llama 3 tokenizer has 6570 tokens containing the period ('.') character.
@@ -322,6 +335,7 @@ static void print_tok_vec_str(std::vector<int> &vec)
 // tail tokens are generated by tokenizing the remainder.
 // If max_tail_len is >= 0, the maximum token length of a tail sequence is clamped to this value.
 static void GetOverlappingTokenSequences(const std::string& str, std::unordered_multimap<gpt_vocab::id, std::vector<gpt_vocab::id>>& token_sequences, int max_tail_len = -1) {
+    bool isAllExtendedUnicode = allExtendedUnicode(str);
     for(int v=0;v<n_vocab;++v)
     {
         std::string word = FileFormatTokenizeID(v, file_format, true);
@@ -355,7 +369,7 @@ static void GetOverlappingTokenSequences(const std::string& str, std::unordered_
                         break;
                     }
                 }
-                if (match) {
+                if (match && !isAllExtendedUnicode) {
                     // We matched to the end of the string. Since `str` is not contained in `word`,
                     // there must be trailing letters in `str`.
                     std::vector<gpt_vocab::id> tokenization;