From c78690737c5ed92b59e06b81eb663d91ac55eb43 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Sun, 8 Sep 2024 18:25:00 +0800 Subject: [PATCH] fix for DRY segfault on unicode character substring tokenization --- gpttype_adapter.cpp | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index c5e8bbada..0b35cd25d 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -313,6 +313,19 @@ static void print_tok_vec_str(std::vector &vec) printf("\n%s", get_tok_vec_str(vec).c_str()); } +bool allExtendedUnicode(const std::string& str) { + if(str.size()==0) + { + return false; + } + for (unsigned char c : str) { + if (c <= 127) { + return false; + } + } + return true; +} + // Find tokens that completely contain `str`, either as a single token, or as a sequence of tokens. // It's important to use a hash map for head tokens because some models have many of them. // For example, the Llama 3 tokenizer has 6570 tokens containing the period ('.') character. @@ -322,6 +335,7 @@ static void print_tok_vec_str(std::vector &vec) // tail tokens are generated by tokenizing the remainder. // If max_tail_len is >= 0, the maximum token length of a tail sequence is clamped to this value. static void GetOverlappingTokenSequences(const std::string& str, std::unordered_multimap>& token_sequences, int max_tail_len = -1) { + bool isAllExtendedUnicode = allExtendedUnicode(str); for(int v=0;v tokenization;