diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index c5e8bbada..0b35cd25d 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -313,6 +313,19 @@ static void print_tok_vec_str(std::vector &vec) printf("\n%s", get_tok_vec_str(vec).c_str()); } +bool allExtendedUnicode(const std::string& str) { + if(str.size()==0) + { + return false; + } + for (unsigned char c : str) { + if (c <= 127) { + return false; + } + } + return true; +} + // Find tokens that completely contain `str`, either as a single token, or as a sequence of tokens. // It's important to use a hash map for head tokens because some models have many of them. // For example, the Llama 3 tokenizer has 6570 tokens containing the period ('.') character. @@ -322,6 +335,7 @@ static void print_tok_vec_str(std::vector &vec) // tail tokens are generated by tokenizing the remainder. // If max_tail_len is >= 0, the maximum token length of a tail sequence is clamped to this value. static void GetOverlappingTokenSequences(const std::string& str, std::unordered_multimap>& token_sequences, int max_tail_len = -1) { + bool isAllExtendedUnicode = allExtendedUnicode(str); for(int v=0;v tokenization;