mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-10 17:14:36 +00:00
fix for DRY segfault on unicode character substring tokenization
This commit is contained in:
parent
2e74bd0327
commit
c78690737c
1 changed files with 15 additions and 1 deletions
|
@ -313,6 +313,19 @@ static void print_tok_vec_str(std::vector<int> &vec)
|
||||||
printf("\n%s", get_tok_vec_str(vec).c_str());
|
printf("\n%s", get_tok_vec_str(vec).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool allExtendedUnicode(const std::string& str) {
|
||||||
|
if(str.size()==0)
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
for (unsigned char c : str) {
|
||||||
|
if (c <= 127) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
// Find tokens that completely contain `str`, either as a single token, or as a sequence of tokens.
|
// Find tokens that completely contain `str`, either as a single token, or as a sequence of tokens.
|
||||||
// It's important to use a hash map for head tokens because some models have many of them.
|
// It's important to use a hash map for head tokens because some models have many of them.
|
||||||
// For example, the Llama 3 tokenizer has 6570 tokens containing the period ('.') character.
|
// For example, the Llama 3 tokenizer has 6570 tokens containing the period ('.') character.
|
||||||
|
@ -322,6 +335,7 @@ static void print_tok_vec_str(std::vector<int> &vec)
|
||||||
// tail tokens are generated by tokenizing the remainder.
|
// tail tokens are generated by tokenizing the remainder.
|
||||||
// If max_tail_len is >= 0, the maximum token length of a tail sequence is clamped to this value.
|
// If max_tail_len is >= 0, the maximum token length of a tail sequence is clamped to this value.
|
||||||
static void GetOverlappingTokenSequences(const std::string& str, std::unordered_multimap<gpt_vocab::id, std::vector<gpt_vocab::id>>& token_sequences, int max_tail_len = -1) {
|
static void GetOverlappingTokenSequences(const std::string& str, std::unordered_multimap<gpt_vocab::id, std::vector<gpt_vocab::id>>& token_sequences, int max_tail_len = -1) {
|
||||||
|
bool isAllExtendedUnicode = allExtendedUnicode(str);
|
||||||
for(int v=0;v<n_vocab;++v)
|
for(int v=0;v<n_vocab;++v)
|
||||||
{
|
{
|
||||||
std::string word = FileFormatTokenizeID(v, file_format, true);
|
std::string word = FileFormatTokenizeID(v, file_format, true);
|
||||||
|
@ -355,7 +369,7 @@ static void GetOverlappingTokenSequences(const std::string& str, std::unordered_
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (match) {
|
if (match && !isAllExtendedUnicode) {
|
||||||
// We matched to the end of the string. Since `str` is not contained in `word`,
|
// We matched to the end of the string. Since `str` is not contained in `word`,
|
||||||
// there must be trailing letters in `str`.
|
// there must be trailing letters in `str`.
|
||||||
std::vector<gpt_vocab::id> tokenization;
|
std::vector<gpt_vocab::id> tokenization;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue