mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-07 09:02:04 +00:00
331 lines
13 KiB
C++
331 lines
13 KiB
C++
#include "ttstokenizer.h"
|
|
|
|
void token_trie::add(const std::string & gram, uint32_t token) {
|
|
_add(gram, token, 0);
|
|
}
|
|
|
|
void token_trie::_add(const std::string & gram, uint32_t new_token, size_t index) {
|
|
if (index >= gram.size()) {
|
|
has_value = true;
|
|
token = new_token;
|
|
return;
|
|
}
|
|
const char c = gram[index];
|
|
auto res = children.find(c);
|
|
if (res != children.end()) {
|
|
res->second._add(gram, new_token, index + 1);
|
|
} else {
|
|
struct token_trie nt{};
|
|
nt._add(gram, new_token, index + 1);
|
|
children[c] = nt;
|
|
}
|
|
}
|
|
|
|
const struct token_trie * token_trie::traverse(const char c) const {
|
|
auto res = children.find(c);
|
|
if (res != children.end()) {
|
|
return &res->second;
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
size_t unicode_len_utf8_tts(char src) {
|
|
const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
|
|
uint8_t highbits = static_cast<uint8_t>(src) >> 4;
|
|
return lookup[highbits];
|
|
}
|
|
|
|
void unigram_tokenizer::initialize_tokenizer() {
|
|
for (const auto it : vocab) {
|
|
root_trie.add(it.first, it.second);
|
|
}
|
|
init = true;
|
|
}
|
|
|
|
// the general approach here is to find the character grams that sum to the max possible value over the entire text sequence.
|
|
// The particular algorithm used here effectively works by walking the text and at each index storing the max value of all possible gram combinations
|
|
// we can then reverse that sequence to pick the best possible tokens.
|
|
void unigram_tokenizer::tokenize(const std::string & text, std::vector<uint32_t> & tokens) {
|
|
if (!init) {
|
|
TTS_ABORT("Error: %s\nTokenizer must be initialized before #tokenize is called.");
|
|
}
|
|
// the parler tokenizer's normalizer (i.e. the bert normalizer implemented by huggingface tokenizers libs) only deduplicates and strips extra spaces and
|
|
// optionally handles chinese characters and accents (neither of which are currently supported here).
|
|
std::string normalized = text;
|
|
if (dedupe_spaces) {
|
|
normalized = " " + std::regex_replace(text, duped_spaces, " ");
|
|
}
|
|
|
|
size_t text_length = normalized.size();
|
|
|
|
// initialize score_sum to neg infinity so it will be always lower than sums of token scores
|
|
std::vector<struct result> results(text_length + 1, {unk_token, 0, -INFINITY});
|
|
results[0] = { unk_token, 0, 0 };
|
|
|
|
size_t offset = 0;
|
|
|
|
while (offset < text_length) {
|
|
size_t current_offset = offset;
|
|
// pulled this directly from llama.cpp; I suspect that this is for handling of non-utf8 steps (to be marked as unknown tokens)
|
|
size_t n_utf8_code_units = std::min<size_t>(unicode_len_utf8_tts(normalized[offset]), text_length - offset);
|
|
|
|
bool found_unknown = true;
|
|
const struct result & current_best = results[offset];
|
|
|
|
// find the current branch in the trie
|
|
const struct token_trie * node = root_trie.traverse(normalized[current_offset++]);
|
|
// search for the next token
|
|
while (current_offset <= text_length && node != NULL) {
|
|
// check if this is a complete token (it could just be an unkown step between two tokens).
|
|
if (node->has_value) {
|
|
// check if it corresponds to the whole utf8 step
|
|
if (current_offset - offset == n_utf8_code_units) {
|
|
found_unknown = false;
|
|
}
|
|
float score = current_best.score + scores[node->token];
|
|
struct result & current_champ = results[current_offset];
|
|
if (score > current_champ.score) {
|
|
struct result challenger = { node->token, offset, score };
|
|
current_champ = challenger;
|
|
}
|
|
}
|
|
node = node->traverse(normalized[current_offset++]);
|
|
}
|
|
|
|
// if we found an unknown token, process it
|
|
if (found_unknown) {
|
|
current_offset = offset + n_utf8_code_units;
|
|
struct result & current_champ = results[current_offset];
|
|
float score = current_best.score + unk_token_score;
|
|
if (score > current_champ.score) {
|
|
struct result challenger = { unk_token, offset, score };
|
|
current_champ = challenger;
|
|
}
|
|
}
|
|
|
|
// move one utf8 step
|
|
offset += n_utf8_code_units;
|
|
}
|
|
|
|
// if we have more than on unknown token in a row, we can join them.
|
|
bool is_prev_unknown = false;
|
|
// iterate from the last result backwards and get the best performing tokens
|
|
for (struct result & result = results[text_length]; ; result = results[result.offset]) {
|
|
bool is_unknown = result.token == unk_token;
|
|
if (!(is_prev_unknown && is_unknown)) {
|
|
tokens.push_back(result.token);
|
|
}
|
|
if (result.offset == 0) {
|
|
break;
|
|
}
|
|
is_prev_unknown = is_unknown;
|
|
}
|
|
|
|
// reverse the tokens since we added tokens starting from the end of the input
|
|
std::reverse(tokens.begin(), tokens.end());
|
|
}
|
|
|
|
// loading the vocab to the tokenizer from gguf file.
|
|
unigram_tokenizer * unigram_tokenizer_from_gguf(gguf_context * meta) {
|
|
std::unordered_map<std::string, uint32_t> vocab;
|
|
std::vector<float> scores;
|
|
int vocab_key = gguf_find_key(meta, "tokenizer.ggml.tokens");
|
|
int vocab_size = gguf_get_arr_n(meta, vocab_key);
|
|
scores.reserve(vocab_size);
|
|
for (int i = 0; i < vocab_size; i++) {
|
|
std::string val = gguf_get_arr_str(meta, vocab_key, i);
|
|
vocab[val] = (uint32_t) i;
|
|
}
|
|
int scores_key = gguf_find_key(meta, "tokenizer.ggml.scores");
|
|
int scores_size = gguf_get_arr_n(meta, scores_key);
|
|
assert(scores_size == vocab_size);
|
|
float * data = (float*) gguf_get_arr_data(meta, scores_key);
|
|
for (int i = 0; i < scores_size; i++) {
|
|
scores.push_back(data[i]);
|
|
}
|
|
int unkown_token_key = gguf_find_key(meta, "tokenizer.ggml.unknown_token_id");
|
|
uint32_t token = gguf_get_val_u32(meta, unkown_token_key);
|
|
|
|
auto tokenizer = new unigram_tokenizer(vocab, token, scores[token], scores);
|
|
|
|
uint32_t eos_token_key = gguf_find_key(meta, "tokenizer.ggml.eos_token_id");
|
|
if (eos_token_key != -1) {
|
|
tokenizer->eos_token = gguf_get_val_u32(meta, eos_token_key);
|
|
}
|
|
return tokenizer;
|
|
}
|
|
|
|
void single_pass_tokenizer::tokenize(const std::string & text, std::vector<uint32_t> & token_ids) {
|
|
std::string remaining = text;
|
|
while (remaining.size() > 0) {
|
|
uint32_t token_id = unknown_id;
|
|
for (int i = 1; i < std::min(remaining.size()+1, max_size+1); i++) {
|
|
std::string part = remaining.substr(0, i);
|
|
ptrdiff_t pos = std::distance(tokens.begin(), std::find(tokens.begin(), tokens.end(), part));
|
|
if (pos < tokens.size()) {
|
|
token_id = (uint32_t) pos;
|
|
remaining = remaining.substr(part.size(), remaining.size() - part.size());
|
|
break;
|
|
}
|
|
}
|
|
if (token_id == unknown_id) {
|
|
remaining = remaining.substr(1, remaining.size() - 1);
|
|
}
|
|
token_ids.push_back(token_id);
|
|
}
|
|
}
|
|
|
|
void single_pass_tokenizer::token_split(const std::string & text, std::vector<std::string> & tokens) {
|
|
std::string remaining = text;
|
|
while (remaining.size() > 0) {
|
|
// String copying is much slower than using a std::string_view, but the former is simpler to implement for now.
|
|
std::string token = remaining.substr(0, 1);
|
|
for (int i = 1; i < remaining.size(); i++) {
|
|
std::string part = remaining.substr(0, i+1);
|
|
if (token_vocab.find(part) == token_vocab.end()) {
|
|
break;
|
|
}
|
|
token = part;
|
|
}
|
|
tokens.push_back(token);
|
|
remaining = remaining.substr(token.size(), remaining.size() - token.size());
|
|
}
|
|
}
|
|
|
|
struct single_pass_tokenizer * single_pass_tokenizer_from_gguf(gguf_context * meta, std::string key_name) {
|
|
int tokens_key = gguf_find_key(meta, key_name.c_str());
|
|
if (tokens_key == -1) {
|
|
TTS_ABORT("The '%s' key must be set in order to support single pass tokenization.", key_name.c_str());
|
|
}
|
|
std::vector<std::string> tokens;
|
|
int token_count = gguf_get_arr_n(meta, tokens_key);
|
|
for (int i = 0; i < token_count; i++) {
|
|
tokens.push_back(gguf_get_arr_str(meta, tokens_key, i));
|
|
}
|
|
return new single_pass_tokenizer(tokens);
|
|
}
|
|
|
|
void bpe_symbol::add_merges(std::priority_queue<bpe_merge, std::vector<bpe_merge>, bpe_merge_comp> & merges, std::unordered_map<std::pair<std::string, std::string>, int, pair_hash> & rank_map, bool only_forward) {
|
|
if (!only_forward && last) {
|
|
auto rid = std::make_pair<std::string, std::string>(last->as_str(), as_str());
|
|
if (rank_map.find(rid) != rank_map.end()) {
|
|
bpe_merge m{last, this, rank_map[rid], last->size + size};
|
|
merges.push(m);
|
|
}
|
|
}
|
|
|
|
if (next) {
|
|
auto rid = std::make_pair<std::string, std::string>(as_str(), next->as_str());
|
|
if (rank_map.find(rid) != rank_map.end()) {
|
|
bpe_merge m{this, next, rank_map[rid], size + next->size};
|
|
merges.push(m);
|
|
}
|
|
}
|
|
}
|
|
|
|
std::string bpe_symbol::as_str() {
|
|
return std::string(token, size);
|
|
}
|
|
|
|
bool bpe_merge_comp::operator() (const bpe_merge & a, const bpe_merge & b) {
|
|
return a.rank > b.rank || (a.rank == b.rank && a.a && b.a && a.a->pos > b.a->pos);
|
|
}
|
|
|
|
size_t pair_hash::operator() (const std::pair<std::string, std::string> & p) const {
|
|
return std::hash<std::string>{}(p.first) ^ (std::hash<std::string>{}(p.second) << 1);
|
|
}
|
|
|
|
bpe_symbol * bpe_merge::merge() {
|
|
a->size += b->size;
|
|
b->size = -1;
|
|
a->next = b->next;
|
|
if (a->next) {
|
|
a->next->last = a;
|
|
}
|
|
return a;
|
|
}
|
|
|
|
void pair_builder::join_pairs(std::unordered_map<std::pair<std::string, std::string>, int, pair_hash> & rank_map) {
|
|
std::priority_queue<bpe_merge, std::vector<bpe_merge>, bpe_merge_comp> merges;
|
|
for (auto part : parts) {
|
|
part->add_merges(merges, rank_map, true);
|
|
}
|
|
while (!merges.empty()) {
|
|
auto m = merges.top();
|
|
merges.pop();
|
|
if (m.a->size > 0 && m.b->size > 0 && m.new_size == m.a->size + m.b->size) {
|
|
m.merge();
|
|
m.a->add_merges(merges, rank_map);
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
void bpe_tokenizer::tokenize(const std::string & text, std::vector<uint32_t> & token_ids) {
|
|
std::vector<std::string> chunks = split(text, " ", true);
|
|
bool space_prior = false;
|
|
for (auto chunk : chunks) {
|
|
if (chunk != " ") {
|
|
bpe_tokenize(space_prior ? "Ġ" + chunk : chunk, token_ids);
|
|
} else {
|
|
space_prior = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
void bpe_tokenizer::bpe_tokenize(std::string chunk, std::vector<uint32_t> & token_ids) {
|
|
if (tokens_to_ids.find(chunk) != tokens_to_ids.end()) {
|
|
token_ids.push_back(tokens_to_ids[chunk]);
|
|
return;
|
|
}
|
|
auto pb = pair_builder{chunk};
|
|
pb.join_pairs(ranks);
|
|
bpe_symbol * next = pb.parts[0];
|
|
while (next) {
|
|
token_ids.push_back(tokens_to_ids[next->as_str()]);
|
|
next = next->next;
|
|
}
|
|
}
|
|
|
|
bpe_tokenizer * bpe_tokenizer_from_gguf(gguf_context * meta, std::string base_name) {
|
|
int vocab_key = gguf_find_key(meta, (base_name + ".tokens").c_str());
|
|
if (vocab_key == -1) {
|
|
TTS_ABORT("The '%s' key must be set in order to support BPE tokenization.", (base_name + ".tokens").c_str());
|
|
}
|
|
int merges_key = gguf_find_key(meta, (base_name + ".merges").c_str());
|
|
if (merges_key == -1) {
|
|
TTS_ABORT("The '%s' key must be set in order to support BPE tokenization.", (base_name + ".merges").c_str());
|
|
}
|
|
int eos_token_id_key = gguf_find_key(meta, (base_name + ".eos_token_id").c_str());
|
|
if (eos_token_id_key == -1) {
|
|
TTS_ABORT("The '%s' key must be set in order to support BPE tokenization.", (base_name + ".eos_token_id").c_str());
|
|
}
|
|
int bos_token_id_key = gguf_find_key(meta, (base_name + ".bos_token_id").c_str());
|
|
if (bos_token_id_key == -1) {
|
|
TTS_ABORT("The '%s' key must be set in order to support BPE tokenization.", (base_name + ".bos_token_id").c_str());
|
|
}
|
|
|
|
uint32_t bos_token_id = gguf_get_val_u32(meta, bos_token_id_key);
|
|
uint32_t eos_token_id = gguf_get_val_u32(meta, eos_token_id_key);
|
|
|
|
std::unordered_map<std::string, uint32_t> vocab;
|
|
int token_count = gguf_get_arr_n(meta, vocab_key);
|
|
for (int i = 0; i < token_count; i++) {
|
|
vocab[gguf_get_arr_str(meta, vocab_key, i)] = (uint32_t) i;
|
|
}
|
|
|
|
std::unordered_map<std::pair<std::string, std::string>, int, pair_hash> ranks;
|
|
int merge_count = gguf_get_arr_n(meta, merges_key);
|
|
|
|
for (int i = 0; i < merge_count; i++) {
|
|
auto raw_merge = gguf_get_arr_str(meta, merges_key, i);
|
|
std::vector<std::string> pair = split(raw_merge, " ");
|
|
if (pair.size() != 2) {
|
|
TTS_ABORT("Invalid pair, '%s', found in BPE merges, '%s', at index %d.", raw_merge, (base_name + ".merges").c_str(), i);
|
|
}
|
|
ranks[std::make_pair<>(pair[0], pair[1])] = i;
|
|
}
|
|
|
|
return new bpe_tokenizer(vocab, ranks, bos_token_id, eos_token_id);
|
|
}
|