mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 01:24:36 +00:00
added MPT support
This commit is contained in:
parent
9839259b63
commit
6f82e17b7a
10 changed files with 983 additions and 48 deletions
|
@ -1,7 +1,12 @@
|
|||
#include "utils.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
#include <regex>
|
||||
#include <locale>
|
||||
#include <codecvt>
|
||||
#include <sstream>
|
||||
|
||||
|
||||
|
||||
|
@ -109,24 +114,16 @@ void gpt_vocab::add_special_token(const std::string & token) {
|
|||
special_tokens.push_back(token);
|
||||
}
|
||||
|
||||
static void append_utf8(char32_t ch, std::string & out) {
|
||||
if (ch <= 0x7F) {
|
||||
out.push_back(static_cast<unsigned char>(ch));
|
||||
} else if (ch <= 0x7FF) {
|
||||
out.push_back(static_cast<unsigned char>(0xC0 | ((ch >> 6) & 0x1F)));
|
||||
out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
|
||||
} else if (ch <= 0xFFFF) {
|
||||
out.push_back(static_cast<unsigned char>(0xE0 | ((ch >> 12) & 0x0F)));
|
||||
out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 6) & 0x3F)));
|
||||
out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
|
||||
} else if (ch <= 0x10FFFF) {
|
||||
out.push_back(static_cast<unsigned char>(0xF0 | ((ch >> 18) & 0x07)));
|
||||
out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 12) & 0x3F)));
|
||||
out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 6) & 0x3F)));
|
||||
out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
|
||||
} else {
|
||||
printf("Invalid Unicode code point\n");
|
||||
}
|
||||
|
||||
std::string convert_to_utf8(const std::wstring & input) {
|
||||
std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
|
||||
return converter.to_bytes(input);
|
||||
}
|
||||
|
||||
|
||||
std::wstring convert_to_wstring(const std::string & input) {
|
||||
std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
|
||||
return converter.from_bytes(input);
|
||||
}
|
||||
|
||||
std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
|
||||
|
@ -162,40 +159,27 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
|
|||
}
|
||||
}
|
||||
|
||||
// find the longest tokens that form the words:
|
||||
// find the longest token that forms each word in words:
|
||||
std::vector<gpt_vocab::id> tokens;
|
||||
for (const auto & word : words) {
|
||||
if (word.size() == 0) continue;
|
||||
|
||||
int i = 0;
|
||||
int n = word.size();
|
||||
while (i < n) {
|
||||
int j = n;
|
||||
while (j > i) {
|
||||
auto it = vocab.token_to_id.find(word.substr(i, j-i));
|
||||
if (it != vocab.token_to_id.end()) {
|
||||
for (int i = 0; i < word.size(); ){
|
||||
for (int j = word.size() - 1; j >= i; j--){
|
||||
auto cand = word.substr(i, j-i+1);
|
||||
auto it = vocab.token_to_id.find(cand);
|
||||
if (it != vocab.token_to_id.end()){ // word.substr(i, j-i+1) in vocab
|
||||
tokens.push_back(it->second);
|
||||
i = j;
|
||||
j = n;
|
||||
continue;
|
||||
i = j + 1;
|
||||
break;
|
||||
}
|
||||
--j;
|
||||
}
|
||||
if (i == n) {
|
||||
break;
|
||||
}
|
||||
if (j == i) {
|
||||
auto sub = word.substr(i, 1);
|
||||
if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) {
|
||||
tokens.push_back(vocab.token_to_id.at(sub));
|
||||
} else {
|
||||
fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data());
|
||||
else if (j == i){ // word.substr(i, 1) has no matching
|
||||
fprintf(stderr, "%s: unknown token '%s'\n", __func__, word.substr(i, 1).data());
|
||||
i++;
|
||||
}
|
||||
++i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return tokens;
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue