Merge branch 'master' into concedo_experimental

# Conflicts:
#	README.md
#	ci/run.sh
#	llama.cpp
#	scripts/sync-ggml.last
This commit is contained in:
Concedo 2024-03-01 17:41:37 +08:00
commit 55af5446ad
23 changed files with 1069 additions and 743 deletions

237
llama.cpp
View file

@ -73,10 +73,12 @@
#include <cstdio>
#include <cstring>
#include <ctime>
#include <cwctype>
#include <forward_list>
#include <fstream>
#include <functional>
#include <initializer_list>
#include <locale>
#include <map>
#include <memory>
#include <mutex>
@ -7964,9 +7966,9 @@ static int llama_decode_internal(
const auto n_batch = cparams.n_batch;
GGML_ASSERT(n_tokens <= n_batch);
GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
int n_threads = n_tokens < 32 ? cparams.n_threads : cparams.n_threads_batch;
GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
const int64_t t_start_us = ggml_time_us();
@ -9237,37 +9239,46 @@ struct llm_tokenizer_wpm {
}
std::vector<std::string> preprocess(const std::string & text) {
std::string ori_str = normalize(text);
uint64_t ori_size = ori_str.size();
// normalalization form D
std::vector<uint32_t> codepoints = codepoints_from_utf8(text);
std::vector<uint32_t> nfd_codepoints;
for (uint32_t code : codepoints) {
auto it = nfd_map.find(code);
if (it != nfd_map.end()) {
for (uint32_t c : it->second) {
nfd_codepoints.push_back(c);
}
} else {
nfd_codepoints.push_back(code);
}
}
// single punct / single symbol / single digit
// baseline: add whitespace on the left and right of punct and chinese characters
std::vector<std::string> words;
// strip accents, strip control, uniformize whitespace,
// to lowercase, pad chinese characters, pad punctuation
std::string new_str = "";
uint64_t i = 0;
while (i < ori_size) {
int utf_char_len = utf8_len(ori_str[i]);
if ((utf_char_len == 1) && ispunct(ori_str[i])) {
new_str += " ";
new_str += ori_str[i];
new_str += " ";
i += 1;
for (uint32_t code : nfd_codepoints) {
int type = codepoint_type(code);
if (type == CODEPOINT_TYPE_ACCENT_MARK || type == CODEPOINT_TYPE_CONTROL) {
continue;
}
else if ((utf_char_len == 3) && is_chinese_char(ori_str.substr(i, 3))) {
new_str += " ";
new_str += ori_str.substr(i, 3);
new_str += " ";
i += 3;
code = to_lower(code);
if (type == CODEPOINT_TYPE_WHITESPACE) {
code = ' ';
}
else {
new_str += ori_str[i];
i += 1;
std::string s = codepoint_to_utf8(code);
if (type == CODEPOINT_TYPE_PUNCTUATION || is_ascii_punct(code) || is_chinese_char(code)) {
new_str += " ";
new_str += s;
new_str += " ";
} else {
new_str += s;
}
}
// split by whitespace
uint64_t l = 0;
uint64_t r = 0;
std::vector<std::string> words;
while (r < new_str.size()) {
// if is whitespace
if (isspace(new_str[r])) {
@ -9285,47 +9296,20 @@ struct llm_tokenizer_wpm {
return words;
}
std::string normalize(const std::string & text) {
// TODO: handle chinese characters? https://github.com/huggingface/tokenizers/blob/ef5f50605ddf9f8caef1598c0e4853862b9707a7/tokenizers/src/normalizers/bert.rs#L98
std::string text2 = strip_accents(text);
for (size_t i = 0; i < text2.size(); i += utf8_len(text2[i])) {
char c = text2[i];
if (c >= 'A' && c <= 'Z') {
text2[i] = c - 'A' + 'a';
}
uint32_t to_lower(uint32_t code) {
#if defined(_WIN32)
if (code > 0xFFFF) {
return code;
}
return text2;
#endif
return std::tolower(wchar_t(code), std::locale("en_US.UTF-8"));
}
bool is_chinese_char(const std::string & str) {
int len = str.length();
unsigned int codepoint = 0;
int num_bytes = 0;
int i = 0;
unsigned char ch = static_cast<unsigned char>(str[i]);
if (ch <= 0x7f) {
codepoint = ch;
num_bytes = 1;
} else if ((ch >> 5) == 0x06) {
codepoint = ch & 0x1f;
num_bytes = 2;
} else if ((ch >> 4) == 0x0e) {
codepoint = ch & 0x0f;
num_bytes = 3;
} else if ((ch >> 3) == 0x1e) {
codepoint = ch & 0x07;
num_bytes = 4;
}
for (int j = 1; j < num_bytes; ++j) {
if (i + j >= len) {
return false; // incomplete UTF-8 character
}
unsigned char next_ch = static_cast<unsigned char>(str[i + j]);
if ((next_ch >> 6) != 0x02) {
return false; // invalid trailing byte
}
codepoint = (codepoint << 6) | (next_ch & 0x3f);
}
bool is_ascii_punct(uint32_t code) {
return code < 256 && ispunct(code);
}
bool is_chinese_char(uint32_t codepoint) {
if ((codepoint >= 0x4E00 && codepoint <= 0x9FFF) ||
(codepoint >= 0x3400 && codepoint <= 0x4DBF) ||
(codepoint >= 0x20000 && codepoint <= 0x2A6DF) ||
@ -9341,41 +9325,6 @@ struct llm_tokenizer_wpm {
return false;
}
std::string strip_accents(const std::string & input_string) {
std::string resultString;
std::map<std::string, char> accent_map = {
{"À", 'A'}, {"Á", 'A'}, {"Â", 'A'}, {"Ã", 'A'}, {"Ä", 'A'}, {"Å", 'A'},
{"à", 'a'}, {"á", 'a'}, {"â", 'a'}, {"ã", 'a'}, {"ä", 'a'}, {"å", 'a'},
{"È", 'E'}, {"É", 'E'}, {"Ê", 'E'}, {"Ë", 'E'}, {"è", 'e'}, {"é", 'e'},
{"ê", 'e'}, {"ë", 'e'}, {"Ì", 'I'}, {"Í", 'I'}, {"Î", 'I'}, {"Ï", 'I'},
{"ì", 'i'}, {"í", 'i'}, {"î", 'i'}, {"ï", 'i'}, {"Ò", 'O'}, {"Ó", 'O'},
{"Ô", 'O'}, {"Õ", 'O'}, {"Ö", 'O'}, {"ò", 'o'}, {"ó", 'o'}, {"ô", 'o'},
{"õ", 'o'}, {"ö", 'o'}, {"Ù", 'U'}, {"Ú", 'U'}, {"Û", 'U'}, {"Ü", 'U'},
{"ù", 'u'}, {"ú", 'u'}, {"û", 'u'}, {"ü", 'u'}, {"Ý", 'Y'}, {"ý", 'y'},
{"Ç", 'C'}, {"ç", 'c'}, {"Ñ", 'N'}, {"ñ", 'n'},
};
for (size_t i = 0; i < input_string.length();) {
int len = utf8_len(input_string[i]);
std::string curChar = input_string.substr(i, len);
auto iter = accent_map.find(curChar);
if (iter != accent_map.end()) {
resultString += iter->second;
} else {
resultString += curChar;
}
i += len;
}
return resultString;
}
static size_t utf8_len(char src) {
const size_t lookup[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4};
uint8_t highbits = static_cast<uint8_t>(src) >> 4;
return lookup[highbits];
}
const llama_vocab & vocab;
};
@ -10433,13 +10382,6 @@ void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * cand
}
}
void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp, float smoothing_factor) {
llama_sample_temp(ctx, candidates_p, temp, smoothing_factor);
}
// The llama.cpp repetition penalty code goes unused in kobold's API
void llama_sample_repetition_penalties(
struct llama_context * ctx,
llama_token_data_array * candidates,
@ -10566,38 +10508,6 @@ void llama_sample_apply_guidance(
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
}
void llama_sample_classifier_free_guidance(
struct llama_context * ctx,
llama_token_data_array * candidates,
struct llama_context * guidance_ctx,
float scale) {
GGML_ASSERT(ctx);
int64_t t_start_sample_us;
t_start_sample_us = ggml_time_us();
const size_t n_vocab = llama_n_vocab(llama_get_model(ctx));
GGML_ASSERT(n_vocab == candidates->size);
GGML_ASSERT(!candidates->sorted);
std::vector<float> logits_base(n_vocab);
for (size_t i = 0; i < n_vocab; ++i) {
logits_base[i] = candidates->data[i].logit;
}
float * logits_guidance = llama_get_logits(guidance_ctx);
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
llama_sample_apply_guidance(ctx, logits_base.data(), logits_guidance, scale);
t_start_sample_us = ggml_time_us();
for (size_t i = 0; i < n_vocab; ++i) {
candidates->data[i].logit = logits_base[i];
}
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
}
llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) {
GGML_ASSERT(ctx);
@ -11536,7 +11446,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
quantize &= !params->only_copy;
// do not quantize expert gating tensors
quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_FFN_GATE_INP, "weight");
// NOTE: can't use LLM_TN here because the layer number is not known
quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
// do not quantize positional embeddings and token types (BERT)
quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight");
@ -12087,15 +11998,6 @@ bool llama_supports_gpu_offload(void) {
#endif
}
// deprecated:
bool llama_mmap_supported(void) {
return llama_supports_mmap();
}
bool llama_mlock_supported(void) {
return llama_supports_mlock();
}
void llama_backend_init(void) {
ggml_time_init();
@ -12607,15 +12509,6 @@ uint32_t llama_model_quantize(
}
}
int32_t llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
try {
return llama_apply_lora_from_file_internal(ctx->model, path_lora, scale, path_base_model, n_threads);
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
return 1;
}
}
int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
try {
return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads);
@ -12962,8 +12855,8 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
}
// Sets the state reading from the specified source address
size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
uint8_t * inp = src;
size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
const uint8_t * inp = src;
// set rng
{
@ -12972,7 +12865,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE);
std::string rng_str((char *)inp, rng_size); inp += rng_size;
std::string rng_str((const char *)inp, rng_size); inp += rng_size;
std::istringstream rng_ss(rng_str);
rng_ss >> ctx->rng;
@ -13175,38 +13068,6 @@ void printcache(struct llama_context * ctx)
printf("%s",vals.c_str());
}
int llama_eval(
struct llama_context * ctx,
llama_token * tokens,
int32_t n_tokens,
int32_t n_past) {
llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
const int ret = llama_decode_internal(*ctx, llama_batch_get_one(tokens, n_tokens, n_past, 0));
if (ret < 0) {
LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
}
return ret;
}
int llama_eval_embd(
struct llama_context * ctx,
float * embd,
int32_t n_tokens,
int32_t n_past) {
llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, };
const int ret = llama_decode_internal(*ctx, batch);
if (ret < 0) {
LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
}
return ret;
}
void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) {
ctx->cparams.n_threads = n_threads;
ctx->cparams.n_threads_batch = n_threads_batch;