cleanup indentation, fixing cublas build

This commit is contained in:
Concedo 2023-06-08 22:40:53 +08:00
parent a979e71ddc
commit a6a0fa338a
9 changed files with 933 additions and 127 deletions

View file

@ -97,14 +97,14 @@ inline bool LogitsDuplicated(std::vector<float> & arr1, std::vector<float> & arr
}
llama_token sample_token(llama_token_data_array * candidates, std::mt19937 & rng)
llama_token sample_token(llama_token_data_array * candidates, std::mt19937 & rng)
{
llama_sample_softmax(nullptr, candidates);
std::vector<float> probs;
probs.reserve(candidates->size);
top_picks.clear();
for (size_t i = 0; i < candidates->size; ++i) {
probs.push_back(candidates->data[i].p);
probs.push_back(candidates->data[i].p);
}
std::discrete_distribution<> dist(probs.begin(), probs.end());
@ -113,21 +113,21 @@ llama_token sample_token(llama_token_data_array * candidates, std::mt19937 & rng
if(debugmode)
{
top_picks.push_back(candidates->data[idx]);
for (size_t i = 0; (i < candidates->size && i<4); ++i)
{
for (size_t i = 0; (i < candidates->size && i<4); ++i)
{
if(i!=idx)
{
top_picks.push_back(candidates->data[i]);
}
}
}
}
llama_token result = candidates->data[idx].id;
return result;
}
llama_token sample_token_mirostat(int n_vocab, llama_token_data_array * candidates, std::mt19937 & rng, float tau, float eta, int m, float * mu)
{
llama_token sample_token_mirostat(int n_vocab, llama_token_data_array * candidates, std::mt19937 & rng, float tau, float eta, int m, float * mu)
{
float N = float(n_vocab);
llama_sample_softmax(nullptr, candidates);
// Estimate s_hat using the most probable m tokens
@ -157,7 +157,7 @@ llama_token sample_token_mirostat(int n_vocab, llama_token_data_array * candidat
return X;
}
llama_token sample_token_mirostat_v2(llama_token_data_array * candidates, std::mt19937 & rng, float tau, float eta, float * mu)
llama_token sample_token_mirostat_v2(llama_token_data_array * candidates, std::mt19937 & rng, float tau, float eta, float * mu)
{
llama_sample_softmax(nullptr, candidates);
// Truncate the words with surprise values greater than mu
@ -191,11 +191,11 @@ void sample_top_a(llama_token_data_array * candidates, float a, size_t min_keep)
// Compute the cumulative probabilities
float maxprob = candidates->data[0].p;
float threshold = a * maxprob * maxprob; //tokens with probs less than this are removed
size_t last_idx = candidates->size;
for (size_t i = 0; i < candidates->size; ++i) {
for (size_t i = 0; i < candidates->size; ++i) {
// Go until we reach a value under the threshold
float checkprob = candidates->data[i].p;
if (checkprob < threshold && i >= min_keep) {
@ -223,11 +223,11 @@ int mirostat, float mirostat_tau, float mirostat_eta)
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
// Apply penalties
auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), rep_pen_range), n_ctx);
auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), rep_pen_range), n_ctx);
llama_sample_repetition_penalty(nullptr, &candidates_p,
last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
last_n_repeat, rep_pen);
// llama_sample_frequency_and_presence_penalties(nullptr, &candidates_p,
// last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
// last_n_repeat, alpha_frequency, alpha_presence);
@ -300,15 +300,15 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
params.memory_f16 = inputs.f16_kv;
params.n_ctx = inputs.max_context_length;
neox_ctx_v2.hparams.n_ctx = gptj_ctx_v1.hparams.n_ctx = gptj_ctx_v2.hparams.n_ctx = gpt2_ctx_v1.hparams.n_ctx = gpt2_ctx_v2.hparams.n_ctx
neox_ctx_v2.hparams.n_ctx = gptj_ctx_v1.hparams.n_ctx = gptj_ctx_v2.hparams.n_ctx = gpt2_ctx_v1.hparams.n_ctx = gpt2_ctx_v2.hparams.n_ctx
= neox_ctx_v3.hparams.n_ctx = gptj_ctx_v3.hparams.n_ctx = gptj_ctx_v3.hparams.n_ctx = mpt_ctx_v3.hparams.n_ctx = params.n_ctx;
printf("System Info: %s\n", llama_print_system_info());
SetQuantsUnshuffled(false);
SetQuantsUnshuffled(false);
if(file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2)
{
//newer format has bit unshuffling
SetQuantsUnshuffled(file_format == FileFormat::GGJT_2);
SetQuantsUnshuffled(file_format == FileFormat::GGJT_2);
llama_ctx_params_v2 = llama_v2_context_default_params();
llama_ctx_params_v2.n_ctx = inputs.max_context_length;
@ -319,21 +319,21 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
llama_ctx_params_v2.use_mmap = inputs.use_mmap;
llama_ctx_params_v2.use_mlock = inputs.use_mlock;
llama_ctx_params_v2.n_gpu_layers = inputs.gpulayers;
llama_ctx_v2 = llama_v2_init_from_file(modelname.c_str(), llama_ctx_params_v2);
if (llama_ctx_v2 == NULL)
{
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, modelname.c_str());
return ModelLoadResult::FAIL;
}
printf("\n---\nWarning: Your model may be an OUTDATED format (ver %d). Please reconvert it for better results!\n---\n", file_format);
if (lora_filename != "")
{
printf("\nAttempting to apply LORA adapter: %s\n", lora_filename.c_str());
int err = llama_v2_apply_lora_from_file(llama_ctx_v2,
lora_filename.c_str(),
NULL,
@ -361,9 +361,9 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
llama_ctx_params.use_mmap = inputs.use_mmap;
llama_ctx_params.use_mlock = inputs.use_mlock;
llama_ctx_params.n_gpu_layers = inputs.gpulayers;
llama_ctx_v3 = llama_init_from_file(modelname.c_str(), llama_ctx_params);
if (llama_ctx_v3 == NULL)
{
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, modelname.c_str());
@ -372,7 +372,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
if (lora_filename != "")
{
printf("\nAttempting to apply LORA adapter: %s\n", lora_filename.c_str());
int err = llama_apply_lora_from_file(llama_ctx_v3,
lora_filename.c_str(),
NULL,
@ -479,8 +479,8 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
printf("\nTensor Transposition Detected! Retrying GPT-2 model loading...");
return res;
}
// determine the required inference memory per token:
legacy_gpt2_eval(gpt2_ctx_v1, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, file_format);
// determine the required inference memory per token:
legacy_gpt2_eval(gpt2_ctx_v1, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, file_format);
return ModelLoadResult::SUCCESS;
}
else if (file_format == FileFormat::GPT2_2 || file_format==FileFormat::GPT2_3 || file_format==FileFormat::GPT2_4)
@ -492,34 +492,34 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
{
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
return res;
}
}
else if(res==ModelLoadResult::RETRY_LOAD)
{
printf("\nTensor Transposition Detected! Retrying GPT-2 model loading...");
return res;
}
// determine the required inference memory per token:
gpt2_eval(gpt2_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, file_format);
gpt2_eval(gpt2_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, file_format);
return ModelLoadResult::SUCCESS;
}
else
{
//newer format has bit unshuffling
SetQuantsUnshuffled(file_format == FileFormat::GPT2_3);
SetQuantsUnshuffled(file_format == FileFormat::GPT2_3);
ModelLoadResult res = gpt2_v2_model_load(params.model, gpt2_ctx_v2, vocab, file_format, inputs.gpulayers);
if(res==ModelLoadResult::FAIL)
{
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
return res;
}
}
else if(res==ModelLoadResult::RETRY_LOAD)
{
printf("\nTensor Transposition Detected! Retrying GPT-2 model loading...");
return res;
}
// determine the required inference memory per token:
gpt2_v2_eval(gpt2_ctx_v2, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, file_format);
// determine the required inference memory per token:
gpt2_v2_eval(gpt2_ctx_v2, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, file_format);
return ModelLoadResult::SUCCESS;
}
}
@ -536,9 +536,9 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
printf("\nTensor Transposition Detected! Retrying GPT-J model loading...");
return res;
}
// determine the required inference memory per token:
legacy_gptj_eval(gptj_ctx_v1, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, file_format);
// determine the required inference memory per token:
legacy_gptj_eval(gptj_ctx_v1, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, file_format);
//if the logits are NAN or duplicated, it means the model is incompatible
if(logits.size()>0 && IsNanCheck(logits[0]))
{
@ -565,16 +565,16 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
return loadresult;
}
// determine the required inference memory per token:
// determine the required inference memory per token:
gptj_eval(gptj_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
//if the logits are NAN or duplicated, it means the model is incompatible
std::vector<float> oldlogits(logits);
//this is another hack because they change the library - we run the eval through the model
//twice and compare logits. if they give the same logits for different inputs, model is broken
gptj_eval(gptj_ctx_v3, params.n_threads, 0, {4, 5, 6, 7}, logits, mem_per_token);
if(logits.size()>0 && (IsNanCheck(logits[0]) || LogitsDuplicated(oldlogits,logits)))
{
printf("\nBad Logits detected! Retrying GPT-J model loading...");
@ -587,7 +587,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
else
{
//newer format has bit unshuffling
SetQuantsUnshuffled(file_format == FileFormat::GPTJ_4);
SetQuantsUnshuffled(file_format == FileFormat::GPTJ_4);
ModelLoadResult loadresult = gptj_v2_model_load(params.model, gptj_ctx_v2, vocab, inputs.gpulayers);
if (loadresult == ModelLoadResult::FAIL)
@ -601,16 +601,16 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
return loadresult;
}
// determine the required inference memory per token:
// determine the required inference memory per token:
gptj_v2_eval(gptj_ctx_v2, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
//if the logits are NAN or duplicated, it means the model is incompatible
std::vector<float> oldlogits(logits);
//this is another hack because they change the library - we run the eval through the model
//twice and compare logits. if they give the same logits for different inputs, model is broken
gptj_v2_eval(gptj_ctx_v2, params.n_threads, 0, {4, 5, 6, 7}, logits, mem_per_token);
if(logits.size()>0 && (IsNanCheck(logits[0]) || LogitsDuplicated(oldlogits,logits)))
{
printf("\nBad Logits detected! Retrying GPT-J model loading...");
@ -624,8 +624,8 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
else if(file_format==FileFormat::NEOX_1 || file_format==FileFormat::NEOX_2 || file_format==FileFormat::NEOX_3 || file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5|| file_format==FileFormat::NEOX_6|| file_format==FileFormat::NEOX_7)
{
if(file_format==FileFormat::NEOX_6|| file_format==FileFormat::NEOX_7)
{
ModelLoadResult res = gpt_neox_model_load(params.model, neox_ctx_v3, vocab, file_format);
{
ModelLoadResult res = gpt_neox_model_load(params.model, neox_ctx_v3, vocab, file_format);
if(res==ModelLoadResult::FAIL)
{
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
@ -637,7 +637,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
return res;
}
// determine the required inference memory per token:
// determine the required inference memory per token:
gpt_neox_eval(neox_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
return ModelLoadResult::SUCCESS;
@ -645,9 +645,9 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
else
{
//newer format has bit unshuffling
SetQuantsUnshuffled(file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5);
SetQuantsUnshuffled(file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5);
ModelLoadResult res = gpt_neox_v2_model_load(params.model, neox_ctx_v2, vocab, file_format);
ModelLoadResult res = gpt_neox_v2_model_load(params.model, neox_ctx_v2, vocab, file_format);
if(res==ModelLoadResult::FAIL)
{
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
@ -659,7 +659,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
return res;
}
// determine the required inference memory per token:
// determine the required inference memory per token:
gpt_neox_v2_eval(neox_ctx_v2, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
if(logits.size()>0 && file_format==FileFormat::NEOX_2 && !IsNanCheck(logits[0]))
@ -669,7 +669,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
auto orig_par_res = neox_ctx_v2.hparams.par_res;
neox_ctx_v2.hparams.par_res = 0; //test with residual false
gpt_neox_v2_eval(neox_ctx_v2, params.n_threads, 0, test_embd, logits, mem_per_token);
neox_ctx_v2.hparams.par_res = orig_par_res;
neox_ctx_v2.hparams.par_res = orig_par_res;
int topid = std::max_element(logits.begin(),logits.end())-logits.begin();
std::string predicted = vocab.id_to_token[topid].c_str();
auto findresult = predicted.find("8");
@ -683,7 +683,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
return ModelLoadResult::SUCCESS;
}
}
else if(file_format==FileFormat::MPT_1)
{
@ -692,10 +692,10 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
{
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
return ModelLoadResult::FAIL;
}
}
// determine the required inference memory per token:
mpt_eval(mpt_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, false, mem_per_token);
mpt_eval(mpt_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, false, mem_per_token);
return ModelLoadResult::SUCCESS;
}
else
@ -703,7 +703,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
printf("\nUnknown Model, cannot load.\n");
return ModelLoadResult::FAIL;
}
}
@ -802,10 +802,10 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
//if using BLAS and prompt is big enough, switch to single thread and use a huge batch
bool approved_format = !(file_format == FileFormat::BADFORMAT ||
file_format == FileFormat::GPT2_1 ||
file_format == FileFormat::GPT2_1 ||
file_format == FileFormat::GPTJ_1 ||
file_format == FileFormat::GPTJ_2 ||
file_format == FileFormat::RWKV_1 ||
file_format == FileFormat::GPTJ_2 ||
file_format == FileFormat::RWKV_1 ||
file_format==FileFormat::RWKV_2);
bool blasmode = (approved_format && embd_inp.size() >= 32 && ggml_cpu_has_blas() && blasbatchsize!=-1);
// bool blasmode = false;
@ -856,7 +856,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
else if (file_format == FileFormat::GPTJ_1 || file_format == FileFormat::GPTJ_2)
{
n_vocab = gptj_ctx_v1.hparams.n_vocab;
}
}
else if(file_format == FileFormat::GPTJ_3 || file_format==FileFormat::GPTJ_4)
{
n_vocab = gptj_ctx_v2.hparams.n_vocab;
@ -963,7 +963,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
if (!startedsampling)
{
printf("\rProcessing Prompt%s (%d / %d tokens)", (blasmode ? " [BLAS]" : ""), input_consumed, embd_inp.size());
}
}
fflush(stdout);
if (embdsize > 0)
@ -1081,7 +1081,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
}
eosID = llama_token_eos();
if (!unbanTokens)
{
// set the logit of the eos token (2) to zero to avoid sampling it
@ -1112,7 +1112,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
logits[eosID] = (logits[topid] < 0 ? logits[topid] : 0);
}
}
// set the logit of the eos token (0) to minimum to avoid sampling it
if (file_format == FileFormat::RWKV_1 ||
file_format == FileFormat::RWKV_2 ||
@ -1130,13 +1130,13 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
logits[eosID] = (logits[topid] < 0 ? logits[topid] : 0);
}
}
}
id = SampleLogits(logitsPtr, nctx, n_vocab, last_n_size, repeat_penalty,
id = SampleLogits(logitsPtr, nctx, n_vocab, last_n_size, repeat_penalty,
top_k, top_a, top_p, typical_p, tfs_z, temp, rng,
params.mirostat,params.mirostat_tau,params.mirostat_eta);
last_n_tokens.erase(last_n_tokens.begin());
last_n_tokens.push_back(id);
current_context_tokens.push_back(id);
@ -1151,11 +1151,11 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
{
concat_output += FileFormatTokenizeID(id,file_format);
}
if (startedsampling)
{
{
printf("\rGenerating (%d / %d tokens)", (params.n_predict - remaining_tokens), params.n_predict);
}
}
if(debugmode && top_picks.size()>0)
{
printf(" [");
@ -1163,11 +1163,11 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
for (auto & pick : top_picks)
{
if (!firstloop)
{
{
printf(" ");
}
firstloop = false;
std::string tokenizedstr = FileFormatTokenizeID(pick.id, file_format);
std::string tokenizedstr = FileFormatTokenizeID(pick.id, file_format);
::utreplace(tokenizedstr, "\n", "\\n");
printf("(%s %.2f%%)", tokenizedstr.c_str(), pick.p*100);
}
@ -1178,7 +1178,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
{
printf("\n(EOS token triggered!)");
remaining_tokens = 0;
}
}
for (const auto &matched : stop_sequence)
{
@ -1199,7 +1199,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
{
embd.push_back(embd_inp[input_consumed]);
last_n_tokens.erase(last_n_tokens.begin());
last_n_tokens.push_back(embd_inp[input_consumed]);
last_n_tokens.push_back(embd_inp[input_consumed]);
current_context_tokens.push_back(embd_inp[input_consumed]);
++input_consumed;
if ((int)embd.size() >= params.n_batch)