mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 01:24:36 +00:00
refactor some old code with batching
This commit is contained in:
parent
38863a34d2
commit
35c32fd0f2
2 changed files with 252 additions and 88 deletions
|
@ -77,13 +77,8 @@ static llama_context * llama_ctx_v4;
|
||||||
static gpt_params * kcpp_params = nullptr;
|
static gpt_params * kcpp_params = nullptr;
|
||||||
static int max_context_limit_at_load = 0;
|
static int max_context_limit_at_load = 0;
|
||||||
static int n_past = 0;
|
static int n_past = 0;
|
||||||
static int n_threads = 4;
|
|
||||||
static int n_blasthreads = 4;
|
|
||||||
static int n_batch = 8;
|
|
||||||
static bool useSmartContext = false;
|
static bool useSmartContext = false;
|
||||||
static bool useContextShift = false;
|
static bool useContextShift = false;
|
||||||
static int blasbatchsize = 512;
|
|
||||||
static int smallbatchsize = 16;
|
|
||||||
static int debugmode = 0; //-1 = hide all, 0 = normal, 1 = showall
|
static int debugmode = 0; //-1 = hide all, 0 = normal, 1 = showall
|
||||||
static std::string modelname;
|
static std::string modelname;
|
||||||
static std::vector<gpt_vocab::id> last_n_tokens;
|
static std::vector<gpt_vocab::id> last_n_tokens;
|
||||||
|
@ -686,26 +681,38 @@ void PurgeMissingTokens(llama_context * ctx, std::vector<int> ¤t_context_t
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int GetBatchSize(int desiredBlasBatchSize,FileFormat in_file_format)
|
||||||
|
{
|
||||||
|
if(desiredBlasBatchSize<=0)
|
||||||
|
{
|
||||||
|
desiredBlasBatchSize = 16;
|
||||||
|
}
|
||||||
|
if (file_format != FileFormat::GGML && file_format != FileFormat::GGHF && file_format != FileFormat::GGJT && file_format != FileFormat::GGJT_2 && file_format != FileFormat::GGJT_3 && file_format != FileFormat::GGUF_GENERIC)
|
||||||
|
{
|
||||||
|
desiredBlasBatchSize = (desiredBlasBatchSize > 256 ? 256 : desiredBlasBatchSize);
|
||||||
|
}
|
||||||
|
if (file_format == FileFormat::RWKV_1 || file_format==FileFormat::RWKV_2)
|
||||||
|
{
|
||||||
|
desiredBlasBatchSize = 1;
|
||||||
|
}
|
||||||
|
return desiredBlasBatchSize;
|
||||||
|
}
|
||||||
|
|
||||||
ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in_file_format, FileFormatExtraMeta file_format_meta)
|
ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in_file_format, FileFormatExtraMeta file_format_meta)
|
||||||
{
|
{
|
||||||
ggml_time_init();
|
ggml_time_init();
|
||||||
kcpp_params = new gpt_params(); //allocate on heap to avoid linux segfault. yes this leaks memory.
|
kcpp_params = new gpt_params(); //allocate on heap to avoid linux segfault. yes this leaks memory.
|
||||||
|
|
||||||
file_format = in_file_format;
|
file_format = in_file_format;
|
||||||
n_threads = kcpp_params->n_threads = inputs.threads;
|
kcpp_params->n_threads = inputs.threads;
|
||||||
n_blasthreads = kcpp_params->n_threads_batch = inputs.blasthreads;
|
kcpp_params->n_threads_batch = inputs.blasthreads;
|
||||||
bool isGguf = (file_format == FileFormat::GGUF_GENERIC);
|
bool isGguf = (file_format == FileFormat::GGUF_GENERIC);
|
||||||
|
kcpp_params->n_batch = GetBatchSize(inputs.blasbatchsize, in_file_format);
|
||||||
n_batch = kcpp_params->n_batch = smallbatchsize;
|
|
||||||
modelname = kcpp_params->model = inputs.model_filename;
|
modelname = kcpp_params->model = inputs.model_filename;
|
||||||
useSmartContext = inputs.use_smartcontext;
|
useSmartContext = inputs.use_smartcontext;
|
||||||
useContextShift = inputs.use_contextshift;
|
useContextShift = inputs.use_contextshift;
|
||||||
debugmode = inputs.debugmode;
|
debugmode = inputs.debugmode;
|
||||||
blasbatchsize = inputs.blasbatchsize;
|
|
||||||
if(blasbatchsize<=0)
|
|
||||||
{
|
|
||||||
blasbatchsize = smallbatchsize;
|
|
||||||
}
|
|
||||||
|
|
||||||
auto clamped_max_context_length = inputs.max_context_length;
|
auto clamped_max_context_length = inputs.max_context_length;
|
||||||
|
|
||||||
|
@ -796,7 +803,6 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
SetQuantsUnshuffled(file_format == FileFormat::GGJT_2);
|
SetQuantsUnshuffled(file_format == FileFormat::GGJT_2);
|
||||||
llama_v2_context_params llama_ctx_params_v2 = llama_v2_context_default_params();
|
llama_v2_context_params llama_ctx_params_v2 = llama_v2_context_default_params();
|
||||||
llama_ctx_params_v2.n_ctx = clamped_max_context_length;
|
llama_ctx_params_v2.n_ctx = clamped_max_context_length;
|
||||||
//llama_ctx_params.n_parts = -1;
|
|
||||||
llama_ctx_params_v2.seed = -1;
|
llama_ctx_params_v2.seed = -1;
|
||||||
llama_ctx_params_v2.f16_kv = true;
|
llama_ctx_params_v2.f16_kv = true;
|
||||||
llama_ctx_params_v2.logits_all = false;
|
llama_ctx_params_v2.logits_all = false;
|
||||||
|
@ -827,7 +833,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
int err = llama_v2_apply_lora_from_file(llama_ctx_v2,
|
int err = llama_v2_apply_lora_from_file(llama_ctx_v2,
|
||||||
lora_filename.c_str(),
|
lora_filename.c_str(),
|
||||||
lora_base_arg,
|
lora_base_arg,
|
||||||
n_threads);
|
kcpp_params->n_threads);
|
||||||
if (err != 0)
|
if (err != 0)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
|
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
|
||||||
|
@ -846,7 +852,6 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
{
|
{
|
||||||
llama_v3_context_params llama_ctx_params = llama_v3_context_default_params();
|
llama_v3_context_params llama_ctx_params = llama_v3_context_default_params();
|
||||||
llama_ctx_params.n_ctx = clamped_max_context_length;
|
llama_ctx_params.n_ctx = clamped_max_context_length;
|
||||||
//llama_ctx_paran_parts = -1;
|
|
||||||
llama_ctx_params.seed = -1;
|
llama_ctx_params.seed = -1;
|
||||||
llama_ctx_params.f16_kv = true;
|
llama_ctx_params.f16_kv = true;
|
||||||
llama_ctx_params.low_vram = inputs.low_vram;
|
llama_ctx_params.low_vram = inputs.low_vram;
|
||||||
|
@ -858,7 +863,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
llama_ctx_params.main_gpu = cu_parseinfo_maindevice;
|
llama_ctx_params.main_gpu = cu_parseinfo_maindevice;
|
||||||
llama_ctx_params.rope_freq_base = rope_freq_base;
|
llama_ctx_params.rope_freq_base = rope_freq_base;
|
||||||
llama_ctx_params.rope_freq_scale = rope_freq_scale;
|
llama_ctx_params.rope_freq_scale = rope_freq_scale;
|
||||||
llama_ctx_params.n_batch = blasbatchsize;
|
llama_ctx_params.n_batch = kcpp_params->n_batch;
|
||||||
|
|
||||||
#if defined(GGML_USE_CUBLAS)
|
#if defined(GGML_USE_CUBLAS)
|
||||||
bool ts_all_zero = true;
|
bool ts_all_zero = true;
|
||||||
|
@ -894,7 +899,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
int err = llama_v3_apply_lora_from_file(llama_ctx_v3,
|
int err = llama_v3_apply_lora_from_file(llama_ctx_v3,
|
||||||
lora_filename.c_str(),
|
lora_filename.c_str(),
|
||||||
lora_base_arg,
|
lora_base_arg,
|
||||||
n_threads);
|
kcpp_params->n_threads);
|
||||||
if (err != 0)
|
if (err != 0)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
|
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
|
||||||
|
@ -915,6 +920,8 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
}
|
}
|
||||||
else if(file_format==FileFormat::GGUF_GENERIC)
|
else if(file_format==FileFormat::GGUF_GENERIC)
|
||||||
{
|
{
|
||||||
|
llama_backend_init(false);
|
||||||
|
|
||||||
llama_model_params model_params = llama_model_default_params();
|
llama_model_params model_params = llama_model_default_params();
|
||||||
llama_context_params llama_ctx_params = llama_context_default_params();
|
llama_context_params llama_ctx_params = llama_context_default_params();
|
||||||
llama_ctx_params.n_ctx = clamped_max_context_length;
|
llama_ctx_params.n_ctx = clamped_max_context_length;
|
||||||
|
@ -955,9 +962,9 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
model_params.main_gpu = cu_parseinfo_maindevice;
|
model_params.main_gpu = cu_parseinfo_maindevice;
|
||||||
model_params.split_mode = llama_split_mode::LLAMA_SPLIT_ROW;
|
model_params.split_mode = llama_split_mode::LLAMA_SPLIT_ROW;
|
||||||
|
|
||||||
llama_ctx_params.n_batch = blasbatchsize;
|
llama_ctx_params.n_batch = kcpp_params->n_batch;
|
||||||
llama_ctx_params.n_threads = n_threads;
|
llama_ctx_params.n_threads = kcpp_params->n_threads;
|
||||||
llama_ctx_params.n_threads_batch = n_blasthreads;
|
llama_ctx_params.n_threads_batch = kcpp_params->n_threads_batch;
|
||||||
|
|
||||||
#if defined(GGML_USE_CUBLAS)
|
#if defined(GGML_USE_CUBLAS)
|
||||||
bool ts_all_zero = true;
|
bool ts_all_zero = true;
|
||||||
|
@ -994,20 +1001,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
llamamodel->hparams.rope_freq_scale_train!=1.0f ||
|
llamamodel->hparams.rope_freq_scale_train!=1.0f ||
|
||||||
llamamodel->hparams.rope_scaling_type_train==2)
|
llamamodel->hparams.rope_scaling_type_train==2)
|
||||||
{
|
{
|
||||||
// float ropemultiplier = 1.0f;
|
|
||||||
// if(llamamodel->hparams.rope_scaling_type_train!=2 &&
|
|
||||||
// llamamodel->hparams.n_ctx_train > 2048 && clamped_max_context_length > llamamodel->hparams.n_ctx_train &&
|
|
||||||
// llamamodel->hparams.rope_freq_scale_train==1.0f)
|
|
||||||
// {
|
|
||||||
// ropemultiplier = (float)llamamodel->hparams.n_ctx_train / (float)clamped_max_context_length;
|
|
||||||
// llama_ctx_params.rope_freq_base = rope_freq_base = llamamodel->hparams.rope_freq_base_train;
|
|
||||||
// llama_ctx_params.rope_freq_scale = rope_freq_scale = ropemultiplier * llamamodel->hparams.rope_freq_scale_train;
|
|
||||||
// printf("Automatic RoPE Scaling: Using (scale:%.3f, base:%.1f).\n", rope_freq_scale, rope_freq_base);
|
|
||||||
// }
|
|
||||||
// else
|
|
||||||
// {
|
|
||||||
printf("Automatic RoPE Scaling: Using model internal value.\n");
|
printf("Automatic RoPE Scaling: Using model internal value.\n");
|
||||||
// }
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -1038,7 +1032,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
lora_filename.c_str(),
|
lora_filename.c_str(),
|
||||||
1.0f,
|
1.0f,
|
||||||
lora_base_arg,
|
lora_base_arg,
|
||||||
n_threads);
|
kcpp_params->n_threads);
|
||||||
if (err != 0)
|
if (err != 0)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
|
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
|
||||||
|
@ -1064,11 +1058,11 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
bool useWorldTokenizer = false;
|
bool useWorldTokenizer = false;
|
||||||
if (file_format == FileFormat::RWKV_1)
|
if (file_format == FileFormat::RWKV_1)
|
||||||
{
|
{
|
||||||
rwkv_ctx_v2 = rwkv_v2_init_from_file(modelname.c_str(), n_threads);
|
rwkv_ctx_v2 = rwkv_v2_init_from_file(modelname.c_str(), kcpp_params->n_threads);
|
||||||
}
|
}
|
||||||
else //rwkv_2
|
else //rwkv_2
|
||||||
{
|
{
|
||||||
rwkv_ctx_v3 = rwkv_init_from_file(modelname.c_str(), n_threads);
|
rwkv_ctx_v3 = rwkv_init_from_file(modelname.c_str(), kcpp_params->n_threads);
|
||||||
|
|
||||||
if(inputs.gpulayers>0)
|
if(inputs.gpulayers>0)
|
||||||
{
|
{
|
||||||
|
@ -1110,7 +1104,6 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
|
|
||||||
if (file_format == FileFormat::RWKV_1)
|
if (file_format == FileFormat::RWKV_1)
|
||||||
{
|
{
|
||||||
n_batch = 1;
|
|
||||||
|
|
||||||
//setup buffers for rwkv state
|
//setup buffers for rwkv state
|
||||||
auto padding = 512u;
|
auto padding = 512u;
|
||||||
|
@ -1138,8 +1131,6 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
n_batch = 1; //do not use sequence mode to speedup until it is fixed
|
|
||||||
|
|
||||||
//setup buffers for rwkv state
|
//setup buffers for rwkv state
|
||||||
auto padding = 512u;
|
auto padding = 512u;
|
||||||
auto statebufsiz = rwkv_get_state_buffer_element_count(rwkv_ctx_v3) * sizeof(float) + padding;
|
auto statebufsiz = rwkv_get_state_buffer_element_count(rwkv_ctx_v3) * sizeof(float) + padding;
|
||||||
|
@ -1472,6 +1463,22 @@ const std::string & gpttype_get_pending_output()
|
||||||
return concat_output_reader_copy;
|
return concat_output_reader_copy;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool GetThreadsToUse(bool blasmode)
|
||||||
|
{
|
||||||
|
if (blasmode)
|
||||||
|
{
|
||||||
|
if(!ggml_cpu_has_gpublas())
|
||||||
|
{
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
return kcpp_params->n_threads_batch;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return kcpp_params->n_threads;
|
||||||
|
}
|
||||||
|
|
||||||
generation_outputs gpttype_generate(const generation_inputs inputs, generation_outputs &output)
|
generation_outputs gpttype_generate(const generation_inputs inputs, generation_outputs &output)
|
||||||
{
|
{
|
||||||
if(kcpp_params==nullptr)
|
if(kcpp_params==nullptr)
|
||||||
|
@ -1482,6 +1489,12 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
||||||
generation_finished = true;
|
generation_finished = true;
|
||||||
return output;
|
return output;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if(debugmode==1 && file_format == FileFormat::GGUF_GENERIC)
|
||||||
|
{
|
||||||
|
llama_reset_timings(llama_ctx_v4);
|
||||||
|
}
|
||||||
|
|
||||||
concat_output_mtx.lock();
|
concat_output_mtx.lock();
|
||||||
concat_output = "";
|
concat_output = "";
|
||||||
concat_output_reader_copy = "";
|
concat_output_reader_copy = "";
|
||||||
|
@ -1528,9 +1541,6 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
||||||
kcpp_params->dynatemp_range = inputs.dynatemp_range;
|
kcpp_params->dynatemp_range = inputs.dynatemp_range;
|
||||||
kcpp_params->dynatemp_exponent = inputs.dynatemp_exponent;
|
kcpp_params->dynatemp_exponent = inputs.dynatemp_exponent;
|
||||||
kcpp_params->n_ctx = inputs.max_context_length;
|
kcpp_params->n_ctx = inputs.max_context_length;
|
||||||
kcpp_params->n_batch = n_batch;
|
|
||||||
kcpp_params->n_threads = n_threads;
|
|
||||||
kcpp_params->n_threads_batch = n_blasthreads;
|
|
||||||
kcpp_params->smoothing_factor = inputs.smoothing_factor;
|
kcpp_params->smoothing_factor = inputs.smoothing_factor;
|
||||||
|
|
||||||
bool stream_sse = inputs.stream_sse;
|
bool stream_sse = inputs.stream_sse;
|
||||||
|
@ -1674,33 +1684,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
||||||
file_format == FileFormat::GPTJ_2 ||
|
file_format == FileFormat::GPTJ_2 ||
|
||||||
file_format == FileFormat::RWKV_1 ||
|
file_format == FileFormat::RWKV_1 ||
|
||||||
file_format==FileFormat::RWKV_2);
|
file_format==FileFormat::RWKV_2);
|
||||||
bool blasmode = (approved_format && embd_inp.size() >= 32 && ggml_cpu_has_blas() && blasbatchsize>=32);
|
bool blasmode = (approved_format && embd_inp.size() >= 32 && ggml_cpu_has_blas() && kcpp_params->n_batch>=32);
|
||||||
// bool blasmode = false;
|
|
||||||
int original_batch = kcpp_params->n_batch;
|
|
||||||
int original_threads = kcpp_params->n_threads;
|
|
||||||
if (blasmode)
|
|
||||||
{
|
|
||||||
//for non llama, limit to 256
|
|
||||||
int bbs = blasbatchsize;
|
|
||||||
if (file_format != FileFormat::GGML && file_format != FileFormat::GGHF && file_format != FileFormat::GGJT && file_format != FileFormat::GGJT_2 && file_format != FileFormat::GGJT_3 && file_format != FileFormat::GGUF_GENERIC)
|
|
||||||
{
|
|
||||||
bbs = (blasbatchsize > 256 ? 256 : blasbatchsize);
|
|
||||||
}
|
|
||||||
|
|
||||||
kcpp_params->n_batch = bbs; //received reports of 1024 and above crashing on some models
|
|
||||||
if(!ggml_cpu_has_gpublas())
|
|
||||||
{
|
|
||||||
//does not limit here for gguf anymore. this is kept for older models.
|
|
||||||
//new models will override threads inside decode fn.
|
|
||||||
kcpp_params->n_threads = 1;
|
|
||||||
kcpp_params->n_threads_batch = 1;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
kcpp_params->n_threads = n_blasthreads;
|
|
||||||
kcpp_params->n_threads_batch = n_blasthreads;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
current_context_tokens.resize(n_past);
|
current_context_tokens.resize(n_past);
|
||||||
|
|
||||||
|
@ -1828,11 +1812,11 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
||||||
|
|
||||||
if (file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2)
|
if (file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2)
|
||||||
{
|
{
|
||||||
evalres = (llama_v2_eval(llama_ctx_v2, embd.data(), embdsize, n_past, kcpp_params->n_threads)==0);
|
evalres = (llama_v2_eval(llama_ctx_v2, embd.data(), embdsize, n_past, GetThreadsToUse(blasmode))==0);
|
||||||
}
|
}
|
||||||
else if(file_format == FileFormat::GGJT_3)
|
else if(file_format == FileFormat::GGJT_3)
|
||||||
{
|
{
|
||||||
evalres = (llama_v3_eval(llama_ctx_v3, embd.data(), embdsize, n_past, kcpp_params->n_threads)==0);
|
evalres = (llama_v3_eval(llama_ctx_v3, embd.data(), embdsize, n_past, GetThreadsToUse(blasmode))==0);
|
||||||
}
|
}
|
||||||
else if(file_format == FileFormat::GGUF_GENERIC)
|
else if(file_format == FileFormat::GGUF_GENERIC)
|
||||||
{
|
{
|
||||||
|
@ -1850,12 +1834,12 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
||||||
{
|
{
|
||||||
if(embd.size()>1)
|
if(embd.size()>1)
|
||||||
{
|
{
|
||||||
evalres = rwkv_eval_sequence(rwkv_ctx_v3, kcpp_params->n_threads, (uint32_t*)embd.data(), embd.size(), rwkv_ctx_v3->state_in, rwkv_ctx_v3->state_out, rwkv_ctx_v3->logits_out);
|
evalres = rwkv_eval_sequence(rwkv_ctx_v3, GetThreadsToUse(blasmode), (uint32_t*)embd.data(), embd.size(), rwkv_ctx_v3->state_in, rwkv_ctx_v3->state_out, rwkv_ctx_v3->logits_out);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
bool ignoreLogits = (!startedsampling && ((int)embd_inp.size() > input_consumed + 2));
|
bool ignoreLogits = (!startedsampling && ((int)embd_inp.size() > input_consumed + 2));
|
||||||
evalres = rwkv_eval(rwkv_ctx_v3, kcpp_params->n_threads, embd[0], rwkv_ctx_v3->state_in, rwkv_ctx_v3->state_out, ignoreLogits?nullptr:rwkv_ctx_v3->logits_out);
|
evalres = rwkv_eval(rwkv_ctx_v3, GetThreadsToUse(blasmode), embd[0], rwkv_ctx_v3->state_in, rwkv_ctx_v3->state_out, ignoreLogits?nullptr:rwkv_ctx_v3->logits_out);
|
||||||
}
|
}
|
||||||
|
|
||||||
memcpy(logits.data(), rwkv_ctx_v3->logits_out, sizeof(float) * rwkv_vocab.size());
|
memcpy(logits.data(), rwkv_ctx_v3->logits_out, sizeof(float) * rwkv_vocab.size());
|
||||||
|
@ -1864,39 +1848,39 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
||||||
}
|
}
|
||||||
else if(file_format==FileFormat::GPT2_1)
|
else if(file_format==FileFormat::GPT2_1)
|
||||||
{
|
{
|
||||||
evalres = legacy_gpt2_eval(gpt2_ctx_v1, kcpp_params->n_threads, n_past, embd, logits, mem_per_token, file_format);
|
evalres = legacy_gpt2_eval(gpt2_ctx_v1, GetThreadsToUse(blasmode), n_past, embd, logits, mem_per_token, file_format);
|
||||||
}
|
}
|
||||||
else if(file_format==FileFormat::GPT2_2 || file_format==FileFormat::GPT2_3)
|
else if(file_format==FileFormat::GPT2_2 || file_format==FileFormat::GPT2_3)
|
||||||
{
|
{
|
||||||
evalres = gpt2_v2_eval(gpt2_ctx_v2, kcpp_params->n_threads, n_past, embd, logits, mem_per_token, file_format);
|
evalres = gpt2_v2_eval(gpt2_ctx_v2, GetThreadsToUse(blasmode), n_past, embd, logits, mem_per_token, file_format);
|
||||||
}
|
}
|
||||||
else if(file_format==FileFormat::GPT2_4)
|
else if(file_format==FileFormat::GPT2_4)
|
||||||
{
|
{
|
||||||
evalres = gpt2_eval(gpt2_ctx_v3, kcpp_params->n_threads, n_past, embd, logits, mem_per_token, v3_use_scratch);
|
evalres = gpt2_eval(gpt2_ctx_v3, GetThreadsToUse(blasmode), n_past, embd, logits, mem_per_token, v3_use_scratch);
|
||||||
}
|
}
|
||||||
else if(file_format==FileFormat::NEOX_1 || file_format == FileFormat::NEOX_2 || file_format == FileFormat::NEOX_3 || file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5)
|
else if(file_format==FileFormat::NEOX_1 || file_format == FileFormat::NEOX_2 || file_format == FileFormat::NEOX_3 || file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5)
|
||||||
{
|
{
|
||||||
evalres = gpt_neox_v2_eval(neox_ctx_v2, kcpp_params->n_threads, n_past, embd, logits, mem_per_token);
|
evalres = gpt_neox_v2_eval(neox_ctx_v2, GetThreadsToUse(blasmode), n_past, embd, logits, mem_per_token);
|
||||||
}
|
}
|
||||||
else if(file_format==FileFormat::NEOX_6|| file_format==FileFormat::NEOX_7)
|
else if(file_format==FileFormat::NEOX_6|| file_format==FileFormat::NEOX_7)
|
||||||
{
|
{
|
||||||
evalres = gpt_neox_eval(neox_ctx_v3, kcpp_params->n_threads, n_past, embd, logits, mem_per_token, v3_use_scratch);
|
evalres = gpt_neox_eval(neox_ctx_v3, GetThreadsToUse(blasmode), n_past, embd, logits, mem_per_token, v3_use_scratch);
|
||||||
}
|
}
|
||||||
else if(file_format==FileFormat::GPTJ_1 || file_format==FileFormat::GPTJ_2)
|
else if(file_format==FileFormat::GPTJ_1 || file_format==FileFormat::GPTJ_2)
|
||||||
{
|
{
|
||||||
evalres = legacy_gptj_eval(gptj_ctx_v1, kcpp_params->n_threads, n_past, embd, logits, mem_per_token, file_format);
|
evalres = legacy_gptj_eval(gptj_ctx_v1, GetThreadsToUse(blasmode), n_past, embd, logits, mem_per_token, file_format);
|
||||||
}
|
}
|
||||||
else if(file_format==FileFormat::GPTJ_3 || file_format==FileFormat::GPTJ_4)
|
else if(file_format==FileFormat::GPTJ_3 || file_format==FileFormat::GPTJ_4)
|
||||||
{
|
{
|
||||||
evalres = gptj_v2_eval(gptj_ctx_v2, kcpp_params->n_threads, n_past, embd, logits, mem_per_token);
|
evalres = gptj_v2_eval(gptj_ctx_v2, GetThreadsToUse(blasmode), n_past, embd, logits, mem_per_token);
|
||||||
}
|
}
|
||||||
else if(file_format==FileFormat::GPTJ_5)
|
else if(file_format==FileFormat::GPTJ_5)
|
||||||
{
|
{
|
||||||
evalres = gptj_eval(gptj_ctx_v3, kcpp_params->n_threads, n_past, embd, logits, mem_per_token, v3_use_scratch);
|
evalres = gptj_eval(gptj_ctx_v3, GetThreadsToUse(blasmode), n_past, embd, logits, mem_per_token, v3_use_scratch);
|
||||||
}
|
}
|
||||||
else if(file_format==FileFormat::MPT_1)
|
else if(file_format==FileFormat::MPT_1)
|
||||||
{
|
{
|
||||||
evalres = mpt_eval(mpt_ctx_v3, kcpp_params->n_threads, n_past, embd, logits, false, mem_per_token, v3_use_scratch);
|
evalres = mpt_eval(mpt_ctx_v3, GetThreadsToUse(blasmode), n_past, embd, logits, false, mem_per_token, v3_use_scratch);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -1934,8 +1918,6 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
||||||
if (!startedsampling)
|
if (!startedsampling)
|
||||||
{
|
{
|
||||||
startedsampling = true;
|
startedsampling = true;
|
||||||
kcpp_params->n_batch = original_batch;
|
|
||||||
kcpp_params->n_threads = original_threads;
|
|
||||||
time1 = timer_check();
|
time1 = timer_check();
|
||||||
timer_start();
|
timer_start();
|
||||||
if(allow_regular_prints)
|
if(allow_regular_prints)
|
||||||
|
@ -2081,6 +2063,12 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if(debugmode==1 && file_format == FileFormat::GGUF_GENERIC)
|
||||||
|
{
|
||||||
|
llama_print_timings(llama_ctx_v4);
|
||||||
|
}
|
||||||
|
|
||||||
time2 = timer_check();
|
time2 = timer_check();
|
||||||
float pt1 = (time1*1000.0/(embd_inp.size()==0?1:embd_inp.size()));
|
float pt1 = (time1*1000.0/(embd_inp.size()==0?1:embd_inp.size()));
|
||||||
float ts1 = (1000.0/pt1);
|
float ts1 = (1000.0/pt1);
|
||||||
|
|
176
otherarch/tools/unused/mainfn.txt
Normal file
176
otherarch/tools/unused/mainfn.txt
Normal file
|
@ -0,0 +1,176 @@
|
||||||
|
int mainfn() {
|
||||||
|
|
||||||
|
kcpp_params = new gpt_params();
|
||||||
|
int argc = 11;
|
||||||
|
char* argv[11] = {
|
||||||
|
"E:\\LLaMA\\llamacpp\\main.exe",
|
||||||
|
"-ngl",
|
||||||
|
"99",
|
||||||
|
"-n",
|
||||||
|
"32",
|
||||||
|
"-m",
|
||||||
|
"E:\\LLaMA\\models\\airoboros-mistral2.2-7b.Q4_K_S.gguf",
|
||||||
|
"-c",
|
||||||
|
"2128",
|
||||||
|
"-p",
|
||||||
|
"Niko the kobold stalked carefully down the alley,"
|
||||||
|
};
|
||||||
|
|
||||||
|
if (!gpt_params_parse(argc, argv, *kcpp_params)) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
llama_sampling_params & sparams = kcpp_params->sparams;
|
||||||
|
|
||||||
|
|
||||||
|
if (kcpp_params->seed == LLAMA_DEFAULT_SEED) {
|
||||||
|
kcpp_params->seed = time(NULL);
|
||||||
|
}
|
||||||
|
|
||||||
|
LOG_TEE("%s: seed = %u\n", __func__, kcpp_params->seed);
|
||||||
|
|
||||||
|
std::mt19937 rng(kcpp_params->seed);
|
||||||
|
|
||||||
|
LOG("%s: llama backend init\n", __func__);
|
||||||
|
llama_backend_init(kcpp_params->numa);
|
||||||
|
|
||||||
|
llama_model * model;
|
||||||
|
|
||||||
|
// load the model and apply lora adapter, if any
|
||||||
|
LOG("%s: load the model and apply lora adapter, if any\n", __func__);
|
||||||
|
std::tie(model, llama_ctx_v4) = llama_init_from_gpt_params(*kcpp_params);
|
||||||
|
llama_reset_timings(llama_ctx_v4);
|
||||||
|
|
||||||
|
if (model == NULL) {
|
||||||
|
LOG_TEE("%s: error: unable to load model\n", __func__);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int n_ctx = llama_n_ctx(llama_ctx_v4);
|
||||||
|
const bool add_bos = true;
|
||||||
|
std::vector<llama_token> embd_inp;
|
||||||
|
|
||||||
|
embd_inp = ::llama_tokenize(llama_ctx_v4, kcpp_params->prompt, add_bos, true);
|
||||||
|
|
||||||
|
// Should not run without any tokens
|
||||||
|
if (embd_inp.empty()) {
|
||||||
|
embd_inp.push_back(llama_token_bos(model));
|
||||||
|
}
|
||||||
|
|
||||||
|
// number of tokens to keep when resetting context
|
||||||
|
if (kcpp_params->n_keep < 0 || kcpp_params->n_keep > (int) embd_inp.size() || kcpp_params->instruct || kcpp_params->chatml) {
|
||||||
|
kcpp_params->n_keep = (int)embd_inp.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
int n_past = 0;
|
||||||
|
int n_remain = kcpp_params->n_predict;
|
||||||
|
bool startedpred = false;
|
||||||
|
int predamt = 0;
|
||||||
|
int n_consumed = 0;
|
||||||
|
|
||||||
|
std::vector<int> input_tokens;
|
||||||
|
std::vector<int> output_tokens;
|
||||||
|
std::ostringstream output_ss;
|
||||||
|
|
||||||
|
|
||||||
|
std::vector<llama_token> embd;
|
||||||
|
struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
|
||||||
|
|
||||||
|
while (n_remain != 0) {
|
||||||
|
// predict
|
||||||
|
if (!embd.empty()) {
|
||||||
|
int max_embd_size = n_ctx - 4;
|
||||||
|
|
||||||
|
// Ensure the input doesn't exceed the context size by truncating embd if necessary.
|
||||||
|
if ((int) embd.size() > max_embd_size) {
|
||||||
|
const int skipped_tokens = (int) embd.size() - max_embd_size;
|
||||||
|
embd.resize(max_embd_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
if (n_past + (int) embd.size() > n_ctx) {
|
||||||
|
if (kcpp_params->n_predict == -2) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
const int n_left = n_past - kcpp_params->n_keep - 1;
|
||||||
|
const int n_discard = n_left/2;
|
||||||
|
llama_kv_cache_seq_rm (llama_ctx_v4, 0, kcpp_params->n_keep + 1 , kcpp_params->n_keep + n_discard + 1);
|
||||||
|
llama_kv_cache_seq_shift(llama_ctx_v4, 0, kcpp_params->n_keep + 1 + n_discard, n_past, -n_discard);
|
||||||
|
|
||||||
|
n_past -= n_discard;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
for (int i = 0; i < (int) embd.size(); i += kcpp_params->n_batch) {
|
||||||
|
int n_eval = (int) embd.size() - i;
|
||||||
|
if (n_eval > kcpp_params->n_batch) {
|
||||||
|
n_eval = kcpp_params->n_batch;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (llama_decode(llama_ctx_v4, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
|
||||||
|
LOG_TEE("%s : failed to eval\n", __func__);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
n_past += n_eval;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
embd.clear();
|
||||||
|
|
||||||
|
if ((int) embd_inp.size() <= n_consumed) {
|
||||||
|
const llama_token id = llama_sampling_sample(ctx_sampling, llama_ctx_v4, nullptr);
|
||||||
|
llama_sampling_accept(ctx_sampling, llama_ctx_v4, id, true);
|
||||||
|
embd.push_back(id);
|
||||||
|
// decrement remaining sampling budget
|
||||||
|
--n_remain;
|
||||||
|
if(!startedpred)
|
||||||
|
{
|
||||||
|
startedpred = true;
|
||||||
|
timer_start();
|
||||||
|
predamt += 1;
|
||||||
|
}else
|
||||||
|
{
|
||||||
|
predamt += 1;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
while ((int) embd_inp.size() > n_consumed) {
|
||||||
|
embd.push_back(embd_inp[n_consumed]);
|
||||||
|
llama_sampling_accept(ctx_sampling, llama_ctx_v4, embd_inp[n_consumed], false);
|
||||||
|
++n_consumed;
|
||||||
|
|
||||||
|
if ((int) embd.size() >= kcpp_params->n_batch) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// display text
|
||||||
|
{
|
||||||
|
for (auto id : embd) {
|
||||||
|
const std::string token_str = llama_token_to_piece(llama_ctx_v4, id);
|
||||||
|
printf("%s", token_str.c_str());
|
||||||
|
if (embd.size() > 1) {
|
||||||
|
input_tokens.push_back(id);
|
||||||
|
} else {
|
||||||
|
output_tokens.push_back(id);
|
||||||
|
output_ss << token_str;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fflush(stdout);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
auto tt = timer_check();
|
||||||
|
float pt1 = (tt*1000.0/(predamt));
|
||||||
|
float ts1 = (1000.0/pt1);
|
||||||
|
printf("\n\n Time:%.2fs (%.1fms/T = %.2fT/s) tokens: %d",tt,pt1,ts1,predamt);
|
||||||
|
|
||||||
|
llama_print_timings(llama_ctx_v4);
|
||||||
|
|
||||||
|
llama_free(llama_ctx_v4);
|
||||||
|
llama_free_model(model);
|
||||||
|
|
||||||
|
llama_sampling_free(ctx_sampling);
|
||||||
|
llama_backend_free();
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
Loading…
Add table
Add a link
Reference in a new issue