mirror of
https://github.com/Lizonghang/prima.cpp.git
synced 2025-09-10 14:14:41 +00:00
Merge branch 'dev' into feat/auto-exit
This commit is contained in:
commit
7b0ededd24
10 changed files with 488 additions and 87 deletions
|
@ -1582,6 +1582,12 @@ static bool tune_layer_allocation(
|
|||
//
|
||||
|
||||
struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
||||
|
||||
#if !(defined(GGML_USE_METAL) || defined(GGML_USE_CUDA))
|
||||
// reset n_gpu_layers to 0 if GPU is not used
|
||||
params.n_gpu_layers = 0;
|
||||
#endif
|
||||
|
||||
llama_init_result iparams;
|
||||
auto mparams = llama_model_params_from_gpt_params(params);
|
||||
|
||||
|
@ -1637,10 +1643,16 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|||
|
||||
if (n_world == 1) {
|
||||
uint32_t n_layers = llama_model_n_layers(model);
|
||||
// assign all layers to this device
|
||||
params.n_layer_window[0] = n_layers;
|
||||
cparams.n_layer_window[0] = n_layers;
|
||||
mparams.n_layer_window[0] = n_layers;
|
||||
llama_context_n_layer_window(lctx)[0] = n_layers;
|
||||
|
||||
#if defined(GGML_USE_METAL) || defined(GGML_USE_CUDA)
|
||||
params.n_gpu_layers = std::min((int32_t)n_layers, params.n_gpu_layers);
|
||||
#endif
|
||||
|
||||
} else {
|
||||
uint32_t n_layer_window[32] = {0}, n_gpu_layers[32] = {0};
|
||||
|
||||
|
@ -1649,12 +1661,20 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|||
|
||||
// broadcast startup args
|
||||
struct startup_args args;
|
||||
if (my_rank==0){
|
||||
if (my_rank == 0){
|
||||
args.should_profile = auto_schedule;
|
||||
args.n_ctx = params.n_ctx;
|
||||
}
|
||||
|
||||
llama_bcast_startup_args(lctx, my_rank, &args);
|
||||
|
||||
auto_schedule = args.should_profile;
|
||||
if (my_rank > 0) {
|
||||
// receive startup args
|
||||
auto_schedule = args.should_profile;
|
||||
params.n_ctx = args.n_ctx;
|
||||
cparams.n_ctx = args.n_ctx;
|
||||
}
|
||||
|
||||
// if n_world > 1 and need auto schdule, then prifile
|
||||
if (auto_schedule){
|
||||
// get device profile
|
||||
|
@ -1751,6 +1771,11 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|||
cparams.n_gpu_layers = n_gpu_layers[my_rank];
|
||||
mparams.n_gpu_layers = n_gpu_layers[my_rank];
|
||||
llama_model_set_n_gpu_layers(model, n_gpu_layers[my_rank]);
|
||||
} else { // -ngl is set
|
||||
params.n_gpu_layers = std::min(params.n_gpu_layers, (int32_t)n_layer_window[my_rank]);
|
||||
cparams.n_gpu_layers = params.n_gpu_layers;
|
||||
mparams.n_gpu_layers = params.n_gpu_layers;
|
||||
llama_model_set_n_gpu_layers(model, params.n_gpu_layers);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1820,7 +1845,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|||
}
|
||||
|
||||
if (params.warmup) {
|
||||
LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
|
||||
LOG_WRN("%s: warming up the model with an empty run - please wait ...\n", __func__);
|
||||
|
||||
const uint32_t my_rank = cparams.rank;
|
||||
std::vector<llama_token> tmp;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue