mirror of
https://github.com/Lizonghang/prima.cpp.git
synced 2025-09-08 17:19:04 +00:00
fix manully set layer window and gpu layers
This commit is contained in:
parent
70bd9db008
commit
f9d16fbf71
1 changed files with 40 additions and 40 deletions
|
@ -1408,23 +1408,26 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|||
}
|
||||
}
|
||||
|
||||
device_info dev_info;
|
||||
uint32_t n_world = params.n_world;
|
||||
uint32_t my_rank = params.rank;
|
||||
bool auto_schedule = n_world == 1 || params.n_layer_window[0] == 0;
|
||||
|
||||
if (auto_schedule) {
|
||||
// get device profile
|
||||
LOG_INF("Start profiling this device, this may take some seconds ...\n");
|
||||
|
||||
device_info dev_info;
|
||||
dev_info.rank = params.rank;
|
||||
llama_profile_device(&dev_info, model, ml, params.cuda_mem, params.n_predict, params.n_ctx, params.cpuparams.n_threads, params.flash_attn);
|
||||
}
|
||||
|
||||
// create llama context
|
||||
struct llama_context_params cparams = llama_context_params_from_gpt_params(params);
|
||||
llama_context * lctx = llama_new_context_with_model(model, cparams);
|
||||
|
||||
uint32_t n_world = cparams.n_world;
|
||||
uint32_t my_rank = cparams.rank;
|
||||
|
||||
// initialize sockets
|
||||
llama_init_sockets(lctx, n_world, my_rank);
|
||||
|
||||
if (auto_schedule) {
|
||||
// sychronize device profile to the master node
|
||||
struct device_info * dev_info_set = nullptr;
|
||||
if (my_rank == 0) {
|
||||
|
@ -1437,14 +1440,8 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|||
|
||||
uint32_t n_layer_window[32] = {0}, n_gpu_layers[32] = {0};
|
||||
if (my_rank == 0) {
|
||||
if (n_world == 1 || params.n_layer_window[0] == 0) {
|
||||
// automatically determine n_layer_window and n_gpu_layers
|
||||
assign_device(n_world, my_rank, dev_info_set, n_layer_window, n_gpu_layers, model, cparams);
|
||||
} else {
|
||||
// use manually set n_layer_window
|
||||
std::copy(std::begin(params.n_layer_window), std::end(params.n_layer_window), n_layer_window);
|
||||
}
|
||||
|
||||
// synchronize the new n_layer_window and n_gpu_layers to other nodes
|
||||
llama_bcast_layer_setup(lctx, n_layer_window, n_gpu_layers);
|
||||
} else {
|
||||
|
@ -1465,6 +1462,9 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|||
#ifdef LLAMA_DEBUG
|
||||
device_print_props(dev_info_set, n_world, model, cparams);
|
||||
#endif
|
||||
}
|
||||
|
||||
LOG_INF("\nUsing window size: %d, GPU layers: %d\n\n", cparams.n_layer_window[my_rank], cparams.n_gpu_layers);
|
||||
|
||||
if (!mparams.vocab_only && llm_load_tensors(ml, model, mparams) < 0) {
|
||||
LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.c_str());
|
||||
|
|
Loading…
Add table
Reference in a new issue