mirror of
https://github.com/Lizonghang/prima.cpp.git
synced 2025-09-06 22:59:02 +00:00
show device props
This commit is contained in:
parent
f9d16fbf71
commit
6761ca5358
2 changed files with 10 additions and 14 deletions
|
@ -1434,6 +1434,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
||||||
dev_info_set = (struct device_info *)malloc(n_world * sizeof(struct device_info));
|
dev_info_set = (struct device_info *)malloc(n_world * sizeof(struct device_info));
|
||||||
dev_info_set[0] = dev_info;
|
dev_info_set[0] = dev_info;
|
||||||
llama_gather_device_info(lctx, dev_info_set);
|
llama_gather_device_info(lctx, dev_info_set);
|
||||||
|
device_print_props(dev_info_set, n_world, model, cparams);
|
||||||
} else {
|
} else {
|
||||||
llama_send_device_info(lctx, &dev_info);
|
llama_send_device_info(lctx, &dev_info);
|
||||||
}
|
}
|
||||||
|
@ -1458,10 +1459,6 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
||||||
cparams.n_gpu_layers = n_gpu_layers[my_rank];
|
cparams.n_gpu_layers = n_gpu_layers[my_rank];
|
||||||
mparams.n_gpu_layers = n_gpu_layers[my_rank];
|
mparams.n_gpu_layers = n_gpu_layers[my_rank];
|
||||||
llama_model_set_n_gpu_layers(model, n_gpu_layers[my_rank]);
|
llama_model_set_n_gpu_layers(model, n_gpu_layers[my_rank]);
|
||||||
|
|
||||||
#ifdef LLAMA_DEBUG
|
|
||||||
device_print_props(dev_info_set, n_world, model, cparams);
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_INF("\nUsing window size: %d, GPU layers: %d\n\n", cparams.n_layer_window[my_rank], cparams.n_gpu_layers);
|
LOG_INF("\nUsing window size: %d, GPU layers: %d\n\n", cparams.n_layer_window[my_rank], cparams.n_gpu_layers);
|
||||||
|
|
|
@ -1966,17 +1966,16 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
|
||||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_bytes.nb_output);
|
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_bytes.nb_output);
|
||||||
LOG_INF("\n");
|
LOG_INF("\n");
|
||||||
|
|
||||||
// todo: calculate for each device, not only master
|
// float latency = 0.0f;
|
||||||
float latency = 0.0f;
|
// int n_layers = llama_model_n_layers (model);
|
||||||
int n_layers = llama_model_n_layers (model);
|
// latency += device_compute_delay (dev_info_set[0], n_layers, cparams);
|
||||||
latency += device_compute_delay (dev_info_set[0], n_layers, cparams);
|
// latency += device_memory_access_delay(dev_info_set[0], model, cparams, n_layers);
|
||||||
latency += device_memory_access_delay(dev_info_set[0], model, cparams, n_layers);
|
// latency += device_disk_access_delay (dev_info_set[0], model, cparams); // if physical memory is not enough, some mapped data will be released and reloaded later
|
||||||
latency += device_disk_access_delay (dev_info_set[0], model, cparams); // if physical memory is not enough, some mapped data will be released and reloaded later
|
// latency += device_mem_copy_delay (dev_info_set[0], model, cparams); // memory copy delay in kvcache
|
||||||
latency += device_mem_copy_delay (dev_info_set[0], model, cparams); // memory copy delay in kvcache
|
|
||||||
|
|
||||||
LOG_INF("| Token latency (ms) ");
|
// LOG_INF("| Token latency (ms) ");
|
||||||
LOG_INF("| %-10.2f ", latency);
|
// LOG_INF("| %-10.2f ", latency);
|
||||||
LOG_INF("\n");
|
// LOG_INF("\n");
|
||||||
|
|
||||||
LOG_INF("-------------------------------------------------------------------------------------------\n\n");
|
LOG_INF("-------------------------------------------------------------------------------------------\n\n");
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Reference in a new issue