mirror of
https://github.com/Lizonghang/prima.cpp.git
synced 2025-09-05 16:59:03 +00:00
remove unnecessary profile when --lw
is specified
This commit is contained in:
parent
9cb87f7923
commit
168c14f4e8
4 changed files with 73 additions and 15 deletions
|
@ -1576,13 +1576,6 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|||
uint32_t my_rank = params.rank;
|
||||
bool auto_schedule = params.n_layer_window[0] == 0;
|
||||
|
||||
// get device profile
|
||||
LOG_INF("\nstart profiling this device, this may take some seconds ...\n");
|
||||
dev_info.rank = params.rank;
|
||||
if (n_world > 1) {
|
||||
llama_profile_device(&dev_info, model, ml, params.gpu_mem, params.n_predict, params.n_ctx, params.cpuparams.n_threads, params.flash_attn);
|
||||
}
|
||||
|
||||
// create llama context
|
||||
struct llama_context_params cparams = llama_context_params_from_gpt_params(params);
|
||||
llama_context * lctx = llama_new_context_with_model(model, cparams);
|
||||
|
@ -1599,16 +1592,34 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|||
// initialize sockets
|
||||
llama_init_sockets(lctx, n_world, my_rank);
|
||||
|
||||
// broadcast startup args
|
||||
struct startup_args args;
|
||||
if (my_rank==0){
|
||||
args.should_profile = auto_schedule;
|
||||
}
|
||||
llama_bcast_startup_args(lctx, my_rank, &args);
|
||||
|
||||
auto_schedule = args.should_profile;
|
||||
// if n_world > 1 and need auto schdule, then prifile
|
||||
if (auto_schedule){
|
||||
// get device profile
|
||||
LOG_INF("\nstart profiling this device, this may take some seconds ...\n");
|
||||
dev_info.rank = params.rank;
|
||||
if (n_world > 1) {
|
||||
llama_profile_device(&dev_info, model, ml, params.gpu_mem, params.n_predict, params.n_ctx, params.cpuparams.n_threads, params.flash_attn);
|
||||
}
|
||||
}
|
||||
|
||||
// sychronize device profile to the master node
|
||||
struct device_info * dev_info_set = nullptr;
|
||||
if (my_rank == 0) {
|
||||
dev_info_set = (struct device_info *)malloc(n_world * sizeof(struct device_info));
|
||||
dev_info_set[0] = dev_info;
|
||||
|
||||
llama_gather_device_info(lctx, dev_info_set);
|
||||
device_print_props(dev_info_set, n_world, model, cparams);
|
||||
|
||||
if (auto_schedule) {
|
||||
struct device_info * dev_info_set = nullptr;
|
||||
dev_info_set = (struct device_info *)malloc(n_world * sizeof(struct device_info));
|
||||
dev_info_set[0] = dev_info;
|
||||
|
||||
llama_gather_device_info(lctx, dev_info_set);
|
||||
device_print_props(dev_info_set, n_world, model, cparams);
|
||||
|
||||
// automatically determine n_layer_window and n_gpu_layers
|
||||
if (!assign_layers_to_device(n_world, my_rank, dev_info_set, n_layer_window, n_gpu_layers, model, cparams)) {
|
||||
LOG_ERR("%s: Invalid allocation by HiGHS solver\n", __func__);
|
||||
|
@ -1623,7 +1634,9 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|||
llama_bcast_layer_setup(lctx, n_layer_window, nullptr);
|
||||
}
|
||||
} else {
|
||||
llama_send_device_info(lctx, &dev_info);
|
||||
if (auto_schedule){
|
||||
llama_send_device_info(lctx, &dev_info);
|
||||
}
|
||||
llama_recv_layer_setup(lctx, n_layer_window, n_gpu_layers);
|
||||
}
|
||||
|
||||
|
|
|
@ -312,6 +312,10 @@ struct disk_props {
|
|||
write_rnd_bw(0.0f) {}
|
||||
};
|
||||
|
||||
struct startup_args{
|
||||
bool should_profile;
|
||||
};
|
||||
|
||||
struct device_info {
|
||||
uint32_t rank;
|
||||
const char * device_name;
|
||||
|
|
|
@ -453,6 +453,7 @@ extern "C" {
|
|||
LLAMA_API void llama_free_sockets (struct llama_context * ctx, char ** msg);
|
||||
LLAMA_API int llama_gather_device_info(struct llama_context * ctx, struct device_info * dev_info_set);
|
||||
LLAMA_API int llama_send_device_info (struct llama_context * ctx, struct device_info * dev_info);
|
||||
LLAMA_API int llama_bcast_startup_args(struct llama_context * ctx, uint32_t rank, struct startup_args * args);
|
||||
LLAMA_API int llama_bcast_layer_setup (struct llama_context * ctx, uint32_t * n_layer_window, uint32_t * n_gpu_layers);
|
||||
LLAMA_API int llama_recv_layer_setup (struct llama_context * ctx, uint32_t * n_layer_window, uint32_t * n_gpu_layers);
|
||||
|
||||
|
|
|
@ -20262,6 +20262,46 @@ int llama_send_device_info(struct llama_context * ctx, struct device_info * dev_
|
|||
return 0;
|
||||
}
|
||||
|
||||
LLAMA_API int llama_bcast_startup_args(llama_context *ctx, uint32_t rank, startup_args *args) {
|
||||
int32_t n_world = ctx->cparams.n_world;
|
||||
if (n_world == 1) {
|
||||
return 0;
|
||||
}
|
||||
GGML_ASSERT(ctx != nullptr && ctx->send_socket != nullptr);
|
||||
if (rank==0){
|
||||
// send
|
||||
try {
|
||||
std::vector<zmq::message_t> send_msgs;
|
||||
send_msgs.emplace_back("should_profile", strlen("should_profile"));
|
||||
send_msgs.emplace_back(&args->should_profile, sizeof(args->should_profile));
|
||||
zmq::send_multipart(*ctx->send_socket, send_msgs);
|
||||
} catch (const zmq::error_t& e) {
|
||||
LLAMA_LOG_INFO("Failed to send data: %s\n", e.what());
|
||||
return -1;
|
||||
}
|
||||
}else {
|
||||
// receive
|
||||
std::vector<zmq::message_t> recv_msgs;
|
||||
if (!zmq::recv_multipart(*ctx->recv_socket, std::back_inserter(recv_msgs))) {
|
||||
return -1;
|
||||
}
|
||||
GGML_ASSERT(recv_msgs[0].to_string() == "should_profile");
|
||||
GGML_ASSERT(recv_msgs[1].size() == sizeof(bool));
|
||||
bool should_profile = *static_cast<bool*>(recv_msgs[1].data());
|
||||
args->should_profile = should_profile;
|
||||
if (rank != n_world-1){
|
||||
// send
|
||||
try {
|
||||
zmq::send_multipart(*ctx->send_socket, recv_msgs);
|
||||
} catch (const zmq::error_t& e) {
|
||||
LLAMA_LOG_INFO("Failed to send data: %s\n", e.what());
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int llama_bcast_layer_setup(struct llama_context * ctx, uint32_t * n_layer_window, uint32_t * n_gpu_layers) {
|
||||
uint32_t n_world = ctx->cparams.n_world;
|
||||
if (n_world == 1) {
|
||||
|
|
Loading…
Add table
Reference in a new issue