diff --git a/common/common.cpp b/common/common.cpp index 5c972c90..55807f78 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1576,13 +1576,6 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { uint32_t my_rank = params.rank; bool auto_schedule = params.n_layer_window[0] == 0; - // get device profile - LOG_INF("\nstart profiling this device, this may take some seconds ...\n"); - dev_info.rank = params.rank; - if (n_world > 1) { - llama_profile_device(&dev_info, model, ml, params.gpu_mem, params.n_predict, params.n_ctx, params.cpuparams.n_threads, params.flash_attn); - } - // create llama context struct llama_context_params cparams = llama_context_params_from_gpt_params(params); llama_context * lctx = llama_new_context_with_model(model, cparams); @@ -1599,16 +1592,34 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { // initialize sockets llama_init_sockets(lctx, n_world, my_rank); + // broadcast startup args + struct startup_args args; + if (my_rank==0){ + args.should_profile = auto_schedule; + } + llama_bcast_startup_args(lctx, my_rank, &args); + + auto_schedule = args.should_profile; + // if n_world > 1 and need auto schdule, then prifile + if (auto_schedule){ + // get device profile + LOG_INF("\nstart profiling this device, this may take some seconds ...\n"); + dev_info.rank = params.rank; + if (n_world > 1) { + llama_profile_device(&dev_info, model, ml, params.gpu_mem, params.n_predict, params.n_ctx, params.cpuparams.n_threads, params.flash_attn); + } + } + // sychronize device profile to the master node - struct device_info * dev_info_set = nullptr; if (my_rank == 0) { - dev_info_set = (struct device_info *)malloc(n_world * sizeof(struct device_info)); - dev_info_set[0] = dev_info; - - llama_gather_device_info(lctx, dev_info_set); - device_print_props(dev_info_set, n_world, model, cparams); - if (auto_schedule) { + struct device_info * dev_info_set = nullptr; + dev_info_set = (struct device_info *)malloc(n_world * sizeof(struct device_info)); + dev_info_set[0] = dev_info; + + llama_gather_device_info(lctx, dev_info_set); + device_print_props(dev_info_set, n_world, model, cparams); + // automatically determine n_layer_window and n_gpu_layers if (!assign_layers_to_device(n_world, my_rank, dev_info_set, n_layer_window, n_gpu_layers, model, cparams)) { LOG_ERR("%s: Invalid allocation by HiGHS solver\n", __func__); @@ -1623,7 +1634,9 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { llama_bcast_layer_setup(lctx, n_layer_window, nullptr); } } else { - llama_send_device_info(lctx, &dev_info); + if (auto_schedule){ + llama_send_device_info(lctx, &dev_info); + } llama_recv_layer_setup(lctx, n_layer_window, n_gpu_layers); } diff --git a/common/profiler.h b/common/profiler.h index b8fff0d1..a685ff8c 100644 --- a/common/profiler.h +++ b/common/profiler.h @@ -312,6 +312,10 @@ struct disk_props { write_rnd_bw(0.0f) {} }; +struct startup_args{ + bool should_profile; +}; + struct device_info { uint32_t rank; const char * device_name; diff --git a/include/llama.h b/include/llama.h index 7d7392fe..9f3da708 100644 --- a/include/llama.h +++ b/include/llama.h @@ -453,6 +453,7 @@ extern "C" { LLAMA_API void llama_free_sockets (struct llama_context * ctx, char ** msg); LLAMA_API int llama_gather_device_info(struct llama_context * ctx, struct device_info * dev_info_set); LLAMA_API int llama_send_device_info (struct llama_context * ctx, struct device_info * dev_info); + LLAMA_API int llama_bcast_startup_args(struct llama_context * ctx, uint32_t rank, struct startup_args * args); LLAMA_API int llama_bcast_layer_setup (struct llama_context * ctx, uint32_t * n_layer_window, uint32_t * n_gpu_layers); LLAMA_API int llama_recv_layer_setup (struct llama_context * ctx, uint32_t * n_layer_window, uint32_t * n_gpu_layers); diff --git a/src/llama.cpp b/src/llama.cpp index 1aedb6a4..87ae83ac 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -20262,6 +20262,46 @@ int llama_send_device_info(struct llama_context * ctx, struct device_info * dev_ return 0; } +LLAMA_API int llama_bcast_startup_args(llama_context *ctx, uint32_t rank, startup_args *args) { + int32_t n_world = ctx->cparams.n_world; + if (n_world == 1) { + return 0; + } + GGML_ASSERT(ctx != nullptr && ctx->send_socket != nullptr); + if (rank==0){ + // send + try { + std::vector send_msgs; + send_msgs.emplace_back("should_profile", strlen("should_profile")); + send_msgs.emplace_back(&args->should_profile, sizeof(args->should_profile)); + zmq::send_multipart(*ctx->send_socket, send_msgs); + } catch (const zmq::error_t& e) { + LLAMA_LOG_INFO("Failed to send data: %s\n", e.what()); + return -1; + } + }else { + // receive + std::vector recv_msgs; + if (!zmq::recv_multipart(*ctx->recv_socket, std::back_inserter(recv_msgs))) { + return -1; + } + GGML_ASSERT(recv_msgs[0].to_string() == "should_profile"); + GGML_ASSERT(recv_msgs[1].size() == sizeof(bool)); + bool should_profile = *static_cast(recv_msgs[1].data()); + args->should_profile = should_profile; + if (rank != n_world-1){ + // send + try { + zmq::send_multipart(*ctx->send_socket, recv_msgs); + } catch (const zmq::error_t& e) { + LLAMA_LOG_INFO("Failed to send data: %s\n", e.what()); + return -1; + } + } + } + return 0; +} + int llama_bcast_layer_setup(struct llama_context * ctx, uint32_t * n_layer_window, uint32_t * n_gpu_layers) { uint32_t n_world = ctx->cparams.n_world; if (n_world == 1) {