diff --git a/common/common.cpp b/common/common.cpp
index 5c972c90..55807f78 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1576,13 +1576,6 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
     uint32_t my_rank   = params.rank;
     bool auto_schedule = params.n_layer_window[0] == 0;
     
-    // get device profile
-    LOG_INF("\nstart profiling this device, this may take some seconds ...\n");
-    dev_info.rank = params.rank;
-    if (n_world > 1) {
-        llama_profile_device(&dev_info, model, ml, params.gpu_mem, params.n_predict, params.n_ctx, params.cpuparams.n_threads, params.flash_attn);
-    }
-
     // create llama context
     struct llama_context_params cparams = llama_context_params_from_gpt_params(params);
     llama_context * lctx                = llama_new_context_with_model(model, cparams);
@@ -1599,16 +1592,34 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
         // initialize sockets
         llama_init_sockets(lctx, n_world, my_rank);
 
+        // broadcast startup args
+        struct startup_args args;
+        if (my_rank==0){
+            args.should_profile = auto_schedule;
+        }
+        llama_bcast_startup_args(lctx, my_rank, &args);
+
+        auto_schedule = args.should_profile;
+        // if n_world > 1 and need auto schdule, then prifile
+        if (auto_schedule){
+            // get device profile
+            LOG_INF("\nstart profiling this device, this may take some seconds ...\n");
+            dev_info.rank = params.rank;
+            if (n_world > 1) {
+                llama_profile_device(&dev_info, model, ml, params.gpu_mem, params.n_predict, params.n_ctx, params.cpuparams.n_threads, params.flash_attn);
+            }
+        }
+
         // sychronize device profile to the master node
-        struct device_info * dev_info_set = nullptr;
         if (my_rank == 0) {
-            dev_info_set = (struct device_info *)malloc(n_world * sizeof(struct device_info));
-            dev_info_set[0] = dev_info;
-
-            llama_gather_device_info(lctx, dev_info_set);
-            device_print_props(dev_info_set, n_world, model, cparams);
-
             if (auto_schedule) {
+                struct device_info * dev_info_set = nullptr;
+                dev_info_set = (struct device_info *)malloc(n_world * sizeof(struct device_info));
+                dev_info_set[0] = dev_info;
+
+                llama_gather_device_info(lctx, dev_info_set);
+                device_print_props(dev_info_set, n_world, model, cparams);
+
                 // automatically determine n_layer_window and n_gpu_layers
                 if (!assign_layers_to_device(n_world, my_rank, dev_info_set, n_layer_window, n_gpu_layers, model, cparams)) {
                     LOG_ERR("%s: Invalid allocation by HiGHS solver\n", __func__);
@@ -1623,7 +1634,9 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
                 llama_bcast_layer_setup(lctx, n_layer_window, nullptr);
             }
         } else {
-            llama_send_device_info(lctx, &dev_info);
+            if (auto_schedule){
+                llama_send_device_info(lctx, &dev_info);
+            }
             llama_recv_layer_setup(lctx, n_layer_window, n_gpu_layers);
         }
 
diff --git a/common/profiler.h b/common/profiler.h
index b8fff0d1..a685ff8c 100644
--- a/common/profiler.h
+++ b/common/profiler.h
@@ -312,6 +312,10 @@ struct disk_props {
         write_rnd_bw(0.0f) {}
 };
 
+struct startup_args{
+    bool should_profile;
+};
+
 struct device_info {
     uint32_t            rank;
     const char *        device_name;
diff --git a/include/llama.h b/include/llama.h
index 7d7392fe..9f3da708 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -453,6 +453,7 @@ extern "C" {
     LLAMA_API void llama_free_sockets      (struct llama_context * ctx, char ** msg);
     LLAMA_API int  llama_gather_device_info(struct llama_context * ctx, struct device_info * dev_info_set);
     LLAMA_API int  llama_send_device_info  (struct llama_context * ctx, struct device_info * dev_info);
+    LLAMA_API int  llama_bcast_startup_args(struct llama_context * ctx, uint32_t rank,  struct startup_args * args);
     LLAMA_API int  llama_bcast_layer_setup (struct llama_context * ctx, uint32_t * n_layer_window, uint32_t * n_gpu_layers);
     LLAMA_API int  llama_recv_layer_setup  (struct llama_context * ctx, uint32_t * n_layer_window, uint32_t * n_gpu_layers);
 
diff --git a/src/llama.cpp b/src/llama.cpp
index 1aedb6a4..87ae83ac 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -20262,6 +20262,46 @@ int llama_send_device_info(struct llama_context * ctx, struct device_info * dev_
     return 0;
 }
 
+LLAMA_API int llama_bcast_startup_args(llama_context *ctx, uint32_t rank, startup_args *args) {
+    int32_t n_world = ctx->cparams.n_world;
+    if (n_world == 1) {
+        return 0;
+    }
+    GGML_ASSERT(ctx != nullptr && ctx->send_socket != nullptr);
+    if (rank==0){
+        // send
+        try {
+            std::vector<zmq::message_t> send_msgs;
+            send_msgs.emplace_back("should_profile", strlen("should_profile"));
+            send_msgs.emplace_back(&args->should_profile, sizeof(args->should_profile));
+            zmq::send_multipart(*ctx->send_socket, send_msgs);
+        } catch (const zmq::error_t& e) {
+            LLAMA_LOG_INFO("Failed to send data: %s\n", e.what());
+            return -1;
+        }
+    }else {
+        // receive
+        std::vector<zmq::message_t> recv_msgs;
+        if (!zmq::recv_multipart(*ctx->recv_socket, std::back_inserter(recv_msgs))) {
+            return -1;
+        }
+        GGML_ASSERT(recv_msgs[0].to_string() == "should_profile");
+        GGML_ASSERT(recv_msgs[1].size() == sizeof(bool));
+        bool should_profile = *static_cast<bool*>(recv_msgs[1].data());
+        args->should_profile = should_profile;
+        if (rank != n_world-1){
+            // send
+            try {
+                zmq::send_multipart(*ctx->send_socket, recv_msgs);
+            } catch (const zmq::error_t& e) {
+                LLAMA_LOG_INFO("Failed to send data: %s\n", e.what());
+                return -1;
+            }
+        }
+    }
+    return 0;
+}
+
 int llama_bcast_layer_setup(struct llama_context * ctx, uint32_t * n_layer_window, uint32_t * n_gpu_layers) {
     uint32_t n_world = ctx->cparams.n_world;
     if (n_world == 1) {