remove unnecessary profile when --lw is specified

2025-09-10 10:24:34 +00:00 · 2025-04-17 13:49:09 +00:00 · 2025-04-17 13:49:09 +00:00 · 168c14f4e8
commit 168c14f4e8
parent 9cb87f7923
4 changed files with 73 additions and 15 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@ -1576,13 +1576,6 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
    uint32_t my_rank   = params.rank;
    bool auto_schedule = params.n_layer_window[0] == 0;
    
-    // get device profile
-    LOG_INF("\nstart profiling this device, this may take some seconds ...\n");
-    dev_info.rank = params.rank;
-    if (n_world > 1) {
-        llama_profile_device(&dev_info, model, ml, params.gpu_mem, params.n_predict, params.n_ctx, params.cpuparams.n_threads, params.flash_attn);
-    }
-
    // create llama context
    struct llama_context_params cparams = llama_context_params_from_gpt_params(params);
    llama_context * lctx                = llama_new_context_with_model(model, cparams);
@ -1599,16 +1592,34 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
        // initialize sockets
        llama_init_sockets(lctx, n_world, my_rank);

+        // broadcast startup args
+        struct startup_args args;
+        if (my_rank==0){
+            args.should_profile = auto_schedule;
+        }
+        llama_bcast_startup_args(lctx, my_rank, &args);
+
+        auto_schedule = args.should_profile;
+        // if n_world > 1 and need auto schdule, then prifile
+        if (auto_schedule){
+            // get device profile
+            LOG_INF("\nstart profiling this device, this may take some seconds ...\n");
+            dev_info.rank = params.rank;
+            if (n_world > 1) {
+                llama_profile_device(&dev_info, model, ml, params.gpu_mem, params.n_predict, params.n_ctx, params.cpuparams.n_threads, params.flash_attn);
+            }
+        }
+
        // sychronize device profile to the master node
-        struct device_info * dev_info_set = nullptr;
        if (my_rank == 0) {
-            dev_info_set = (struct device_info *)malloc(n_world * sizeof(struct device_info));
-            dev_info_set[0] = dev_info;
-
-            llama_gather_device_info(lctx, dev_info_set);
-            device_print_props(dev_info_set, n_world, model, cparams);
-
            if (auto_schedule) {
+                struct device_info * dev_info_set = nullptr;
+                dev_info_set = (struct device_info *)malloc(n_world * sizeof(struct device_info));
+                dev_info_set[0] = dev_info;
+
+                llama_gather_device_info(lctx, dev_info_set);
+                device_print_props(dev_info_set, n_world, model, cparams);
+
                // automatically determine n_layer_window and n_gpu_layers
                if (!assign_layers_to_device(n_world, my_rank, dev_info_set, n_layer_window, n_gpu_layers, model, cparams)) {
                    LOG_ERR("%s: Invalid allocation by HiGHS solver\n", __func__);
@ -1623,7 +1634,9 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
                llama_bcast_layer_setup(lctx, n_layer_window, nullptr);
            }
        } else {
-            llama_send_device_info(lctx, &dev_info);
+            if (auto_schedule){
+                llama_send_device_info(lctx, &dev_info);
+            }
            llama_recv_layer_setup(lctx, n_layer_window, n_gpu_layers);
        }