diff --git a/common/common.cpp b/common/common.cpp index 88b00075..a98337d3 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1547,7 +1547,7 @@ static bool tune_layer_allocation( dev_infos_temp.clear(); n_layer_windows_temp.clear(); n_gpu_layers_temp.clear(); - for(auto i=0; i 1 || i==0 ) { dev_infos_temp.push_back(dev_infos_[i]); n_layer_windows_temp.push_back(n_layer_windows_[i]); @@ -1561,7 +1561,7 @@ static bool tune_layer_allocation( n_world = dev_infos_temp.size(); } - int i =0 , j =0; + uint32_t i =0 , j =0; while(j < n_world) { if(dev_infos[i].rank == dev_infos_temp[j].rank){ n_layer_window[i] = n_layer_windows_temp[j]; @@ -1701,13 +1701,19 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { llama_recv_layer_setup(lctx, n_layer_window, n_gpu_layers); } } + if(n_layer_window[my_rank]<=0){ + LOG_INF("%s: info: rank %d has no layers to run, skipping\n", __func__, my_rank); + llama_free(lctx); + llama_free_model(model); + exit(0); + } //update rank and n_world for consistency uint32_t update_rank = 0; uint32_t update_n_world = 1; std::vector n_layer_window_temp = {n_layer_window[0]}; std::vector n_gpu_layers_temp = {n_gpu_layers[0]}; - for(auto i=1; icparams.n_world; + auto n_world = ctx->cparams.n_world; if (n_world == 1) { return 0; } @@ -20343,14 +20343,14 @@ LLAMA_API int llama_rebuild_topo(llama_context *ctx, } dev_info_ptr = new device_info[n_world]; for (size_t i = 0; i < msgs.size(); i++) { - deserialize((const char *)msgs[i].data(), &dev_info_set[i]); + deserialize((const char *)msgs[i].data(), &dev_info_ptr[i]); } }else{ char * buffer = nullptr; for(size_t i = 0; i < n_world; i++) { size_t buffer_size = serialize(&dev_info_set[i], &buffer); msgs.emplace_back(buffer, buffer_size); - + free(buffer); } dev_info_ptr = dev_info_set; @@ -20361,9 +20361,9 @@ LLAMA_API int llama_rebuild_topo(llama_context *ctx, // notify next rank auto next_rank = (my_rank + 1) % n_world; - if(n_layer_window[next_rank] <= 0){ + if(n_layer_window[next_rank] <= 0 && next_rank != 0){ try { - ctx->send_socket->setsockopt(ZMQ_LINGER, 3500); + ctx->send_socket->set(zmq::sockopt::linger, 3500); zmq::send_multipart(*ctx->send_socket, msgs); } catch (const zmq::error_t& e) { LLAMA_LOG_INFO("Failed to send data: %s\n", e.what()); @@ -20382,7 +20382,7 @@ LLAMA_API int llama_rebuild_topo(llama_context *ctx, auto current_rank = my_rank; while(next_rank!=my_rank){ if(n_layer_window[next_rank] > 0){ - next_ip = dev_info_ptr[next_rank].next_ip; + next_ip = dev_info_ptr[current_rank].next_ip; break; } next_rank = (next_rank + 1) % n_world; @@ -20402,6 +20402,9 @@ LLAMA_API int llama_rebuild_topo(llama_context *ctx, } return -1; } + }else{ + // only one node + ctx->next_node_ip = ""; } } if(!dev_info_set){ @@ -20409,10 +20412,7 @@ LLAMA_API int llama_rebuild_topo(llama_context *ctx, } socket_to_close->close(); delete socket_to_close; - if(n_layer_window[my_rank]<=0){ - exit(0); - } - return true; + return 0; } int llama_recv_layer_setup(struct llama_context * ctx, uint32_t * n_layer_window, uint32_t * n_gpu_layers) {