diff --git a/common/common.cpp b/common/common.cpp index 226958d7..f449a501 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -832,7 +832,7 @@ std::string fs_get_cache_file(const std::string & filename) { return cache_directory + filename; } -static void assign_device( +static bool assign_device( uint32_t n_world, uint32_t my_rank, const device_info * dev_info_set, @@ -849,7 +849,7 @@ static void assign_device( const uint32_t n_layer = llama_model_n_layers(model); if (n_world == 1) { n_layer_window[0] = n_layer; - return; + return true; } const device_info &master = dev_info_set[0]; @@ -958,6 +958,11 @@ static void assign_device( w[m] = std::round(mem_budget[m] / total_mem_budget * n_layer); n[m] = 0; } + // adjust w[m] to ensure L mod W = 0 + int diff = n_layer - std::accumulate(w.begin(), w.end(), 0); + auto device = (diff > 0) ? std::max_element(mem_budget.begin(), mem_budget.end()) + : std::min_element(mem_budget.begin(), mem_budget.end()); + w[std::distance(mem_budget.begin(), device)] += diff; #if defined(USE_HIGHS) // stores the actual read bandwidth (GB/s) for each device @@ -1066,7 +1071,13 @@ static void assign_device( while (true) { int W = std::accumulate(w.begin(), w.end(), 0); int cur_k = (int)n_layer / W; - GGML_ASSERT(W > 1 && (int)n_layer % W == 0 && "Constraint: L = k * W must hold\n"); + + if (W <= 1 || (int)n_layer % W != 0) { + LOG_INF("Constraint: L = k * W must hold, but W = %d, L = %d\n", W, n_layer); + fflush(stdout); + fflush(stderr); + return false; + } if (!assign_sets(cur_k)) break; @@ -1380,6 +1391,8 @@ static void assign_device( // copy value from w and n to n_layer_window and n_gpu_layers, respectively std::copy(w.begin(), w.end(), n_layer_window); std::copy(n.begin(), n.end(), n_gpu_layers); + + return true; } // @@ -1465,7 +1478,13 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { uint32_t n_layer_window[32] = {0}, n_gpu_layers[32] = {0}; if (my_rank == 0) { // automatically determine n_layer_window and n_gpu_layers - assign_device(n_world, my_rank, dev_info_set, n_layer_window, n_gpu_layers, model, cparams); + if (!assign_device(n_world, my_rank, dev_info_set, n_layer_window, n_gpu_layers, model, cparams)) { + LOG_ERR("%s: Invalid allocation by HiGHS solver\n", __func__); + llama_free(lctx); + llama_free_model(model); + return iparams; + } + // synchronize the new n_layer_window and n_gpu_layers to other nodes llama_bcast_layer_setup(lctx, n_layer_window, n_gpu_layers); } else { @@ -1494,6 +1513,8 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { if (!mparams.vocab_only && llm_load_tensors(ml, model, mparams) < 0) { LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.c_str()); + llama_free(lctx); + llama_free_model(model); return iparams; } @@ -1501,6 +1522,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { if (llama_context_setup_backend(model, cparams, lctx) == nullptr) { LOG_ERR("%s: failed to setup context with model '%s'\n", __func__, params.model.c_str()); + llama_free(lctx); llama_free_model(model); return iparams; } @@ -1513,7 +1535,6 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { if (cvec.n_embd == -1) { llama_free(lctx); llama_free_model(model); - return iparams; } @@ -1526,7 +1547,6 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { if (err) { llama_free(lctx); llama_free_model(model); - return iparams; } }