From 07a397360b420b6c08c7cf26ef1da6aab0c291c4 Mon Sep 17 00:00:00 2001 From: Lizonghang <870644199@qq.com> Date: Wed, 19 Feb 2025 16:30:18 +0400 Subject: [PATCH] fix gpu underutilization --- common/common.cpp | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 35da8f2c..deaffc71 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1362,17 +1362,22 @@ static bool assign_layers_to_device( } // check the solution - bool is_set_suboptimal = false; + bool has_free_gpu_memory = false, has_overload = false; for (uint32_t m = 0; m < n_world; ++m) { uint32_t w_m = best_solution[m], n_m = best_solution[m + n_world]; - // if w[m] > n[m] and there is still free VRAM, the GPU is not fully utilized, - // indicating that the memory constraints are too strict, and the set assignment is suboptimal. - if (w_m > n_m && n_m < static_cast(std::round(W * vec_z_gpu[m]))) { - is_set_suboptimal = true; - } + + // if there is still free GPU memory + if (n_m < static_cast(std::round(W * vec_z_gpu[m]))) { + has_free_gpu_memory = true; + } + + // if there is device overloaded + if (w_m > n_m) { + has_overload = true; + } } - if (is_set_suboptimal) { + if (has_free_gpu_memory && has_overload) { int worst_device = -1; float worst_speed = std::numeric_limits::max(); @@ -1422,8 +1427,8 @@ static bool assign_layers_to_device( LOG_INF(" - N Layer Window : %d\n", w[m]); LOG_INF(" - N GPU Layers : %d\n", n[m]); } - LOG_INF("\nEstimated Latency: %.3f ms\n", final_objective); - LOG_INF("------------------------------------------"); + // LOG_INF("\nEstimated Latency: %.3f ms\n", final_objective); + // LOG_INF("------------------------------------------"); // copy value from w and n to n_layer_window and n_gpu_layers, respectively std::copy(w.begin(), w.end(), n_layer_window);