From 50807fd4e1ec79ebd820071bc718335ca28863a1 Mon Sep 17 00:00:00 2001 From: "Li, Zonghang" <870644199@qq.com> Date: Thu, 26 Jun 2025 08:56:31 +0400 Subject: [PATCH] halda: handle infeasible solution with weak device --- common/common.cpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 7520b703..70979382 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1443,11 +1443,17 @@ static bool assign_layers_to_device( } // check the solution - bool has_free_gpu_memory = false, has_gpu_overload = false, has_cpu_overload = false; + bool has_free_gpu_memory = false, has_gpu_overload = false, has_cpu_overload = false, has_weak_device = false; for (uint32_t m = 0; m < n_world; ++m) { // if (!dev_gpu[m]) continue; uint32_t w_m = best_solution[m], n_m = best_solution[m + n_world]; + if (w_m == 1 && n_m == 0) { + // if the device is weak + has_weak_device = true; + LOG_INF("Device %d is weak, need to be removed: w_m = %d, n_m = %d\n", m, w_m, n_m); + } + if (dev_gpu[m]) { if (n_m < static_cast(std::floor(W * vec_z_gpu[m]))) { // if there is still free GPU memory @@ -1467,7 +1473,7 @@ static bool assign_layers_to_device( } } - if (has_free_gpu_memory && (has_gpu_overload || has_cpu_overload)) { + if (!has_weak_device && has_free_gpu_memory && (has_gpu_overload || has_cpu_overload)) { int worst_device = -1; float worst_speed = std::numeric_limits::max();