From 1e2b934d6908831c185fe47f906a807b9858f03e Mon Sep 17 00:00:00 2001
From: Lizonghang <870644199@qq.com>
Date: Mon, 27 Jan 2025 11:13:09 +0400
Subject: [PATCH] add bounds n[m]<=0 for devices without GPUs

---
 common/common.cpp | 97 +++++++++++++++++++++++++++++++----------------
 1 file changed, 65 insertions(+), 32 deletions(-)
diff --git a/common/common.cpp b/common/common.cpp
index e737a4f3..942d513b 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -832,6 +832,20 @@ std::string fs_get_cache_file(const std::string & filename) {
     return cache_directory + filename;
 }
 
+template <typename T>
+static std::string vec_to_str(const std::vector<T> & vec) {
+    std::ostringstream oss;
+    oss << "[";
+    for (size_t i = 0; i < vec.size(); ++i) {
+        oss << vec[i];
+        if (i < vec.size() - 1) {
+            oss << ", ";
+        }
+    }
+    oss << "]";
+    return oss.str();
+}
+
 static bool assign_layers_to_device(
                                 uint32_t   n_world, 
                                 uint32_t   my_rank, 
@@ -840,7 +854,7 @@ static bool assign_layers_to_device(
                                 uint32_t * n_gpu_layers,
                       struct llama_model * model,
        const struct llama_context_params   cparams,
-                                   float   min_disk_read_speed = 0.5f) { // minimum disk I/O speed: 500 MB/s
+                                   float   min_disk_read_speed = 0.1f) { // minimum disk I/O speed: 100 MB/s
     GGML_ASSERT(dev_info_set != nullptr);
     GGML_ASSERT(n_layer_window != nullptr);
     GGML_ASSERT(my_rank == 0);
@@ -1082,6 +1096,9 @@ static bool assign_layers_to_device(
 
         if (!assign_sets(cur_k)) break;
 
+        LOG_INF("Set assignment: M1: %s, M2: %s, M3: %s, M4: %s\n", 
+                vec_to_str(M1).c_str(), vec_to_str(M2).c_str(), vec_to_str(M3).c_str(), vec_to_str(M4).c_str());
+
         // update kappa
         for (uint32_t m = 0; m < n_world; ++m) {
             const device_info & dev = dev_info_set[m];
@@ -1109,6 +1126,14 @@ static bool assign_layers_to_device(
             }
         }
 
+        std::vector<int> dev_gpu(n_world, 0);
+        for (uint32_t m = 0; m < n_world; ++m) {
+            const device_info & dev = dev_info_set[m];
+            if (dev.gpu_support.cuda || dev.gpu_support.metal) {
+                dev_gpu[m] = 1;
+            }
+        }
+
         // -------------------------------------------------------------
         // Construct vectors va, vb, vc
         // -------------------------------------------------------------
@@ -1118,7 +1143,7 @@ static bool assign_layers_to_device(
         //   - M3: a[m] = alpha[m] + b' / s_m^{disk}, b[m] = beta[m] - b'/ s_m^{disk}, c[m] = xi[m]
         //   - M4: a[m] = alpha[m],                   b[m] = beta[m],                  c[m] = xi[m]
         std::vector<float> vec_a(n_world, 0.0f), vec_b(n_world, 0.0f), vec_c(n_world, 0.0f);
-
+        
         for (uint32_t m = 0; m < n_world; ++m) {
             if (in_set(m, M1)) {
                 vec_a[m] = alpha[m] + b / (disk_speed[m] * 1e9) * 1000; // in ms
@@ -1130,11 +1155,15 @@ static bool assign_layers_to_device(
                 vec_c[m] = xi[m];
             } else if (in_set(m, M3)) {
                 vec_a[m] = alpha[m] + b_prime / (disk_speed[m] * 1e9) * 1000; // in ms
-                vec_b[m] = beta[m]  - b_prime / (disk_speed[m] * 1e9) * 1000; // in ms
+                if (dev_gpu[m]) {
+                    vec_b[m] = beta[m] - b_prime / (disk_speed[m] * 1e9) * 1000; // in ms
+                }
                 vec_c[m] = xi[m];
             } else {
                 vec_a[m] = alpha[m];
-                vec_b[m] = beta[m];
+                if (dev_gpu[m]) {
+                    vec_b[m] = beta[m];
+                }
                 vec_c[m] = xi[m];
             }
         }
@@ -1143,7 +1172,6 @@ static bool assign_layers_to_device(
         // Construct vectors vz, vz_gpu
         // -------------------------------------------------------------
         std::vector<float> vec_z(n_world, 0.0f), vec_z_gpu(n_world, 0.0f);
-        std::vector<int> dev_gpu(n_world, 0);
 
         for (uint32_t m = 0; m < n_world; ++m) {
             const device_info & dev = dev_info_set[m];
@@ -1168,25 +1196,19 @@ static bool assign_layers_to_device(
                 } else if (is_macos && dev.gpu_support.metal) {
                     vec_z[m] = - (double)(dev.gpu_props.memory_free * GIGABYTE - b_cio - c_gpu[m]) / (double)(n_layer * b_prime);
                 } else {
-                    vec_z[m] = - (double)(dev.memory.available_physical * GIGABYTE + dev.memory.used_can_swap * GIGABYTE * int(is_android) - b_cio) / (double)(n_layer * b_prime);
+                    vec_z[m] = - (double)((dev.memory.available_physical + dev.memory.used_can_swap * int(is_android)) * GIGABYTE - b_cio) / (double)(n_layer * b_prime);
                 }
             }
 
-            if (dev.gpu_support.cuda || dev.gpu_support.metal) {
+            if (dev_gpu[m]) {
                 float reserved_mem = 0.1f; // reserved shared memory to avoid potential OOM, set to 100 MiB by default
                 vec_z_gpu[m] = (double)((dev.gpu_props.memory_free - reserved_mem) * GIGABYTE - c_gpu[m]) / (double)(n_layer * b_prime);
                 if (dev.gpu_support.metal && m == 0 && cparams.keep_out_in_metal) {
                     vec_z_gpu[m] -= (double)bo / (double)(n_layer * b_prime);
                 }
-                dev_gpu[m] = 1;
-            } else {
-                vec_z_gpu[m] = -(double)c_gpu[m] / (double)(n_layer * b_prime);
             }
         }
 
-        // count the number of cuda devices
-        int num_dev_gpu = std::accumulate(dev_gpu.begin(), dev_gpu.end(), 0);
-
         // -------------------------------------------------------------
         // Build and solve the optimization model
         // -------------------------------------------------------------
@@ -1203,7 +1225,7 @@ static bool assign_layers_to_device(
 
             // define the number of decision variables and constraints
             model.lp_.num_col_ = n_world * 2; // number of decision variables
-            model.lp_.num_row_ = 1 + 2 * n_world + num_dev_gpu; // number of constraints
+            model.lp_.num_row_ = 1 + 3 * n_world; // number of constraints
 
             // define the objective: k * sum(a[m] * w[m] + b[m] * n[m]) + kappa + k * sum(c[m])
             model.lp_.sense_  = ObjSense::kMinimize;
@@ -1246,10 +1268,8 @@ static bool assign_layers_to_device(
 
             // constraint bound 4: CUDA/shared memory constraint for CUDA/Metal devices
             for (uint32_t m = 0; m < n_world; ++m) {
-                if (dev_gpu[m]) {
-                    model.lp_.row_upper_[constraint_idx] = W * vec_z_gpu[m];
-                    constraint_idx++;
-                }
+                model.lp_.row_upper_[constraint_idx] = W * vec_z_gpu[m];
+                constraint_idx++;
             }
 
             // define the constraint matrix
@@ -1278,15 +1298,14 @@ static bool assign_layers_to_device(
 
                 if (in_set(m, M1) || in_set(m, M2)) { // in sets M1 and M2
                     A[cons_row][m] = -1.0; // coefficient for w[m]
-                    A[cons_row][m + n_world] = 0.0; // coefficient for n[m]
                 } else if (in_set(m, M3)) { // in set M3
                     A[cons_row][m] = -1.0; // coefficient for w[m]
-                    A[cons_row][m + n_world] = 1.0; // coefficient for n[m]
+                    if (dev_gpu[m]) {
+                        A[cons_row][m + n_world] = 1.0; // coefficient for n[m]
+                    }
                 } else { // in set M4
                     A[cons_row][m] = 1.0; // coefficient for w[m]
-                    if (is_macos) {
-                        A[cons_row][m + n_world] = 0.0; // coefficient for n[m]
-                    } else {
+                    if (!is_macos && dev_gpu[m]) {
                         A[cons_row][m + n_world] = -1.0; // coefficient for n[m]
                     }
                 }
@@ -1295,11 +1314,8 @@ static bool assign_layers_to_device(
 
             // constraint coefficients 4: CUDA/shared memory constraint for CUDA/Metal devices
             for (uint32_t m = 0; m < n_world; ++m) {
-                if (dev_gpu[m]) {
-                    A[constraint_idx][m] = 0.0; // coefficient for w[m]
-                    A[constraint_idx][m + n_world] = 1.0; // coefficient for n[m]
-                    constraint_idx++;
-                }
+                A[constraint_idx][m + n_world] = 1.0; // coefficient for n[m]
+                constraint_idx++;
             }
 
             // translate the constraint matrix A into the LP model
@@ -1353,6 +1369,13 @@ static bool assign_layers_to_device(
                 best_k = k;
                 best_solution = solution.col_value;
             }
+
+            LOG_INF("k = %2d, obj = %7.1f, solution: %s | best_k = %2d, best_obj = %7.1f, best_solution: %s\n", 
+                k, objective_value, vec_to_str(solution.col_value).c_str(), best_k, best_objective, vec_to_str(best_solution).c_str());
+        }
+
+        if (best_objective > final_objective) {
+            break; // avoid oscillation between two set assignments
         }
 
         // update w[m] and n[m]
@@ -1382,19 +1405,29 @@ static bool assign_layers_to_device(
     LOG_INF("\nEstimated Latency: %.3f ms\n", final_objective);
     LOG_INF("------------------------------------------");
 
+    // copy value from w and n to n_layer_window and n_gpu_layers, respectively
+    std::copy(w.begin(), w.end(), n_layer_window);
+    std::copy(n.begin(), n.end(), n_gpu_layers);
+
 #else
     (void)bi;
     (void)bo;
     (void)kappa;
     (void)cparams;
     (void)min_disk_read_speed;
+    (void)n_vocab;
+    (void)GIGABYTE;
+
+    std::copy(w.begin(), w.end(), n_layer_window);
+    for (uint32_t m = 0; m < n_world; ++m) {
+        const device_info & dev = dev_info_set[m];
+        if (dev.gpu_support.cuda || dev.gpu_support.metal) {
+            n_gpu_layers[m] = w[m];
+        }
+    }
 
 #endif
 
-    // copy value from w and n to n_layer_window and n_gpu_layers, respectively
-    std::copy(w.begin(), w.end(), n_layer_window);
-    std::copy(n.begin(), n.end(), n_gpu_layers);
-
     return true;
 }