use highs to solve the allocation program

2025-09-06 17:49:04 +00:00 · 2025-01-15 10:04:04 +04:00 · 2025-01-15 10:04:04 +04:00 · 5d9aadf3d5
commit 5d9aadf3d5
parent b577c10d25
6 changed files with 614 additions and 86 deletions
--- a/8
+++ b/8
@ -264,11 +264,11 @@ MK_CXXFLAGS  = -std=c++11 -fPIC
 MK_NVCCFLAGS = -std=c++11

 ifeq ($(UNAME_S),Darwin)
-    MK_CPPFLAGS += -I/opt/homebrew/include
-    MK_LDFLAGS  += -L/opt/homebrew/lib -lzmq
+    MK_CPPFLAGS += -isystem /opt/homebrew/include -isystem /opt/homebrew/include/highs
+    MK_LDFLAGS  += -L/opt/homebrew/lib -lzmq -lhighs
 else ifeq ($(UNAME_S),Linux)
-    MK_CPPFLAGS += -I/usr/local/include
-    MK_LDFLAGS  += -L/usr/local/lib -lzmq
+    MK_CPPFLAGS += -isystem /usr/local/include -isystem /usr/local/include/highs
+    MK_LDFLAGS  += -L/usr/local/lib -lzmq -lhighs
 endif

 ifdef LLAMA_NO_CCACHE
--- a/common/common.cpp
+++ b/common/common.cpp
@ -9,6 +9,7 @@
 #include "json.hpp"
 #include "json-schema-to-grammar.h"
 #include "llama.h"
+#include "Highs.h"

 #include <algorithm>
 #include <cinttypes>
@ -28,8 +29,6 @@
 #include <vector>
 #include <thread>

-#define DEFAULT_N_LAYER_WINDOW 4
-
 #if defined(__APPLE__) && defined(__MACH__)
 #include <sys/types.h>
 #include <sys/sysctl.h>
@ -72,6 +71,8 @@

 using json = nlohmann::ordered_json;

+constexpr int GIGABYTE = 1024 * 1024 * 1024;
+
 //
 // CPU utils
 //
@ -364,11 +365,6 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD
    return true;
 }

-template <size_t N>
-void copy_n_layer_window(const uint32_t (&source)[N], uint32_t * destination) {
-    std::copy(std::begin(source), std::end(source), destination);
-}
-
 void gpt_init() {
    llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
        if (LOG_DEFAULT_LLAMA <= gpt_log_verbosity_thold) {
@ -822,30 +818,531 @@ std::string fs_get_cache_file(const std::string & filename) {
    return cache_directory + filename;
 }

-
-//
-// Model utils
-//
-static void llama_assign_n_layer_window(
+static void assign_device(
                                uint32_t   n_world, 
                                uint32_t   my_rank, 
                       const device_info * dev_info_set, 
                                uint32_t * n_layer_window, 
-                      struct llama_model * model) {
+                                uint32_t * n_gpu_layers,
+                      struct llama_model * model,
+       const struct llama_context_params   cparams,
+                                   float   min_disk_read_speed = 0.1f) { // minimum disk I/O speed: 100 MB/s
    GGML_ASSERT(dev_info_set != nullptr);
    GGML_ASSERT(n_layer_window != nullptr);
+    GGML_ASSERT(my_rank == 0);

-    uint32_t n_layer = llama_model_n_layers(model);
+    // if only 1 device, it is assigned all layers
+    const uint32_t n_layer = llama_model_n_layers(model);
    if (n_world == 1) {
        n_layer_window[0] = n_layer;
        return;
    }

-    (void)my_rank;
+    const device_info &master = dev_info_set[0];

-    std::fill_n(n_layer_window, n_world, DEFAULT_N_LAYER_WINDOW);
+    // model-specific constants
+    const int n_embd_k_gqa = llama_model_n_embd_k_gqa(model);
+    const int n_embd_v_gqa = llama_model_n_embd_v_gqa(model);
+    const int n_kv         = 16;
+
+    const int64_t b        = dev_info_set[0].model_bytes.nb_layer;
+    const int64_t bi       = dev_info_set[0].model_bytes.nb_input;
+    const int64_t bo       = dev_info_set[0].model_bytes.nb_output;
+    const int64_t b_prime  = b + 2 * (n_embd_k_gqa + n_embd_v_gqa) * n_kv;
+
+    // device-specific constants
+    std::vector<float> alpha(n_world, 0.0f);
+    std::vector<float> beta(n_world, 0.0f);
+    std::vector<float> xi(n_world, 0.0f);
+    float kappa = 0.0f;
+    std::vector<int>   w(n_world, 0);
+    std::vector<int>   n(n_world, 0);
+    std::vector<float> mem_budget(n_world, 0.0f);
+
+    // -------- Compute alpha[m], beta[m], xi[m] --------
+    for (uint32_t m = 0; m < n_world; ++m) {
+        // alpha[m]
+        const device_info &dev = dev_info_set[m];
+        float t_calc_cpu = (
+            master.model_flops.layer_f32_f32 / (dev.cpu_props.flops_f32_f32 * 1e9 + EPS) +
+            master.model_flops.layer_f16_f32 / (dev.cpu_props.flops_f16_f32 * 1e9 + EPS) +
+            master.model_flops.layer_q4k_f32 / (dev.cpu_props.flops_q4k_f32 * 1e9 + EPS) +
+            master.model_flops.layer_q5k_f32 / (dev.cpu_props.flops_q5k_f32 * 1e9 + EPS) +
+            master.model_flops.layer_q6k_f32 / (dev.cpu_props.flops_q6k_f32 * 1e9 + EPS) +
+            master.model_flops.layer_q80_f32 / (dev.cpu_props.flops_q80_f32 * 1e9 + EPS)) * 1000; // in ms
+        float t_kv_cpy_cpu = dev.memory.mem_cpy_delay; // in ms
+        float t_read_ram_cpu = b_prime / (dev.memory.cpu_read_ram_bw * 1e9) * 1000; // in ms
+
+        alpha[m] = t_calc_cpu + t_kv_cpy_cpu + t_read_ram_cpu; // in ms
+
+        // beta[m]
+        float t_calc_gpu     = 0.0;
+        float t_kv_cpy_gpu   = 0.0;
+        float t_read_ram_gpu = 0.0;
+
+        if (dev.gpu_support.metal) {
+            t_calc_gpu = (
+                master.model_flops.layer_f32_f32 / (dev.gpu_props.metal_flops_f32_f32 * 1e9 + EPS) +
+                master.model_flops.layer_f16_f32 / (dev.gpu_props.metal_flops_f16_f32 * 1e9 + EPS) +
+                master.model_flops.layer_q4k_f32 / (dev.gpu_props.metal_flops_q4k_f32 * 1e9 + EPS) +
+                master.model_flops.layer_q5k_f32 / (dev.gpu_props.metal_flops_q5k_f32 * 1e9 + EPS) +
+                master.model_flops.layer_q6k_f32 / (dev.gpu_props.metal_flops_q6k_f32 * 1e9 + EPS) +
+                master.model_flops.layer_q80_f32 / (dev.gpu_props.metal_flops_q80_f32 * 1e9 + EPS)) * 1000; // in ms
+            t_kv_cpy_gpu = dev.gpu_props.metal_mem_cpy_delay; // in ms
+            t_read_ram_gpu = b_prime / (dev.gpu_props.metal_read_vram_bw * 1e9) * 1000; // in ms
+        } else if (dev.gpu_support.cuda) {
+            t_calc_gpu = (
+                master.model_flops.layer_f32_f32 / (dev.gpu_props.cuda_flops_f32_f32 * 1e9 + EPS) +
+                master.model_flops.layer_f16_f32 / (dev.gpu_props.cuda_flops_f16_f32 * 1e9 + EPS) +
+                master.model_flops.layer_q4k_f32 / (dev.gpu_props.cuda_flops_q4k_f32 * 1e9 + EPS) +
+                master.model_flops.layer_q5k_f32 / (dev.gpu_props.cuda_flops_q5k_f32 * 1e9 + EPS) +
+                master.model_flops.layer_q6k_f32 / (dev.gpu_props.cuda_flops_q6k_f32 * 1e9 + EPS) +
+                master.model_flops.layer_q80_f32 / (dev.gpu_props.cuda_flops_q80_f32 * 1e9 + EPS)) * 1000; // in ms
+            t_kv_cpy_gpu = dev.gpu_props.cuda_mem_cpy_delay; // in ms
+            t_read_ram_gpu = b_prime / (dev.gpu_props.cuda_read_vram_bw * 1e9) * 1000; // in ms
        }

+        beta[m] = t_calc_gpu - t_calc_cpu + t_kv_cpy_gpu - t_kv_cpy_cpu + t_read_ram_gpu - t_read_ram_cpu; // in ms
+        
+        // xi[m]
+        // the ram-vram and vram-ram transfer time and the communication time are less than 1 ms
+        xi[m] = 0.0;
+    }
+
+    // we adopt an iterative optimization approach. Initially, $w_m$ is set proportionally 
+    // based on the available memory budget
+    // - $d_m^{\text{avail}}$ for macOS without Metal and Linux
+    // - $d_m^{\text{total}}$ for macOS with Metal
+    // - $d_m^{\text{avail}}+d_m^{\text{swapout}}$ for Android
+    // and $n_m$ is initialized to 0. 
+    for (uint32_t m = 0; m < n_world; ++m) {
+        const device_info &dev = dev_info_set[m];
+        GGML_ASSERT(dev.device_os != nullptr);
+
+        bool is_macos   = strcmp(dev.device_os, "macOS") == 0;
+        bool is_linux   = strcmp(dev.device_os, "Linux") == 0;
+        bool is_android = strcmp(dev.device_os, "Android") == 0;
+        bool is_windows = strcmp(dev.device_os, "Windows") == 0;
+        GGML_ASSERT(!is_windows && "Windows is not tested yet\n");
+        
+        if ((is_macos && !dev.gpu_support.metal) || is_linux) {
+            mem_budget[m] = dev.memory.available_physical;
+        } else if (is_macos && dev.gpu_support.metal) {
+            mem_budget[m] = dev.memory.total_physical;
+        } else if (is_android) {
+            mem_budget[m] = dev.memory.available_physical + dev.memory.used_can_swap;
+        } else {
+            // todo: add support for other OS such as Windows
+            GGML_ASSERT(false && "Unsupported OS\n");
+        }
+    }
+
+    // initialize w_m proportionally to memory budget and n_m to 0
+    float total_mem_budget = std::accumulate(mem_budget.begin(), mem_budget.end(), 0.0f);
+    for (uint32_t m = 0; m < n_world; ++m) {
+        w[m] = std::round(mem_budget[m] / total_mem_budget * n_layer);
+        n[m] = 0;
+    }
+
+    // stores the actual read bandwidth (GB/s) for each device
+    std::vector<float> disk_speed(n_world, 0.0f);
+    for (uint32_t m = 0; m < n_world; ++m) {
+        const device_info &dev = dev_info_set[m];
+        GGML_ASSERT(dev.device_os != nullptr);
+        bool is_linux = strcmp(dev.device_os, "Linux") == 0;
+
+        if (is_linux) {
+            disk_speed[m] = dev.disk.read_seq_bw;
+        } else {
+            disk_speed[m] = dev.disk.read_rnd_bw;
+        }
+    }
+
+    // helper function to find valid factors for a given n_layers
+    auto find_factors = [&](int n_layers) {
+        std::vector<int> factors;
+        for (int k = 1; k <= n_layers / 2; ++k) {
+            if (n_layers % k == 0) {
+                factors.push_back(k);
+            }
+        }
+        return factors;
+    };
+
+    // get valid factors
+    std::vector<int> valid_k = find_factors(n_layer);
+
+    // assign devices to sets M1, M2, M3, and M4
+    // M1: devices running on macOS without Metal, and with insufficient memory
+    // M2: devices running on macOS with Metal and insufficient memory
+    // M3: devices running on Linux or Android and with insufficient memory
+    // M4: devices with sufficient memory or very slow disk I/O (slower than min_disk_io_speed)
+    std::vector<uint32_t> M1, M2, M3, M4, M1_prev, M2_prev, M3_prev, M4_prev;
+    std::vector<int64_t> c_cpu(n_world, 0), c_gpu(n_world, 0);
+
+    // helper function to check if a device is in a specific set
+    auto in_set = [&](uint32_t m, const std::vector<uint32_t> & M) {
+        return (std::find(M.begin(), M.end(), m) != M.end());
+    };
+
+    auto assign_sets = [&](int k) -> bool {
+        M1.clear(), M2.clear(), M3.clear(), M4.clear();
+
+        for (uint32_t m = 0; m < n_world; ++m) {
+            const device_info &dev = dev_info_set[m];
+
+            GGML_ASSERT(dev.device_os != nullptr);
+            bool is_macos   = strcmp(dev.device_os, "macOS") == 0;
+            bool is_linux   = strcmp(dev.device_os, "Linux") == 0;
+            bool is_android = strcmp(dev.device_os, "Android") == 0;
+            bool is_windows = strcmp(dev.device_os, "Windows") == 0;
+            GGML_ASSERT(!is_windows && "Windows is not tested yet\n");
+
+            llama_model_compute_buf_size(&c_cpu[m], &c_gpu[m], model, cparams, dev.gpu_support.metal, m == 0, w[m] * k, n[m] * k);
+
+            int  l_m          = w[m] * k;  // total number of layers assigned to device m
+            int  l_m_gpu      = n[m] * k;  // number of layers assigned to device m that run on GPU
+            bool condition1   = l_m * b + (bi + bo) * int(m == 0) + 2 * (n_embd_k_gqa + n_embd_v_gqa) * n_kv * l_m + c_cpu[m] > mem_budget[m] * GIGABYTE;
+            bool condition2   = l_m * b + (bi + bo) * int(m == 0) + 2 * (n_embd_k_gqa + n_embd_v_gqa) * n_kv * l_m + c_cpu[m] + c_gpu[m] > mem_budget[m] * GIGABYTE;
+            bool condition3   = (l_m - l_m_gpu) * b_prime + (bi + bo) * int(m == 0) + c_cpu[m] > mem_budget[m] * GIGABYTE;
+            bool is_slow_disk = disk_speed[m] < min_disk_read_speed;
+
+            if (is_macos && !dev.gpu_support.metal && condition1 && !is_slow_disk) {
+                // case 1: macOS without Metal, and with insufficient memory
+                M1.push_back(m);
+            } else if (is_macos && dev.gpu_support.metal && condition2 && !is_slow_disk) {
+                // case 2: macOS with Metal, and with insufficient memory
+                M2.push_back(m);
+            } else if ((is_linux || is_android) && condition3 && !is_slow_disk) {
+                // case 3: Linux with insufficient memory
+                M3.push_back(m);
+            } else {
+                // case 4: otherwise, assigned to M4
+                M4.push_back(m);
+            }
+        }
+
+        // check whether the sets are changed
+        bool sets_changed = (M1 != M1_prev || M2 != M2_prev || M3 != M3_prev || M4 != M4_prev);
+
+        // update the previous sets
+        M1_prev = M1, M2_prev = M2, M3_prev = M3, M4_prev = M4;
+
+        return sets_changed;
+    };
+
+    // helper function to print a matrix
+    auto print_matrix = [](const std::vector<std::vector<double>>& matrix) {
+        for (const auto& row : matrix) {
+            for (const auto& elem : row) {
+                printf("%.3f ", elem);
+            }
+            printf("\n");
+        }
+    };
+
+    double final_objective = 1.0e30;
+    std::vector<double> final_solution;
+    int final_k = -1;
+
+    // iterative optimization to find a valid set assignment (M1, M2, M3, M4)
+    while (true) {
+        int W = std::accumulate(w.begin(), w.end(), 0);
+        int cur_k = (int)n_layer / W;
+        GGML_ASSERT(W > 1 && (int)n_layer % W == 0 && "Constraint: L = k * W must hold\n");
+
+        if (!assign_sets(cur_k)) break;
+
+        // update kappa
+        for (uint32_t m = 0; m < n_world; ++m) {
+            const device_info &dev = dev_info_set[m];
+            GGML_ASSERT(dev.device_os != nullptr);
+            bool is_android = strcmp(dev.device_os, "Android") == 0;
+
+            if (m == 0) {
+                kappa = (bi + bo) / (disk_speed[m] * 1e9) * 1000;  // in ms
+            }
+            if (in_set(m, M3)) {
+                kappa += (c_cpu[m] - dev.memory.available_physical * GIGABYTE - dev.memory.used_can_swap * GIGABYTE * int(is_android)) / (disk_speed[m] * 1e9) * 1000; // in ms
+            }
+        }
+
+        // -------------------------------------------------------------
+        // Construct vectors va, vb, vc
+        // -------------------------------------------------------------
+        // a[m], b[m], c[m] are computed based on divisions M1, M2, M3, and M4:
+        //   - M1: a[m] = alpha[m] + b / s_m^{disk},  b[m] = 0,                        c[m] = xi[m]
+        //   - M2: a[m] = alpha[m] + b / s_m^{disk},  b[m] = beta[m],                  c[m] = xi[m]
+        //   - M3: a[m] = alpha[m] + b' / s_m^{disk}, b[m] = beta[m] - b'/ s_m^{disk}, c[m] = xi[m]
+        //   - M4: a[m] = alpha[m],                   b[m] = beta[m],                  c[m] = xi[m]
+        std::vector<float> vec_a(n_world, 0.0f), vec_b(n_world, 0.0f), vec_c(n_world, 0.0f);
+
+        for (uint32_t m = 0; m < n_world; ++m) {
+            if (in_set(m, M1)) {
+                vec_a[m] = alpha[m] + b / (disk_speed[m] * 1e9) * 1000; // in ms
+                vec_b[m] = 0.0f;
+                vec_c[m] = xi[m];
+            } else if (in_set(m, M2)) {
+                vec_a[m] = alpha[m] + b / (disk_speed[m] * 1e9) * 1000; // in ms
+                vec_b[m] = beta[m];
+                vec_c[m] = xi[m];
+            } else if (in_set(m, M3)) {
+                vec_a[m] = alpha[m] + b_prime / (disk_speed[m] * 1e9) * 1000; // in ms
+                vec_b[m] = beta[m]  - b_prime / (disk_speed[m] * 1e9) * 1000; // in ms
+                vec_c[m] = xi[m];
+            } else {
+                vec_a[m] = alpha[m];
+                vec_b[m] = beta[m];
+                vec_c[m] = xi[m];
+            }
+        }
+
+        // -------------------------------------------------------------
+        // Construct vectors vz, vz_cuda
+        // -------------------------------------------------------------
+        // z and z_cuda are used to express memory constraints:
+        // for z:
+        //   - M1:  (d_m^{avail} - b_cio) / (L*b')
+        //   - M2:  (d_m^{total} - b_cio - c_gpu) / (L*b')
+        //   - M3:  (d_m^{avail}+d_m^{swapout} - b_cio) / (L*b')
+        //   - M4:  - (d_m^{avail} - b_cio) / (L*b') on macOS without Metal,
+        //       or - (d_m^{total} - b_cio - c_gpu) / (L*b') on macOS with Metal,
+        //       or - (d_m^{avail}+d_m^{swapout} - b_cio) / (L*b') on Linux or Android
+        //
+        // for z_cuda:
+        //   - M1:  (d_{m,cuda}^{avail} - c_gpu) / (L*b'),
+        // d_{m,cuda}^{avail} is non-zero only if the device supports CUDA
+        std::vector<float> vec_z(n_world, 0.0f), vec_z_cuda(n_world, 0.0f);
+        std::vector<int> dev_cuda(n_world, 0);
+
+        for (uint32_t m = 0; m < n_world; ++m) {
+            const device_info &dev = dev_info_set[m];
+
+            GGML_ASSERT(dev.device_os != nullptr);
+            bool is_macos   = strcmp(dev.device_os, "macOS") == 0;
+            bool is_android = strcmp(dev.device_os, "Android") == 0;
+            bool is_windows = strcmp(dev.device_os, "Windows") == 0;
+            GGML_ASSERT(!is_windows && "Windows is not tested yet\n");
+
+            int64_t b_cio = (bi + bo) * int(m == 0) + c_cpu[m];
+
+            if (in_set(m, M1)) {
+                vec_z[m] = (double)(dev.memory.available_physical * GIGABYTE - b_cio) / (double)(n_layer * b_prime);
+            } else if (in_set(m, M2)) {
+                vec_z[m] = (double)(dev.memory.total_physical * GIGABYTE - b_cio - c_gpu[m]) / (double)(n_layer * b_prime);
+            } else if (in_set(m, M3)) {
+                vec_z[m] = (double)(dev.memory.available_physical * GIGABYTE + dev.memory.used_can_swap * GIGABYTE * int(is_android) - b_cio) / (double)(n_layer * b_prime);
+            } else {
+                if (is_macos && !dev.gpu_support.metal) {
+                    vec_z[m] = - (double)(dev.memory.available_physical * GIGABYTE - b_cio) / (double)(n_layer * b_prime);
+                } else if (is_macos && dev.gpu_support.metal) {
+                    vec_z[m] = - (double)(dev.memory.total_physical * GIGABYTE - b_cio - c_gpu[m]) / (double)(n_layer * b_prime);
+                } else {
+                    vec_z[m] = - (double)(dev.memory.available_physical * GIGABYTE + dev.memory.used_can_swap * GIGABYTE * int(is_android) - b_cio) / (double)(n_layer * b_prime);
+                }
+            }
+
+            if (dev.gpu_support.cuda) {
+                vec_z_cuda[m] = (double)(dev.gpu_props.memory_free * GIGABYTE - c_gpu[m]) / (double)(n_layer * b_prime);
+                dev_cuda[m] = 1;
+            } else {
+                vec_z_cuda[m] = -(double)c_gpu[m] / (double)(n_layer * b_prime);
+            }
+        }
+
+        // count the number of cuda devices
+        int num_dev_cuda = std::accumulate(dev_cuda.begin(), dev_cuda.end(), 0);
+
+        // -------------------------------------------------------------
+        // Build and solve the optimization model
+        // -------------------------------------------------------------
+        double best_objective = 1.0e30;
+        std::vector<double> best_solution;
+        int best_k = -1;
+
+        // iterate over all possible values of k to find the best solution
+        for (int k : valid_k) {
+            GGML_ASSERT(n_layer % k == 0 && "Constraint: L = k * W must hold\n");
+            int W = n_layer / k;
+
+            HighsModel model;
+
+            // define the number of decision variables and constraints
+            model.lp_.num_col_ = n_world * 2; // number of decision variables
+            model.lp_.num_row_ = 1 + 2 * n_world + num_dev_cuda; // number of constraints
+
+            // define the objective: k * sum(a[m] * w[m] + b[m] * n[m]) + kappa + k * sum(c[m])
+            model.lp_.sense_  = ObjSense::kMinimize;
+            model.lp_.offset_ = k * std::accumulate(vec_c.begin(), vec_c.end(), 0.0f) + kappa;
+            model.lp_.col_cost_.clear();
+            std::copy(vec_a.begin(), vec_a.end(), std::back_inserter(model.lp_.col_cost_));
+            std::copy(vec_b.begin(), vec_b.end(), std::back_inserter(model.lp_.col_cost_));
+            std::transform(
+                model.lp_.col_cost_.begin(), 
+                model.lp_.col_cost_.end(), 
+                model.lp_.col_cost_.begin(), [k](double cost) {
+                    return cost * k;
+                }
+            );
+
+            // define the variable bounds
+            model.lp_.col_lower_ = std::vector<double>(n_world * 2, 0.0);
+            std::fill(model.lp_.col_lower_.begin(), model.lp_.col_lower_.begin() + n_world, 1.0);
+            model.lp_.col_upper_ = std::vector<double>(n_world * 2, n_layer);
+
+            // define the constraint bounds
+            int constraint_idx = 0;
+            model.lp_.row_lower_ = std::vector<double>(model.lp_.num_row_, -1.0e30); // initialize to a large negative value
+            model.lp_.row_upper_ = std::vector<double>(model.lp_.num_row_,  1.0e30); // initialize to a large positive value
+            
+            // constraint bound 1: sum(w[m]) = W
+            model.lp_.row_lower_[constraint_idx] = {(double)W}; 
+            model.lp_.row_upper_[constraint_idx] = {(double)W};
+            constraint_idx++;
+
+            // constraint bound 2: n[m] <= w[m], m = 1, 2, ..., n_world
+            std::fill_n(model.lp_.row_upper_.begin() + constraint_idx, n_world, 0.0); // constraint: -w[m] + n[m] <= 0.0
+            constraint_idx += n_world;
+
+            // constraint bound 3: RAM constraint for each device
+            for (uint32_t m = 0; m < n_world; ++m) {
+                model.lp_.row_upper_[constraint_idx + m] = -W * vec_z[m];
+            }
+            constraint_idx += n_world;
+
+            // constraint bound 4: CUDA memory constraint for CUDA devices
+            for (uint32_t m = 0; m < n_world; ++m) {
+                if (dev_cuda[m]) {
+                    model.lp_.row_upper_[constraint_idx] = W * vec_z_cuda[m];
+                    constraint_idx++;
+                }
+            }
+
+            // define the constraint matrix
+            const int n_rows = model.lp_.num_row_;
+            const int n_cols = model.lp_.num_col_;
+            std::vector<std::vector<double>> A(n_rows, std::vector<double>(n_cols, 0.0));
+            constraint_idx = 0;
+
+            // constraint coefficients 1: sum(w[m]) = W
+            std::fill_n(A[constraint_idx].begin(), n_world, 1.0);
+            constraint_idx++;
+
+            // constraint coefficients 2: n[m] <= w[m], m = 1, 2, ..., n_world
+            for (uint32_t m = 0; m < n_world; ++m) {
+                A[constraint_idx + m][m] = -1.0; // coefficient for w[m]
+                A[constraint_idx + m][m + n_world] = 1.0; // coefficient for n[m]
+            }
+            constraint_idx += n_world;
+            
+            // constraint coefficients 3: RAM constraint for each device
+            for (uint32_t m = 0; m < n_world; ++m) {
+                const device_info &dev = dev_info_set[m];
+                GGML_ASSERT(dev.device_os != nullptr);
+                bool is_macos = strcmp(dev.device_os, "macOS") == 0;
+                int cons_row = constraint_idx + m;
+
+                if (in_set(m, M1) || in_set(m, M2)) { // in sets M1 and M2
+                    A[cons_row][m] = -1.0; // coefficient for w[m]
+                    A[cons_row][m + n_world] = 0.0; // coefficient for n[m]
+                } else if (in_set(m, M3)) { // in set M3
+                    A[cons_row][m] = -1.0; // coefficient for w[m]
+                    A[cons_row][m + n_world] = 1.0; // coefficient for n[m]
+                } else { // in set M4
+                    A[cons_row][m] = 1.0; // coefficient for w[m]
+                    if (is_macos) {
+                        A[cons_row][m + n_world] = 0.0; // coefficient for n[m]
+                    } else {
+                        A[cons_row][m + n_world] = -1.0; // coefficient for n[m]
+                    }
+                }
+            }
+            constraint_idx += n_world;
+
+            // constraint coefficients 4: CUDA memory constraint for CUDA devices
+            for (uint32_t m = 0; m < n_world; ++m) {
+                if (dev_cuda[m]) {
+                    A[constraint_idx][m] = 0.0; // coefficient for w[m]
+                    A[constraint_idx][m + n_world] = 1.0; // coefficient for n[m]
+                    constraint_idx++;
+                }
+            }
+
+            // translate the constraint matrix A into the LP model
+            model.lp_.a_matrix_.format_ = MatrixFormat::kColwise;
+            model.lp_.a_matrix_.start_.resize(n_cols + 1);
+            model.lp_.a_matrix_.index_.clear();
+            model.lp_.a_matrix_.value_.clear();
+
+            int nnz_count = 0; // number of non-zero elements
+            for (int j = 0; j < n_cols; ++j) {
+                model.lp_.a_matrix_.start_[j] = nnz_count;
+                for (int i = 0; i < n_rows; ++i) {
+                    if (A[i][j] != 0.0) {
+                        model.lp_.a_matrix_.index_.push_back(i);
+                        model.lp_.a_matrix_.value_.push_back(A[i][j]);
+                        nnz_count++;
+                    }
+                }
+            }
+            model.lp_.a_matrix_.start_[n_cols] = nnz_count;
+
+            // integer constraints
+            model.lp_.integrality_ = std::vector<HighsVarType>(n_world * 2, HighsVarType::kInteger);
+
+            // solve the optimization problem
+            Highs highs;
+            highs.setOptionValue("log_to_console", false); // disable logging
+
+            HighsStatus return_status = highs.passModel(model);
+            GGML_ASSERT(return_status == HighsStatus::kOk && "Failed to pass model\n");
+            
+            // run the solver
+            return_status = highs.run();
+            GGML_ASSERT(return_status == HighsStatus::kOk && "Failed to run the solver\n");
+
+            // get the solution
+            const HighsModelStatus& model_status = highs.getModelStatus();
+            if (model_status != HighsModelStatus::kOptimal) continue;
+
+            // record the best solution
+            const HighsSolution& solution = highs.getSolution();
+            double objective_value = highs.getInfo().objective_function_value;
+            if (objective_value < best_objective) {
+                best_objective = objective_value;
+                best_k = k;
+                best_solution = solution.col_value;
+            }
+        }
+
+        // update w[m] and n[m]
+        GGML_ASSERT(best_solution.size() == n_world * 2 && "Invalid solution\n");
+        std::copy(best_solution.begin(), best_solution.begin() + n_world, w.begin());
+        std::copy(best_solution.begin() + n_world, best_solution.end(), n.begin());
+
+        // update the global best solution
+        final_k = best_k;
+        final_objective = best_objective;
+        final_solution = best_solution;
+    }
+
+    LOG_INF("Global best solution found for k = %d\n", final_k);
+    for (uint32_t m = 0; m < n_world; ++m) {
+        const char * device_name = dev_info_set[m].device_name;
+        GGML_ASSERT(final_solution[m] == w[m] && final_solution[m + n_world] == n[m]);
+        LOG_INF("Device %s (m = %d): w = %d, n = %d\n", device_name, m, w[m], n[m]);
+    }
+    LOG_INF("Objective value: %.3f\n", final_objective);
+
+    // copy value from w and n to n_layer_window and n_gpu_layers, respectively
+    std::copy(w.begin(), w.end(), n_layer_window);
+    std::copy(n.begin(), n.end(), n_gpu_layers);
+}
+
+//
+// Model utils
+//
+
 struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
    llama_init_result iparams;
    auto mparams = llama_model_params_from_gpt_params(params);
@ -914,30 +1411,40 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
        dev_info_set = (struct device_info *)malloc(n_world * sizeof(struct device_info));
        dev_info_set[0] = dev_info;
        llama_gather_device_info(lctx, dev_info_set);
-        device_print_props(dev_info_set, n_world, model, cparams);
    } else {
        llama_send_device_info(lctx, &dev_info);
    }

-    uint32_t n_layer_window[32] = {0};
+    uint32_t n_layer_window[32] = {0}, n_gpu_layers[32] = {0};
    if (my_rank == 0) {
        if (n_world == 1 || params.n_layer_window[0] == 0) {
-            llama_assign_n_layer_window(n_world, my_rank, dev_info_set, n_layer_window, model);
+            // automatically determine n_layer_window and n_gpu_layers
+            assign_device(n_world, my_rank, dev_info_set, n_layer_window, n_gpu_layers, model, cparams);
        } else {
-            copy_n_layer_window(params.n_layer_window, n_layer_window);
+            // use manually set n_layer_window
+            std::copy(std::begin(params.n_layer_window), std::end(params.n_layer_window), n_layer_window);
        }

-        // synchronize the new n_layer_window to other nodes
-        llama_broadcast_n_layer_window(lctx, n_layer_window);
+        // synchronize the new n_layer_window and n_gpu_layers to other nodes
+        llama_bcast_layer_setup(lctx, n_layer_window, n_gpu_layers);
    } else {
-        llama_recv_n_layer_window(lctx, n_layer_window);
+        llama_recv_layer_setup(lctx, n_layer_window, n_gpu_layers);
    }

-    // update n_layer_window
-    copy_n_layer_window(n_layer_window, params.n_layer_window);
-    copy_n_layer_window(n_layer_window, cparams.n_layer_window);
-    copy_n_layer_window(n_layer_window, mparams.n_layer_window);
-    copy_n_layer_window(n_layer_window, llama_context_n_layer_window(lctx));
+    // update n_layer_window and n_gpu_layers
+    std::copy(std::begin(n_layer_window), std::end(n_layer_window), params.n_layer_window);
+    std::copy(std::begin(n_layer_window), std::end(n_layer_window), cparams.n_layer_window);
+    std::copy(std::begin(n_layer_window), std::end(n_layer_window), mparams.n_layer_window);
+    std::copy(std::begin(n_layer_window), std::end(n_layer_window), llama_context_n_layer_window(lctx));
+
+    params.n_gpu_layers  = n_gpu_layers[my_rank];
+    cparams.n_gpu_layers = n_gpu_layers[my_rank];
+    mparams.n_gpu_layers = n_gpu_layers[my_rank];
+    llama_context_n_gpu_layers(lctx)[my_rank] = n_gpu_layers[my_rank];
+
+#ifdef LLAMA_DEBUG
+    device_print_props(dev_info_set, n_world, model, cparams);
+#endif

    if (!mparams.vocab_only && llm_load_tensors(ml, model, mparams) < 0) {
        LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.c_str());
--- a/common/common.h
+++ b/common/common.h
@ -158,7 +158,7 @@ struct gpt_params {
    int32_t n_parallel            =     1; // number of parallel sequences to decode
    int32_t n_sequences           =     1; // number of sequences to decode
    float   p_split               =  0.1f; // speculative decoding split probability
-    int32_t n_gpu_layers          =    -1; // number of layers to store in VRAM (-1 - use default)
+    int32_t n_gpu_layers          =     0; // number of layers to store in VRAM (0 - do not use by default)
    int32_t n_gpu_layers_draft    =    -1; // number of layers to store in VRAM for the draft model (-1 - use default)
    int32_t main_gpu              =     0; // the GPU that is used for scratch and small tensors
    float   tensor_split[128]     =   {0}; // how split tensors should be distributed across GPUs
--- a/common/profiler.cpp
+++ b/common/profiler.cpp
@ -1312,8 +1312,8 @@ static float device_memory_access_delay(struct device_info & dev_info, struct ll
    auto n_bytes     = dev_info.model_bytes;
    int n_gpu_layers = std::min(static_cast<int>(cparams.n_gpu_layers), n_layers);
    
-    uint64_t cpu_kv_size;
-    uint64_t gpu_kv_size;
+    int64_t cpu_kv_size;
+    int64_t gpu_kv_size;

 #if defined(GGML_USE_METAL) || defined(GGML_USE_CUDA)
    llama_kv_size(&cpu_kv_size, &gpu_kv_size, model, cparams, true);
@ -1428,17 +1428,17 @@ static float device_disk_access_delay(struct device_info & dev_info, struct llam

    cpu_total_bytes += n_bytes.nb_output;

-    uint64_t cpu_kv_size;
-    uint64_t gpu_kv_size;
-    uint64_t cpu_compute_buf;
-    uint64_t gpu_compute_buf;
+    int64_t cpu_kv_size;
+    int64_t gpu_kv_size;
+    int64_t cpu_compute_buf;
+    int64_t gpu_compute_buf;

 #if defined(GGML_USE_METAL) || defined(GGML_USE_CUDA)
    llama_kv_size(&cpu_kv_size, &gpu_kv_size, model, cparams, true);
-    llama_model_compute_buf_size(&cpu_compute_buf, &gpu_compute_buf, model, cparams, true);
+    llama_model_compute_buf_size(&cpu_compute_buf, &gpu_compute_buf, model, cparams, true,  true, n_layers, n_gpu_layers);
 #else
    llama_kv_size(&cpu_kv_size, &gpu_kv_size, model, cparams, false);
-    llama_model_compute_buf_size(&cpu_compute_buf, &gpu_compute_buf, model, cparams, false);
+    llama_model_compute_buf_size(&cpu_compute_buf, &gpu_compute_buf, model, cparams, false, true, n_layers, n_gpu_layers);
 #endif

    double cpu_kv_size_gib     = static_cast<double>(cpu_kv_size) / 1024.0 / 1024.0 / 1024.0;     // convert to GiB
--- a/include/llama.h
+++ b/include/llama.h
@ -446,8 +446,8 @@ extern "C" {
    LLAMA_API void llama_free_sockets      (struct llama_context * ctx, char ** msg);
    LLAMA_API int  llama_gather_device_info(struct llama_context * ctx, struct device_info * dev_info_set);
    LLAMA_API int  llama_send_device_info  (struct llama_context * ctx, struct device_info * dev_info);
-    LLAMA_API int  llama_broadcast_n_layer_window(struct llama_context * ctx, uint32_t * n_layer_window);
-    LLAMA_API int  llama_recv_n_layer_window(struct llama_context * ctx, uint32_t * n_layer_window);
+    LLAMA_API int  llama_bcast_layer_setup (struct llama_context * ctx, uint32_t * n_layer_window, uint32_t * n_gpu_layers);
+    LLAMA_API int  llama_recv_layer_setup  (struct llama_context * ctx, uint32_t * n_layer_window, uint32_t * n_gpu_layers);

    LLAMA_API int llm_load_tensors(
              struct llama_model_loader * ml,
@ -465,6 +465,8 @@ extern "C" {
    
    LLAMA_API uint32_t * llama_context_n_layer_window(struct llama_context * ctx);

+    LLAMA_API uint32_t * llama_context_n_gpu_layers(struct llama_context * ctx);
+
    // Frees all allocated memory
    LLAMA_API void llama_free(struct llama_context * ctx);

@ -536,11 +538,14 @@ extern "C" {

    // Return the size of compute buffer size, including input tensors and activations
    LLAMA_API void llama_model_compute_buf_size(
-                                  uint64_t * cpu_buf,
-                                  uint64_t * gpu_buf,
+                                   int64_t * cpu_buf,
+                                   int64_t * gpu_buf,
                  const struct llama_model * model, 
         const struct llama_context_params   cparams, 
-                                      bool   use_gpu);
+                                      bool   use_gpu,
+                                      bool   is_master,
+                                       int   n_layers,
+                                       int   n_gpu_layers);

    // Return the size of KV cache in the model
    LLAMA_API void llama_total_kv_size(
@ -551,8 +556,8 @@ extern "C" {
                                      bool   use_gpu);
    
    LLAMA_API void llama_kv_size(
-                            uint64_t * cpu_cache, 
-                            uint64_t * gpu_cache, 
+                             int64_t * cpu_cache, 
+                             int64_t * gpu_cache, 
            const struct llama_model * model, 
   const struct llama_context_params   cparams,
                                bool   use_gpu);
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -19960,7 +19960,7 @@ int llama_send_device_info(struct llama_context * ctx, struct device_info * dev_
    return 0;
 }

-int llama_broadcast_n_layer_window(struct llama_context * ctx, uint32_t * n_layer_window) {
+int llama_bcast_layer_setup(struct llama_context * ctx, uint32_t * n_layer_window, uint32_t * n_gpu_layers) {
    uint32_t n_world = ctx->cparams.n_world;
    if (n_world == 1) {
        return 0;
@ -19973,6 +19973,9 @@ int llama_broadcast_n_layer_window(struct llama_context * ctx, uint32_t * n_laye
        send_msgs.emplace_back("n_layer_window", strlen("n_layer_window"));
        send_msgs.emplace_back(n_layer_window, sizeof(uint32_t) * 32);

+        send_msgs.emplace_back("n_gpu_layers", strlen("n_gpu_layers"));
+        send_msgs.emplace_back(n_gpu_layers, sizeof(uint32_t) * 32);
+
        zmq::send_multipart(*ctx->send_socket, send_msgs);
    } catch (const zmq::error_t& e) {
        LLAMA_LOG_INFO("Failed to send data: %s\n", e.what());
@ -19982,7 +19985,7 @@ int llama_broadcast_n_layer_window(struct llama_context * ctx, uint32_t * n_laye
    return 0;
 }

-int llama_recv_n_layer_window(struct llama_context * ctx, uint32_t * n_layer_window) {
+int llama_recv_layer_setup(struct llama_context * ctx, uint32_t * n_layer_window, uint32_t * n_gpu_layers) {
    uint32_t n_world = ctx->cparams.n_world;
    uint32_t my_rank = ctx->cparams.rank;

@ -19991,15 +19994,20 @@ int llama_recv_n_layer_window(struct llama_context * ctx, uint32_t * n_layer_win
        return -1;
    }

-    std::string key = recv_msgs[0].to_string();
-    if (key != "n_layer_window") {
-        LLAMA_LOG_INFO("Unexpected message received: %s\n", key.c_str());
+    if (recv_msgs.size() != 4) { // expecting n_layer_windows and n_gpu_layers
+        LLAMA_LOG_INFO("Unexpected number of messages received: %zu\n", recv_msgs.size());
        return -1;
    }

-    zmq::message_t & data_msg = recv_msgs[1];
-    GGML_ASSERT(data_msg.size() == sizeof(uint32_t) * 32);
-    memcpy(n_layer_window, data_msg.data(), sizeof(uint32_t) * 32);
+    if (recv_msgs[0].to_string() != "n_layer_window" || recv_msgs[2].to_string() != "n_gpu_layers") {
+        LLAMA_LOG_INFO("Unexpected message received\n");
+        return -1;
+    }
+
+    GGML_ASSERT(recv_msgs[1].size() == sizeof(uint32_t) * 32);
+    GGML_ASSERT(recv_msgs[3].size() == sizeof(uint32_t) * 32);
+    memcpy(n_layer_window, recv_msgs[1].data(), sizeof(uint32_t) * 32);
+    memcpy(n_gpu_layers,   recv_msgs[3].data(), sizeof(uint32_t) * 32);

    if (my_rank != n_world - 1) {
        try {
@ -20511,6 +20519,10 @@ uint32_t * llama_context_n_layer_window(struct llama_context * ctx) {
    return ctx->cparams.n_layer_window;
 }

+uint32_t * llama_context_n_gpu_layers(struct llama_context * ctx) {
+    return ctx->cparams.n_gpu_layers;
+}
+
 void llama_free(struct llama_context * ctx) {
    delete ctx;
 }
@ -20909,47 +20921,51 @@ static void count_n_bytes(struct model_bytes * n_bytes, enum profiler_layer_type
 }

 void llama_model_compute_buf_size(
-                            uint64_t * cpu_buf, 
-                            uint64_t * gpu_buf, 
+                             int64_t * cpu_buf, 
+                             int64_t * gpu_buf, 
            const struct llama_model * model, 
   const struct llama_context_params   cparams,
-                                bool   use_gpu) {
+                                bool   use_gpu,
+                                bool   is_master,
+                                 int   n_layers,
+                                 int   n_gpu_layers) {
    const llama_hparams hparams = model->hparams;

    // input tensors
-    const uint64_t n_inp_toks = cparams.n_ubatch;
-    const uint64_t n_inp_embd = hparams.n_embd  * cparams.n_ubatch;
+    const int64_t n_inp_toks = cparams.n_ubatch;
+    const int64_t n_inp_embd = hparams.n_embd  * cparams.n_ubatch;

    // activations (see figures/memory-allocation-map-for-activations.png for detailed allocation)
-    const uint64_t n_bak_embd = hparams.n_embd  * cparams.n_ubatch;
-    const uint64_t n_inp_pos  = cparams.n_ubatch;
-    const uint64_t n_kq_mask  = cparams.n_ctx   * cparams.n_ubatch;
-    const uint64_t n_inp_out_ids = cparams.n_ubatch;
-    const uint64_t n_norm     = hparams.n_embd  * cparams.n_ubatch;
-    const uint64_t n_qcur     = hparams.n_embd  * cparams.n_ubatch * 2;
-    const uint64_t n_kq       = cparams.n_ctx   * cparams.n_ubatch * hparams.n_head();
+    const int64_t n_bak_embd = hparams.n_embd  * cparams.n_ubatch;
+    const int64_t n_inp_pos  = cparams.n_ubatch;
+    const int64_t n_kq_mask  = cparams.n_ctx   * cparams.n_ubatch;
+    const int64_t n_inp_out_ids = cparams.n_ubatch;
+    const int64_t n_norm     = hparams.n_embd  * cparams.n_ubatch;
+    const int64_t n_qcur     = hparams.n_embd  * cparams.n_ubatch * 2;
+    const int64_t n_kq       = cparams.n_ctx   * cparams.n_ubatch * hparams.n_head();

    // outputs
-    const uint64_t n_out_embd = hparams.n_embd  * cparams.n_ubatch;
-    const uint64_t n_output   = hparams.n_vocab * cparams.n_ubatch;
+    const int64_t n_out_embd = hparams.n_embd  * cparams.n_ubatch;
+    const int64_t n_output   = hparams.n_vocab * cparams.n_ubatch;

    // compute buffer size for input, each layer, and output
-    const uint64_t n_buf_inp  = (n_inp_toks + n_inp_embd) * ggml_type_size(GGML_TYPE_F32);
-    const uint64_t n_buf_act  = (n_bak_embd + n_inp_pos + n_kq_mask + 
+    const int64_t n_buf_inp  = (n_inp_toks + n_inp_embd) * ggml_type_size(GGML_TYPE_F32);
+    const int64_t n_buf_act  = (n_bak_embd + n_inp_pos + n_kq_mask + 
                                 n_inp_out_ids + n_norm + n_qcur + n_kq
                                ) * ggml_type_size(GGML_TYPE_F32);
-    const uint64_t n_buf_out  = (n_out_embd + n_output) * ggml_type_size(GGML_TYPE_F32);
+    const int64_t n_buf_out  = (n_out_embd + n_output) * ggml_type_size(GGML_TYPE_F32);
+
+    *cpu_buf = 0;
+    *gpu_buf = 0;
+    if (is_master) *cpu_buf = n_buf_inp + n_buf_out;

    if (use_gpu) {
-        *gpu_buf = n_buf_act;
-        if (llama_model_n_layers(model) > cparams.n_gpu_layers) {
-            *cpu_buf = n_buf_inp + n_buf_act + n_buf_out;
-        } else {
-            *cpu_buf = n_buf_inp + n_buf_out;
+        *gpu_buf += n_buf_act;
+        if (n_layers > n_gpu_layers) {
+            *cpu_buf += n_buf_act;
        }
    } else {
-        *gpu_buf = 0;
-        *cpu_buf = n_buf_inp + n_buf_act + n_buf_out;
+        *cpu_buf += n_buf_act;
    }
 }

@ -20973,8 +20989,8 @@ void llama_total_kv_size(
 }

 void llama_kv_size(
-                            uint64_t * cpu_cache, 
-                            uint64_t * gpu_cache, 
+                             int64_t * cpu_cache, 
+                             int64_t * gpu_cache, 
            const struct llama_model * model, 
   const struct llama_context_params   cparams, 
                                bool   use_gpu) {