From 261c88f05852c45f86dede40333aa47abc1b4587 Mon Sep 17 00:00:00 2001
From: Zonghang Li <lizhuestc@gmail.com>
Date: Tue, 11 Feb 2025 09:49:17 +0400
Subject: [PATCH 01/18] skip tensors on CUDA in manage_graph_tensors

---
 src/llama.cpp | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/llama.cpp b/src/llama.cpp
index 30226968..0a9eabb5 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -17776,10 +17776,19 @@ static void manage_graph_tensors(struct ggml_cgraph * cgraph, int advice, bool f
 
     for (int i = 0; i < ggml_graph_n_leafs(cgraph); i++) {
         struct ggml_tensor * cur = ggml_graph_leaf(cgraph, i);
+
         if (strstr(cur->name, "weight") == nullptr || cur->data == nullptr) {
             continue;
         }
 
+        const char * backend_name = ggml_backend_buffer_name(cur->buffer);
+        if (backend_name) {
+            std::string lower_name(backend_name);
+            std::transform(lower_name.begin(), lower_name.end(), lower_name.begin(), 
+                           [](unsigned char c) { return std::tolower(c); });
+            if (lower_name.find("cuda") != std::string::npos) continue;
+        }
+
         size_t size  = ggml_nbytes(cur);
         size_t first = reinterpret_cast<size_t>(cur->data);
         size_t last  = first + size;

From 24974a488c9ef98f0551d6fbacb197a8f7e33b76 Mon Sep 17 00:00:00 2001
From: Lizonghang <870644199@qq.com>
Date: Tue, 11 Feb 2025 11:06:33 +0400
Subject: [PATCH 02/18] assume 10% of active pages can be compressed on macOS
 UMA

---
 common/profiler.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/common/profiler.cpp b/common/profiler.cpp
index 96a1c701..550092b8 100644
--- a/common/profiler.cpp
+++ b/common/profiler.cpp
@@ -521,9 +521,9 @@ static uint64_t device_host_physical_memory(bool available) {
             // active pages compression has higher priority than releasing the clean mmap-ed pages
             // some of the active pages can be compressed to save memory for our mmap-ed model weights
             if (is_uma_arch()) {
-                // assume 30% of active pages can be compressed on macOS UMA (an empirical value) 
+                // assume 10% of active pages can be compressed on macOS UMA (an empirical value) 
                 // because GPU is more likely to use the inactive memory
-                memory += vm_stats.active_count * 0.3 * page_size;
+                memory += vm_stats.active_count * 0.1 * page_size;
             } else {
                 // assume 50% of active pages can be compressed on macOS NUMA (an empirical value)
                 memory += vm_stats.active_count * 0.5 * page_size;

From 3dd3138207378b98d5388940975735e311f704d3 Mon Sep 17 00:00:00 2001
From: Lizonghang <870644199@qq.com>
Date: Tue, 11 Feb 2025 17:00:17 +0400
Subject: [PATCH 03/18] ignore tensors already in page cache when prefetching

---
 src/llama.cpp | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 0a9eabb5..5ceefb10 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -17723,14 +17723,12 @@ static bool is_tensor_loaded(struct ggml_tensor * tensor) {
         // align addr
         llama_mmap::align_range(&first, &last, page_size);
         size_t len = std::max(last - first, static_cast<size_t>(page_size));
-
-        // calculate the number of pages to check
-        size_t page_count = (len + page_size - 1) / page_size;
+        size_t page_count = len / page_size;
 
         #ifdef __APPLE__
             char * mincore_res = new char[page_count];
         #else
-            unsigned char *mincore_res = new unsigned char[page_count]; // use 'unsigned char' for Linux
+            unsigned char * mincore_res = new unsigned char[page_count]; // use 'unsigned char' for Linux
         #endif
 
         // call mincore to check if pages are resident in memory
@@ -17759,6 +17757,13 @@ static float is_graph_loaded(struct ggml_cgraph * cgraph) {
         if (strstr(cur->name, "weight") == nullptr || cur->data == nullptr) {
             continue;
         }
+        const char * backend_name = ggml_backend_buffer_name(cur->buffer);
+        if (backend_name) {
+            std::string lower_name(backend_name);
+            std::transform(lower_name.begin(), lower_name.end(), lower_name.begin(), 
+                           [](unsigned char c) { return std::tolower(c); });
+            if (lower_name.find("cuda") != std::string::npos) continue;
+        }
         if (is_tensor_loaded(cur)) n_loaded++;
         n_total++;
     }
@@ -17789,6 +17794,8 @@ static void manage_graph_tensors(struct ggml_cgraph * cgraph, int advice, bool f
             if (lower_name.find("cuda") != std::string::npos) continue;
         }
 
+        if (is_tensor_loaded(cur)) continue;
+
         size_t size  = ggml_nbytes(cur);
         size_t first = reinterpret_cast<size_t>(cur->data);
         size_t last  = first + size;

From 65ad14140a3875fc9d8191cf6eb695687cc32fb0 Mon Sep 17 00:00:00 2001
From: Lizonghang <870644199@qq.com>
Date: Tue, 11 Feb 2025 17:10:11 +0400
Subject: [PATCH 04/18] do not check loaded tensors due to increased latency

---
 src/llama.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 5ceefb10..70a7195b 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -17794,8 +17794,6 @@ static void manage_graph_tensors(struct ggml_cgraph * cgraph, int advice, bool f
             if (lower_name.find("cuda") != std::string::npos) continue;
         }
 
-        if (is_tensor_loaded(cur)) continue;
-
         size_t size  = ggml_nbytes(cur);
         size_t first = reinterpret_cast<size_t>(cur->data);
         size_t last  = first + size;

From 6a50d494d298fe8d6568e8e9f6fb1da1c1f68b73 Mon Sep 17 00:00:00 2001
From: Lizonghang <870644199@qq.com>
Date: Tue, 11 Feb 2025 17:25:06 +0400
Subject: [PATCH 05/18] increase prefetch dense

---
 src/llama.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 70a7195b..9f613743 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -17823,13 +17823,16 @@ static void manage_graph_tensors(struct ggml_cgraph * cgraph, int advice, bool f
     }
 
     for (const auto & segment : merged_segments) {
+        size_t prefetch_dense = 4;
         size_t len = std::max(segment.end - segment.start, static_cast<size_t>(page_size));
         posix_madvise(reinterpret_cast<void *>(segment.start), len, advice); // hint to load into memory
         // force to prefetch data
         if (force && advice == POSIX_MADV_WILLNEED) {
             volatile char * ptr = reinterpret_cast<volatile char *>(segment.start);
-            for (size_t off = 0; off < len; off += page_size) {
-                (void)ptr[off];
+            for (size_t off = 0; off < len; off += prefetch_dense * page_size) {
+                for (size_t i = 0; i < prefetch_dense; i++) {
+                    if (off + i * page_size < len) (void)ptr[off + i * page_size];
+                }
             }
         }
     }

From b163918b46ed67db86a8346a6bd8a4c2f31b1e32 Mon Sep 17 00:00:00 2001
From: Lizonghang <870644199@qq.com>
Date: Wed, 12 Feb 2025 00:17:33 +0400
Subject: [PATCH 06/18] disable prefetch in standalone mode

---
 src/llama.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 9f613743..c41ebb8c 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -18108,9 +18108,9 @@ static int llama_decode_internal(
                 timer(manage_graph_tensors);
                 
                 int next_gf_id = (i + 1) % gf.size();
-                manage_graph_tensors(gf[next_gf_id], POSIX_MADV_WILLNEED, true);
+                manage_graph_tensors(gf[next_gf_id], POSIX_MADV_WILLNEED, n_world > 1);
                 if (my_rank == 0 && (is_last_l || (next_gf_id == (int)gf.size() - 1))) {
-                    manage_graph_tensors(gf[0], POSIX_MADV_WILLNEED, true);
+                    manage_graph_tensors(gf[0], POSIX_MADV_WILLNEED, n_world > 1);
                 }
 
                 if (cparams.unload && n_world > 1) {

From ea0e655a8b1a3cea883a77e4f156ab74e9612b2e Mon Sep 17 00:00:00 2001
From: Lizonghang <870644199@qq.com>
Date: Wed, 12 Feb 2025 16:55:21 +0400
Subject: [PATCH 07/18] disable force feteching

---
 src/llama.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index c41ebb8c..99bb6bc3 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -17827,14 +17827,14 @@ static void manage_graph_tensors(struct ggml_cgraph * cgraph, int advice, bool f
         size_t len = std::max(segment.end - segment.start, static_cast<size_t>(page_size));
         posix_madvise(reinterpret_cast<void *>(segment.start), len, advice); // hint to load into memory
         // force to prefetch data
-        if (force && advice == POSIX_MADV_WILLNEED) {
-            volatile char * ptr = reinterpret_cast<volatile char *>(segment.start);
-            for (size_t off = 0; off < len; off += prefetch_dense * page_size) {
-                for (size_t i = 0; i < prefetch_dense; i++) {
-                    if (off + i * page_size < len) (void)ptr[off + i * page_size];
-                }
-            }
-        }
+        // if (force && advice == POSIX_MADV_WILLNEED && false) {
+        //     volatile char * ptr = reinterpret_cast<volatile char *>(segment.start);
+        //     for (size_t off = 0; off < len; off += prefetch_dense * page_size) {
+        //         for (size_t i = 0; i < prefetch_dense; i++) {
+        //             if (off + i * page_size < len) (void)ptr[off + i * page_size];
+        //         }
+        //     }
+        // }
     }
 }
 

From 708b1d8c897e83d6030227520387c995c173b5d9 Mon Sep 17 00:00:00 2001
From: Lizonghang <870644199@qq.com>
Date: Wed, 12 Feb 2025 16:55:44 +0400
Subject: [PATCH 08/18] disable force fetching

---
 src/llama.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 99bb6bc3..55df72c5 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -17827,14 +17827,14 @@ static void manage_graph_tensors(struct ggml_cgraph * cgraph, int advice, bool f
         size_t len = std::max(segment.end - segment.start, static_cast<size_t>(page_size));
         posix_madvise(reinterpret_cast<void *>(segment.start), len, advice); // hint to load into memory
         // force to prefetch data
-        // if (force && advice == POSIX_MADV_WILLNEED && false) {
-        //     volatile char * ptr = reinterpret_cast<volatile char *>(segment.start);
-        //     for (size_t off = 0; off < len; off += prefetch_dense * page_size) {
-        //         for (size_t i = 0; i < prefetch_dense; i++) {
-        //             if (off + i * page_size < len) (void)ptr[off + i * page_size];
-        //         }
-        //     }
-        // }
+        if (force && advice == POSIX_MADV_WILLNEED && false) {
+            volatile char * ptr = reinterpret_cast<volatile char *>(segment.start);
+            for (size_t off = 0; off < len; off += prefetch_dense * page_size) {
+                for (size_t i = 0; i < prefetch_dense; i++) {
+                    if (off + i * page_size < len) (void)ptr[off + i * page_size];
+                }
+            }
+        }
     }
 }
 

From c84f9d29fe5852b73c89244a148f1f1dae64b940 Mon Sep 17 00:00:00 2001
From: Lizonghang <870644199@qq.com>
Date: Wed, 12 Feb 2025 17:04:41 +0400
Subject: [PATCH 09/18] use arg prefetch and remove arg unload

---
 common/arg.cpp    |  6 +++---
 common/common.cpp |  2 +-
 common/common.h   |  2 +-
 include/llama.h   |  2 +-
 src/llama.cpp     | 22 +++++++++-------------
 5 files changed, 15 insertions(+), 19 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 12c7788c..0820dbe3 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -724,10 +724,10 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
         }
     ).set_env("LLAMA_ARG_NEXT_NODE_IP"));
     add_opt(llama_arg(
-        {"--unload", "--unload-weight"},
-        format("whether to unload layer weights after use (default: %s)", params.unload ? "true" : "false"),
+        {"--prefetch"},
+        format("whether to prefetch layer weights (default: %s)", params.prefetch ? "true" : "false"),
         [](gpt_params & params) {
-            params.unload = true;
+            params.prefetch = true;
         }
     ).set_env("LLAMA_ARG_UNLOAD"));
     add_opt(llama_arg(
diff --git a/common/common.cpp b/common/common.cpp
index 765b64c1..447de272 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1714,7 +1714,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
 
     cparams.n_world           = params.n_world;
     cparams.rank              = params.rank;
-    cparams.unload            = params.unload;
+    cparams.prefetch          = params.prefetch;
     cparams.keep_out_in_metal = params.keep_out_in_metal;
     cparams.n_gpu_layers      = params.n_gpu_layers;
     std::copy(std::begin(params.n_layer_window), std::end(params.n_layer_window), cparams.n_layer_window);
diff --git a/common/common.h b/common/common.h
index 9ac200c1..25424612 100644
--- a/common/common.h
+++ b/common/common.h
@@ -147,7 +147,7 @@ struct gpt_params {
     uint32_t n_layer_window[32]   =   {0}; // layer window size on each node
     std::string master_ip         = "localhost"; // ip address of the master node
     std::string next_node_ip      = "localhost"; // ip address of my next node
-    bool    unload                = false; // unload layer weights after use or not
+    bool    prefetch              = false; // prefetch layer weights
     bool    keep_out_in_metal     =  true; // whether to keep output weights in metal memory, true by default
     int32_t gpu_mem               = 999.0; // gpu memory to use, in GiB
     int32_t n_predict             =    -1; // new tokens to predict
diff --git a/include/llama.h b/include/llama.h
index b7c170ab..259cb2ea 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -322,7 +322,7 @@ extern "C" {
         uint32_t    rank;              // my rank
         uint32_t    n_layer_window[32];// number of layers to process in each compute
         uint32_t    n_gpu_layers;      // number of layers to process on GPU
-        bool        unload;            // whether to unload layer weights after use
+        bool        prefetch;          // whether to prefetch layer weights
         bool        keep_out_in_metal; // whether to keep output weights in metal memory
         char *      master_ip;         // ip address of the master node
         char *      next_node_ip;      // ip address of the next node
diff --git a/src/llama.cpp b/src/llama.cpp
index 55df72c5..1977f79b 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -2571,7 +2571,7 @@ struct llama_cparams {
     uint32_t  n_world;
     uint32_t  rank;
     uint32_t  n_layer_window[32];
-    bool      unload;
+    bool      prefetch;
     uint32_t  n_ctx;           // context size used during inference
     uint32_t  n_batch;
     uint32_t  n_ubatch;
@@ -17770,7 +17770,7 @@ static float is_graph_loaded(struct ggml_cgraph * cgraph) {
     return float(n_loaded) / float(n_total) * 100.0f;
 }
 
-static void manage_graph_tensors(struct ggml_cgraph * cgraph, int advice, bool force = false) {
+static void manage_graph_tensors(struct ggml_cgraph * cgraph, int advice) {
     long page_size = sysconf(_SC_PAGESIZE);
 
     struct Segment {
@@ -17826,8 +17826,8 @@ static void manage_graph_tensors(struct ggml_cgraph * cgraph, int advice, bool f
         size_t prefetch_dense = 4;
         size_t len = std::max(segment.end - segment.start, static_cast<size_t>(page_size));
         posix_madvise(reinterpret_cast<void *>(segment.start), len, advice); // hint to load into memory
-        // force to prefetch data
-        if (force && advice == POSIX_MADV_WILLNEED && false) {
+        // force to prefetch data, disabled by default
+        if (advice == POSIX_MADV_WILLNEED && false) {
             volatile char * ptr = reinterpret_cast<volatile char *>(segment.start);
             for (size_t off = 0; off < len; off += prefetch_dense * page_size) {
                 for (size_t i = 0; i < prefetch_dense; i++) {
@@ -18104,17 +18104,13 @@ static int llama_decode_internal(
             }
 
             // overlap memory scheduling with other nodes' communication and computing
-            {
+            if (cparams.prefetch && n_world > 1) {
                 timer(manage_graph_tensors);
                 
                 int next_gf_id = (i + 1) % gf.size();
-                manage_graph_tensors(gf[next_gf_id], POSIX_MADV_WILLNEED, n_world > 1);
+                manage_graph_tensors(gf[next_gf_id], POSIX_MADV_WILLNEED);
                 if (my_rank == 0 && (is_last_l || (next_gf_id == (int)gf.size() - 1))) {
-                    manage_graph_tensors(gf[0], POSIX_MADV_WILLNEED, n_world > 1);
-                }
-
-                if (cparams.unload && n_world > 1) {
-                    manage_graph_tensors(sub_gf,  POSIX_MADV_DONTNEED);
+                    manage_graph_tensors(gf[0], POSIX_MADV_WILLNEED);
                 }
             }
         }
@@ -19837,7 +19833,7 @@ struct llama_context_params llama_context_default_params() {
         /*.rank                        =*/ 0,
         /*.n_layer_window              =*/ {32},
         /*.n_gpu_layers                =*/ 0,
-        /*.unload                      =*/ false,
+        /*.prefetch                    =*/ false,
         /*.keep_out_in_metal           =*/ true,
         /*.master_ip                   =*/ nullptr,
         /*.next_node_ip                =*/ nullptr,
@@ -20265,7 +20261,7 @@ void * llama_context_setup_backend(
     auto       & cparams = ctx->cparams;
 
     std::copy(std::begin(params.n_layer_window), std::end(params.n_layer_window), cparams.n_layer_window);
-    cparams.unload           = params.unload;
+    cparams.prefetch           = params.prefetch;
     cparams.n_seq_max        = std::max(1u, params.n_seq_max);
     cparams.n_threads        = params.n_threads;
     cparams.n_threads_batch  = params.n_threads_batch;

From fdfaaecd5e4fbdcc04cc4c93bd3f2b061cd2758a Mon Sep 17 00:00:00 2001
From: Lizonghang <870644199@qq.com>
Date: Wed, 12 Feb 2025 17:12:30 +0400
Subject: [PATCH 10/18] disable device profiler in standalone mode

---
 common/common.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 447de272..ec6a0d98 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1473,8 +1473,10 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
     // get device profile
     LOG_INF("\nstart profiling this device, this may take some seconds ...\n");
     dev_info.rank = params.rank;
-    llama_profile_device(&dev_info, model, ml, params.gpu_mem, params.n_predict, params.n_ctx, params.cpuparams.n_threads, params.flash_attn);
-    
+    if (n_world > 1) {
+        llama_profile_device(&dev_info, model, ml, params.gpu_mem, params.n_predict, params.n_ctx, params.cpuparams.n_threads, params.flash_attn);
+    }
+
     // create llama context
     struct llama_context_params cparams = llama_context_params_from_gpt_params(params);
     llama_context * lctx                = llama_new_context_with_model(model, cparams);

From 630556bc1660e44d92e10c352916c1a1df0134fc Mon Sep 17 00:00:00 2001
From: Zonghang Li <lizhuestc@gmail.com>
Date: Sat, 15 Feb 2025 17:23:19 +0400
Subject: [PATCH 11/18] fix default allocation strategy to avoid OOM

---
 common/common.cpp | 58 +++++++++++++++++++++++++++++++++++------------
 1 file changed, 43 insertions(+), 15 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index ec6a0d98..40603517 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -866,6 +866,10 @@ static bool assign_layers_to_device(
         return true;
     }
 
+    std::vector<int>   w(n_world, 0);
+    std::vector<int>   n(n_world, 0);
+    std::vector<float> mem_budget(n_world, 0.0f);
+
     const device_info &master = dev_info_set[0];
 
     // model-specific constants
@@ -879,14 +883,12 @@ static bool assign_layers_to_device(
     const int64_t bo       = dev_info_set[0].model_bytes.nb_output;
     const int64_t b_prime  = b + 2 * (n_embd_k_gqa + n_embd_v_gqa) * n_kv;
 
+#if defined(USE_HIGHS)
     // device-specific constants
     std::vector<float> alpha(n_world, 0.0f);
     std::vector<float> beta(n_world, 0.0f);
     std::vector<float> xi(n_world, 0.0f);
     float kappa = 0.0f;
-    std::vector<int>   w(n_world, 0);
-    std::vector<int>   n(n_world, 0);
-    std::vector<float> mem_budget(n_world, 0.0f);
 
     // -------- Compute alpha[m], beta[m], xi[m] --------
     for (uint32_t m = 0; m < n_world; ++m) {
@@ -977,7 +979,6 @@ static bool assign_layers_to_device(
                              : std::min_element(mem_budget.begin(), mem_budget.end());
     w[std::distance(mem_budget.begin(), device)] += diff;
 
-#if defined(USE_HIGHS)
     // stores the actual read bandwidth (GB/s) for each device
     std::vector<float> disk_speed(n_world, 0.0f);
     for (uint32_t m = 0; m < n_world; ++m) {
@@ -1032,7 +1033,8 @@ static bool assign_layers_to_device(
             bool is_windows = strcmp(dev.device_os, "Windows") == 0;
             GGML_ASSERT(!is_windows && "Windows is not tested yet\n");
 
-            llama_model_compute_buf_size(&c_cpu[m], &c_gpu[m], model, cparams, dev.gpu_support.metal, m == 0, w[m] * k, n[m] * k);
+            bool use_gpu = dev.gpu_support.metal || dev.gpu_support.cuda;
+            llama_model_compute_buf_size(&c_cpu[m], &c_gpu[m], model, cparams, use_gpu, m == 0, w[m] * k, n[m] * k);
 
             int  l_m          = w[m] * k;  // total number of layers assigned to device m
             int  l_m_gpu      = n[m] * k;  // number of layers assigned to device m that run on GPU
@@ -1395,19 +1397,45 @@ static bool assign_layers_to_device(
     std::copy(n.begin(), n.end(), n_gpu_layers);
 
 #else
-    (void)bi;
-    (void)bo;
-    (void)kappa;
-    (void)cparams;
-    (void)min_disk_read_speed;
-    (void)n_vocab;
-    (void)GIGABYTE;
-
-    std::copy(w.begin(), w.end(), n_layer_window);
+    // assign layers according to RAM/VRAM
     for (uint32_t m = 0; m < n_world; ++m) {
         const device_info & dev = dev_info_set[m];
+        if (dev.gpu_support.metal || dev.gpu_support.cuda) {
+            mem_budget[m] = dev.gpu_props.memory_free;
+        } else {
+            mem_budget[m] = dev.memory.available_physical;
+        } 
+    }
+
+    // initialize w_m proportionally to memory budget and n_m to 0
+    float total_mem_budget = std::accumulate(mem_budget.begin(), mem_budget.end(), 0.0f);
+    for (uint32_t m = 0; m < n_world; ++m) {
+        w[m] = std::round(mem_budget[m] / total_mem_budget * n_layer);
+        n[m] = 0;
+    }
+    // adjust w[m] to ensure L mod W = 0
+    int diff = n_layer - std::accumulate(w.begin(), w.end(), 0);
+    auto device = (diff > 0) ? std::max_element(mem_budget.begin(), mem_budget.end()) 
+                            : std::min_element(mem_budget.begin(), mem_budget.end());
+    w[std::distance(mem_budget.begin(), device)] += diff;
+    std::copy(w.begin(), w.end(), n_layer_window);
+
+    std::vector<float> vec_z_gpu(n_world, 0.0f);
+    std::vector<int64_t> c_cpu(n_world, 0), c_gpu(n_world, 0); 
+
+    for (uint32_t m = 0; m < n_world; ++m) {
+        const device_info & dev = dev_info_set[m];
+        bool use_gpu = dev.gpu_support.metal || dev.gpu_support.cuda;
+        llama_model_compute_buf_size(&c_cpu[m], &c_gpu[m], model, cparams, use_gpu, m == 0, w[m] * k, n[m] * k);
+
         if (dev.gpu_support.cuda || dev.gpu_support.metal) {
-            n_gpu_layers[m] = w[m];
+            int64_t required_mem = w[m] * b_prime;
+            int64_t available_mem = dev.gpu_props.memory_free * GIGABYTE - c_gpu[m];            
+            if (required_mem <= available_mem) {
+                n_gpu_layers[m] = w[m];
+            } else {
+                n_gpu_layers[m] = available_mem / b_prime;
+            }
         }
     }
 

From 64c4a479805a4c018c9ed606080ccfdb2bc6c79c Mon Sep 17 00:00:00 2001
From: Lizonghang <870644199@qq.com>
Date: Sat, 15 Feb 2025 17:33:03 +0400
Subject: [PATCH 12/18] fix bugs and warnings

---
 common/common.cpp | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 40603517..bcd3c49a 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -870,20 +870,20 @@ static bool assign_layers_to_device(
     std::vector<int>   n(n_world, 0);
     std::vector<float> mem_budget(n_world, 0.0f);
 
-    const device_info &master = dev_info_set[0];
-
     // model-specific constants
     const int n_embd_k_gqa = llama_model_n_embd_k_gqa(model);
     const int n_embd_v_gqa = llama_model_n_embd_v_gqa(model);
-    const int n_vocab      = llama_n_vocab(model);
     const int n_kv         = cparams.n_ctx;
 
     const int64_t b        = dev_info_set[0].model_bytes.nb_layer;
-    const int64_t bi       = dev_info_set[0].model_bytes.nb_input;
-    const int64_t bo       = dev_info_set[0].model_bytes.nb_output;
     const int64_t b_prime  = b + 2 * (n_embd_k_gqa + n_embd_v_gqa) * n_kv;
 
 #if defined(USE_HIGHS)
+    const device_info &master = dev_info_set[0];
+    const int n_vocab = llama_n_vocab(model);
+    const int64_t bi  = dev_info_set[0].model_bytes.nb_input;
+    const int64_t bo  = dev_info_set[0].model_bytes.nb_output;
+
     // device-specific constants
     std::vector<float> alpha(n_world, 0.0f);
     std::vector<float> beta(n_world, 0.0f);
@@ -1397,6 +1397,8 @@ static bool assign_layers_to_device(
     std::copy(n.begin(), n.end(), n_gpu_layers);
 
 #else
+    (void)min_disk_read_speed;
+
     // assign layers according to RAM/VRAM
     for (uint32_t m = 0; m < n_world; ++m) {
         const device_info & dev = dev_info_set[m];
@@ -1426,7 +1428,7 @@ static bool assign_layers_to_device(
     for (uint32_t m = 0; m < n_world; ++m) {
         const device_info & dev = dev_info_set[m];
         bool use_gpu = dev.gpu_support.metal || dev.gpu_support.cuda;
-        llama_model_compute_buf_size(&c_cpu[m], &c_gpu[m], model, cparams, use_gpu, m == 0, w[m] * k, n[m] * k);
+        llama_model_compute_buf_size(&c_cpu[m], &c_gpu[m], model, cparams, use_gpu, m == 0, w[m], n[m]);
 
         if (dev.gpu_support.cuda || dev.gpu_support.metal) {
             int64_t required_mem = w[m] * b_prime;

From e64f237e04a3fc5fbf14dab802325871769f7cdc Mon Sep 17 00:00:00 2001
From: Lizonghang <870644199@qq.com>
Date: Sat, 15 Feb 2025 17:43:03 +0400
Subject: [PATCH 13/18] fix bugs in available_mem calculation

---
 common/common.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/common/common.cpp b/common/common.cpp
index bcd3c49a..f13a2e12 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1432,7 +1432,11 @@ static bool assign_layers_to_device(
 
         if (dev.gpu_support.cuda || dev.gpu_support.metal) {
             int64_t required_mem = w[m] * b_prime;
-            int64_t available_mem = dev.gpu_props.memory_free * GIGABYTE - c_gpu[m];            
+            int64_t available_mem = dev.gpu_props.memory_free * GIGABYTE - c_gpu[m];
+            if (dev.gpu_support.metal && m == 0 && cparams.keep_out_in_metal) {
+                available_mem -= bo;
+            }
+
             if (required_mem <= available_mem) {
                 n_gpu_layers[m] = w[m];
             } else {

From 8532d030f324a7acc6429a2c449386cc421255c7 Mon Sep 17 00:00:00 2001
From: Zonghang Li <lizhuestc@gmail.com>
Date: Sat, 15 Feb 2025 18:10:11 +0400
Subject: [PATCH 14/18] fix bugs in bo

---
 common/common.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/common.cpp b/common/common.cpp
index f13a2e12..01022a26 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -876,13 +876,13 @@ static bool assign_layers_to_device(
     const int n_kv         = cparams.n_ctx;
 
     const int64_t b        = dev_info_set[0].model_bytes.nb_layer;
+    const int64_t bo       = dev_info_set[0].model_bytes.nb_output;
     const int64_t b_prime  = b + 2 * (n_embd_k_gqa + n_embd_v_gqa) * n_kv;
 
 #if defined(USE_HIGHS)
     const device_info &master = dev_info_set[0];
     const int n_vocab = llama_n_vocab(model);
     const int64_t bi  = dev_info_set[0].model_bytes.nb_input;
-    const int64_t bo  = dev_info_set[0].model_bytes.nb_output;
 
     // device-specific constants
     std::vector<float> alpha(n_world, 0.0f);

From 863393554a36182ce33ca39fcbb679f8b49c24ab Mon Sep 17 00:00:00 2001
From: Lizonghang <870644199@qq.com>
Date: Mon, 17 Feb 2025 18:54:18 +0400
Subject: [PATCH 15/18] add gpu underutilization calibration step

---
 common/common.cpp | 57 +++++++++++++++++++++++++++++++++++++----------
 1 file changed, 45 insertions(+), 12 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 01022a26..35da8f2c 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1013,6 +1013,7 @@ static bool assign_layers_to_device(
     // M3: devices running on Linux or Android and with insufficient memory
     // M4: devices with sufficient memory or very slow disk I/O (slower than min_disk_io_speed)
     std::vector<uint32_t> M1, M2, M3, M4, M1_prev, M2_prev, M3_prev, M4_prev;
+    std::vector<bool> M4_force(n_world, false);
     std::vector<int64_t> c_cpu(n_world, 0), c_gpu(n_world, 0);
 
     // helper function to check if a device is in a specific set
@@ -1043,18 +1044,16 @@ static bool assign_layers_to_device(
             bool condition3   = (l_m - l_m_gpu) * b_prime + (bi / n_vocab + bo) * int(m == 0) + c_cpu[m] > mem_budget[m] * GIGABYTE;
             bool is_slow_disk = disk_speed[m] < min_disk_read_speed;
 
-            if (is_macos && !dev.gpu_support.metal && condition1 && !is_slow_disk) {
-                // case 1: macOS without Metal, and with insufficient memory
-                M1.push_back(m);
-            } else if (is_macos && dev.gpu_support.metal && condition2 && !is_slow_disk) {
-                // case 2: macOS with Metal, and with insufficient memory
-                M2.push_back(m);
-            } else if ((is_linux || is_android) && condition3 && !is_slow_disk) {
-                // case 3: Linux with insufficient memory
-                M3.push_back(m);
+            if (M4_force[m] || is_slow_disk) {
+                M4.push_back(m); // case 4: devices with very slow disk or force to be in M4
+            } else if (is_macos && !dev.gpu_support.metal && condition1) {
+                M1.push_back(m); // case 1: macOS without Metal, and with insufficient memory
+            } else if (is_macos && dev.gpu_support.metal && condition2) {
+                M2.push_back(m); // case 2: macOS with Metal, and with insufficient memory
+            } else if ((is_linux || is_android) && condition3) {
+                M3.push_back(m); // case 3: Linux with insufficient memory
             } else {
-                // case 4: otherwise, assigned to M4
-                M4.push_back(m);
+                M4.push_back(m); // case 4: devices with sufficient memory
             }
         }
 
@@ -1255,7 +1254,8 @@ static bool assign_layers_to_device(
 
             // constraint bound 4: CUDA/shared memory constraint for CUDA/Metal devices
             for (uint32_t m = 0; m < n_world; ++m) {
-                model.lp_.row_upper_[constraint_idx] = W * vec_z_gpu[m];
+                double upper_bound = W * vec_z_gpu[m];
+                model.lp_.row_upper_[constraint_idx] = (upper_bound > 0) ? std::max(upper_bound, 1.0) : upper_bound;
                 constraint_idx++;
             }
 
@@ -1361,6 +1361,39 @@ static bool assign_layers_to_device(
                 k, objective_value, vec_to_str(solution.col_value).c_str(), best_k, best_objective, vec_to_str(best_solution).c_str());
         }
 
+        // check the solution
+        bool is_set_suboptimal = false;
+        for (uint32_t m = 0; m < n_world; ++m) {
+            uint32_t w_m = best_solution[m], n_m = best_solution[m + n_world];
+            // if w[m] > n[m] and there is still free VRAM, the GPU is not fully utilized, 
+            // indicating that the memory constraints are too strict, and the set assignment is suboptimal.
+            if (w_m > n_m && n_m < static_cast<uint32_t>(std::round(W * vec_z_gpu[m]))) {
+                is_set_suboptimal = true;
+            } 
+        }
+
+        if (is_set_suboptimal) {
+            int worst_device = -1;
+            float worst_speed = std::numeric_limits<float>::max();
+
+            // find the device with slowest disk speed but was not in M4 yet
+            for (uint32_t m = 0; m < n_world; ++m) {
+                if (!in_set(m, M4) && disk_speed[m] < worst_speed) {
+                    worst_speed = disk_speed[m];
+                    worst_device = m;
+                }
+            }
+
+            if (worst_device != -1) {
+                M4_force[worst_device] = true;
+                LOG_INF("Forcing device %d (disk speed %.2f GB/s) into M4\n", worst_device, worst_speed);
+            } else {
+                LOG_INF("Infeasible solution detected but no device can be forced into M4\n");
+            }
+
+            continue;
+        }
+
         // update w[m] and n[m]
         GGML_ASSERT(best_solution.size() == n_world * 2 && "Invalid solution\n");
         std::copy(best_solution.begin(), best_solution.begin() + n_world, w.begin());

From e219fada4e52f4071469cf6ae62d6766512cc94e Mon Sep 17 00:00:00 2001
From: Lizonghang <870644199@qq.com>
Date: Wed, 19 Feb 2025 16:24:12 +0400
Subject: [PATCH 16/18] disable timer

---
 src/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 1977f79b..2aeb0c69 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -106,7 +106,7 @@
 struct Timer {
     const char * name;
     int64_t start_time;
-    bool enable_timer = true;
+    bool enable_timer = false;
     Timer(const char * name) : name(name), start_time(ggml_time_us()) {}
     ~Timer() {
         if (enable_timer) {

From 07a397360b420b6c08c7cf26ef1da6aab0c291c4 Mon Sep 17 00:00:00 2001
From: Lizonghang <870644199@qq.com>
Date: Wed, 19 Feb 2025 16:30:18 +0400
Subject: [PATCH 17/18] fix gpu underutilization

---
 common/common.cpp | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 35da8f2c..deaffc71 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1362,17 +1362,22 @@ static bool assign_layers_to_device(
         }
 
         // check the solution
-        bool is_set_suboptimal = false;
+        bool has_free_gpu_memory = false, has_overload = false;
         for (uint32_t m = 0; m < n_world; ++m) {
             uint32_t w_m = best_solution[m], n_m = best_solution[m + n_world];
-            // if w[m] > n[m] and there is still free VRAM, the GPU is not fully utilized, 
-            // indicating that the memory constraints are too strict, and the set assignment is suboptimal.
-            if (w_m > n_m && n_m < static_cast<uint32_t>(std::round(W * vec_z_gpu[m]))) {
-                is_set_suboptimal = true;
-            } 
+
+            // if there is still free GPU memory
+            if (n_m < static_cast<uint32_t>(std::round(W * vec_z_gpu[m]))) {
+                has_free_gpu_memory = true;
+            }
+
+            // if there is device overloaded
+            if (w_m > n_m) {
+                has_overload = true;
+            }
         }
 
-        if (is_set_suboptimal) {
+        if (has_free_gpu_memory && has_overload) {
             int worst_device = -1;
             float worst_speed = std::numeric_limits<float>::max();
 
@@ -1422,8 +1427,8 @@ static bool assign_layers_to_device(
         LOG_INF("  - N Layer Window : %d\n", w[m]);
         LOG_INF("  - N GPU Layers   : %d\n", n[m]);
     }
-    LOG_INF("\nEstimated Latency: %.3f ms\n", final_objective);
-    LOG_INF("------------------------------------------");
+    // LOG_INF("\nEstimated Latency: %.3f ms\n", final_objective);
+    // LOG_INF("------------------------------------------");
 
     // copy value from w and n to n_layer_window and n_gpu_layers, respectively
     std::copy(w.begin(), w.end(), n_layer_window);

From f5e874f75f722dbd498c9a7b198d6217f46e6388 Mon Sep 17 00:00:00 2001
From: Zonghang Li <lizhuestc@gmail.com>
Date: Sun, 23 Feb 2025 01:38:13 +0400
Subject: [PATCH 18/18] remove conda path

---
 Makefile | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/Makefile b/Makefile
index d2f459f6..a3f1bf04 100644
--- a/Makefile
+++ b/Makefile
@@ -274,15 +274,10 @@ endif
 ifeq ($(USE_HIGHS),1)
 	HIGHS_CPPFLAGS = -isystem /usr/local/include/highs
 	HIGHS_LDFLAGS  = -L/usr/local/lib -lhighs
-
 	ifeq ($(UNAME_S),Darwin)
         HIGHS_CPPFLAGS += -isystem /opt/homebrew/include/highs
         HIGHS_LDFLAGS  += -L/opt/homebrew/lib -lhighs
-	else ifneq ($(CONDA_PREFIX),)
-		HIGHS_CPPFLAGS += -isystem $(CONDA_PREFIX)/include -isystem $(CONDA_PREFIX)/include/highs
-		HIGHS_LDFLAGS  += -L$(CONDA_PREFIX)/lib -Wl,-rpath,$(CONDA_PREFIX)/lib
 	endif
-
 	MK_CPPFLAGS += $(HIGHS_CPPFLAGS) -DUSE_HIGHS
 	MK_LDFLAGS  += $(HIGHS_LDFLAGS)
 endif