From c84f9d29fe5852b73c89244a148f1f1dae64b940 Mon Sep 17 00:00:00 2001
From: Lizonghang <870644199@qq.com>
Date: Wed, 12 Feb 2025 17:04:41 +0400
Subject: [PATCH] use arg prefetch and remove arg unload

---
 common/arg.cpp    |  6 +++---
 common/common.cpp |  2 +-
 common/common.h   |  2 +-
 include/llama.h   |  2 +-
 src/llama.cpp     | 22 +++++++++-------------
 5 files changed, 15 insertions(+), 19 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 12c7788c..0820dbe3 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -724,10 +724,10 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
         }
     ).set_env("LLAMA_ARG_NEXT_NODE_IP"));
     add_opt(llama_arg(
-        {"--unload", "--unload-weight"},
-        format("whether to unload layer weights after use (default: %s)", params.unload ? "true" : "false"),
+        {"--prefetch"},
+        format("whether to prefetch layer weights (default: %s)", params.prefetch ? "true" : "false"),
         [](gpt_params & params) {
-            params.unload = true;
+            params.prefetch = true;
         }
     ).set_env("LLAMA_ARG_UNLOAD"));
     add_opt(llama_arg(
diff --git a/common/common.cpp b/common/common.cpp
index 765b64c1..447de272 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1714,7 +1714,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
 
     cparams.n_world           = params.n_world;
     cparams.rank              = params.rank;
-    cparams.unload            = params.unload;
+    cparams.prefetch          = params.prefetch;
     cparams.keep_out_in_metal = params.keep_out_in_metal;
     cparams.n_gpu_layers      = params.n_gpu_layers;
     std::copy(std::begin(params.n_layer_window), std::end(params.n_layer_window), cparams.n_layer_window);
diff --git a/common/common.h b/common/common.h
index 9ac200c1..25424612 100644
--- a/common/common.h
+++ b/common/common.h
@@ -147,7 +147,7 @@ struct gpt_params {
     uint32_t n_layer_window[32]   =   {0}; // layer window size on each node
     std::string master_ip         = "localhost"; // ip address of the master node
     std::string next_node_ip      = "localhost"; // ip address of my next node
-    bool    unload                = false; // unload layer weights after use or not
+    bool    prefetch              = false; // prefetch layer weights
     bool    keep_out_in_metal     =  true; // whether to keep output weights in metal memory, true by default
     int32_t gpu_mem               = 999.0; // gpu memory to use, in GiB
     int32_t n_predict             =    -1; // new tokens to predict
diff --git a/include/llama.h b/include/llama.h
index b7c170ab..259cb2ea 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -322,7 +322,7 @@ extern "C" {
         uint32_t    rank;              // my rank
         uint32_t    n_layer_window[32];// number of layers to process in each compute
         uint32_t    n_gpu_layers;      // number of layers to process on GPU
-        bool        unload;            // whether to unload layer weights after use
+        bool        prefetch;          // whether to prefetch layer weights
         bool        keep_out_in_metal; // whether to keep output weights in metal memory
         char *      master_ip;         // ip address of the master node
         char *      next_node_ip;      // ip address of the next node
diff --git a/src/llama.cpp b/src/llama.cpp
index 55df72c5..1977f79b 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -2571,7 +2571,7 @@ struct llama_cparams {
     uint32_t  n_world;
     uint32_t  rank;
     uint32_t  n_layer_window[32];
-    bool      unload;
+    bool      prefetch;
     uint32_t  n_ctx;           // context size used during inference
     uint32_t  n_batch;
     uint32_t  n_ubatch;
@@ -17770,7 +17770,7 @@ static float is_graph_loaded(struct ggml_cgraph * cgraph) {
     return float(n_loaded) / float(n_total) * 100.0f;
 }
 
-static void manage_graph_tensors(struct ggml_cgraph * cgraph, int advice, bool force = false) {
+static void manage_graph_tensors(struct ggml_cgraph * cgraph, int advice) {
     long page_size = sysconf(_SC_PAGESIZE);
 
     struct Segment {
@@ -17826,8 +17826,8 @@ static void manage_graph_tensors(struct ggml_cgraph * cgraph, int advice, bool f
         size_t prefetch_dense = 4;
         size_t len = std::max(segment.end - segment.start, static_cast<size_t>(page_size));
         posix_madvise(reinterpret_cast<void *>(segment.start), len, advice); // hint to load into memory
-        // force to prefetch data
-        if (force && advice == POSIX_MADV_WILLNEED && false) {
+        // force to prefetch data, disabled by default
+        if (advice == POSIX_MADV_WILLNEED && false) {
             volatile char * ptr = reinterpret_cast<volatile char *>(segment.start);
             for (size_t off = 0; off < len; off += prefetch_dense * page_size) {
                 for (size_t i = 0; i < prefetch_dense; i++) {
@@ -18104,17 +18104,13 @@ static int llama_decode_internal(
             }
 
             // overlap memory scheduling with other nodes' communication and computing
-            {
+            if (cparams.prefetch && n_world > 1) {
                 timer(manage_graph_tensors);
                 
                 int next_gf_id = (i + 1) % gf.size();
-                manage_graph_tensors(gf[next_gf_id], POSIX_MADV_WILLNEED, n_world > 1);
+                manage_graph_tensors(gf[next_gf_id], POSIX_MADV_WILLNEED);
                 if (my_rank == 0 && (is_last_l || (next_gf_id == (int)gf.size() - 1))) {
-                    manage_graph_tensors(gf[0], POSIX_MADV_WILLNEED, n_world > 1);
-                }
-
-                if (cparams.unload && n_world > 1) {
-                    manage_graph_tensors(sub_gf,  POSIX_MADV_DONTNEED);
+                    manage_graph_tensors(gf[0], POSIX_MADV_WILLNEED);
                 }
             }
         }
@@ -19837,7 +19833,7 @@ struct llama_context_params llama_context_default_params() {
         /*.rank                        =*/ 0,
         /*.n_layer_window              =*/ {32},
         /*.n_gpu_layers                =*/ 0,
-        /*.unload                      =*/ false,
+        /*.prefetch                    =*/ false,
         /*.keep_out_in_metal           =*/ true,
         /*.master_ip                   =*/ nullptr,
         /*.next_node_ip                =*/ nullptr,
@@ -20265,7 +20261,7 @@ void * llama_context_setup_backend(
     auto       & cparams = ctx->cparams;
 
     std::copy(std::begin(params.n_layer_window), std::end(params.n_layer_window), cparams.n_layer_window);
-    cparams.unload           = params.unload;
+    cparams.prefetch           = params.prefetch;
     cparams.n_seq_max        = std::max(1u, params.n_seq_max);
     cparams.n_threads        = params.n_threads;
     cparams.n_threads_batch  = params.n_threads_batch;