From bcfdace59bd577fc955935165208280d8253e45e Mon Sep 17 00:00:00 2001
From: Zonghang Li <lizhuestc@gmail.com>
Date: Tue, 11 Mar 2025 20:44:36 +0400
Subject: [PATCH] add args -k and --force

---
 common/arg.cpp    | 14 ++++++++++++++
 common/common.cpp |  4 +++-
 common/common.h   |  2 ++
 include/llama.h   |  2 ++
 src/llama.cpp     | 10 ++++++----
 5 files changed, 27 insertions(+), 5 deletions(-)
diff --git a/common/arg.cpp b/common/arg.cpp
index 0820dbe3..d53a09f8 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -737,6 +737,20 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
             params.gpu_mem = value; // in GiB
         }
     ).set_env("LLAMA_ARG_CUDA_MEM"));
+    add_opt(llama_arg(
+        {"-k", "--n-cycles"}, "N",
+        format("number of cycles to output one token (default: %d)", params.n_cycles),
+        [](gpt_params & params, int value) {
+            params.n_cycles = value;
+        }
+    ).set_env("LLAMA_ARG_K"));
+    add_opt(llama_arg(
+        {"--force"},
+        format("force to start prefetching after computation (default: %s)", params.force ? "true" : "false"),
+        [](gpt_params & params) {
+            params.force = true;
+        }
+    ).set_env("LLAMA_ARG_FORCE"));
 // #ifdef GGML_USE_METAL
 //     // warn: if the output layer weights are not kept in metal shared memory, its mmap-ed weight data
 //     // could be released by the OS and reloaded repeatedly, which causes additional disk I/O latency.
diff --git a/common/common.cpp b/common/common.cpp
index ae7dd883..2b80284e 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1012,7 +1012,7 @@ static bool assign_layers_to_device(
     };
 
     // get valid factors
-    std::vector<int> valid_k = find_factors(n_layer);
+    std::vector<int> valid_k = cparams.n_cycles > 0 ? {(int)cparams.n_cycles} : find_factors(n_layer);
 
     // assign devices to sets M1, M2, M3, and M4
     // M1: devices running on macOS without Metal, and with insufficient memory
@@ -1801,8 +1801,10 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
     cparams.n_world           = params.n_world;
     cparams.rank              = params.rank;
     cparams.prefetch          = params.prefetch;
+    cparams.force             = params.force;
     cparams.keep_out_in_metal = params.keep_out_in_metal;
     cparams.n_gpu_layers      = params.n_gpu_layers;
+    cparams.n_cycles          = params.n_cycles;
     std::copy(std::begin(params.n_layer_window), std::end(params.n_layer_window), cparams.n_layer_window);
 
     if (cparams.master_ip != nullptr) {
diff --git a/common/common.h b/common/common.h
index 25424612..0a679213 100644
--- a/common/common.h
+++ b/common/common.h
@@ -149,7 +149,9 @@ struct gpt_params {
     std::string next_node_ip      = "localhost"; // ip address of my next node
     bool    prefetch              = false; // prefetch layer weights
     bool    keep_out_in_metal     =  true; // whether to keep output weights in metal memory, true by default
+    bool    force                 = false; // force to start prefetching after computation
     int32_t gpu_mem               = 999.0; // gpu memory to use, in GiB
+    int32_t n_cycles              =     0; // number of cycles to output one token
     int32_t n_predict             =    -1; // new tokens to predict
     int32_t n_ctx                 =     0; // context size
     int32_t n_batch               =  2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
diff --git a/include/llama.h b/include/llama.h
index 356b0fbf..5c14d2a3 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -324,7 +324,9 @@ extern "C" {
         uint32_t    rank;              // my rank
         uint32_t    n_layer_window[32];// number of layers to process in each compute
         uint32_t    n_gpu_layers;      // number of layers to process on GPU
+        uint32_t    n_cycles;          // number of cycles to output one token
         bool        prefetch;          // whether to prefetch layer weights
+        bool        force;             // force to start prefetching after computation
         bool        keep_out_in_metal; // whether to keep output weights in metal memory
         char *      master_ip;         // ip address of the master node
         char *      next_node_ip;      // ip address of the next node
diff --git a/src/llama.cpp b/src/llama.cpp
index 810dd3f7..265435fd 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -17892,7 +17892,7 @@ static float is_graph_loaded(struct ggml_cgraph * cgraph) {
     return float(n_loaded) / float(n_total) * 100.0f;
 }
 
-static void manage_graph_tensors(struct ggml_cgraph * cgraph, int advice) {
+static void manage_graph_tensors(struct ggml_cgraph * cgraph, int advice, bool force) {
     long page_size = sysconf(_SC_PAGESIZE);
 
     struct Segment {
@@ -17949,7 +17949,7 @@ static void manage_graph_tensors(struct ggml_cgraph * cgraph, int advice) {
         size_t len = std::max(segment.end - segment.start, static_cast<size_t>(page_size));
         posix_madvise(reinterpret_cast<void *>(segment.start), len, advice); // hint to load into memory
         // force to prefetch data, disabled by default
-        if (advice == POSIX_MADV_WILLNEED && false) {
+        if (advice == POSIX_MADV_WILLNEED && force) {
             volatile char * ptr = reinterpret_cast<volatile char *>(segment.start);
             for (size_t off = 0; off < len; off += prefetch_dense * page_size) {
                 for (size_t i = 0; i < prefetch_dense; i++) {
@@ -18230,9 +18230,9 @@ static int llama_decode_internal(
                 timer(manage_graph_tensors);
                 
                 int next_gf_id = (i + 1) % gf.size();
-                manage_graph_tensors(gf[next_gf_id], POSIX_MADV_WILLNEED);
+                manage_graph_tensors(gf[next_gf_id], POSIX_MADV_WILLNEED, cparams.force);
                 if (my_rank == 0 && (is_last_l || (next_gf_id == (int)gf.size() - 1))) {
-                    manage_graph_tensors(gf[0], POSIX_MADV_WILLNEED);
+                    manage_graph_tensors(gf[0], POSIX_MADV_WILLNEED, cparams.force);
                 }
             }
         }
@@ -19955,7 +19955,9 @@ struct llama_context_params llama_context_default_params() {
         /*.rank                        =*/ 0,
         /*.n_layer_window              =*/ {32},
         /*.n_gpu_layers                =*/ 0,
+        /*.n_cycles                    =*/ 0,
         /*.prefetch                    =*/ false,
+        /*.force                       =*/ false,
         /*.keep_out_in_metal           =*/ true,
         /*.master_ip                   =*/ nullptr,
         /*.next_node_ip                =*/ nullptr,