add args -k and --force

2025-09-04 16:39:23 +00:00 · 2025-03-11 20:44:36 +04:00 · 2025-03-11 20:44:36 +04:00 · bcfdace59b
commit bcfdace59b
parent 9cbdf01645
5 changed files with 27 additions and 5 deletions
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -737,6 +737,20 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
            params.gpu_mem = value; // in GiB
        }
    ).set_env("LLAMA_ARG_CUDA_MEM"));
+    add_opt(llama_arg(
+        {"-k", "--n-cycles"}, "N",
+        format("number of cycles to output one token (default: %d)", params.n_cycles),
+        [](gpt_params & params, int value) {
+            params.n_cycles = value;
+        }
+    ).set_env("LLAMA_ARG_K"));
+    add_opt(llama_arg(
+        {"--force"},
+        format("force to start prefetching after computation (default: %s)", params.force ? "true" : "false"),
+        [](gpt_params & params) {
+            params.force = true;
+        }
+    ).set_env("LLAMA_ARG_FORCE"));
 // #ifdef GGML_USE_METAL
 //     // warn: if the output layer weights are not kept in metal shared memory, its mmap-ed weight data
 //     // could be released by the OS and reloaded repeatedly, which causes additional disk I/O latency.
--- a/common/common.cpp
+++ b/common/common.cpp
@ -1012,7 +1012,7 @@ static bool assign_layers_to_device(
    };

    // get valid factors
-    std::vector<int> valid_k = find_factors(n_layer);
+    std::vector<int> valid_k = cparams.n_cycles > 0 ? {(int)cparams.n_cycles} : find_factors(n_layer);

    // assign devices to sets M1, M2, M3, and M4
    // M1: devices running on macOS without Metal, and with insufficient memory
@ -1801,8 +1801,10 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
    cparams.n_world           = params.n_world;
    cparams.rank              = params.rank;
    cparams.prefetch          = params.prefetch;
+    cparams.force             = params.force;
    cparams.keep_out_in_metal = params.keep_out_in_metal;
    cparams.n_gpu_layers      = params.n_gpu_layers;
+    cparams.n_cycles          = params.n_cycles;
    std::copy(std::begin(params.n_layer_window), std::end(params.n_layer_window), cparams.n_layer_window);

    if (cparams.master_ip != nullptr) {
--- a/common/common.h
+++ b/common/common.h
@ -149,7 +149,9 @@ struct gpt_params {
    std::string next_node_ip      = "localhost"; // ip address of my next node
    bool    prefetch              = false; // prefetch layer weights
    bool    keep_out_in_metal     =  true; // whether to keep output weights in metal memory, true by default
+    bool    force                 = false; // force to start prefetching after computation
    int32_t gpu_mem               = 999.0; // gpu memory to use, in GiB
+    int32_t n_cycles              =     0; // number of cycles to output one token
    int32_t n_predict             =    -1; // new tokens to predict
    int32_t n_ctx                 =     0; // context size
    int32_t n_batch               =  2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
--- a/include/llama.h
+++ b/include/llama.h
@ -324,7 +324,9 @@ extern "C" {
        uint32_t    rank;              // my rank
        uint32_t    n_layer_window[32];// number of layers to process in each compute
        uint32_t    n_gpu_layers;      // number of layers to process on GPU
+        uint32_t    n_cycles;          // number of cycles to output one token
        bool        prefetch;          // whether to prefetch layer weights
+        bool        force;             // force to start prefetching after computation
        bool        keep_out_in_metal; // whether to keep output weights in metal memory
        char *      master_ip;         // ip address of the master node
        char *      next_node_ip;      // ip address of the next node
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -17892,7 +17892,7 @@ static float is_graph_loaded(struct ggml_cgraph * cgraph) {
    return float(n_loaded) / float(n_total) * 100.0f;
 }

-static void manage_graph_tensors(struct ggml_cgraph * cgraph, int advice) {
+static void manage_graph_tensors(struct ggml_cgraph * cgraph, int advice, bool force) {
    long page_size = sysconf(_SC_PAGESIZE);

    struct Segment {
@ -17949,7 +17949,7 @@ static void manage_graph_tensors(struct ggml_cgraph * cgraph, int advice) {
        size_t len = std::max(segment.end - segment.start, static_cast<size_t>(page_size));
        posix_madvise(reinterpret_cast<void *>(segment.start), len, advice); // hint to load into memory
        // force to prefetch data, disabled by default
-        if (advice == POSIX_MADV_WILLNEED && false) {
+        if (advice == POSIX_MADV_WILLNEED && force) {
            volatile char * ptr = reinterpret_cast<volatile char *>(segment.start);
            for (size_t off = 0; off < len; off += prefetch_dense * page_size) {
                for (size_t i = 0; i < prefetch_dense; i++) {
@ -18230,9 +18230,9 @@ static int llama_decode_internal(
                timer(manage_graph_tensors);
                
                int next_gf_id = (i + 1) % gf.size();
-                manage_graph_tensors(gf[next_gf_id], POSIX_MADV_WILLNEED);
+                manage_graph_tensors(gf[next_gf_id], POSIX_MADV_WILLNEED, cparams.force);
                if (my_rank == 0 && (is_last_l || (next_gf_id == (int)gf.size() - 1))) {
-                    manage_graph_tensors(gf[0], POSIX_MADV_WILLNEED);
+                    manage_graph_tensors(gf[0], POSIX_MADV_WILLNEED, cparams.force);
                }
            }
        }
@ -19955,7 +19955,9 @@ struct llama_context_params llama_context_default_params() {
        /*.rank                        =*/ 0,
        /*.n_layer_window              =*/ {32},
        /*.n_gpu_layers                =*/ 0,
+        /*.n_cycles                    =*/ 0,
        /*.prefetch                    =*/ false,
+        /*.force                       =*/ false,
        /*.keep_out_in_metal           =*/ true,
        /*.master_ip                   =*/ nullptr,
        /*.next_node_ip                =*/ nullptr,