speedup: add arg --keep-out-in-cuda to run the output layer on CUDA

2025-09-03 23:49:07 +00:00 · 2025-06-28 05:59:19 +00:00 · 2025-06-28 05:59:19 +00:00 · 1ea2d61a97
commit 1ea2d61a97
parent e8d3e5a631
6 changed files with 66 additions and 16 deletions
--- a/README.md
+++ b/README.md
@ -226,6 +226,8 @@ Take QwQ-32B as an example, run the following commands on the devices to launch

 Once started, prima.cpp will profile each device and decide how much workload to assign, e.g., how many model layers each device should handle, and how many of them should run on GPU (if available).

+> By default, the output layer runs on the CPU. However, if you have enough total VRAM, add `--keep-out-in-cuda` to the master to run it on the GPU.
+
 ### (Optional) Run with Prebuilt Docker Image
 Assume we have a host machine with at least 32 CPU cores, 32 GiB RAM, and 32 GiB VRAM. We simulate 4 homogeneous nodes using Docker containers, with each node allocated 8 CPU cores, 8 GiB RAM, and 8 GiB VRAM. Follow the below steps to get started:

--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -775,6 +775,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
            params.master_priority = std::stof(value);
        }
    ).set_env("LLAMA_ARG_MASTER_PRIORITY"));
+
 // #ifdef GGML_USE_METAL
 //     // warn: if the output layer weights are not kept in metal shared memory, its mmap-ed weight data
 //     // could be released by the OS and reloaded repeatedly, which causes additional disk I/O latency.
@ -787,6 +788,17 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
 //         }
 //     ).set_env("LLAMA_ARG_KEEP_INP_OUT_IN_METAL"));
 // #endif
+
+#ifdef GGML_USE_CUDA
+    add_opt(llama_arg(
+        {"--keep-out-in-cuda"},
+        format("whether to compute the output layer on CUDA (default: %s)", params.keep_out_in_cuda ? "true" : "false"),
+        [](gpt_params & params) {
+            params.keep_out_in_cuda = true;
+        }
+    ).set_env("LLAMA_ARG_KEEP_INP_OUT_IN_CUDA"));
+#endif
+
    add_opt(llama_arg(
        {"-n", "--predict", "--n-predict"}, "N",
        format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict),
--- a/common/common.cpp
+++ b/common/common.cpp
@ -2017,16 +2017,19 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
    if (params.n_gpu_layers != -1) {
        mparams.n_gpu_layers = params.n_gpu_layers;
    }
-    mparams.n_world         = params.n_world;
-    mparams.rank            = params.rank;
-    mparams.rpc_servers     = params.rpc_servers.c_str();
-    mparams.main_gpu        = params.main_gpu;
-    mparams.split_mode      = params.split_mode;
-    mparams.tensor_split    = params.tensor_split;
-    mparams.use_mmap        = params.use_mmap;
-    mparams.use_mlock       = params.use_mlock;
-    mparams.check_tensors   = params.check_tensors;
+
+    mparams.n_world           = params.n_world;
+    mparams.rank              = params.rank;
+    mparams.rpc_servers       = params.rpc_servers.c_str();
+    mparams.main_gpu          = params.main_gpu;
+    mparams.split_mode        = params.split_mode;
+    mparams.tensor_split      = params.tensor_split;
+    mparams.use_mmap          = params.use_mmap;
+    mparams.use_mlock         = params.use_mlock;
+    mparams.check_tensors     = params.check_tensors;
    mparams.keep_out_in_metal = params.keep_out_in_metal;
+    mparams.keep_out_in_cuda  = params.keep_out_in_cuda;
+
    std::copy(std::begin(params.n_layer_window), std::end(params.n_layer_window), mparams.n_layer_window);
    if (params.kv_overrides.empty()) {
        mparams.kv_overrides = NULL;
@ -2068,6 +2071,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
    cparams.force             = params.force;
    cparams.master_priority   = params.master_priority;
    cparams.keep_out_in_metal = params.keep_out_in_metal;
+    cparams.keep_out_in_cuda  = params.keep_out_in_cuda;
    cparams.n_gpu_layers      = params.n_gpu_layers;
    cparams.n_cycles          = params.n_cycles;
    std::copy(std::begin(params.n_layer_window), std::end(params.n_layer_window), cparams.n_layer_window);
--- a/common/common.h
+++ b/common/common.h
@ -151,6 +151,7 @@ struct gpt_params {
    uint32_t signal_port          =  10000; // signal port for distributed inference
    bool    prefetch              = false; // prefetch layer weights
    bool    keep_out_in_metal     =  true; // whether to keep output weights in metal memory, true by default
+    bool    keep_out_in_cuda      = false; // whether to run the output layer on CUDA, false by default
    bool    force                 = false; // force to start prefetching after computation
    float   master_priority       =  1.01; // priority to assign workload to the master (set 1.01 to use master first, and 0.99 to offload to other devices)
    int32_t gpu_mem               = 999.0; // gpu memory to use, in GiB
--- a/include/llama.h
+++ b/include/llama.h
@ -325,6 +325,7 @@ extern "C" {
        bool use_mlock;     // force system to keep model in RAM
        bool check_tensors; // validate model tensor data
        bool keep_out_in_metal; // whether to keep output weights in metal memory
+        bool keep_out_in_cuda;  // whether to run the output layer on CUDA
    };

    // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
@ -339,6 +340,7 @@ extern "C" {
        bool        force;             // force to start prefetching after computation
        float       master_priority;   // priority to assign workload to the master (set 1.01 to use master first, and 0.99 to offload to other devices)
        bool        keep_out_in_metal; // whether to keep output weights in metal memory
+        bool        keep_out_in_cuda;  // whether to run the output layer on CUDA
        char *      master_ip;         // ip address of the master node
        char *      next_node_ip;      // ip address of the next node
        uint32_t    data_port;         // data port for distributed inference
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -7562,6 +7562,7 @@ static bool llm_load_tensors_impl(
        int                     main_gpu,
        bool                    use_mlock,
        bool                    keep_out_in_metal,
+        bool                    keep_out_in_cuda,
        llama_progress_callback progress_callback,
        void                  * progress_callback_user_data) {
    auto & hparams = model.hparams;
@ -7606,9 +7607,15 @@ static bool llm_load_tensors_impl(
    // assign the input and output layers on CPU by default
    if (my_rank == 0) {
        model.buft_input  = llama_default_buffer_type_cpu(model, true);
-        model.buft_output = llama_default_buffer_type_cpu(model, true);
        LLAMA_LOG_DEBUG("Layer input assigned to cpu\n");
-        LLAMA_LOG_DEBUG("Layer output assigned to cpu\n");
+
+        if (keep_out_in_cuda) {
+            model.buft_output = llama_default_buffer_type_offload(model, main_gpu);
+            LLAMA_LOG_DEBUG("Layer output assigned to gpu\n");
+        } else {
+            model.buft_output = llama_default_buffer_type_cpu(model, true);
+            LLAMA_LOG_DEBUG("Layer output assigned to cpu\n");
+        }
    }

    // count used buffer types
@ -9535,7 +9542,8 @@ int llm_load_tensors(
    try {
        if (!llm_load_tensors_impl(
            *ml, *model, params.n_world, params.rank, params.n_layer_window, params.n_gpu_layers, params.split_mode, 
-            params.main_gpu, params.use_mlock, params.keep_out_in_metal, params.progress_callback, params.progress_callback_user_data
+            params.main_gpu, params.use_mlock, params.keep_out_in_metal, params.keep_out_in_cuda, params.progress_callback, 
+            params.progress_callback_user_data
        )) {
            return -2;
        }
@ -20247,6 +20255,7 @@ struct llama_model_params llama_model_default_params() {
        /*.use_mlock                   =*/ false,
        /*.check_tensors               =*/ false,
        /*.keep_out_in_metal           =*/ true,
+        /*.keep_out_in_cuda            =*/ false,
    };

 #ifdef GGML_USE_METAL
@ -20268,6 +20277,7 @@ struct llama_context_params llama_context_default_params() {
        /*.force                       =*/ false,
        /*.master_priority             =*/ 1.01,
        /*.keep_out_in_metal           =*/ true,
+        /*.keep_out_in_cuda            =*/ false,
        /*.master_ip                   =*/ nullptr,
        /*.next_node_ip                =*/ nullptr,
        /*.data_port                   =*/ 9000,
@ -21361,14 +21371,31 @@ void * llama_context_setup_backend(
            for (size_t i = 0; i < gf.size(); ++i) {

 #if defined(GGML_USE_CUDA)
-                if ((cparams.rank == 0 && (i == 0 || i == gf.size() - 1)) 
-                        || model->n_gpu_layers == 0) {
+                // output layer
+                if (!params.keep_out_in_cuda && cparams.rank == 0 && i == gf.size() - 1) {
+                    continue;
+                }
+
+                // input layer
+                if (cparams.rank == 0 && i == 0) {
+                    continue;
+                }
+
+                // ignore all backend layers if n_gpu_layers is 0
+                if (model->n_gpu_layers == 0) {
+                    continue;
+                }
+
+                // don't reserve for repeated backend layers
+                if ((cparams.rank == 0 && i > 1 && i < gf.size() - 1) 
+                    || (cparams.rank > 0 && i > 0)) {
                    continue;
                }
 #endif

                ok = ok & ggml_backend_sched_reserve(ctx->sched[i], gf[i]);
            }
+            
            if (!ok) {
                LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
                llama_free(ctx);
@ -21933,7 +21960,7 @@ void llama_model_compute_buf_size(
    // weights
    const int64_t nb_attn_norm_w = n_bytes.nb_attn_norm_w;
    const int64_t nb_attn_q_w    = n_bytes.nb_attn_q_w;
-    // const int64_t nb_output_w    = n_bytes.nb_output_w;
+    const int64_t nb_output_w    = n_bytes.nb_output_w;
    
    // format bytes
    const int64_t type_size_f32 = ggml_type_size(GGML_TYPE_F32);
@ -21972,7 +21999,9 @@ void llama_model_compute_buf_size(
                });
            }
            // we run the output layer on CPU by default
-            // *gpu_buf     += (n_out_embd + n_result) * type_size_f32 + nb_output_w;
+            if (cparams.keep_out_in_cuda) { 
+                *gpu_buf += (n_out_embd + n_result) * type_size_f32 + nb_output_w;
+            }
            gpu_host_buf  = (n_inp_toks + n_inp_embd + n_bak_embd + n_inp_pos + n_kq_mask + n_inp_out_ids + n_out_embd) * type_size_f32;
        } else {
            if (has_gpu_layers) {