speedup: add arg --keep-out-in-cuda to run the output layer on CUDA

This commit is contained in:
Zonghang Li 2025-06-28 05:59:19 +00:00 committed by Li, Zonghang
parent e8d3e5a631
commit 1ea2d61a97
6 changed files with 66 additions and 16 deletions

View file

@ -226,6 +226,8 @@ Take QwQ-32B as an example, run the following commands on the devices to launch
Once started, prima.cpp will profile each device and decide how much workload to assign, e.g., how many model layers each device should handle, and how many of them should run on GPU (if available).
> By default, the output layer runs on the CPU. However, if you have enough total VRAM, add `--keep-out-in-cuda` to the master to run it on the GPU.
### (Optional) Run with Prebuilt Docker Image
Assume we have a host machine with at least 32 CPU cores, 32 GiB RAM, and 32 GiB VRAM. We simulate 4 homogeneous nodes using Docker containers, with each node allocated 8 CPU cores, 8 GiB RAM, and 8 GiB VRAM. Follow the below steps to get started:

View file

@ -775,6 +775,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
params.master_priority = std::stof(value);
}
).set_env("LLAMA_ARG_MASTER_PRIORITY"));
// #ifdef GGML_USE_METAL
// // warn: if the output layer weights are not kept in metal shared memory, its mmap-ed weight data
// // could be released by the OS and reloaded repeatedly, which causes additional disk I/O latency.
@ -787,6 +788,17 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
// }
// ).set_env("LLAMA_ARG_KEEP_INP_OUT_IN_METAL"));
// #endif
#ifdef GGML_USE_CUDA
add_opt(llama_arg(
{"--keep-out-in-cuda"},
format("whether to compute the output layer on CUDA (default: %s)", params.keep_out_in_cuda ? "true" : "false"),
[](gpt_params & params) {
params.keep_out_in_cuda = true;
}
).set_env("LLAMA_ARG_KEEP_INP_OUT_IN_CUDA"));
#endif
add_opt(llama_arg(
{"-n", "--predict", "--n-predict"}, "N",
format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict),

View file

@ -2017,16 +2017,19 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
if (params.n_gpu_layers != -1) {
mparams.n_gpu_layers = params.n_gpu_layers;
}
mparams.n_world = params.n_world;
mparams.rank = params.rank;
mparams.rpc_servers = params.rpc_servers.c_str();
mparams.main_gpu = params.main_gpu;
mparams.split_mode = params.split_mode;
mparams.tensor_split = params.tensor_split;
mparams.use_mmap = params.use_mmap;
mparams.use_mlock = params.use_mlock;
mparams.check_tensors = params.check_tensors;
mparams.n_world = params.n_world;
mparams.rank = params.rank;
mparams.rpc_servers = params.rpc_servers.c_str();
mparams.main_gpu = params.main_gpu;
mparams.split_mode = params.split_mode;
mparams.tensor_split = params.tensor_split;
mparams.use_mmap = params.use_mmap;
mparams.use_mlock = params.use_mlock;
mparams.check_tensors = params.check_tensors;
mparams.keep_out_in_metal = params.keep_out_in_metal;
mparams.keep_out_in_cuda = params.keep_out_in_cuda;
std::copy(std::begin(params.n_layer_window), std::end(params.n_layer_window), mparams.n_layer_window);
if (params.kv_overrides.empty()) {
mparams.kv_overrides = NULL;
@ -2068,6 +2071,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
cparams.force = params.force;
cparams.master_priority = params.master_priority;
cparams.keep_out_in_metal = params.keep_out_in_metal;
cparams.keep_out_in_cuda = params.keep_out_in_cuda;
cparams.n_gpu_layers = params.n_gpu_layers;
cparams.n_cycles = params.n_cycles;
std::copy(std::begin(params.n_layer_window), std::end(params.n_layer_window), cparams.n_layer_window);

View file

@ -151,6 +151,7 @@ struct gpt_params {
uint32_t signal_port = 10000; // signal port for distributed inference
bool prefetch = false; // prefetch layer weights
bool keep_out_in_metal = true; // whether to keep output weights in metal memory, true by default
bool keep_out_in_cuda = false; // whether to run the output layer on CUDA, false by default
bool force = false; // force to start prefetching after computation
float master_priority = 1.01; // priority to assign workload to the master (set 1.01 to use master first, and 0.99 to offload to other devices)
int32_t gpu_mem = 999.0; // gpu memory to use, in GiB

View file

@ -325,6 +325,7 @@ extern "C" {
bool use_mlock; // force system to keep model in RAM
bool check_tensors; // validate model tensor data
bool keep_out_in_metal; // whether to keep output weights in metal memory
bool keep_out_in_cuda; // whether to run the output layer on CUDA
};
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
@ -339,6 +340,7 @@ extern "C" {
bool force; // force to start prefetching after computation
float master_priority; // priority to assign workload to the master (set 1.01 to use master first, and 0.99 to offload to other devices)
bool keep_out_in_metal; // whether to keep output weights in metal memory
bool keep_out_in_cuda; // whether to run the output layer on CUDA
char * master_ip; // ip address of the master node
char * next_node_ip; // ip address of the next node
uint32_t data_port; // data port for distributed inference

View file

@ -7562,6 +7562,7 @@ static bool llm_load_tensors_impl(
int main_gpu,
bool use_mlock,
bool keep_out_in_metal,
bool keep_out_in_cuda,
llama_progress_callback progress_callback,
void * progress_callback_user_data) {
auto & hparams = model.hparams;
@ -7606,9 +7607,15 @@ static bool llm_load_tensors_impl(
// assign the input and output layers on CPU by default
if (my_rank == 0) {
model.buft_input = llama_default_buffer_type_cpu(model, true);
model.buft_output = llama_default_buffer_type_cpu(model, true);
LLAMA_LOG_DEBUG("Layer input assigned to cpu\n");
LLAMA_LOG_DEBUG("Layer output assigned to cpu\n");
if (keep_out_in_cuda) {
model.buft_output = llama_default_buffer_type_offload(model, main_gpu);
LLAMA_LOG_DEBUG("Layer output assigned to gpu\n");
} else {
model.buft_output = llama_default_buffer_type_cpu(model, true);
LLAMA_LOG_DEBUG("Layer output assigned to cpu\n");
}
}
// count used buffer types
@ -9535,7 +9542,8 @@ int llm_load_tensors(
try {
if (!llm_load_tensors_impl(
*ml, *model, params.n_world, params.rank, params.n_layer_window, params.n_gpu_layers, params.split_mode,
params.main_gpu, params.use_mlock, params.keep_out_in_metal, params.progress_callback, params.progress_callback_user_data
params.main_gpu, params.use_mlock, params.keep_out_in_metal, params.keep_out_in_cuda, params.progress_callback,
params.progress_callback_user_data
)) {
return -2;
}
@ -20247,6 +20255,7 @@ struct llama_model_params llama_model_default_params() {
/*.use_mlock =*/ false,
/*.check_tensors =*/ false,
/*.keep_out_in_metal =*/ true,
/*.keep_out_in_cuda =*/ false,
};
#ifdef GGML_USE_METAL
@ -20268,6 +20277,7 @@ struct llama_context_params llama_context_default_params() {
/*.force =*/ false,
/*.master_priority =*/ 1.01,
/*.keep_out_in_metal =*/ true,
/*.keep_out_in_cuda =*/ false,
/*.master_ip =*/ nullptr,
/*.next_node_ip =*/ nullptr,
/*.data_port =*/ 9000,
@ -21361,14 +21371,31 @@ void * llama_context_setup_backend(
for (size_t i = 0; i < gf.size(); ++i) {
#if defined(GGML_USE_CUDA)
if ((cparams.rank == 0 && (i == 0 || i == gf.size() - 1))
|| model->n_gpu_layers == 0) {
// output layer
if (!params.keep_out_in_cuda && cparams.rank == 0 && i == gf.size() - 1) {
continue;
}
// input layer
if (cparams.rank == 0 && i == 0) {
continue;
}
// ignore all backend layers if n_gpu_layers is 0
if (model->n_gpu_layers == 0) {
continue;
}
// don't reserve for repeated backend layers
if ((cparams.rank == 0 && i > 1 && i < gf.size() - 1)
|| (cparams.rank > 0 && i > 0)) {
continue;
}
#endif
ok = ok & ggml_backend_sched_reserve(ctx->sched[i], gf[i]);
}
if (!ok) {
LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
llama_free(ctx);
@ -21933,7 +21960,7 @@ void llama_model_compute_buf_size(
// weights
const int64_t nb_attn_norm_w = n_bytes.nb_attn_norm_w;
const int64_t nb_attn_q_w = n_bytes.nb_attn_q_w;
// const int64_t nb_output_w = n_bytes.nb_output_w;
const int64_t nb_output_w = n_bytes.nb_output_w;
// format bytes
const int64_t type_size_f32 = ggml_type_size(GGML_TYPE_F32);
@ -21972,7 +21999,9 @@ void llama_model_compute_buf_size(
});
}
// we run the output layer on CPU by default
// *gpu_buf += (n_out_embd + n_result) * type_size_f32 + nb_output_w;
if (cparams.keep_out_in_cuda) {
*gpu_buf += (n_out_embd + n_result) * type_size_f32 + nb_output_w;
}
gpu_host_buf = (n_inp_toks + n_inp_embd + n_bak_embd + n_inp_pos + n_kq_mask + n_inp_out_ids + n_out_embd) * type_size_f32;
} else {
if (has_gpu_layers) {