mirror of
https://github.com/Lizonghang/prima.cpp.git
synced 2025-09-03 23:49:07 +00:00
speedup: add arg --keep-out-in-cuda to run the output layer on CUDA
This commit is contained in:
parent
e8d3e5a631
commit
1ea2d61a97
6 changed files with 66 additions and 16 deletions
|
@ -226,6 +226,8 @@ Take QwQ-32B as an example, run the following commands on the devices to launch
|
|||
|
||||
Once started, prima.cpp will profile each device and decide how much workload to assign, e.g., how many model layers each device should handle, and how many of them should run on GPU (if available).
|
||||
|
||||
> By default, the output layer runs on the CPU. However, if you have enough total VRAM, add `--keep-out-in-cuda` to the master to run it on the GPU.
|
||||
|
||||
### (Optional) Run with Prebuilt Docker Image
|
||||
Assume we have a host machine with at least 32 CPU cores, 32 GiB RAM, and 32 GiB VRAM. We simulate 4 homogeneous nodes using Docker containers, with each node allocated 8 CPU cores, 8 GiB RAM, and 8 GiB VRAM. Follow the below steps to get started:
|
||||
|
||||
|
|
|
@ -775,6 +775,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
|||
params.master_priority = std::stof(value);
|
||||
}
|
||||
).set_env("LLAMA_ARG_MASTER_PRIORITY"));
|
||||
|
||||
// #ifdef GGML_USE_METAL
|
||||
// // warn: if the output layer weights are not kept in metal shared memory, its mmap-ed weight data
|
||||
// // could be released by the OS and reloaded repeatedly, which causes additional disk I/O latency.
|
||||
|
@ -787,6 +788,17 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
|||
// }
|
||||
// ).set_env("LLAMA_ARG_KEEP_INP_OUT_IN_METAL"));
|
||||
// #endif
|
||||
|
||||
#ifdef GGML_USE_CUDA
|
||||
add_opt(llama_arg(
|
||||
{"--keep-out-in-cuda"},
|
||||
format("whether to compute the output layer on CUDA (default: %s)", params.keep_out_in_cuda ? "true" : "false"),
|
||||
[](gpt_params & params) {
|
||||
params.keep_out_in_cuda = true;
|
||||
}
|
||||
).set_env("LLAMA_ARG_KEEP_INP_OUT_IN_CUDA"));
|
||||
#endif
|
||||
|
||||
add_opt(llama_arg(
|
||||
{"-n", "--predict", "--n-predict"}, "N",
|
||||
format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict),
|
||||
|
|
|
@ -2017,16 +2017,19 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
|
|||
if (params.n_gpu_layers != -1) {
|
||||
mparams.n_gpu_layers = params.n_gpu_layers;
|
||||
}
|
||||
mparams.n_world = params.n_world;
|
||||
mparams.rank = params.rank;
|
||||
mparams.rpc_servers = params.rpc_servers.c_str();
|
||||
mparams.main_gpu = params.main_gpu;
|
||||
mparams.split_mode = params.split_mode;
|
||||
mparams.tensor_split = params.tensor_split;
|
||||
mparams.use_mmap = params.use_mmap;
|
||||
mparams.use_mlock = params.use_mlock;
|
||||
mparams.check_tensors = params.check_tensors;
|
||||
|
||||
mparams.n_world = params.n_world;
|
||||
mparams.rank = params.rank;
|
||||
mparams.rpc_servers = params.rpc_servers.c_str();
|
||||
mparams.main_gpu = params.main_gpu;
|
||||
mparams.split_mode = params.split_mode;
|
||||
mparams.tensor_split = params.tensor_split;
|
||||
mparams.use_mmap = params.use_mmap;
|
||||
mparams.use_mlock = params.use_mlock;
|
||||
mparams.check_tensors = params.check_tensors;
|
||||
mparams.keep_out_in_metal = params.keep_out_in_metal;
|
||||
mparams.keep_out_in_cuda = params.keep_out_in_cuda;
|
||||
|
||||
std::copy(std::begin(params.n_layer_window), std::end(params.n_layer_window), mparams.n_layer_window);
|
||||
if (params.kv_overrides.empty()) {
|
||||
mparams.kv_overrides = NULL;
|
||||
|
@ -2068,6 +2071,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
|||
cparams.force = params.force;
|
||||
cparams.master_priority = params.master_priority;
|
||||
cparams.keep_out_in_metal = params.keep_out_in_metal;
|
||||
cparams.keep_out_in_cuda = params.keep_out_in_cuda;
|
||||
cparams.n_gpu_layers = params.n_gpu_layers;
|
||||
cparams.n_cycles = params.n_cycles;
|
||||
std::copy(std::begin(params.n_layer_window), std::end(params.n_layer_window), cparams.n_layer_window);
|
||||
|
|
|
@ -151,6 +151,7 @@ struct gpt_params {
|
|||
uint32_t signal_port = 10000; // signal port for distributed inference
|
||||
bool prefetch = false; // prefetch layer weights
|
||||
bool keep_out_in_metal = true; // whether to keep output weights in metal memory, true by default
|
||||
bool keep_out_in_cuda = false; // whether to run the output layer on CUDA, false by default
|
||||
bool force = false; // force to start prefetching after computation
|
||||
float master_priority = 1.01; // priority to assign workload to the master (set 1.01 to use master first, and 0.99 to offload to other devices)
|
||||
int32_t gpu_mem = 999.0; // gpu memory to use, in GiB
|
||||
|
|
|
@ -325,6 +325,7 @@ extern "C" {
|
|||
bool use_mlock; // force system to keep model in RAM
|
||||
bool check_tensors; // validate model tensor data
|
||||
bool keep_out_in_metal; // whether to keep output weights in metal memory
|
||||
bool keep_out_in_cuda; // whether to run the output layer on CUDA
|
||||
};
|
||||
|
||||
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
|
||||
|
@ -339,6 +340,7 @@ extern "C" {
|
|||
bool force; // force to start prefetching after computation
|
||||
float master_priority; // priority to assign workload to the master (set 1.01 to use master first, and 0.99 to offload to other devices)
|
||||
bool keep_out_in_metal; // whether to keep output weights in metal memory
|
||||
bool keep_out_in_cuda; // whether to run the output layer on CUDA
|
||||
char * master_ip; // ip address of the master node
|
||||
char * next_node_ip; // ip address of the next node
|
||||
uint32_t data_port; // data port for distributed inference
|
||||
|
|
|
@ -7562,6 +7562,7 @@ static bool llm_load_tensors_impl(
|
|||
int main_gpu,
|
||||
bool use_mlock,
|
||||
bool keep_out_in_metal,
|
||||
bool keep_out_in_cuda,
|
||||
llama_progress_callback progress_callback,
|
||||
void * progress_callback_user_data) {
|
||||
auto & hparams = model.hparams;
|
||||
|
@ -7606,9 +7607,15 @@ static bool llm_load_tensors_impl(
|
|||
// assign the input and output layers on CPU by default
|
||||
if (my_rank == 0) {
|
||||
model.buft_input = llama_default_buffer_type_cpu(model, true);
|
||||
model.buft_output = llama_default_buffer_type_cpu(model, true);
|
||||
LLAMA_LOG_DEBUG("Layer input assigned to cpu\n");
|
||||
LLAMA_LOG_DEBUG("Layer output assigned to cpu\n");
|
||||
|
||||
if (keep_out_in_cuda) {
|
||||
model.buft_output = llama_default_buffer_type_offload(model, main_gpu);
|
||||
LLAMA_LOG_DEBUG("Layer output assigned to gpu\n");
|
||||
} else {
|
||||
model.buft_output = llama_default_buffer_type_cpu(model, true);
|
||||
LLAMA_LOG_DEBUG("Layer output assigned to cpu\n");
|
||||
}
|
||||
}
|
||||
|
||||
// count used buffer types
|
||||
|
@ -9535,7 +9542,8 @@ int llm_load_tensors(
|
|||
try {
|
||||
if (!llm_load_tensors_impl(
|
||||
*ml, *model, params.n_world, params.rank, params.n_layer_window, params.n_gpu_layers, params.split_mode,
|
||||
params.main_gpu, params.use_mlock, params.keep_out_in_metal, params.progress_callback, params.progress_callback_user_data
|
||||
params.main_gpu, params.use_mlock, params.keep_out_in_metal, params.keep_out_in_cuda, params.progress_callback,
|
||||
params.progress_callback_user_data
|
||||
)) {
|
||||
return -2;
|
||||
}
|
||||
|
@ -20247,6 +20255,7 @@ struct llama_model_params llama_model_default_params() {
|
|||
/*.use_mlock =*/ false,
|
||||
/*.check_tensors =*/ false,
|
||||
/*.keep_out_in_metal =*/ true,
|
||||
/*.keep_out_in_cuda =*/ false,
|
||||
};
|
||||
|
||||
#ifdef GGML_USE_METAL
|
||||
|
@ -20268,6 +20277,7 @@ struct llama_context_params llama_context_default_params() {
|
|||
/*.force =*/ false,
|
||||
/*.master_priority =*/ 1.01,
|
||||
/*.keep_out_in_metal =*/ true,
|
||||
/*.keep_out_in_cuda =*/ false,
|
||||
/*.master_ip =*/ nullptr,
|
||||
/*.next_node_ip =*/ nullptr,
|
||||
/*.data_port =*/ 9000,
|
||||
|
@ -21361,14 +21371,31 @@ void * llama_context_setup_backend(
|
|||
for (size_t i = 0; i < gf.size(); ++i) {
|
||||
|
||||
#if defined(GGML_USE_CUDA)
|
||||
if ((cparams.rank == 0 && (i == 0 || i == gf.size() - 1))
|
||||
|| model->n_gpu_layers == 0) {
|
||||
// output layer
|
||||
if (!params.keep_out_in_cuda && cparams.rank == 0 && i == gf.size() - 1) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// input layer
|
||||
if (cparams.rank == 0 && i == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// ignore all backend layers if n_gpu_layers is 0
|
||||
if (model->n_gpu_layers == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// don't reserve for repeated backend layers
|
||||
if ((cparams.rank == 0 && i > 1 && i < gf.size() - 1)
|
||||
|| (cparams.rank > 0 && i > 0)) {
|
||||
continue;
|
||||
}
|
||||
#endif
|
||||
|
||||
ok = ok & ggml_backend_sched_reserve(ctx->sched[i], gf[i]);
|
||||
}
|
||||
|
||||
if (!ok) {
|
||||
LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
|
||||
llama_free(ctx);
|
||||
|
@ -21933,7 +21960,7 @@ void llama_model_compute_buf_size(
|
|||
// weights
|
||||
const int64_t nb_attn_norm_w = n_bytes.nb_attn_norm_w;
|
||||
const int64_t nb_attn_q_w = n_bytes.nb_attn_q_w;
|
||||
// const int64_t nb_output_w = n_bytes.nb_output_w;
|
||||
const int64_t nb_output_w = n_bytes.nb_output_w;
|
||||
|
||||
// format bytes
|
||||
const int64_t type_size_f32 = ggml_type_size(GGML_TYPE_F32);
|
||||
|
@ -21972,7 +21999,9 @@ void llama_model_compute_buf_size(
|
|||
});
|
||||
}
|
||||
// we run the output layer on CPU by default
|
||||
// *gpu_buf += (n_out_embd + n_result) * type_size_f32 + nb_output_w;
|
||||
if (cparams.keep_out_in_cuda) {
|
||||
*gpu_buf += (n_out_embd + n_result) * type_size_f32 + nb_output_w;
|
||||
}
|
||||
gpu_host_buf = (n_inp_toks + n_inp_embd + n_bak_embd + n_inp_pos + n_kq_mask + n_inp_out_ids + n_out_embd) * type_size_f32;
|
||||
} else {
|
||||
if (has_gpu_layers) {
|
||||
|
|
Loading…
Add table
Reference in a new issue