add arg --cuda-mem

This commit is contained in:
Zonghang Li 2025-01-16 09:15:34 +04:00
parent dab6b2e1c2
commit 46e99218b4
5 changed files with 19 additions and 3 deletions

View file

@ -730,6 +730,13 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
params.unload = true;
}
).set_env("LLAMA_ARG_UNLOAD"));
add_opt(llama_arg(
{"-cm", "--cuda-mem"}, "N",
format("maximum cuda memory to use (default: %d)", params.cuda_mem),
[](gpt_params & params, int value) {
params.cuda_mem = value; // in GiB
}
).set_env("LLAMA_ARG_CUDA_MEM"));
add_opt(llama_arg(
{"-n", "--predict", "--n-predict"}, "N",
format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict),

View file

@ -948,7 +948,6 @@ static void assign_device(
}
#if defined(USE_HIGHS)
// stores the actual read bandwidth (GB/s) for each device
std::vector<float> disk_speed(n_world, 0.0f);
for (uint32_t m = 0; m < n_world; ++m) {
@ -1339,6 +1338,13 @@ static void assign_device(
}
LOG_INF("Total latency: %.3f\n", final_objective);
#else
(void)bi;
(void)bo;
(void)kappa;
(void)cparams;
(void)min_disk_read_speed;
#endif
// copy value from w and n to n_layer_window and n_gpu_layers, respectively
@ -1400,7 +1406,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
device_info dev_info;
dev_info.rank = params.rank;
llama_profile_device(&dev_info, model, ml, params.n_predict, params.n_ctx, params.cpuparams.n_threads, params.flash_attn);
llama_profile_device(&dev_info, model, ml, params.cuda_mem, params.n_predict, params.n_ctx, params.cpuparams.n_threads, params.flash_attn);
// create llama context
struct llama_context_params cparams = llama_context_params_from_gpt_params(params);

View file

@ -148,6 +148,7 @@ struct gpt_params {
std::string master_ip = "localhost"; // ip address of the master node
std::string next_node_ip = "localhost"; // ip address of my next node
bool unload = false; // unload layer weights after use or not
int32_t cuda_mem = 999.0; // cuda memory to use, in GiB
int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 0; // context size
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)

View file

@ -416,6 +416,7 @@ extern "C" {
struct device_info * dev_info,
struct llama_model * model,
struct llama_model_loader * ml,
int cuda_mem,
int n_predict,
int n_ctx,
int n_threads,

View file

@ -3574,6 +3574,7 @@ void llama_profile_device(
device_info * dev_info,
struct llama_model * model,
llama_model_loader * ml,
int cuda_mem,
int n_predict,
int n_ctx,
int n_threads,
@ -3619,7 +3620,7 @@ void llama_profile_device(
dev_info->gpu_props.name = gpu_props.name;
dev_info->gpu_props.description = gpu_props.description;
dev_info->gpu_props.memory_free = round(gpu_props.memory_free / (double)(1 << 30) * 100) / 100;
dev_info->gpu_props.memory_free = std::min((double)cuda_mem, round(gpu_props.memory_free / (double)(1 << 30) * 100) / 100);
dev_info->gpu_props.memory_total = round(gpu_props.memory_total / (double)(1 << 30) * 100) / 100;
dev_info->gpu_props.metal_read_vram_bw = device_metal_read_vram_bw();
dev_info->gpu_props.cuda_read_vram_bw = device_cuda_read_vram_bw();