mirror of
https://github.com/Lizonghang/prima.cpp.git
synced 2025-09-06 13:49:02 +00:00
add arg --cuda-mem
This commit is contained in:
parent
dab6b2e1c2
commit
46e99218b4
5 changed files with 19 additions and 3 deletions
|
@ -730,6 +730,13 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
||||||
params.unload = true;
|
params.unload = true;
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_UNLOAD"));
|
).set_env("LLAMA_ARG_UNLOAD"));
|
||||||
|
add_opt(llama_arg(
|
||||||
|
{"-cm", "--cuda-mem"}, "N",
|
||||||
|
format("maximum cuda memory to use (default: %d)", params.cuda_mem),
|
||||||
|
[](gpt_params & params, int value) {
|
||||||
|
params.cuda_mem = value; // in GiB
|
||||||
|
}
|
||||||
|
).set_env("LLAMA_ARG_CUDA_MEM"));
|
||||||
add_opt(llama_arg(
|
add_opt(llama_arg(
|
||||||
{"-n", "--predict", "--n-predict"}, "N",
|
{"-n", "--predict", "--n-predict"}, "N",
|
||||||
format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict),
|
format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict),
|
||||||
|
|
|
@ -948,7 +948,6 @@ static void assign_device(
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(USE_HIGHS)
|
#if defined(USE_HIGHS)
|
||||||
|
|
||||||
// stores the actual read bandwidth (GB/s) for each device
|
// stores the actual read bandwidth (GB/s) for each device
|
||||||
std::vector<float> disk_speed(n_world, 0.0f);
|
std::vector<float> disk_speed(n_world, 0.0f);
|
||||||
for (uint32_t m = 0; m < n_world; ++m) {
|
for (uint32_t m = 0; m < n_world; ++m) {
|
||||||
|
@ -1339,6 +1338,13 @@ static void assign_device(
|
||||||
}
|
}
|
||||||
LOG_INF("Total latency: %.3f\n", final_objective);
|
LOG_INF("Total latency: %.3f\n", final_objective);
|
||||||
|
|
||||||
|
#else
|
||||||
|
(void)bi;
|
||||||
|
(void)bo;
|
||||||
|
(void)kappa;
|
||||||
|
(void)cparams;
|
||||||
|
(void)min_disk_read_speed;
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// copy value from w and n to n_layer_window and n_gpu_layers, respectively
|
// copy value from w and n to n_layer_window and n_gpu_layers, respectively
|
||||||
|
@ -1400,7 +1406,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
||||||
|
|
||||||
device_info dev_info;
|
device_info dev_info;
|
||||||
dev_info.rank = params.rank;
|
dev_info.rank = params.rank;
|
||||||
llama_profile_device(&dev_info, model, ml, params.n_predict, params.n_ctx, params.cpuparams.n_threads, params.flash_attn);
|
llama_profile_device(&dev_info, model, ml, params.cuda_mem, params.n_predict, params.n_ctx, params.cpuparams.n_threads, params.flash_attn);
|
||||||
|
|
||||||
// create llama context
|
// create llama context
|
||||||
struct llama_context_params cparams = llama_context_params_from_gpt_params(params);
|
struct llama_context_params cparams = llama_context_params_from_gpt_params(params);
|
||||||
|
|
|
@ -148,6 +148,7 @@ struct gpt_params {
|
||||||
std::string master_ip = "localhost"; // ip address of the master node
|
std::string master_ip = "localhost"; // ip address of the master node
|
||||||
std::string next_node_ip = "localhost"; // ip address of my next node
|
std::string next_node_ip = "localhost"; // ip address of my next node
|
||||||
bool unload = false; // unload layer weights after use or not
|
bool unload = false; // unload layer weights after use or not
|
||||||
|
int32_t cuda_mem = 999.0; // cuda memory to use, in GiB
|
||||||
int32_t n_predict = -1; // new tokens to predict
|
int32_t n_predict = -1; // new tokens to predict
|
||||||
int32_t n_ctx = 0; // context size
|
int32_t n_ctx = 0; // context size
|
||||||
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
||||||
|
|
|
@ -416,6 +416,7 @@ extern "C" {
|
||||||
struct device_info * dev_info,
|
struct device_info * dev_info,
|
||||||
struct llama_model * model,
|
struct llama_model * model,
|
||||||
struct llama_model_loader * ml,
|
struct llama_model_loader * ml,
|
||||||
|
int cuda_mem,
|
||||||
int n_predict,
|
int n_predict,
|
||||||
int n_ctx,
|
int n_ctx,
|
||||||
int n_threads,
|
int n_threads,
|
||||||
|
|
|
@ -3574,6 +3574,7 @@ void llama_profile_device(
|
||||||
device_info * dev_info,
|
device_info * dev_info,
|
||||||
struct llama_model * model,
|
struct llama_model * model,
|
||||||
llama_model_loader * ml,
|
llama_model_loader * ml,
|
||||||
|
int cuda_mem,
|
||||||
int n_predict,
|
int n_predict,
|
||||||
int n_ctx,
|
int n_ctx,
|
||||||
int n_threads,
|
int n_threads,
|
||||||
|
@ -3619,7 +3620,7 @@ void llama_profile_device(
|
||||||
|
|
||||||
dev_info->gpu_props.name = gpu_props.name;
|
dev_info->gpu_props.name = gpu_props.name;
|
||||||
dev_info->gpu_props.description = gpu_props.description;
|
dev_info->gpu_props.description = gpu_props.description;
|
||||||
dev_info->gpu_props.memory_free = round(gpu_props.memory_free / (double)(1 << 30) * 100) / 100;
|
dev_info->gpu_props.memory_free = std::min((double)cuda_mem, round(gpu_props.memory_free / (double)(1 << 30) * 100) / 100);
|
||||||
dev_info->gpu_props.memory_total = round(gpu_props.memory_total / (double)(1 << 30) * 100) / 100;
|
dev_info->gpu_props.memory_total = round(gpu_props.memory_total / (double)(1 << 30) * 100) / 100;
|
||||||
dev_info->gpu_props.metal_read_vram_bw = device_metal_read_vram_bw();
|
dev_info->gpu_props.metal_read_vram_bw = device_metal_read_vram_bw();
|
||||||
dev_info->gpu_props.cuda_read_vram_bw = device_cuda_read_vram_bw();
|
dev_info->gpu_props.cuda_read_vram_bw = device_cuda_read_vram_bw();
|
||||||
|
|
Loading…
Add table
Reference in a new issue