mirror of
https://github.com/Lizonghang/prima.cpp.git
synced 2025-09-05 23:49:04 +00:00
add arg --cuda-mem
This commit is contained in:
parent
dab6b2e1c2
commit
46e99218b4
5 changed files with 19 additions and 3 deletions
|
@ -730,6 +730,13 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
|||
params.unload = true;
|
||||
}
|
||||
).set_env("LLAMA_ARG_UNLOAD"));
|
||||
add_opt(llama_arg(
|
||||
{"-cm", "--cuda-mem"}, "N",
|
||||
format("maximum cuda memory to use (default: %d)", params.cuda_mem),
|
||||
[](gpt_params & params, int value) {
|
||||
params.cuda_mem = value; // in GiB
|
||||
}
|
||||
).set_env("LLAMA_ARG_CUDA_MEM"));
|
||||
add_opt(llama_arg(
|
||||
{"-n", "--predict", "--n-predict"}, "N",
|
||||
format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict),
|
||||
|
|
|
@ -948,7 +948,6 @@ static void assign_device(
|
|||
}
|
||||
|
||||
#if defined(USE_HIGHS)
|
||||
|
||||
// stores the actual read bandwidth (GB/s) for each device
|
||||
std::vector<float> disk_speed(n_world, 0.0f);
|
||||
for (uint32_t m = 0; m < n_world; ++m) {
|
||||
|
@ -1339,6 +1338,13 @@ static void assign_device(
|
|||
}
|
||||
LOG_INF("Total latency: %.3f\n", final_objective);
|
||||
|
||||
#else
|
||||
(void)bi;
|
||||
(void)bo;
|
||||
(void)kappa;
|
||||
(void)cparams;
|
||||
(void)min_disk_read_speed;
|
||||
|
||||
#endif
|
||||
|
||||
// copy value from w and n to n_layer_window and n_gpu_layers, respectively
|
||||
|
@ -1400,7 +1406,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|||
|
||||
device_info dev_info;
|
||||
dev_info.rank = params.rank;
|
||||
llama_profile_device(&dev_info, model, ml, params.n_predict, params.n_ctx, params.cpuparams.n_threads, params.flash_attn);
|
||||
llama_profile_device(&dev_info, model, ml, params.cuda_mem, params.n_predict, params.n_ctx, params.cpuparams.n_threads, params.flash_attn);
|
||||
|
||||
// create llama context
|
||||
struct llama_context_params cparams = llama_context_params_from_gpt_params(params);
|
||||
|
|
|
@ -148,6 +148,7 @@ struct gpt_params {
|
|||
std::string master_ip = "localhost"; // ip address of the master node
|
||||
std::string next_node_ip = "localhost"; // ip address of my next node
|
||||
bool unload = false; // unload layer weights after use or not
|
||||
int32_t cuda_mem = 999.0; // cuda memory to use, in GiB
|
||||
int32_t n_predict = -1; // new tokens to predict
|
||||
int32_t n_ctx = 0; // context size
|
||||
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
||||
|
|
|
@ -416,6 +416,7 @@ extern "C" {
|
|||
struct device_info * dev_info,
|
||||
struct llama_model * model,
|
||||
struct llama_model_loader * ml,
|
||||
int cuda_mem,
|
||||
int n_predict,
|
||||
int n_ctx,
|
||||
int n_threads,
|
||||
|
|
|
@ -3574,6 +3574,7 @@ void llama_profile_device(
|
|||
device_info * dev_info,
|
||||
struct llama_model * model,
|
||||
llama_model_loader * ml,
|
||||
int cuda_mem,
|
||||
int n_predict,
|
||||
int n_ctx,
|
||||
int n_threads,
|
||||
|
@ -3619,7 +3620,7 @@ void llama_profile_device(
|
|||
|
||||
dev_info->gpu_props.name = gpu_props.name;
|
||||
dev_info->gpu_props.description = gpu_props.description;
|
||||
dev_info->gpu_props.memory_free = round(gpu_props.memory_free / (double)(1 << 30) * 100) / 100;
|
||||
dev_info->gpu_props.memory_free = std::min((double)cuda_mem, round(gpu_props.memory_free / (double)(1 << 30) * 100) / 100);
|
||||
dev_info->gpu_props.memory_total = round(gpu_props.memory_total / (double)(1 << 30) * 100) / 100;
|
||||
dev_info->gpu_props.metal_read_vram_bw = device_metal_read_vram_bw();
|
||||
dev_info->gpu_props.cuda_read_vram_bw = device_cuda_read_vram_bw();
|
||||
|
|
Loading…
Add table
Reference in a new issue