diff --git a/common/arg.cpp b/common/arg.cpp index 794ebfbf..24cc7045 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -730,6 +730,13 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, params.unload = true; } ).set_env("LLAMA_ARG_UNLOAD")); + add_opt(llama_arg( + {"-cm", "--cuda-mem"}, "N", + format("maximum cuda memory to use (default: %d)", params.cuda_mem), + [](gpt_params & params, int value) { + params.cuda_mem = value; // in GiB + } + ).set_env("LLAMA_ARG_CUDA_MEM")); add_opt(llama_arg( {"-n", "--predict", "--n-predict"}, "N", format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict), diff --git a/common/common.cpp b/common/common.cpp index a835709f..cfa06a2e 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -948,7 +948,6 @@ static void assign_device( } #if defined(USE_HIGHS) - // stores the actual read bandwidth (GB/s) for each device std::vector disk_speed(n_world, 0.0f); for (uint32_t m = 0; m < n_world; ++m) { @@ -1339,6 +1338,13 @@ static void assign_device( } LOG_INF("Total latency: %.3f\n", final_objective); +#else + (void)bi; + (void)bo; + (void)kappa; + (void)cparams; + (void)min_disk_read_speed; + #endif // copy value from w and n to n_layer_window and n_gpu_layers, respectively @@ -1400,7 +1406,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { device_info dev_info; dev_info.rank = params.rank; - llama_profile_device(&dev_info, model, ml, params.n_predict, params.n_ctx, params.cpuparams.n_threads, params.flash_attn); + llama_profile_device(&dev_info, model, ml, params.cuda_mem, params.n_predict, params.n_ctx, params.cpuparams.n_threads, params.flash_attn); // create llama context struct llama_context_params cparams = llama_context_params_from_gpt_params(params); diff --git a/common/common.h b/common/common.h index fd9af3a8..d8139a3c 100644 --- a/common/common.h +++ b/common/common.h @@ -148,6 +148,7 @@ struct gpt_params { std::string master_ip = "localhost"; // ip address of the master node std::string next_node_ip = "localhost"; // ip address of my next node bool unload = false; // unload layer weights after use or not + int32_t cuda_mem = 999.0; // cuda memory to use, in GiB int32_t n_predict = -1; // new tokens to predict int32_t n_ctx = 0; // context size int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS) diff --git a/include/llama.h b/include/llama.h index fc2e52b1..a99c70b2 100644 --- a/include/llama.h +++ b/include/llama.h @@ -416,6 +416,7 @@ extern "C" { struct device_info * dev_info, struct llama_model * model, struct llama_model_loader * ml, + int cuda_mem, int n_predict, int n_ctx, int n_threads, diff --git a/src/llama.cpp b/src/llama.cpp index 3126651f..1ca03938 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3574,6 +3574,7 @@ void llama_profile_device( device_info * dev_info, struct llama_model * model, llama_model_loader * ml, + int cuda_mem, int n_predict, int n_ctx, int n_threads, @@ -3619,7 +3620,7 @@ void llama_profile_device( dev_info->gpu_props.name = gpu_props.name; dev_info->gpu_props.description = gpu_props.description; - dev_info->gpu_props.memory_free = round(gpu_props.memory_free / (double)(1 << 30) * 100) / 100; + dev_info->gpu_props.memory_free = std::min((double)cuda_mem, round(gpu_props.memory_free / (double)(1 << 30) * 100) / 100); dev_info->gpu_props.memory_total = round(gpu_props.memory_total / (double)(1 << 30) * 100) / 100; dev_info->gpu_props.metal_read_vram_bw = device_metal_read_vram_bw(); dev_info->gpu_props.cuda_read_vram_bw = device_cuda_read_vram_bw();