add arg --cuda-mem

2025-09-05 23:49:04 +00:00 · 2025-01-16 09:15:34 +04:00 · 2025-01-16 09:15:34 +04:00 · 46e99218b4
commit 46e99218b4
parent dab6b2e1c2
5 changed files with 19 additions and 3 deletions
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -730,6 +730,13 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
            params.unload = true;
        }
    ).set_env("LLAMA_ARG_UNLOAD"));
+    add_opt(llama_arg(
+        {"-cm", "--cuda-mem"}, "N", 
+        format("maximum cuda memory to use (default: %d)", params.cuda_mem),
+        [](gpt_params & params, int value) {
+            params.cuda_mem = value; // in GiB
+        }
+    ).set_env("LLAMA_ARG_CUDA_MEM"));
    add_opt(llama_arg(
        {"-n", "--predict", "--n-predict"}, "N",
        format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict),
--- a/common/common.cpp
+++ b/common/common.cpp
@ -948,7 +948,6 @@ static void assign_device(
    }

 #if defined(USE_HIGHS)
-
    // stores the actual read bandwidth (GB/s) for each device
    std::vector<float> disk_speed(n_world, 0.0f);
    for (uint32_t m = 0; m < n_world; ++m) {
@ -1339,6 +1338,13 @@ static void assign_device(
    }
    LOG_INF("Total latency: %.3f\n", final_objective);

+#else
+    (void)bi;
+    (void)bo;
+    (void)kappa;
+    (void)cparams;
+    (void)min_disk_read_speed;
+
 #endif

    // copy value from w and n to n_layer_window and n_gpu_layers, respectively
@ -1400,7 +1406,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {

    device_info dev_info;
    dev_info.rank = params.rank;
-    llama_profile_device(&dev_info, model, ml, params.n_predict, params.n_ctx, params.cpuparams.n_threads, params.flash_attn);
+    llama_profile_device(&dev_info, model, ml, params.cuda_mem, params.n_predict, params.n_ctx, params.cpuparams.n_threads, params.flash_attn);

    // create llama context
    struct llama_context_params cparams = llama_context_params_from_gpt_params(params);
--- a/common/common.h
+++ b/common/common.h
@ -148,6 +148,7 @@ struct gpt_params {
    std::string master_ip         = "localhost"; // ip address of the master node
    std::string next_node_ip      = "localhost"; // ip address of my next node
    bool    unload                = false; // unload layer weights after use or not
+    int32_t cuda_mem              = 999.0; // cuda memory to use, in GiB
    int32_t n_predict             =    -1; // new tokens to predict
    int32_t n_ctx                 =     0; // context size
    int32_t n_batch               =  2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
--- a/include/llama.h
+++ b/include/llama.h
@ -416,6 +416,7 @@ extern "C" {
                       struct device_info * dev_info, 
                       struct llama_model * model, 
                struct llama_model_loader * ml,
+                                      int   cuda_mem,
                                      int   n_predict,
                                      int   n_ctx,
                                      int   n_threads,
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -3574,6 +3574,7 @@ void llama_profile_device(
                device_info * dev_info, 
         struct llama_model * model, 
         llama_model_loader * ml, 
+                        int   cuda_mem,
                        int   n_predict,
                        int   n_ctx, 
                        int   n_threads,
@ -3619,7 +3620,7 @@ void llama_profile_device(

    dev_info->gpu_props.name                = gpu_props.name;
    dev_info->gpu_props.description         = gpu_props.description;
-    dev_info->gpu_props.memory_free         = round(gpu_props.memory_free  / (double)(1 << 30) * 100) / 100;
+    dev_info->gpu_props.memory_free         = std::min((double)cuda_mem, round(gpu_props.memory_free  / (double)(1 << 30) * 100) / 100);
    dev_info->gpu_props.memory_total        = round(gpu_props.memory_total / (double)(1 << 30) * 100) / 100;
    dev_info->gpu_props.metal_read_vram_bw  = device_metal_read_vram_bw();
    dev_info->gpu_props.cuda_read_vram_bw   = device_cuda_read_vram_bw();