diff --git a/common/arg.cpp b/common/arg.cpp
index 794ebfbf..24cc7045 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -730,6 +730,13 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
             params.unload = true;
         }
     ).set_env("LLAMA_ARG_UNLOAD"));
+    add_opt(llama_arg(
+        {"-cm", "--cuda-mem"}, "N", 
+        format("maximum cuda memory to use (default: %d)", params.cuda_mem),
+        [](gpt_params & params, int value) {
+            params.cuda_mem = value; // in GiB
+        }
+    ).set_env("LLAMA_ARG_CUDA_MEM"));
     add_opt(llama_arg(
         {"-n", "--predict", "--n-predict"}, "N",
         format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict),
diff --git a/common/common.cpp b/common/common.cpp
index a835709f..cfa06a2e 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -948,7 +948,6 @@ static void assign_device(
     }
 
 #if defined(USE_HIGHS)
-
     // stores the actual read bandwidth (GB/s) for each device
     std::vector<float> disk_speed(n_world, 0.0f);
     for (uint32_t m = 0; m < n_world; ++m) {
@@ -1339,6 +1338,13 @@ static void assign_device(
     }
     LOG_INF("Total latency: %.3f\n", final_objective);
 
+#else
+    (void)bi;
+    (void)bo;
+    (void)kappa;
+    (void)cparams;
+    (void)min_disk_read_speed;
+
 #endif
 
     // copy value from w and n to n_layer_window and n_gpu_layers, respectively
@@ -1400,7 +1406,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
 
     device_info dev_info;
     dev_info.rank = params.rank;
-    llama_profile_device(&dev_info, model, ml, params.n_predict, params.n_ctx, params.cpuparams.n_threads, params.flash_attn);
+    llama_profile_device(&dev_info, model, ml, params.cuda_mem, params.n_predict, params.n_ctx, params.cpuparams.n_threads, params.flash_attn);
 
     // create llama context
     struct llama_context_params cparams = llama_context_params_from_gpt_params(params);
diff --git a/common/common.h b/common/common.h
index fd9af3a8..d8139a3c 100644
--- a/common/common.h
+++ b/common/common.h
@@ -148,6 +148,7 @@ struct gpt_params {
     std::string master_ip         = "localhost"; // ip address of the master node
     std::string next_node_ip      = "localhost"; // ip address of my next node
     bool    unload                = false; // unload layer weights after use or not
+    int32_t cuda_mem              = 999.0; // cuda memory to use, in GiB
     int32_t n_predict             =    -1; // new tokens to predict
     int32_t n_ctx                 =     0; // context size
     int32_t n_batch               =  2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
diff --git a/include/llama.h b/include/llama.h
index fc2e52b1..a99c70b2 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -416,6 +416,7 @@ extern "C" {
                        struct device_info * dev_info, 
                        struct llama_model * model, 
                 struct llama_model_loader * ml,
+                                      int   cuda_mem,
                                       int   n_predict,
                                       int   n_ctx,
                                       int   n_threads,
diff --git a/src/llama.cpp b/src/llama.cpp
index 3126651f..1ca03938 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -3574,6 +3574,7 @@ void llama_profile_device(
                 device_info * dev_info, 
          struct llama_model * model, 
          llama_model_loader * ml, 
+                        int   cuda_mem,
                         int   n_predict,
                         int   n_ctx, 
                         int   n_threads,
@@ -3619,7 +3620,7 @@ void llama_profile_device(
 
     dev_info->gpu_props.name                = gpu_props.name;
     dev_info->gpu_props.description         = gpu_props.description;
-    dev_info->gpu_props.memory_free         = round(gpu_props.memory_free  / (double)(1 << 30) * 100) / 100;
+    dev_info->gpu_props.memory_free         = std::min((double)cuda_mem, round(gpu_props.memory_free  / (double)(1 << 30) * 100) / 100);
     dev_info->gpu_props.memory_total        = round(gpu_props.memory_total / (double)(1 << 30) * 100) / 100;
     dev_info->gpu_props.metal_read_vram_bw  = device_metal_read_vram_bw();
     dev_info->gpu_props.cuda_read_vram_bw   = device_cuda_read_vram_bw();