From 1c0087e919ef570c0ddd2fac515977fd40087a7b Mon Sep 17 00:00:00 2001
From: Lizonghang <870644199@qq.com>
Date: Thu, 23 Jan 2025 23:17:06 +0400
Subject: [PATCH] rename arg --keep-inp-out-in-metal to --keep-out-in-metal

---
 common/arg.cpp    |  6 +++---
 common/common.cpp |  6 +++---
 common/common.h   |  2 +-
 include/llama.h   |  4 ++--
 src/llama.cpp     | 10 +++++-----
 5 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index cd4bad7c..e189b5ce 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -739,10 +739,10 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
     ).set_env("LLAMA_ARG_CUDA_MEM"));
 #ifdef GGML_USE_METAL
     add_opt(llama_arg(
-        {"--keep-inp-out-in-metal"},
-        format("whether to keep input and output weight in metal (default: %s)", params.keep_inp_out_in_metal ? "true" : "false"),
+        {"--keep-out-in-metal"},
+        format("whether to keep output weights in metal memory (default: %s)", params.keep_out_in_metal ? "true" : "false"),
         [](gpt_params & params) {
-            params.keep_inp_out_in_metal = true;
+            params.keep_out_in_metal = true;
         }
     ).set_env("LLAMA_ARG_KEEP_INP_OUT_IN_METAL"));
 #endif
diff --git a/common/common.cpp b/common/common.cpp
index a21bbd71..226958d7 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1162,7 +1162,7 @@ static void assign_device(
             if (dev.gpu_support.cuda || dev.gpu_support.metal) {
                 float reserved_mem = 0.1f; // reserved shared memory to avoid potential OOM, set to 100 MiB by default
                 vec_z_gpu[m] = (double)((dev.gpu_props.memory_free - reserved_mem) * GIGABYTE - c_gpu[m]) / (double)(n_layer * b_prime);
-                if (dev.gpu_support.metal && m == 0 && cparams.keep_inp_out_in_metal) {
+                if (dev.gpu_support.metal && m == 0 && cparams.keep_out_in_metal) {
                     vec_z_gpu[m] -= (double)(bi + bo) / (double)(n_layer * b_prime);
                 }
                 dev_gpu[m] = 1;
@@ -1624,7 +1624,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
     mparams.use_mmap        = params.use_mmap;
     mparams.use_mlock       = params.use_mlock;
     mparams.check_tensors   = params.check_tensors;
-    mparams.keep_inp_out_in_metal = params.keep_inp_out_in_metal;
+    mparams.keep_out_in_metal = params.keep_out_in_metal;
     std::copy(std::begin(params.n_layer_window), std::end(params.n_layer_window), mparams.n_layer_window);
     if (params.kv_overrides.empty()) {
         mparams.kv_overrides = NULL;
@@ -1671,7 +1671,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
     cparams.n_world               = params.n_world;
     cparams.rank                  = params.rank;
     cparams.unload                = params.unload;
-    cparams.keep_inp_out_in_metal = params.keep_inp_out_in_metal;
+    cparams.keep_out_in_metal = params.keep_out_in_metal;
     cparams.n_gpu_layers          = params.n_gpu_layers;
     std::copy(std::begin(params.n_layer_window), std::end(params.n_layer_window), cparams.n_layer_window);
 
diff --git a/common/common.h b/common/common.h
index ae40443f..fece83f3 100644
--- a/common/common.h
+++ b/common/common.h
@@ -148,7 +148,7 @@ struct gpt_params {
     std::string master_ip         = "localhost"; // ip address of the master node
     std::string next_node_ip      = "localhost"; // ip address of my next node
     bool    unload                = false; // unload layer weights after use or not
-    bool    keep_inp_out_in_metal = false; // whether to keep input/output weight in metal, not by default
+    bool    keep_out_in_metal     = false; // whether to keep output weights in metal memory, not by default
     int32_t gpu_mem               = 999.0; // gpu memory to use, in GiB
     int32_t n_predict             =    -1; // new tokens to predict
     int32_t n_ctx                 =     0; // context size
diff --git a/include/llama.h b/include/llama.h
index 0777fe76..b7c170ab 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -312,7 +312,7 @@ extern "C" {
         bool use_mmap;      // use mmap if possible
         bool use_mlock;     // force system to keep model in RAM
         bool check_tensors; // validate model tensor data
-        bool keep_inp_out_in_metal; // whether to keep input/output weight in metal
+        bool keep_out_in_metal; // whether to keep output weights in metal memory
     };
 
     // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
@@ -323,7 +323,7 @@ extern "C" {
         uint32_t    n_layer_window[32];// number of layers to process in each compute
         uint32_t    n_gpu_layers;      // number of layers to process on GPU
         bool        unload;            // whether to unload layer weights after use
-        bool        keep_inp_out_in_metal; // whether to keep input/output weight in metal
+        bool        keep_out_in_metal; // whether to keep output weights in metal memory
         char *      master_ip;         // ip address of the master node
         char *      next_node_ip;      // ip address of the next node
         uint32_t    n_ctx;             // text context, 0 = from model
diff --git a/src/llama.cpp b/src/llama.cpp
index 746e847b..aec23444 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -7397,7 +7397,7 @@ static bool llm_load_tensors_impl(
         enum llama_split_mode   split_mode,
         int                     main_gpu,
         bool                    use_mlock,
-        bool                    keep_inp_out_in_metal,
+        bool                    keep_out_in_metal,
         llama_progress_callback progress_callback,
         void                  * progress_callback_user_data) {
     auto & hparams = model.hparams;
@@ -9283,7 +9283,7 @@ static bool llm_load_tensors_impl(
                 void * addr = nullptr;
                 auto & ranges = ctx_buffer_ranges[idx]; 
 
-                ml.get_mapping_ranges(ranges, &addr, idx, ctx, keep_inp_out_in_metal ? cpu_ctx : nullptr);
+                ml.get_mapping_ranges(ranges, &addr, idx, ctx, keep_out_in_metal ? cpu_ctx : nullptr);
 
                 for (const auto & range : ranges) {
                     size_t first = range.first;
@@ -9407,7 +9407,7 @@ int llm_load_tensors(
     try {
         if (!llm_load_tensors_impl(
             *ml, *model, params.n_world, params.rank, params.n_layer_window, params.n_gpu_layers, params.split_mode, 
-            params.main_gpu, params.use_mlock, params.keep_inp_out_in_metal, params.progress_callback, params.progress_callback_user_data
+            params.main_gpu, params.use_mlock, params.keep_out_in_metal, params.progress_callback, params.progress_callback_user_data
         )) {
             return -2;
         }
@@ -19784,7 +19784,7 @@ struct llama_model_params llama_model_default_params() {
         /*.use_mmap                    =*/ true,
         /*.use_mlock                   =*/ false,
         /*.check_tensors               =*/ false,
-        /*.keep_inp_out_in_metal       =*/ false,
+        /*.keep_out_in_metal           =*/ false,
     };
 
 #ifdef GGML_USE_METAL
@@ -19802,7 +19802,7 @@ struct llama_context_params llama_context_default_params() {
         /*.n_layer_window              =*/ {32},
         /*.n_gpu_layers                =*/ 0,
         /*.unload                      =*/ false,
-        /*.keep_inp_out_in_metal       =*/ false,
+        /*.keep_out_in_metal           =*/ false,
         /*.master_ip                   =*/ nullptr,
         /*.next_node_ip                =*/ nullptr,
         /*.n_ctx                       =*/ 512,