rename arg --keep-inp-out-in-metal to --keep-out-in-metal

2025-09-06 21:29:02 +00:00 · 2025-01-23 23:17:06 +04:00 · 2025-01-23 23:17:06 +04:00 · 1c0087e919
commit 1c0087e919
parent 5fcf020cfb
5 changed files with 14 additions and 14 deletions
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -739,10 +739,10 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
    ).set_env("LLAMA_ARG_CUDA_MEM"));
 #ifdef GGML_USE_METAL
    add_opt(llama_arg(
-        {"--keep-inp-out-in-metal"},
+        {"--keep-out-in-metal"},
-        format("whether to keep input and output weight in metal (default: %s)", params.keep_inp_out_in_metal ? "true" : "false"),
+        format("whether to keep output weights in metal memory (default: %s)", params.keep_out_in_metal ? "true" : "false"),
        [](gpt_params & params) {
-            params.keep_inp_out_in_metal = true;
+            params.keep_out_in_metal = true;
        }
    ).set_env("LLAMA_ARG_KEEP_INP_OUT_IN_METAL"));
 #endif
--- a/common/common.cpp
+++ b/common/common.cpp
@ -1162,7 +1162,7 @@ static void assign_device(
            if (dev.gpu_support.cuda || dev.gpu_support.metal) {
                float reserved_mem = 0.1f; // reserved shared memory to avoid potential OOM, set to 100 MiB by default
                vec_z_gpu[m] = (double)((dev.gpu_props.memory_free - reserved_mem) * GIGABYTE - c_gpu[m]) / (double)(n_layer * b_prime);
-                if (dev.gpu_support.metal && m == 0 && cparams.keep_inp_out_in_metal) {
+                if (dev.gpu_support.metal && m == 0 && cparams.keep_out_in_metal) {
                    vec_z_gpu[m] -= (double)(bi + bo) / (double)(n_layer * b_prime);
                }
                dev_gpu[m] = 1;
@ -1624,7 +1624,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
    mparams.use_mmap        = params.use_mmap;
    mparams.use_mlock       = params.use_mlock;
    mparams.check_tensors   = params.check_tensors;
-    mparams.keep_inp_out_in_metal = params.keep_inp_out_in_metal;
+    mparams.keep_out_in_metal = params.keep_out_in_metal;
    std::copy(std::begin(params.n_layer_window), std::end(params.n_layer_window), mparams.n_layer_window);
    if (params.kv_overrides.empty()) {
        mparams.kv_overrides = NULL;
@ -1671,7 +1671,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
    cparams.n_world               = params.n_world;
    cparams.rank                  = params.rank;
    cparams.unload                = params.unload;
-    cparams.keep_inp_out_in_metal = params.keep_inp_out_in_metal;
+    cparams.keep_out_in_metal = params.keep_out_in_metal;
    cparams.n_gpu_layers          = params.n_gpu_layers;
    std::copy(std::begin(params.n_layer_window), std::end(params.n_layer_window), cparams.n_layer_window);
--- a/common/common.h
+++ b/common/common.h
@ -148,7 +148,7 @@ struct gpt_params {
    std::string master_ip         = "localhost"; // ip address of the master node
    std::string next_node_ip      = "localhost"; // ip address of my next node
    bool    unload                = false; // unload layer weights after use or not
-    bool    keep_inp_out_in_metal = false; // whether to keep input/output weight in metal, not by default
+    bool    keep_out_in_metal     = false; // whether to keep output weights in metal memory, not by default
    int32_t gpu_mem               = 999.0; // gpu memory to use, in GiB
    int32_t n_predict             =    -1; // new tokens to predict
    int32_t n_ctx                 =     0; // context size
--- a/include/llama.h
+++ b/include/llama.h
@ -312,7 +312,7 @@ extern "C" {
        bool use_mmap;      // use mmap if possible
        bool use_mlock;     // force system to keep model in RAM
        bool check_tensors; // validate model tensor data
-        bool keep_inp_out_in_metal; // whether to keep input/output weight in metal
+        bool keep_out_in_metal; // whether to keep output weights in metal memory
    };
    // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
@ -323,7 +323,7 @@ extern "C" {
        uint32_t    n_layer_window[32];// number of layers to process in each compute
        uint32_t    n_gpu_layers;      // number of layers to process on GPU
        bool        unload;            // whether to unload layer weights after use
-        bool        keep_inp_out_in_metal; // whether to keep input/output weight in metal
+        bool        keep_out_in_metal; // whether to keep output weights in metal memory
        char *      master_ip;         // ip address of the master node
        char *      next_node_ip;      // ip address of the next node
        uint32_t    n_ctx;             // text context, 0 = from model
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -7397,7 +7397,7 @@ static bool llm_load_tensors_impl(
        enum llama_split_mode   split_mode,
        int                     main_gpu,
        bool                    use_mlock,
-        bool                    keep_inp_out_in_metal,
+        bool                    keep_out_in_metal,
        llama_progress_callback progress_callback,
        void                  * progress_callback_user_data) {
    auto & hparams = model.hparams;
@ -9283,7 +9283,7 @@ static bool llm_load_tensors_impl(
                void * addr = nullptr;
                auto & ranges = ctx_buffer_ranges[idx]; 
-                ml.get_mapping_ranges(ranges, &addr, idx, ctx, keep_inp_out_in_metal ? cpu_ctx : nullptr);
+                ml.get_mapping_ranges(ranges, &addr, idx, ctx, keep_out_in_metal ? cpu_ctx : nullptr);
                for (const auto & range : ranges) {
                    size_t first = range.first;
@ -9407,7 +9407,7 @@ int llm_load_tensors(
    try {
        if (!llm_load_tensors_impl(
            *ml, *model, params.n_world, params.rank, params.n_layer_window, params.n_gpu_layers, params.split_mode, 
-            params.main_gpu, params.use_mlock, params.keep_inp_out_in_metal, params.progress_callback, params.progress_callback_user_data
+            params.main_gpu, params.use_mlock, params.keep_out_in_metal, params.progress_callback, params.progress_callback_user_data
        )) {
            return -2;
        }
@ -19784,7 +19784,7 @@ struct llama_model_params llama_model_default_params() {
        /*.use_mmap                    =*/ true,
        /*.use_mlock                   =*/ false,
        /*.check_tensors               =*/ false,
-        /*.keep_inp_out_in_metal       =*/ false,
+        /*.keep_out_in_metal           =*/ false,
    };
 #ifdef GGML_USE_METAL
@ -19802,7 +19802,7 @@ struct llama_context_params llama_context_default_params() {
        /*.n_layer_window              =*/ {32},
        /*.n_gpu_layers                =*/ 0,
        /*.unload                      =*/ false,
-        /*.keep_inp_out_in_metal       =*/ false,
+        /*.keep_out_in_metal           =*/ false,
        /*.master_ip                   =*/ nullptr,
        /*.next_node_ip                =*/ nullptr,
        /*.n_ctx                       =*/ 512,