From 1c0087e919ef570c0ddd2fac515977fd40087a7b Mon Sep 17 00:00:00 2001 From: Lizonghang <870644199@qq.com> Date: Thu, 23 Jan 2025 23:17:06 +0400 Subject: [PATCH] rename arg --keep-inp-out-in-metal to --keep-out-in-metal --- common/arg.cpp | 6 +++--- common/common.cpp | 6 +++--- common/common.h | 2 +- include/llama.h | 4 ++-- src/llama.cpp | 10 +++++----- 5 files changed, 14 insertions(+), 14 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index cd4bad7c..e189b5ce 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -739,10 +739,10 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, ).set_env("LLAMA_ARG_CUDA_MEM")); #ifdef GGML_USE_METAL add_opt(llama_arg( - {"--keep-inp-out-in-metal"}, - format("whether to keep input and output weight in metal (default: %s)", params.keep_inp_out_in_metal ? "true" : "false"), + {"--keep-out-in-metal"}, + format("whether to keep output weights in metal memory (default: %s)", params.keep_out_in_metal ? "true" : "false"), [](gpt_params & params) { - params.keep_inp_out_in_metal = true; + params.keep_out_in_metal = true; } ).set_env("LLAMA_ARG_KEEP_INP_OUT_IN_METAL")); #endif diff --git a/common/common.cpp b/common/common.cpp index a21bbd71..226958d7 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1162,7 +1162,7 @@ static void assign_device( if (dev.gpu_support.cuda || dev.gpu_support.metal) { float reserved_mem = 0.1f; // reserved shared memory to avoid potential OOM, set to 100 MiB by default vec_z_gpu[m] = (double)((dev.gpu_props.memory_free - reserved_mem) * GIGABYTE - c_gpu[m]) / (double)(n_layer * b_prime); - if (dev.gpu_support.metal && m == 0 && cparams.keep_inp_out_in_metal) { + if (dev.gpu_support.metal && m == 0 && cparams.keep_out_in_metal) { vec_z_gpu[m] -= (double)(bi + bo) / (double)(n_layer * b_prime); } dev_gpu[m] = 1; @@ -1624,7 +1624,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & mparams.use_mmap = params.use_mmap; mparams.use_mlock = params.use_mlock; mparams.check_tensors = params.check_tensors; - mparams.keep_inp_out_in_metal = params.keep_inp_out_in_metal; + mparams.keep_out_in_metal = params.keep_out_in_metal; std::copy(std::begin(params.n_layer_window), std::end(params.n_layer_window), mparams.n_layer_window); if (params.kv_overrides.empty()) { mparams.kv_overrides = NULL; @@ -1671,7 +1671,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param cparams.n_world = params.n_world; cparams.rank = params.rank; cparams.unload = params.unload; - cparams.keep_inp_out_in_metal = params.keep_inp_out_in_metal; + cparams.keep_out_in_metal = params.keep_out_in_metal; cparams.n_gpu_layers = params.n_gpu_layers; std::copy(std::begin(params.n_layer_window), std::end(params.n_layer_window), cparams.n_layer_window); diff --git a/common/common.h b/common/common.h index ae40443f..fece83f3 100644 --- a/common/common.h +++ b/common/common.h @@ -148,7 +148,7 @@ struct gpt_params { std::string master_ip = "localhost"; // ip address of the master node std::string next_node_ip = "localhost"; // ip address of my next node bool unload = false; // unload layer weights after use or not - bool keep_inp_out_in_metal = false; // whether to keep input/output weight in metal, not by default + bool keep_out_in_metal = false; // whether to keep output weights in metal memory, not by default int32_t gpu_mem = 999.0; // gpu memory to use, in GiB int32_t n_predict = -1; // new tokens to predict int32_t n_ctx = 0; // context size diff --git a/include/llama.h b/include/llama.h index 0777fe76..b7c170ab 100644 --- a/include/llama.h +++ b/include/llama.h @@ -312,7 +312,7 @@ extern "C" { bool use_mmap; // use mmap if possible bool use_mlock; // force system to keep model in RAM bool check_tensors; // validate model tensor data - bool keep_inp_out_in_metal; // whether to keep input/output weight in metal + bool keep_out_in_metal; // whether to keep output weights in metal memory }; // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations @@ -323,7 +323,7 @@ extern "C" { uint32_t n_layer_window[32];// number of layers to process in each compute uint32_t n_gpu_layers; // number of layers to process on GPU bool unload; // whether to unload layer weights after use - bool keep_inp_out_in_metal; // whether to keep input/output weight in metal + bool keep_out_in_metal; // whether to keep output weights in metal memory char * master_ip; // ip address of the master node char * next_node_ip; // ip address of the next node uint32_t n_ctx; // text context, 0 = from model diff --git a/src/llama.cpp b/src/llama.cpp index 746e847b..aec23444 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -7397,7 +7397,7 @@ static bool llm_load_tensors_impl( enum llama_split_mode split_mode, int main_gpu, bool use_mlock, - bool keep_inp_out_in_metal, + bool keep_out_in_metal, llama_progress_callback progress_callback, void * progress_callback_user_data) { auto & hparams = model.hparams; @@ -9283,7 +9283,7 @@ static bool llm_load_tensors_impl( void * addr = nullptr; auto & ranges = ctx_buffer_ranges[idx]; - ml.get_mapping_ranges(ranges, &addr, idx, ctx, keep_inp_out_in_metal ? cpu_ctx : nullptr); + ml.get_mapping_ranges(ranges, &addr, idx, ctx, keep_out_in_metal ? cpu_ctx : nullptr); for (const auto & range : ranges) { size_t first = range.first; @@ -9407,7 +9407,7 @@ int llm_load_tensors( try { if (!llm_load_tensors_impl( *ml, *model, params.n_world, params.rank, params.n_layer_window, params.n_gpu_layers, params.split_mode, - params.main_gpu, params.use_mlock, params.keep_inp_out_in_metal, params.progress_callback, params.progress_callback_user_data + params.main_gpu, params.use_mlock, params.keep_out_in_metal, params.progress_callback, params.progress_callback_user_data )) { return -2; } @@ -19784,7 +19784,7 @@ struct llama_model_params llama_model_default_params() { /*.use_mmap =*/ true, /*.use_mlock =*/ false, /*.check_tensors =*/ false, - /*.keep_inp_out_in_metal =*/ false, + /*.keep_out_in_metal =*/ false, }; #ifdef GGML_USE_METAL @@ -19802,7 +19802,7 @@ struct llama_context_params llama_context_default_params() { /*.n_layer_window =*/ {32}, /*.n_gpu_layers =*/ 0, /*.unload =*/ false, - /*.keep_inp_out_in_metal =*/ false, + /*.keep_out_in_metal =*/ false, /*.master_ip =*/ nullptr, /*.next_node_ip =*/ nullptr, /*.n_ctx =*/ 512,