From c84f9d29fe5852b73c89244a148f1f1dae64b940 Mon Sep 17 00:00:00 2001 From: Lizonghang <870644199@qq.com> Date: Wed, 12 Feb 2025 17:04:41 +0400 Subject: [PATCH] use arg prefetch and remove arg unload --- common/arg.cpp | 6 +++--- common/common.cpp | 2 +- common/common.h | 2 +- include/llama.h | 2 +- src/llama.cpp | 22 +++++++++------------- 5 files changed, 15 insertions(+), 19 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 12c7788c..0820dbe3 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -724,10 +724,10 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, } ).set_env("LLAMA_ARG_NEXT_NODE_IP")); add_opt(llama_arg( - {"--unload", "--unload-weight"}, - format("whether to unload layer weights after use (default: %s)", params.unload ? "true" : "false"), + {"--prefetch"}, + format("whether to prefetch layer weights (default: %s)", params.prefetch ? "true" : "false"), [](gpt_params & params) { - params.unload = true; + params.prefetch = true; } ).set_env("LLAMA_ARG_UNLOAD")); add_opt(llama_arg( diff --git a/common/common.cpp b/common/common.cpp index 765b64c1..447de272 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1714,7 +1714,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param cparams.n_world = params.n_world; cparams.rank = params.rank; - cparams.unload = params.unload; + cparams.prefetch = params.prefetch; cparams.keep_out_in_metal = params.keep_out_in_metal; cparams.n_gpu_layers = params.n_gpu_layers; std::copy(std::begin(params.n_layer_window), std::end(params.n_layer_window), cparams.n_layer_window); diff --git a/common/common.h b/common/common.h index 9ac200c1..25424612 100644 --- a/common/common.h +++ b/common/common.h @@ -147,7 +147,7 @@ struct gpt_params { uint32_t n_layer_window[32] = {0}; // layer window size on each node std::string master_ip = "localhost"; // ip address of the master node std::string next_node_ip = "localhost"; // ip address of my next node - bool unload = false; // unload layer weights after use or not + bool prefetch = false; // prefetch layer weights bool keep_out_in_metal = true; // whether to keep output weights in metal memory, true by default int32_t gpu_mem = 999.0; // gpu memory to use, in GiB int32_t n_predict = -1; // new tokens to predict diff --git a/include/llama.h b/include/llama.h index b7c170ab..259cb2ea 100644 --- a/include/llama.h +++ b/include/llama.h @@ -322,7 +322,7 @@ extern "C" { uint32_t rank; // my rank uint32_t n_layer_window[32];// number of layers to process in each compute uint32_t n_gpu_layers; // number of layers to process on GPU - bool unload; // whether to unload layer weights after use + bool prefetch; // whether to prefetch layer weights bool keep_out_in_metal; // whether to keep output weights in metal memory char * master_ip; // ip address of the master node char * next_node_ip; // ip address of the next node diff --git a/src/llama.cpp b/src/llama.cpp index 55df72c5..1977f79b 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2571,7 +2571,7 @@ struct llama_cparams { uint32_t n_world; uint32_t rank; uint32_t n_layer_window[32]; - bool unload; + bool prefetch; uint32_t n_ctx; // context size used during inference uint32_t n_batch; uint32_t n_ubatch; @@ -17770,7 +17770,7 @@ static float is_graph_loaded(struct ggml_cgraph * cgraph) { return float(n_loaded) / float(n_total) * 100.0f; } -static void manage_graph_tensors(struct ggml_cgraph * cgraph, int advice, bool force = false) { +static void manage_graph_tensors(struct ggml_cgraph * cgraph, int advice) { long page_size = sysconf(_SC_PAGESIZE); struct Segment { @@ -17826,8 +17826,8 @@ static void manage_graph_tensors(struct ggml_cgraph * cgraph, int advice, bool f size_t prefetch_dense = 4; size_t len = std::max(segment.end - segment.start, static_cast(page_size)); posix_madvise(reinterpret_cast(segment.start), len, advice); // hint to load into memory - // force to prefetch data - if (force && advice == POSIX_MADV_WILLNEED && false) { + // force to prefetch data, disabled by default + if (advice == POSIX_MADV_WILLNEED && false) { volatile char * ptr = reinterpret_cast(segment.start); for (size_t off = 0; off < len; off += prefetch_dense * page_size) { for (size_t i = 0; i < prefetch_dense; i++) { @@ -18104,17 +18104,13 @@ static int llama_decode_internal( } // overlap memory scheduling with other nodes' communication and computing - { + if (cparams.prefetch && n_world > 1) { timer(manage_graph_tensors); int next_gf_id = (i + 1) % gf.size(); - manage_graph_tensors(gf[next_gf_id], POSIX_MADV_WILLNEED, n_world > 1); + manage_graph_tensors(gf[next_gf_id], POSIX_MADV_WILLNEED); if (my_rank == 0 && (is_last_l || (next_gf_id == (int)gf.size() - 1))) { - manage_graph_tensors(gf[0], POSIX_MADV_WILLNEED, n_world > 1); - } - - if (cparams.unload && n_world > 1) { - manage_graph_tensors(sub_gf, POSIX_MADV_DONTNEED); + manage_graph_tensors(gf[0], POSIX_MADV_WILLNEED); } } } @@ -19837,7 +19833,7 @@ struct llama_context_params llama_context_default_params() { /*.rank =*/ 0, /*.n_layer_window =*/ {32}, /*.n_gpu_layers =*/ 0, - /*.unload =*/ false, + /*.prefetch =*/ false, /*.keep_out_in_metal =*/ true, /*.master_ip =*/ nullptr, /*.next_node_ip =*/ nullptr, @@ -20265,7 +20261,7 @@ void * llama_context_setup_backend( auto & cparams = ctx->cparams; std::copy(std::begin(params.n_layer_window), std::end(params.n_layer_window), cparams.n_layer_window); - cparams.unload = params.unload; + cparams.prefetch = params.prefetch; cparams.n_seq_max = std::max(1u, params.n_seq_max); cparams.n_threads = params.n_threads; cparams.n_threads_batch = params.n_threads_batch;