From 261c88f05852c45f86dede40333aa47abc1b4587 Mon Sep 17 00:00:00 2001 From: Zonghang Li Date: Tue, 11 Feb 2025 09:49:17 +0400 Subject: [PATCH 01/18] skip tensors on CUDA in manage_graph_tensors --- src/llama.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/llama.cpp b/src/llama.cpp index 30226968..0a9eabb5 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -17776,10 +17776,19 @@ static void manage_graph_tensors(struct ggml_cgraph * cgraph, int advice, bool f for (int i = 0; i < ggml_graph_n_leafs(cgraph); i++) { struct ggml_tensor * cur = ggml_graph_leaf(cgraph, i); + if (strstr(cur->name, "weight") == nullptr || cur->data == nullptr) { continue; } + const char * backend_name = ggml_backend_buffer_name(cur->buffer); + if (backend_name) { + std::string lower_name(backend_name); + std::transform(lower_name.begin(), lower_name.end(), lower_name.begin(), + [](unsigned char c) { return std::tolower(c); }); + if (lower_name.find("cuda") != std::string::npos) continue; + } + size_t size = ggml_nbytes(cur); size_t first = reinterpret_cast(cur->data); size_t last = first + size; From 24974a488c9ef98f0551d6fbacb197a8f7e33b76 Mon Sep 17 00:00:00 2001 From: Lizonghang <870644199@qq.com> Date: Tue, 11 Feb 2025 11:06:33 +0400 Subject: [PATCH 02/18] assume 10% of active pages can be compressed on macOS UMA --- common/profiler.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common/profiler.cpp b/common/profiler.cpp index 96a1c701..550092b8 100644 --- a/common/profiler.cpp +++ b/common/profiler.cpp @@ -521,9 +521,9 @@ static uint64_t device_host_physical_memory(bool available) { // active pages compression has higher priority than releasing the clean mmap-ed pages // some of the active pages can be compressed to save memory for our mmap-ed model weights if (is_uma_arch()) { - // assume 30% of active pages can be compressed on macOS UMA (an empirical value) + // assume 10% of active pages can be compressed on macOS UMA (an empirical value) // because GPU is more likely to use the inactive memory - memory += vm_stats.active_count * 0.3 * page_size; + memory += vm_stats.active_count * 0.1 * page_size; } else { // assume 50% of active pages can be compressed on macOS NUMA (an empirical value) memory += vm_stats.active_count * 0.5 * page_size; From 3dd3138207378b98d5388940975735e311f704d3 Mon Sep 17 00:00:00 2001 From: Lizonghang <870644199@qq.com> Date: Tue, 11 Feb 2025 17:00:17 +0400 Subject: [PATCH 03/18] ignore tensors already in page cache when prefetching --- src/llama.cpp | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 0a9eabb5..5ceefb10 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -17723,14 +17723,12 @@ static bool is_tensor_loaded(struct ggml_tensor * tensor) { // align addr llama_mmap::align_range(&first, &last, page_size); size_t len = std::max(last - first, static_cast(page_size)); - - // calculate the number of pages to check - size_t page_count = (len + page_size - 1) / page_size; + size_t page_count = len / page_size; #ifdef __APPLE__ char * mincore_res = new char[page_count]; #else - unsigned char *mincore_res = new unsigned char[page_count]; // use 'unsigned char' for Linux + unsigned char * mincore_res = new unsigned char[page_count]; // use 'unsigned char' for Linux #endif // call mincore to check if pages are resident in memory @@ -17759,6 +17757,13 @@ static float is_graph_loaded(struct ggml_cgraph * cgraph) { if (strstr(cur->name, "weight") == nullptr || cur->data == nullptr) { continue; } + const char * backend_name = ggml_backend_buffer_name(cur->buffer); + if (backend_name) { + std::string lower_name(backend_name); + std::transform(lower_name.begin(), lower_name.end(), lower_name.begin(), + [](unsigned char c) { return std::tolower(c); }); + if (lower_name.find("cuda") != std::string::npos) continue; + } if (is_tensor_loaded(cur)) n_loaded++; n_total++; } @@ -17789,6 +17794,8 @@ static void manage_graph_tensors(struct ggml_cgraph * cgraph, int advice, bool f if (lower_name.find("cuda") != std::string::npos) continue; } + if (is_tensor_loaded(cur)) continue; + size_t size = ggml_nbytes(cur); size_t first = reinterpret_cast(cur->data); size_t last = first + size; From 65ad14140a3875fc9d8191cf6eb695687cc32fb0 Mon Sep 17 00:00:00 2001 From: Lizonghang <870644199@qq.com> Date: Tue, 11 Feb 2025 17:10:11 +0400 Subject: [PATCH 04/18] do not check loaded tensors due to increased latency --- src/llama.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 5ceefb10..70a7195b 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -17794,8 +17794,6 @@ static void manage_graph_tensors(struct ggml_cgraph * cgraph, int advice, bool f if (lower_name.find("cuda") != std::string::npos) continue; } - if (is_tensor_loaded(cur)) continue; - size_t size = ggml_nbytes(cur); size_t first = reinterpret_cast(cur->data); size_t last = first + size; From 6a50d494d298fe8d6568e8e9f6fb1da1c1f68b73 Mon Sep 17 00:00:00 2001 From: Lizonghang <870644199@qq.com> Date: Tue, 11 Feb 2025 17:25:06 +0400 Subject: [PATCH 05/18] increase prefetch dense --- src/llama.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 70a7195b..9f613743 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -17823,13 +17823,16 @@ static void manage_graph_tensors(struct ggml_cgraph * cgraph, int advice, bool f } for (const auto & segment : merged_segments) { + size_t prefetch_dense = 4; size_t len = std::max(segment.end - segment.start, static_cast(page_size)); posix_madvise(reinterpret_cast(segment.start), len, advice); // hint to load into memory // force to prefetch data if (force && advice == POSIX_MADV_WILLNEED) { volatile char * ptr = reinterpret_cast(segment.start); - for (size_t off = 0; off < len; off += page_size) { - (void)ptr[off]; + for (size_t off = 0; off < len; off += prefetch_dense * page_size) { + for (size_t i = 0; i < prefetch_dense; i++) { + if (off + i * page_size < len) (void)ptr[off + i * page_size]; + } } } } From b163918b46ed67db86a8346a6bd8a4c2f31b1e32 Mon Sep 17 00:00:00 2001 From: Lizonghang <870644199@qq.com> Date: Wed, 12 Feb 2025 00:17:33 +0400 Subject: [PATCH 06/18] disable prefetch in standalone mode --- src/llama.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 9f613743..c41ebb8c 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -18108,9 +18108,9 @@ static int llama_decode_internal( timer(manage_graph_tensors); int next_gf_id = (i + 1) % gf.size(); - manage_graph_tensors(gf[next_gf_id], POSIX_MADV_WILLNEED, true); + manage_graph_tensors(gf[next_gf_id], POSIX_MADV_WILLNEED, n_world > 1); if (my_rank == 0 && (is_last_l || (next_gf_id == (int)gf.size() - 1))) { - manage_graph_tensors(gf[0], POSIX_MADV_WILLNEED, true); + manage_graph_tensors(gf[0], POSIX_MADV_WILLNEED, n_world > 1); } if (cparams.unload && n_world > 1) { From ea0e655a8b1a3cea883a77e4f156ab74e9612b2e Mon Sep 17 00:00:00 2001 From: Lizonghang <870644199@qq.com> Date: Wed, 12 Feb 2025 16:55:21 +0400 Subject: [PATCH 07/18] disable force feteching --- src/llama.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index c41ebb8c..99bb6bc3 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -17827,14 +17827,14 @@ static void manage_graph_tensors(struct ggml_cgraph * cgraph, int advice, bool f size_t len = std::max(segment.end - segment.start, static_cast(page_size)); posix_madvise(reinterpret_cast(segment.start), len, advice); // hint to load into memory // force to prefetch data - if (force && advice == POSIX_MADV_WILLNEED) { - volatile char * ptr = reinterpret_cast(segment.start); - for (size_t off = 0; off < len; off += prefetch_dense * page_size) { - for (size_t i = 0; i < prefetch_dense; i++) { - if (off + i * page_size < len) (void)ptr[off + i * page_size]; - } - } - } + // if (force && advice == POSIX_MADV_WILLNEED && false) { + // volatile char * ptr = reinterpret_cast(segment.start); + // for (size_t off = 0; off < len; off += prefetch_dense * page_size) { + // for (size_t i = 0; i < prefetch_dense; i++) { + // if (off + i * page_size < len) (void)ptr[off + i * page_size]; + // } + // } + // } } } From 708b1d8c897e83d6030227520387c995c173b5d9 Mon Sep 17 00:00:00 2001 From: Lizonghang <870644199@qq.com> Date: Wed, 12 Feb 2025 16:55:44 +0400 Subject: [PATCH 08/18] disable force fetching --- src/llama.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 99bb6bc3..55df72c5 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -17827,14 +17827,14 @@ static void manage_graph_tensors(struct ggml_cgraph * cgraph, int advice, bool f size_t len = std::max(segment.end - segment.start, static_cast(page_size)); posix_madvise(reinterpret_cast(segment.start), len, advice); // hint to load into memory // force to prefetch data - // if (force && advice == POSIX_MADV_WILLNEED && false) { - // volatile char * ptr = reinterpret_cast(segment.start); - // for (size_t off = 0; off < len; off += prefetch_dense * page_size) { - // for (size_t i = 0; i < prefetch_dense; i++) { - // if (off + i * page_size < len) (void)ptr[off + i * page_size]; - // } - // } - // } + if (force && advice == POSIX_MADV_WILLNEED && false) { + volatile char * ptr = reinterpret_cast(segment.start); + for (size_t off = 0; off < len; off += prefetch_dense * page_size) { + for (size_t i = 0; i < prefetch_dense; i++) { + if (off + i * page_size < len) (void)ptr[off + i * page_size]; + } + } + } } } From c84f9d29fe5852b73c89244a148f1f1dae64b940 Mon Sep 17 00:00:00 2001 From: Lizonghang <870644199@qq.com> Date: Wed, 12 Feb 2025 17:04:41 +0400 Subject: [PATCH 09/18] use arg prefetch and remove arg unload --- common/arg.cpp | 6 +++--- common/common.cpp | 2 +- common/common.h | 2 +- include/llama.h | 2 +- src/llama.cpp | 22 +++++++++------------- 5 files changed, 15 insertions(+), 19 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 12c7788c..0820dbe3 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -724,10 +724,10 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, } ).set_env("LLAMA_ARG_NEXT_NODE_IP")); add_opt(llama_arg( - {"--unload", "--unload-weight"}, - format("whether to unload layer weights after use (default: %s)", params.unload ? "true" : "false"), + {"--prefetch"}, + format("whether to prefetch layer weights (default: %s)", params.prefetch ? "true" : "false"), [](gpt_params & params) { - params.unload = true; + params.prefetch = true; } ).set_env("LLAMA_ARG_UNLOAD")); add_opt(llama_arg( diff --git a/common/common.cpp b/common/common.cpp index 765b64c1..447de272 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1714,7 +1714,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param cparams.n_world = params.n_world; cparams.rank = params.rank; - cparams.unload = params.unload; + cparams.prefetch = params.prefetch; cparams.keep_out_in_metal = params.keep_out_in_metal; cparams.n_gpu_layers = params.n_gpu_layers; std::copy(std::begin(params.n_layer_window), std::end(params.n_layer_window), cparams.n_layer_window); diff --git a/common/common.h b/common/common.h index 9ac200c1..25424612 100644 --- a/common/common.h +++ b/common/common.h @@ -147,7 +147,7 @@ struct gpt_params { uint32_t n_layer_window[32] = {0}; // layer window size on each node std::string master_ip = "localhost"; // ip address of the master node std::string next_node_ip = "localhost"; // ip address of my next node - bool unload = false; // unload layer weights after use or not + bool prefetch = false; // prefetch layer weights bool keep_out_in_metal = true; // whether to keep output weights in metal memory, true by default int32_t gpu_mem = 999.0; // gpu memory to use, in GiB int32_t n_predict = -1; // new tokens to predict diff --git a/include/llama.h b/include/llama.h index b7c170ab..259cb2ea 100644 --- a/include/llama.h +++ b/include/llama.h @@ -322,7 +322,7 @@ extern "C" { uint32_t rank; // my rank uint32_t n_layer_window[32];// number of layers to process in each compute uint32_t n_gpu_layers; // number of layers to process on GPU - bool unload; // whether to unload layer weights after use + bool prefetch; // whether to prefetch layer weights bool keep_out_in_metal; // whether to keep output weights in metal memory char * master_ip; // ip address of the master node char * next_node_ip; // ip address of the next node diff --git a/src/llama.cpp b/src/llama.cpp index 55df72c5..1977f79b 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2571,7 +2571,7 @@ struct llama_cparams { uint32_t n_world; uint32_t rank; uint32_t n_layer_window[32]; - bool unload; + bool prefetch; uint32_t n_ctx; // context size used during inference uint32_t n_batch; uint32_t n_ubatch; @@ -17770,7 +17770,7 @@ static float is_graph_loaded(struct ggml_cgraph * cgraph) { return float(n_loaded) / float(n_total) * 100.0f; } -static void manage_graph_tensors(struct ggml_cgraph * cgraph, int advice, bool force = false) { +static void manage_graph_tensors(struct ggml_cgraph * cgraph, int advice) { long page_size = sysconf(_SC_PAGESIZE); struct Segment { @@ -17826,8 +17826,8 @@ static void manage_graph_tensors(struct ggml_cgraph * cgraph, int advice, bool f size_t prefetch_dense = 4; size_t len = std::max(segment.end - segment.start, static_cast(page_size)); posix_madvise(reinterpret_cast(segment.start), len, advice); // hint to load into memory - // force to prefetch data - if (force && advice == POSIX_MADV_WILLNEED && false) { + // force to prefetch data, disabled by default + if (advice == POSIX_MADV_WILLNEED && false) { volatile char * ptr = reinterpret_cast(segment.start); for (size_t off = 0; off < len; off += prefetch_dense * page_size) { for (size_t i = 0; i < prefetch_dense; i++) { @@ -18104,17 +18104,13 @@ static int llama_decode_internal( } // overlap memory scheduling with other nodes' communication and computing - { + if (cparams.prefetch && n_world > 1) { timer(manage_graph_tensors); int next_gf_id = (i + 1) % gf.size(); - manage_graph_tensors(gf[next_gf_id], POSIX_MADV_WILLNEED, n_world > 1); + manage_graph_tensors(gf[next_gf_id], POSIX_MADV_WILLNEED); if (my_rank == 0 && (is_last_l || (next_gf_id == (int)gf.size() - 1))) { - manage_graph_tensors(gf[0], POSIX_MADV_WILLNEED, n_world > 1); - } - - if (cparams.unload && n_world > 1) { - manage_graph_tensors(sub_gf, POSIX_MADV_DONTNEED); + manage_graph_tensors(gf[0], POSIX_MADV_WILLNEED); } } } @@ -19837,7 +19833,7 @@ struct llama_context_params llama_context_default_params() { /*.rank =*/ 0, /*.n_layer_window =*/ {32}, /*.n_gpu_layers =*/ 0, - /*.unload =*/ false, + /*.prefetch =*/ false, /*.keep_out_in_metal =*/ true, /*.master_ip =*/ nullptr, /*.next_node_ip =*/ nullptr, @@ -20265,7 +20261,7 @@ void * llama_context_setup_backend( auto & cparams = ctx->cparams; std::copy(std::begin(params.n_layer_window), std::end(params.n_layer_window), cparams.n_layer_window); - cparams.unload = params.unload; + cparams.prefetch = params.prefetch; cparams.n_seq_max = std::max(1u, params.n_seq_max); cparams.n_threads = params.n_threads; cparams.n_threads_batch = params.n_threads_batch; From fdfaaecd5e4fbdcc04cc4c93bd3f2b061cd2758a Mon Sep 17 00:00:00 2001 From: Lizonghang <870644199@qq.com> Date: Wed, 12 Feb 2025 17:12:30 +0400 Subject: [PATCH 10/18] disable device profiler in standalone mode --- common/common.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 447de272..ec6a0d98 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1473,8 +1473,10 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { // get device profile LOG_INF("\nstart profiling this device, this may take some seconds ...\n"); dev_info.rank = params.rank; - llama_profile_device(&dev_info, model, ml, params.gpu_mem, params.n_predict, params.n_ctx, params.cpuparams.n_threads, params.flash_attn); - + if (n_world > 1) { + llama_profile_device(&dev_info, model, ml, params.gpu_mem, params.n_predict, params.n_ctx, params.cpuparams.n_threads, params.flash_attn); + } + // create llama context struct llama_context_params cparams = llama_context_params_from_gpt_params(params); llama_context * lctx = llama_new_context_with_model(model, cparams); From 630556bc1660e44d92e10c352916c1a1df0134fc Mon Sep 17 00:00:00 2001 From: Zonghang Li Date: Sat, 15 Feb 2025 17:23:19 +0400 Subject: [PATCH 11/18] fix default allocation strategy to avoid OOM --- common/common.cpp | 58 +++++++++++++++++++++++++++++++++++------------ 1 file changed, 43 insertions(+), 15 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index ec6a0d98..40603517 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -866,6 +866,10 @@ static bool assign_layers_to_device( return true; } + std::vector w(n_world, 0); + std::vector n(n_world, 0); + std::vector mem_budget(n_world, 0.0f); + const device_info &master = dev_info_set[0]; // model-specific constants @@ -879,14 +883,12 @@ static bool assign_layers_to_device( const int64_t bo = dev_info_set[0].model_bytes.nb_output; const int64_t b_prime = b + 2 * (n_embd_k_gqa + n_embd_v_gqa) * n_kv; +#if defined(USE_HIGHS) // device-specific constants std::vector alpha(n_world, 0.0f); std::vector beta(n_world, 0.0f); std::vector xi(n_world, 0.0f); float kappa = 0.0f; - std::vector w(n_world, 0); - std::vector n(n_world, 0); - std::vector mem_budget(n_world, 0.0f); // -------- Compute alpha[m], beta[m], xi[m] -------- for (uint32_t m = 0; m < n_world; ++m) { @@ -977,7 +979,6 @@ static bool assign_layers_to_device( : std::min_element(mem_budget.begin(), mem_budget.end()); w[std::distance(mem_budget.begin(), device)] += diff; -#if defined(USE_HIGHS) // stores the actual read bandwidth (GB/s) for each device std::vector disk_speed(n_world, 0.0f); for (uint32_t m = 0; m < n_world; ++m) { @@ -1032,7 +1033,8 @@ static bool assign_layers_to_device( bool is_windows = strcmp(dev.device_os, "Windows") == 0; GGML_ASSERT(!is_windows && "Windows is not tested yet\n"); - llama_model_compute_buf_size(&c_cpu[m], &c_gpu[m], model, cparams, dev.gpu_support.metal, m == 0, w[m] * k, n[m] * k); + bool use_gpu = dev.gpu_support.metal || dev.gpu_support.cuda; + llama_model_compute_buf_size(&c_cpu[m], &c_gpu[m], model, cparams, use_gpu, m == 0, w[m] * k, n[m] * k); int l_m = w[m] * k; // total number of layers assigned to device m int l_m_gpu = n[m] * k; // number of layers assigned to device m that run on GPU @@ -1395,19 +1397,45 @@ static bool assign_layers_to_device( std::copy(n.begin(), n.end(), n_gpu_layers); #else - (void)bi; - (void)bo; - (void)kappa; - (void)cparams; - (void)min_disk_read_speed; - (void)n_vocab; - (void)GIGABYTE; - - std::copy(w.begin(), w.end(), n_layer_window); + // assign layers according to RAM/VRAM for (uint32_t m = 0; m < n_world; ++m) { const device_info & dev = dev_info_set[m]; + if (dev.gpu_support.metal || dev.gpu_support.cuda) { + mem_budget[m] = dev.gpu_props.memory_free; + } else { + mem_budget[m] = dev.memory.available_physical; + } + } + + // initialize w_m proportionally to memory budget and n_m to 0 + float total_mem_budget = std::accumulate(mem_budget.begin(), mem_budget.end(), 0.0f); + for (uint32_t m = 0; m < n_world; ++m) { + w[m] = std::round(mem_budget[m] / total_mem_budget * n_layer); + n[m] = 0; + } + // adjust w[m] to ensure L mod W = 0 + int diff = n_layer - std::accumulate(w.begin(), w.end(), 0); + auto device = (diff > 0) ? std::max_element(mem_budget.begin(), mem_budget.end()) + : std::min_element(mem_budget.begin(), mem_budget.end()); + w[std::distance(mem_budget.begin(), device)] += diff; + std::copy(w.begin(), w.end(), n_layer_window); + + std::vector vec_z_gpu(n_world, 0.0f); + std::vector c_cpu(n_world, 0), c_gpu(n_world, 0); + + for (uint32_t m = 0; m < n_world; ++m) { + const device_info & dev = dev_info_set[m]; + bool use_gpu = dev.gpu_support.metal || dev.gpu_support.cuda; + llama_model_compute_buf_size(&c_cpu[m], &c_gpu[m], model, cparams, use_gpu, m == 0, w[m] * k, n[m] * k); + if (dev.gpu_support.cuda || dev.gpu_support.metal) { - n_gpu_layers[m] = w[m]; + int64_t required_mem = w[m] * b_prime; + int64_t available_mem = dev.gpu_props.memory_free * GIGABYTE - c_gpu[m]; + if (required_mem <= available_mem) { + n_gpu_layers[m] = w[m]; + } else { + n_gpu_layers[m] = available_mem / b_prime; + } } } From 64c4a479805a4c018c9ed606080ccfdb2bc6c79c Mon Sep 17 00:00:00 2001 From: Lizonghang <870644199@qq.com> Date: Sat, 15 Feb 2025 17:33:03 +0400 Subject: [PATCH 12/18] fix bugs and warnings --- common/common.cpp | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 40603517..bcd3c49a 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -870,20 +870,20 @@ static bool assign_layers_to_device( std::vector n(n_world, 0); std::vector mem_budget(n_world, 0.0f); - const device_info &master = dev_info_set[0]; - // model-specific constants const int n_embd_k_gqa = llama_model_n_embd_k_gqa(model); const int n_embd_v_gqa = llama_model_n_embd_v_gqa(model); - const int n_vocab = llama_n_vocab(model); const int n_kv = cparams.n_ctx; const int64_t b = dev_info_set[0].model_bytes.nb_layer; - const int64_t bi = dev_info_set[0].model_bytes.nb_input; - const int64_t bo = dev_info_set[0].model_bytes.nb_output; const int64_t b_prime = b + 2 * (n_embd_k_gqa + n_embd_v_gqa) * n_kv; #if defined(USE_HIGHS) + const device_info &master = dev_info_set[0]; + const int n_vocab = llama_n_vocab(model); + const int64_t bi = dev_info_set[0].model_bytes.nb_input; + const int64_t bo = dev_info_set[0].model_bytes.nb_output; + // device-specific constants std::vector alpha(n_world, 0.0f); std::vector beta(n_world, 0.0f); @@ -1397,6 +1397,8 @@ static bool assign_layers_to_device( std::copy(n.begin(), n.end(), n_gpu_layers); #else + (void)min_disk_read_speed; + // assign layers according to RAM/VRAM for (uint32_t m = 0; m < n_world; ++m) { const device_info & dev = dev_info_set[m]; @@ -1426,7 +1428,7 @@ static bool assign_layers_to_device( for (uint32_t m = 0; m < n_world; ++m) { const device_info & dev = dev_info_set[m]; bool use_gpu = dev.gpu_support.metal || dev.gpu_support.cuda; - llama_model_compute_buf_size(&c_cpu[m], &c_gpu[m], model, cparams, use_gpu, m == 0, w[m] * k, n[m] * k); + llama_model_compute_buf_size(&c_cpu[m], &c_gpu[m], model, cparams, use_gpu, m == 0, w[m], n[m]); if (dev.gpu_support.cuda || dev.gpu_support.metal) { int64_t required_mem = w[m] * b_prime; From e64f237e04a3fc5fbf14dab802325871769f7cdc Mon Sep 17 00:00:00 2001 From: Lizonghang <870644199@qq.com> Date: Sat, 15 Feb 2025 17:43:03 +0400 Subject: [PATCH 13/18] fix bugs in available_mem calculation --- common/common.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/common/common.cpp b/common/common.cpp index bcd3c49a..f13a2e12 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1432,7 +1432,11 @@ static bool assign_layers_to_device( if (dev.gpu_support.cuda || dev.gpu_support.metal) { int64_t required_mem = w[m] * b_prime; - int64_t available_mem = dev.gpu_props.memory_free * GIGABYTE - c_gpu[m]; + int64_t available_mem = dev.gpu_props.memory_free * GIGABYTE - c_gpu[m]; + if (dev.gpu_support.metal && m == 0 && cparams.keep_out_in_metal) { + available_mem -= bo; + } + if (required_mem <= available_mem) { n_gpu_layers[m] = w[m]; } else { From 8532d030f324a7acc6429a2c449386cc421255c7 Mon Sep 17 00:00:00 2001 From: Zonghang Li Date: Sat, 15 Feb 2025 18:10:11 +0400 Subject: [PATCH 14/18] fix bugs in bo --- common/common.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/common.cpp b/common/common.cpp index f13a2e12..01022a26 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -876,13 +876,13 @@ static bool assign_layers_to_device( const int n_kv = cparams.n_ctx; const int64_t b = dev_info_set[0].model_bytes.nb_layer; + const int64_t bo = dev_info_set[0].model_bytes.nb_output; const int64_t b_prime = b + 2 * (n_embd_k_gqa + n_embd_v_gqa) * n_kv; #if defined(USE_HIGHS) const device_info &master = dev_info_set[0]; const int n_vocab = llama_n_vocab(model); const int64_t bi = dev_info_set[0].model_bytes.nb_input; - const int64_t bo = dev_info_set[0].model_bytes.nb_output; // device-specific constants std::vector alpha(n_world, 0.0f); From 863393554a36182ce33ca39fcbb679f8b49c24ab Mon Sep 17 00:00:00 2001 From: Lizonghang <870644199@qq.com> Date: Mon, 17 Feb 2025 18:54:18 +0400 Subject: [PATCH 15/18] add gpu underutilization calibration step --- common/common.cpp | 57 +++++++++++++++++++++++++++++++++++++---------- 1 file changed, 45 insertions(+), 12 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 01022a26..35da8f2c 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1013,6 +1013,7 @@ static bool assign_layers_to_device( // M3: devices running on Linux or Android and with insufficient memory // M4: devices with sufficient memory or very slow disk I/O (slower than min_disk_io_speed) std::vector M1, M2, M3, M4, M1_prev, M2_prev, M3_prev, M4_prev; + std::vector M4_force(n_world, false); std::vector c_cpu(n_world, 0), c_gpu(n_world, 0); // helper function to check if a device is in a specific set @@ -1043,18 +1044,16 @@ static bool assign_layers_to_device( bool condition3 = (l_m - l_m_gpu) * b_prime + (bi / n_vocab + bo) * int(m == 0) + c_cpu[m] > mem_budget[m] * GIGABYTE; bool is_slow_disk = disk_speed[m] < min_disk_read_speed; - if (is_macos && !dev.gpu_support.metal && condition1 && !is_slow_disk) { - // case 1: macOS without Metal, and with insufficient memory - M1.push_back(m); - } else if (is_macos && dev.gpu_support.metal && condition2 && !is_slow_disk) { - // case 2: macOS with Metal, and with insufficient memory - M2.push_back(m); - } else if ((is_linux || is_android) && condition3 && !is_slow_disk) { - // case 3: Linux with insufficient memory - M3.push_back(m); + if (M4_force[m] || is_slow_disk) { + M4.push_back(m); // case 4: devices with very slow disk or force to be in M4 + } else if (is_macos && !dev.gpu_support.metal && condition1) { + M1.push_back(m); // case 1: macOS without Metal, and with insufficient memory + } else if (is_macos && dev.gpu_support.metal && condition2) { + M2.push_back(m); // case 2: macOS with Metal, and with insufficient memory + } else if ((is_linux || is_android) && condition3) { + M3.push_back(m); // case 3: Linux with insufficient memory } else { - // case 4: otherwise, assigned to M4 - M4.push_back(m); + M4.push_back(m); // case 4: devices with sufficient memory } } @@ -1255,7 +1254,8 @@ static bool assign_layers_to_device( // constraint bound 4: CUDA/shared memory constraint for CUDA/Metal devices for (uint32_t m = 0; m < n_world; ++m) { - model.lp_.row_upper_[constraint_idx] = W * vec_z_gpu[m]; + double upper_bound = W * vec_z_gpu[m]; + model.lp_.row_upper_[constraint_idx] = (upper_bound > 0) ? std::max(upper_bound, 1.0) : upper_bound; constraint_idx++; } @@ -1361,6 +1361,39 @@ static bool assign_layers_to_device( k, objective_value, vec_to_str(solution.col_value).c_str(), best_k, best_objective, vec_to_str(best_solution).c_str()); } + // check the solution + bool is_set_suboptimal = false; + for (uint32_t m = 0; m < n_world; ++m) { + uint32_t w_m = best_solution[m], n_m = best_solution[m + n_world]; + // if w[m] > n[m] and there is still free VRAM, the GPU is not fully utilized, + // indicating that the memory constraints are too strict, and the set assignment is suboptimal. + if (w_m > n_m && n_m < static_cast(std::round(W * vec_z_gpu[m]))) { + is_set_suboptimal = true; + } + } + + if (is_set_suboptimal) { + int worst_device = -1; + float worst_speed = std::numeric_limits::max(); + + // find the device with slowest disk speed but was not in M4 yet + for (uint32_t m = 0; m < n_world; ++m) { + if (!in_set(m, M4) && disk_speed[m] < worst_speed) { + worst_speed = disk_speed[m]; + worst_device = m; + } + } + + if (worst_device != -1) { + M4_force[worst_device] = true; + LOG_INF("Forcing device %d (disk speed %.2f GB/s) into M4\n", worst_device, worst_speed); + } else { + LOG_INF("Infeasible solution detected but no device can be forced into M4\n"); + } + + continue; + } + // update w[m] and n[m] GGML_ASSERT(best_solution.size() == n_world * 2 && "Invalid solution\n"); std::copy(best_solution.begin(), best_solution.begin() + n_world, w.begin()); From e219fada4e52f4071469cf6ae62d6766512cc94e Mon Sep 17 00:00:00 2001 From: Lizonghang <870644199@qq.com> Date: Wed, 19 Feb 2025 16:24:12 +0400 Subject: [PATCH 16/18] disable timer --- src/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama.cpp b/src/llama.cpp index 1977f79b..2aeb0c69 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -106,7 +106,7 @@ struct Timer { const char * name; int64_t start_time; - bool enable_timer = true; + bool enable_timer = false; Timer(const char * name) : name(name), start_time(ggml_time_us()) {} ~Timer() { if (enable_timer) { From 07a397360b420b6c08c7cf26ef1da6aab0c291c4 Mon Sep 17 00:00:00 2001 From: Lizonghang <870644199@qq.com> Date: Wed, 19 Feb 2025 16:30:18 +0400 Subject: [PATCH 17/18] fix gpu underutilization --- common/common.cpp | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 35da8f2c..deaffc71 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1362,17 +1362,22 @@ static bool assign_layers_to_device( } // check the solution - bool is_set_suboptimal = false; + bool has_free_gpu_memory = false, has_overload = false; for (uint32_t m = 0; m < n_world; ++m) { uint32_t w_m = best_solution[m], n_m = best_solution[m + n_world]; - // if w[m] > n[m] and there is still free VRAM, the GPU is not fully utilized, - // indicating that the memory constraints are too strict, and the set assignment is suboptimal. - if (w_m > n_m && n_m < static_cast(std::round(W * vec_z_gpu[m]))) { - is_set_suboptimal = true; - } + + // if there is still free GPU memory + if (n_m < static_cast(std::round(W * vec_z_gpu[m]))) { + has_free_gpu_memory = true; + } + + // if there is device overloaded + if (w_m > n_m) { + has_overload = true; + } } - if (is_set_suboptimal) { + if (has_free_gpu_memory && has_overload) { int worst_device = -1; float worst_speed = std::numeric_limits::max(); @@ -1422,8 +1427,8 @@ static bool assign_layers_to_device( LOG_INF(" - N Layer Window : %d\n", w[m]); LOG_INF(" - N GPU Layers : %d\n", n[m]); } - LOG_INF("\nEstimated Latency: %.3f ms\n", final_objective); - LOG_INF("------------------------------------------"); + // LOG_INF("\nEstimated Latency: %.3f ms\n", final_objective); + // LOG_INF("------------------------------------------"); // copy value from w and n to n_layer_window and n_gpu_layers, respectively std::copy(w.begin(), w.end(), n_layer_window); From f5e874f75f722dbd498c9a7b198d6217f46e6388 Mon Sep 17 00:00:00 2001 From: Zonghang Li Date: Sun, 23 Feb 2025 01:38:13 +0400 Subject: [PATCH 18/18] remove conda path --- Makefile | 5 ----- 1 file changed, 5 deletions(-) diff --git a/Makefile b/Makefile index d2f459f6..a3f1bf04 100644 --- a/Makefile +++ b/Makefile @@ -274,15 +274,10 @@ endif ifeq ($(USE_HIGHS),1) HIGHS_CPPFLAGS = -isystem /usr/local/include/highs HIGHS_LDFLAGS = -L/usr/local/lib -lhighs - ifeq ($(UNAME_S),Darwin) HIGHS_CPPFLAGS += -isystem /opt/homebrew/include/highs HIGHS_LDFLAGS += -L/opt/homebrew/lib -lhighs - else ifneq ($(CONDA_PREFIX),) - HIGHS_CPPFLAGS += -isystem $(CONDA_PREFIX)/include -isystem $(CONDA_PREFIX)/include/highs - HIGHS_LDFLAGS += -L$(CONDA_PREFIX)/lib -Wl,-rpath,$(CONDA_PREFIX)/lib endif - MK_CPPFLAGS += $(HIGHS_CPPFLAGS) -DUSE_HIGHS MK_LDFLAGS += $(HIGHS_LDFLAGS) endif