From d78fa427e78e19c115b97be43b3457b819d1c8a9 Mon Sep 17 00:00:00 2001 From: Lizonghang <870644199@qq.com> Date: Mon, 9 Dec 2024 10:07:42 +0400 Subject: [PATCH] add memory copy speed test --- common/profiler.cpp | 172 ++++++++++++++++++++++++++++++++++++++++---- common/profiler.h | 3 + include/llama.h | 6 ++ src/llama.cpp | 12 +++- 4 files changed, 179 insertions(+), 14 deletions(-) diff --git a/common/profiler.cpp b/common/profiler.cpp index b009486f..4e09fedc 100644 --- a/common/profiler.cpp +++ b/common/profiler.cpp @@ -221,25 +221,29 @@ float device_cpu_flops(struct llama_model * model, enum ggml_type src0t, enum gg } float device_metal_flops(struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t) { + float flops = 0.0f; + #ifdef GGML_USE_METAL - return device_flops(model, src0t, src1t, PROFILER_BACKEND_TYPE_METAL, 4); + flops = device_flops(model, src0t, src1t, PROFILER_BACKEND_TYPE_METAL, 4); #endif (void)model; (void)src0t; (void)src1t; - return 0.0f; + return flops; } float device_cuda_flops(struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t) { + float flops = 0.0f; + #ifdef GGML_USE_CUDA - return device_flops(model, src0t, src1t, PROFILER_BACKEND_TYPE_CUDA, 4); + flops = device_flops(model, src0t, src1t, PROFILER_BACKEND_TYPE_CUDA, 4); #endif (void)model; (void)src0t; (void)src1t; - return 0.0f; + return flops; } float device_inp_embd_delay(struct llama_model * model, enum ggml_type src0t, int n_tokens, int n_threads) { @@ -1000,19 +1004,140 @@ static float device_read_vram_bw(enum profiler_backend_type btype) { } float device_metal_read_vram_bw() { + float bw = 0.0f; + #ifdef GGML_USE_METAL - return device_read_vram_bw(PROFILER_BACKEND_TYPE_METAL); + bw = device_read_vram_bw(PROFILER_BACKEND_TYPE_METAL); #endif - return 0.0f; + return bw; } float device_cuda_read_vram_bw() { + float bw = 0.0f; + #ifdef GGML_USE_CUDA - return device_read_vram_bw(PROFILER_BACKEND_TYPE_CUDA); + bw = device_read_vram_bw(PROFILER_BACKEND_TYPE_CUDA); #endif - return 0.0f; + return bw; +} + +// return ggml_cpy delay in kvcache in ms +static float device_mem_copy(struct llama_model * model, enum profiler_backend_type btype, int n_threads) { + const int64_t n_embd_k_gqa = llama_model_n_embd_k_gqa(model); + const int64_t n_embd_v_gqa = llama_model_n_embd_v_gqa(model); + + std::vector src_mat_k(n_embd_k_gqa, 1.0f); + std::vector src_mat_v(n_embd_v_gqa, 1.0f); + std::vector dst_mat_k(n_embd_k_gqa, 0.0f); + std::vector dst_mat_v(n_embd_v_gqa, 0.0f); + + ggml_backend_t backend = NULL; + switch (btype) { + case PROFILER_BACKEND_TYPE_CPU: + backend = ggml_backend_cpu_init(); + break; + case PROFILER_BACKEND_TYPE_METAL: +#ifdef GGML_USE_METAL + backend = ggml_backend_metal_init(); +#endif + break; + case PROFILER_BACKEND_TYPE_CUDA: +#ifdef GGML_USE_CUDA + backend = ggml_backend_cuda_init(0); +#endif + break; + } + + if (!backend) { + LOG_INF("%s: ggml backend init failed\n", __func__); + return 0.0f; + } + + size_t ctx_size = 0; + ctx_size += 4 * ggml_tensor_overhead(); // tensors + + struct ggml_init_params params = { + /*.mem_size =*/ ctx_size, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_backend_alloc_ctx_tensors() + }; + struct ggml_context * ctx = ggml_init(params); + + struct ggml_tensor * src_tensor_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_embd_k_gqa); + struct ggml_tensor * src_tensor_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_embd_v_gqa); + struct ggml_tensor * dst_tensor_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_embd_k_gqa); + struct ggml_tensor * dst_tensor_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_embd_v_gqa); + + ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend); + + ggml_backend_tensor_set(src_tensor_k, src_mat_k.data(), 0, ggml_nbytes(src_tensor_k)); + ggml_backend_tensor_set(src_tensor_v, src_mat_v.data(), 0, ggml_nbytes(src_tensor_v)); + ggml_backend_tensor_set(dst_tensor_k, dst_mat_k.data(), 0, ggml_nbytes(dst_tensor_k)); + ggml_backend_tensor_set(dst_tensor_v, dst_mat_v.data(), 0, ggml_nbytes(dst_tensor_v)); + + struct ggml_cgraph * gf = NULL; + struct ggml_context * ctx_cgraph = NULL; + { + struct ggml_init_params params0 = { + /*.mem_size =*/ ggml_tensor_overhead() * 4 + ggml_graph_overhead(), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph() + }; + ctx_cgraph = ggml_init(params0); + + gf = ggml_new_graph(ctx_cgraph); + ggml_build_forward_expand(gf, ggml_cpy(ctx_cgraph, src_tensor_k, dst_tensor_k)); + ggml_build_forward_expand(gf, ggml_cpy(ctx_cgraph, src_tensor_v, dst_tensor_v)); + } + + ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend)); + ggml_gallocr_alloc_graph(allocr, gf); + + if (ggml_backend_is_cpu(backend)) { + ggml_backend_cpu_set_n_threads(backend, n_threads); + } + + const int64_t t_start = ggml_time_us(); + ggml_backend_graph_compute(backend, gf); + const int64_t t_end = ggml_time_us(); + + double elapsed_ms = ((double)t_end - (double)t_start) / 1e3; // ms + + ggml_free(ctx_cgraph); + ggml_gallocr_free(allocr); + ggml_free(ctx); + ggml_backend_buffer_free(buffer); + ggml_backend_free(backend); + + return (float)elapsed_ms; +} + +float device_cpu_mem_copy(struct llama_model * model, int n_threads) { + return device_mem_copy(model, PROFILER_BACKEND_TYPE_CPU, n_threads); +} + +float device_metal_mem_copy(struct llama_model * model) { + float delay = 0.0f; + +#ifdef GGML_USE_METAL + delay = device_mem_copy(model, PROFILER_BACKEND_TYPE_METAL, 4); +#endif + + (void)model; + return delay; +} + +float device_cuda_mem_copy(struct llama_model * model) { + float delay = 0.0f; + +#ifdef GGML_USE_CUDA + delay = device_mem_copy(model, PROFILER_BACKEND_TYPE_CUDA, 4); +#endif + + (void)model; + return delay; } int device_has_metal(void) { @@ -1229,14 +1354,18 @@ static float device_disk_access_delay(struct device_info & dev_info, struct llam double total_bytes_gib = static_cast(cpu_total_bytes + gpu_total_bytes) / 1024.0 / 1024.0 / 1024.0; double total_kv_size_gib = cpu_kv_size_gib + gpu_kv_size_gib; double total_compute_buf_gib = cpu_compute_buf_gib + gpu_compute_buf_gib; + double total_mem_needed = total_bytes_gib + total_kv_size_gib + total_compute_buf_gib; float disk_read_bw = dev_info.disk.read_rnd_bw; - if (total_bytes_gib + total_kv_size_gib + total_compute_buf_gib < dev_info.memory.total_physical) { + + if (total_mem_needed < dev_info.memory.total_physical - 1) { // -1 is an empirical value reserved by system processes // each time one new row of lookup table will be loaded return static_cast(input_bytes) / 1e9 / disk_read_bw * 1000; // convert to ms } else { // warn: OOM error may occur if -ngl is set large - float weight_reload_delay = total_bytes_gib * 1024.0 * 1024.0 * 1024.0 / 1e6 / disk_read_bw; // ms - return weight_reload_delay; + if (total_mem_needed > dev_info.memory.total_physical + 10) { // 10 is an empirical value that may cause system down + throw std::runtime_error("[WARN] Model is too large for Metal shared memory and may cause system down, stopped\n"); + } + return total_bytes_gib * 1024.0 * 1024.0 * 1024.0 / 1e6 / disk_read_bw; // ms } } #endif @@ -1248,6 +1377,7 @@ static float device_disk_access_delay(struct device_info & dev_info, struct llam float cpu_mem_avail = dev_info.memory.available_physical; // GiB float disk_read_bw = dev_info.disk.read_rnd_bw * 1e9 / 1024.0 / 1024.0 / 1024.0; // convert GB/s to GiB/s + if (cpu_total_bytes_gib + cpu_kv_size_gib + cpu_compute_buf_gib > cpu_mem_avail) { #if defined(__APPLE__) && defined(__MACH__) @@ -1265,6 +1395,23 @@ static float device_disk_access_delay(struct device_info & dev_info, struct llam } } +static float device_mem_copy_delay(struct llama_model * model, const struct llama_context_params cparams) { + int n_layers = llama_model_n_layers(model); + int n_gpu_layers = std::min(static_cast(cparams.n_gpu_layers), n_layers); + + float layer_delay_cpu = device_cpu_mem_copy(model, cparams.n_threads); + +#ifdef GGML_USE_METAL + float layer_delay_metal = device_metal_mem_copy(model); + return layer_delay_metal * n_gpu_layers + layer_delay_cpu * (n_layers - n_gpu_layers); +#elif GGML_USE_CUDA + float layer_delay_cuda = device_cuda_mem_copy(model); + return layer_delay_cuda * n_gpu_layers + layer_delay_cpu * (n_layers - n_gpu_layers); +#else + return layer_delay_cpu * n_layers; +#endif +} + void device_print_props(struct device_info * dev_info_set, int n, struct llama_model * model, const struct llama_context_params cparams) { LOG_INF("\n-------------------------------------------------------------------------------------------\n"); LOG_INF("| Property "); @@ -1661,9 +1808,10 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m // todo: calculate for each device, not only master float latency = 0.0f; int n_layers = llama_model_n_layers (model); - latency += device_compute_delay (dev_info_set[0], n_layers,cparams); + latency += device_compute_delay (dev_info_set[0], n_layers, cparams); latency += device_memory_access_delay(dev_info_set[0], model, cparams, n_layers); latency += device_disk_access_delay (dev_info_set[0], model, cparams); // if physical memory is not enough, some mapped data will be released and reloaded later + latency += device_mem_copy_delay (model, cparams); // memory copy delay in kvcache LOG_INF("| Token latency (ms) "); LOG_INF("| %-10.2f ", latency); diff --git a/common/profiler.h b/common/profiler.h index 39d37ce8..b4f05795 100644 --- a/common/profiler.h +++ b/common/profiler.h @@ -241,6 +241,9 @@ uint64_t device_swap_memory (bool available); void device_disk_seq_bw (float * read_seq_bw, float * write_seq_bw, int n_threads); void device_disk_rnd_bw (float * read_rnd_bw, float * write_rnd_bw, int n_threads); float device_memory_bw (int n_thread); +float device_cpu_mem_copy (struct llama_model * model, int n_threads); +float device_metal_mem_copy (struct llama_model * model); +float device_cuda_mem_copy (struct llama_model * model); float device_metal_read_vram_bw(); float device_cuda_read_vram_bw (); void device_get_props (struct llama_model * model, int device, struct ggml_backend_dev_props * props); diff --git a/include/llama.h b/include/llama.h index b68cc269..10a8998a 100644 --- a/include/llama.h +++ b/include/llama.h @@ -528,6 +528,12 @@ extern "C" { // Returns the total number of parameters in the model LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model); + // Returns the embedding size of K in grouped query attention + LLAMA_API uint32_t llama_model_n_embd_k_gqa(struct llama_model * model); + + // Returns the embedding size of V in grouped query attention + LLAMA_API uint32_t llama_model_n_embd_v_gqa(struct llama_model * model); + // Return the size of compute buffer size, including input tensors and activations LLAMA_API void llama_model_compute_buf_size( uint64_t * cpu_buf, diff --git a/src/llama.cpp b/src/llama.cpp index c021541e..838ee3a7 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -7386,8 +7386,8 @@ static bool llm_load_tensors_impl( if (my_rank == 0) { model.buft_input = llama_default_buffer_type_cpu(model, true); model.buft_output = llama_default_buffer_type_cpu(model, true); - LLAMA_LOG_INFO("Layer input assigned to cpu\n"); - LLAMA_LOG_INFO("Layer output assigned to cpu\n"); + // LLAMA_LOG_INFO("Layer input assigned to cpu\n"); + // LLAMA_LOG_INFO("Layer output assigned to cpu\n"); } // count used buffer types @@ -20698,6 +20698,14 @@ uint64_t llama_model_n_params(const struct llama_model * model) { return nparams; } +uint32_t llama_model_n_embd_k_gqa(struct llama_model * model) { + return model->hparams.n_embd_k_gqa(); +} + +uint32_t llama_model_n_embd_v_gqa(struct llama_model * model) { + return model->hparams.n_embd_v_gqa(); +} + static void llama_model_reset_tensors(struct llama_model * model) { model->buft_input.buft = nullptr; model->buft_input.buft_matrix = nullptr;