From b30f749e5e2456f8e67136c3b19d7be157369ece Mon Sep 17 00:00:00 2001 From: "Li, Zonghang" <870644199@qq.com> Date: Tue, 3 Jun 2025 14:06:31 +0400 Subject: [PATCH 1/2] fix n_embd cannot be divided by quantized block size --- common/common.cpp | 3 +++ common/profiler.cpp | 34 +++++++++++++++++++++++++++++----- 2 files changed, 32 insertions(+), 5 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 89707fb7..c90048dc 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1588,6 +1588,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { if (n_world == 1) { uint32_t n_layers = llama_model_n_layers(model); + // assign all layers to this device params.n_layer_window[0] = n_layers; cparams.n_layer_window[0] = n_layers; @@ -1596,6 +1597,8 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { #if defined(GGML_USE_METAL) || defined(GGML_USE_CUDA) params.n_gpu_layers = std::min((int32_t)n_layers, params.n_gpu_layers); + cparams.n_gpu_layers = params.n_gpu_layers; + mparams.n_gpu_layers = params.n_gpu_layers; #endif } else { diff --git a/common/profiler.cpp b/common/profiler.cpp index b842071c..b54bb0be 100644 --- a/common/profiler.cpp +++ b/common/profiler.cpp @@ -48,6 +48,16 @@ #include +static int gcd_int(int a, int b) { + while (b != 0) { + int t = b; + b = a % b; + a = t; + } + return a; +} + + static size_t get_page_size() { size_t page_size = 0; @@ -154,8 +164,25 @@ uint32_t device_cpu_cores() { static float device_flops(struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t, enum profiler_backend_type btype, int n_threads) { int n_repeat = 1; - int n_embd = std::min(llama_n_embd(model), 4096); - if (btype == PROFILER_BACKEND_TYPE_CPU) n_embd /= 8; // simulate small tensor calculation on cpu + int n_embd = std::min(llama_n_embd(model), 4096); + + // simulate small tensor calculation on cpu + if (btype == PROFILER_BACKEND_TYPE_CPU) n_embd /= 8; + + // ensure that the block sizes of the tensors are compatible + int bs0 = ggml_blck_size(src0t); + int bs1 = ggml_blck_size(src1t); + int gcd = gcd_int(bs0, bs1); + int lcm = bs0 / gcd * bs1; + + if (n_embd % bs0 != 0 || n_embd % bs1 != 0) { + if (n_embd < lcm) { + n_embd = 2 * lcm; + } else { + n_embd = 2 * (n_embd / lcm) * lcm; + } + } + std::vector matrix_A(n_embd * n_embd, 1.0f); std::vector matrix_B(n_embd * n_embd, 1.0f / n_embd); @@ -188,9 +215,6 @@ static float device_flops(struct llama_model * model, enum ggml_type src0t, enum }; struct ggml_context * ctx = ggml_init(params); - if(n_embd < ggml_blck_size(src0t)){ - n_embd = 2 * ggml_blck_size(src0t); - } struct ggml_tensor * tensor_a = ggml_new_tensor_2d(ctx, src0t, n_embd, n_embd); struct ggml_tensor * tensor_b = ggml_new_tensor_2d(ctx, src1t, n_embd, n_embd); From 1b3b6a506f8538c0192fa659dcb524d394bee7c1 Mon Sep 17 00:00:00 2001 From: "Li, Zonghang" <870644199@qq.com> Date: Tue, 3 Jun 2025 17:10:09 +0400 Subject: [PATCH 2/2] fix: add warm-up in profiling to prevent init delay --- common/profiler.cpp | 5 ++++- src/llama.cpp | 10 +++++----- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/common/profiler.cpp b/common/profiler.cpp index b54bb0be..a2ac33b5 100644 --- a/common/profiler.cpp +++ b/common/profiler.cpp @@ -439,7 +439,7 @@ float device_inp_embd_delay(struct llama_model * model, enum ggml_type src0t, in } // warm-up - // ggml_backend_graph_compute(backend, gf); + ggml_backend_graph_compute(backend, gf); const int64_t t_start = ggml_time_us(); ggml_backend_graph_compute(backend, gf); @@ -1288,6 +1288,9 @@ static float device_mem_copy(struct llama_model * model, enum profiler_backend_t ggml_backend_cpu_set_n_threads(backend, n_threads); } + // warm-up + ggml_backend_graph_compute(backend, gf); + const int64_t t_start = ggml_time_us(); ggml_backend_graph_compute(backend, gf); const int64_t t_end = ggml_time_us(); diff --git a/src/llama.cpp b/src/llama.cpp index cd5a95b1..2cc8da15 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -18211,7 +18211,7 @@ static int llama_decode_internal( [&]{ llama_kv_cache_clear (&lctx); }, [&]{ llama_send_kv_cache_clear (&lctx); }, is_last_dev)) { - LLAMA_LOG_INFO("%s: received signal kv_cache_clear\n", __func__); + LLAMA_LOG_DEBUG("%s: received signal kv_cache_clear\n", __func__); return -1; } @@ -18219,7 +18219,7 @@ static int llama_decode_internal( [&]{ llama_kv_cache_seq_rm (&lctx, meta.rm_seq_id, meta.rm_p0, meta.rm_p1); }, [&]{ llama_send_kv_cache_seq_rm (&lctx, meta.rm_seq_id, meta.rm_p0, meta.rm_p1); }, is_last_dev)) { - LLAMA_LOG_INFO("%s: received signal kv_cache_seq_rm\n", __func__); + LLAMA_LOG_DEBUG("%s: received signal kv_cache_seq_rm\n", __func__); return -1; } @@ -18227,7 +18227,7 @@ static int llama_decode_internal( [&]{ llama_kv_cache_seq_add (&lctx, meta.add_seq_id, meta.add_p0, meta.add_p1, meta.add_delta); }, [&]{ llama_send_kv_cache_seq_add(&lctx, meta.add_seq_id, meta.add_p0, meta.add_p1, meta.add_delta); }, is_last_dev)) { - LLAMA_LOG_INFO("%s: received signal kv_cache_seq_add\n", __func__); + LLAMA_LOG_DEBUG("%s: received signal kv_cache_seq_add\n", __func__); return -1; } @@ -18235,7 +18235,7 @@ static int llama_decode_internal( [&]{ llama_kv_cache_seq_cp (&lctx, meta.cp_src_seq_id, meta.cp_dst_seq_id, meta.cp_p0, meta.cp_p1); }, [&]{ llama_send_kv_cache_seq_cp (&lctx, meta.cp_src_seq_id, meta.cp_dst_seq_id, meta.cp_p0, meta.cp_p1); }, is_last_dev)) { - LLAMA_LOG_INFO("%s: received signal kv_cache_seq_cp\n", __func__); + LLAMA_LOG_DEBUG("%s: received signal kv_cache_seq_cp\n", __func__); return -1; } @@ -18243,7 +18243,7 @@ static int llama_decode_internal( [&]{ llama_kv_cache_seq_div (&lctx, meta.div_seq_id, meta.div_p0, meta.div_p1, meta.div_factor); }, [&]{ llama_send_kv_cache_seq_div(&lctx, meta.div_seq_id, meta.div_p0, meta.div_p1, meta.div_factor); }, is_last_dev)) { - LLAMA_LOG_INFO("%s: received signal kv_cache_seq_div\n", __func__); + LLAMA_LOG_DEBUG("%s: received signal kv_cache_seq_div\n", __func__); return -1; } }