From 5a99c5894d9fefa0f67f86c473bd6169daab75bb Mon Sep 17 00:00:00 2001 From: Zonghang Li Date: Thu, 5 Dec 2024 09:52:16 +0400 Subject: [PATCH] update test model, enable warm-up and sched --- common/profiler.cpp | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/common/profiler.cpp b/common/profiler.cpp index 74cc4e57..43fe65c0 100644 --- a/common/profiler.cpp +++ b/common/profiler.cpp @@ -141,9 +141,12 @@ static float device_flops(struct llama_model * model, enum ggml_type src0t, enum struct ggml_cgraph * gf = NULL; struct ggml_context * ctx_cgraph = NULL; struct ggml_tensor * cur = NULL; + struct ggml_tensor * cur1 = NULL; + struct ggml_tensor * cur2 = NULL; + struct ggml_tensor * cur3 = NULL; { struct ggml_init_params params0 = { - /*.mem_size =*/ ggml_tensor_overhead() * (n_repeat + 2) + ggml_graph_overhead(), + /*.mem_size =*/ ggml_tensor_overhead() * (5 * n_repeat + 1) + ggml_graph_overhead(), /*.mem_buffer =*/ NULL, /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph() }; @@ -151,8 +154,12 @@ static float device_flops(struct llama_model * model, enum ggml_type src0t, enum gf = ggml_new_graph(ctx_cgraph); cur = ggml_mul_mat(ctx_cgraph, tensor_a, tensor_b); - for (int i = 0; i < n_repeat - 1; i++) { - cur = ggml_mul_mat(ctx_cgraph, tensor_a, cur); + for (int i = 0; i < n_repeat; i++) { + cur1 = ggml_mul_mat(ctx_cgraph, tensor_a, cur); + cur2 = ggml_mul_mat(ctx_cgraph, tensor_a, cur); + cur = ggml_add(ctx_cgraph, cur1, cur2); + cur3 = ggml_mul_mat(ctx_cgraph, tensor_a, cur); + cur = ggml_add(ctx_cgraph, cur, cur3); } ggml_build_forward_expand(gf, cur); } @@ -164,7 +171,6 @@ static float device_flops(struct llama_model * model, enum ggml_type src0t, enum ggml_backend_cpu_set_n_threads(backend, n_threads); } -#if 0 // use scheduler std::vector backend_buft; std::vector backends = {backend}; @@ -180,7 +186,7 @@ static float device_flops(struct llama_model * model, enum ggml_type src0t, enum } } - ggml_backend_sched_t sched = ggml_backend_sched_new(backends.data(), backend_buft.data(), backends.size(), 128, false); + ggml_backend_sched_t sched = ggml_backend_sched_new(backends.data(), backend_buft.data(), backends.size(), 256, false); bool ok = ggml_backend_sched_reserve(sched, gf); if (!ok) { @@ -195,17 +201,17 @@ static float device_flops(struct llama_model * model, enum ggml_type src0t, enum ggml_backend_sched_reset(sched); ggml_backend_sched_alloc_graph(sched, gf); -#endif // warm-up - // ggml_backend_graph_compute(backend, gf); + ggml_backend_graph_compute(backend, gf); const int64_t t_start = ggml_time_us(); ggml_backend_graph_compute(backend, gf); const int64_t t_end = ggml_time_us(); double elapsed_seconds = ((double)t_end - (double)t_start) / 1e6; // convert to seconds - double flops = (2.0 * n_repeat * (double)n_embd * (double)n_embd * (double)n_embd) / elapsed_seconds / 1e9; // convert to GFLOPS + double flops = (2.0 * (double)n_embd * (double)n_embd * (double)n_embd + + n_repeat * 4 * 2.0 * (double)n_embd * (double)n_embd * (double)n_embd) / elapsed_seconds / 1e9; // convert to GFLOPS ggml_free(ctx_cgraph); ggml_gallocr_free(allocr);