From 5a99c5894d9fefa0f67f86c473bd6169daab75bb Mon Sep 17 00:00:00 2001
From: Zonghang Li <lizhuestc@gmail.com>
Date: Thu, 5 Dec 2024 09:52:16 +0400
Subject: [PATCH] update test model, enable warm-up and sched

---
 common/profiler.cpp | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/common/profiler.cpp b/common/profiler.cpp
index 74cc4e57..43fe65c0 100644
--- a/common/profiler.cpp
+++ b/common/profiler.cpp
@@ -141,9 +141,12 @@ static float device_flops(struct llama_model * model, enum ggml_type src0t, enum
     struct ggml_cgraph  * gf         = NULL;
     struct ggml_context * ctx_cgraph = NULL;
     struct ggml_tensor  * cur        = NULL;
+    struct ggml_tensor  * cur1       = NULL;
+    struct ggml_tensor  * cur2       = NULL;
+    struct ggml_tensor  * cur3       = NULL;
     {
         struct ggml_init_params params0 = {
-            /*.mem_size   =*/ ggml_tensor_overhead() * (n_repeat + 2) + ggml_graph_overhead(),
+            /*.mem_size   =*/ ggml_tensor_overhead() * (5 * n_repeat + 1) + ggml_graph_overhead(),
             /*.mem_buffer =*/ NULL,
             /*.no_alloc   =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph()
         };
@@ -151,8 +154,12 @@ static float device_flops(struct llama_model * model, enum ggml_type src0t, enum
 
         gf = ggml_new_graph(ctx_cgraph);
         cur = ggml_mul_mat(ctx_cgraph, tensor_a, tensor_b);
-        for (int i = 0; i < n_repeat - 1; i++) {
-            cur = ggml_mul_mat(ctx_cgraph, tensor_a, cur);
+        for (int i = 0; i < n_repeat; i++) {
+            cur1 = ggml_mul_mat(ctx_cgraph, tensor_a, cur);
+            cur2 = ggml_mul_mat(ctx_cgraph, tensor_a, cur);
+            cur  = ggml_add(ctx_cgraph, cur1, cur2);
+            cur3 = ggml_mul_mat(ctx_cgraph, tensor_a, cur);
+            cur  = ggml_add(ctx_cgraph, cur, cur3);
         }
         ggml_build_forward_expand(gf, cur);
     }
@@ -164,7 +171,6 @@ static float device_flops(struct llama_model * model, enum ggml_type src0t, enum
         ggml_backend_cpu_set_n_threads(backend, n_threads);
     }
 
-#if 0
     // use scheduler
     std::vector<ggml_backend_buffer_type_t> backend_buft;
     std::vector<ggml_backend_t> backends = {backend};
@@ -180,7 +186,7 @@ static float device_flops(struct llama_model * model, enum ggml_type src0t, enum
         }
     }
     
-    ggml_backend_sched_t sched = ggml_backend_sched_new(backends.data(), backend_buft.data(), backends.size(), 128, false);
+    ggml_backend_sched_t sched = ggml_backend_sched_new(backends.data(), backend_buft.data(), backends.size(), 256, false);
 
     bool ok = ggml_backend_sched_reserve(sched, gf);
     if (!ok) {
@@ -195,17 +201,17 @@ static float device_flops(struct llama_model * model, enum ggml_type src0t, enum
 
     ggml_backend_sched_reset(sched);
     ggml_backend_sched_alloc_graph(sched, gf);
-#endif
 
     // warm-up
-    // ggml_backend_graph_compute(backend, gf);
+    ggml_backend_graph_compute(backend, gf);
 
     const int64_t t_start = ggml_time_us();
     ggml_backend_graph_compute(backend, gf);
     const int64_t t_end = ggml_time_us();
 
     double elapsed_seconds = ((double)t_end - (double)t_start) / 1e6; // convert to seconds
-    double flops = (2.0 * n_repeat * (double)n_embd * (double)n_embd * (double)n_embd) / elapsed_seconds / 1e9; // convert to GFLOPS
+    double flops = (2.0 * (double)n_embd * (double)n_embd * (double)n_embd + 
+                   n_repeat * 4 * 2.0 * (double)n_embd * (double)n_embd * (double)n_embd) / elapsed_seconds / 1e9; // convert to GFLOPS
 
     ggml_free(ctx_cgraph);
     ggml_gallocr_free(allocr);