diff --git a/common/common.cpp b/common/common.cpp
index 4c228626..1a62d260 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -914,7 +914,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
         dev_info_set = (struct device_info *)malloc(n_world * sizeof(struct device_info));
         dev_info_set[0] = dev_info;
         llama_gather_device_info(lctx, dev_info_set);
-        device_print_props(dev_info_set, n_world);
+        device_print_props(dev_info_set, n_world, model);
     } else {
         llama_send_device_info(lctx, &dev_info);
     }
diff --git a/common/profiler.cpp b/common/profiler.cpp
index 3bc44de2..05fb4ba0 100644
--- a/common/profiler.cpp
+++ b/common/profiler.cpp
@@ -23,6 +23,7 @@
     #include "ggml-cuda.h"
 #endif
 
+#include <cmath>
 #include <chrono>
 #include <fstream>
 #include <string>
@@ -82,17 +83,11 @@ uint32_t device_cpu_cores() {
 }
 
 static float device_flops(struct llama_model * model, enum ggml_type dtype, profiler_backend_type btype, int n_threads) {
-    const int n_embd      = llama_n_embd(model);
-    const int n_ff_hidden = llama_n_ff_hidden(model);
-    const int rows_A = n_embd, cols_A = n_ff_hidden;
-    const int rows_B = n_embd, cols_B = n_ff_hidden;
-    GGML_ASSERT(cols_A == cols_B);
-
-    std::vector<float> matrix_A(cols_A * rows_A, 1.0f); 
-    std::vector<float> matrix_B(cols_B * rows_B, 1.0f / cols_B);
+    const int n_embd = llama_n_embd(model);
+    std::vector<float> matrix_A(n_embd * n_embd, 1.0f); 
+    std::vector<float> matrix_B(n_embd * n_embd, 1.0f / n_embd);
 
     ggml_backend_t backend = NULL;
-
     switch (btype) {
         case PROFILER_BACKEND_TYPE_CPU:
             backend = ggml_backend_cpu_init();
@@ -124,15 +119,15 @@ static float device_flops(struct llama_model * model, enum ggml_type dtype, prof
     };
     struct ggml_context * ctx = ggml_init(params);
 
-    struct ggml_tensor * tensor_a = ggml_new_tensor_2d(ctx, dtype, cols_A, rows_A);
-    struct ggml_tensor * tensor_b = ggml_new_tensor_2d(ctx, dtype, cols_B, rows_B);
+    struct ggml_tensor * tensor_a = ggml_new_tensor_2d(ctx, dtype, n_embd, n_embd);
+    struct ggml_tensor * tensor_b = ggml_new_tensor_2d(ctx, dtype, n_embd, n_embd);
 
     ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
 
     ggml_backend_tensor_set(tensor_a, matrix_A.data(), 0, ggml_nbytes(tensor_a));
     ggml_backend_tensor_set(tensor_b, matrix_B.data(), 0, ggml_nbytes(tensor_b));
 
-    struct ggml_cgraph * gf = NULL;
+    struct ggml_cgraph  * gf         = NULL;
     struct ggml_context * ctx_cgraph = NULL;
     {
         struct ggml_init_params params0 = {
@@ -162,7 +157,7 @@ static float device_flops(struct llama_model * model, enum ggml_type dtype, prof
     const int64_t t_end = ggml_time_us();
 
     double elapsed_seconds = ((double)t_end - (double)t_start) / 1e6; // convert to seconds
-    double flops = (2.0 * (double)cols_A * (double)rows_A * (double)rows_B) / elapsed_seconds / 1e9; // convert to GFLOPS
+    double flops = (2.0 * (double)n_embd * (double)n_embd * (double)n_embd) / elapsed_seconds / 1e9; // convert to GFLOPS
 
     ggml_free(ctx_cgraph);
     ggml_gallocr_free(allocr);
@@ -435,7 +430,7 @@ void device_get_props(struct llama_model * model, int device, struct ggml_backen
     ggml_backend_dev_get_props(dev, props);
 }
 
-void device_print_props(struct device_info * dev_info_set, int n) {
+void device_print_props(struct device_info * dev_info_set, int n, struct llama_model * model) {
     LOG_INF("\n-------------------------------------------------------------------------------------------\n");
     LOG_INF("| Property                     ");
     for (int i = 0; i < n; ++i) {
@@ -612,6 +607,38 @@ void device_print_props(struct device_info * dev_info_set, int n) {
     }
     LOG_INF("\n");
 
+    LOG_INF("| Model flops (input)          ");
+    LOG_INF("| %-10lu   ", dev_info_set[0].model_flops.input_flops);
+    LOG_INF("\n");
+
+    LOG_INF("| Model flops (each layer)     ");
+    LOG_INF("| %-10lu   ", dev_info_set[0].model_flops.layer_flops);
+    LOG_INF("\n");
+
+    LOG_INF("| Model flops (output)         ");
+    LOG_INF("| %-10lu   ", dev_info_set[0].model_flops.output_flops);
+    LOG_INF("\n");
+
+    LOG_INF("| Model params (input)         ");
+    LOG_INF("| %-10lu   ", dev_info_set[0].model_flops.input_params);
+    LOG_INF("\n");
+
+    LOG_INF("| Model params (each layer)    ");
+    LOG_INF("| %-10lu   ", dev_info_set[0].model_flops.layer_params);
+    LOG_INF("\n");
+
+    LOG_INF("| Model params (output)        ");
+    LOG_INF("| %-10lu   ", dev_info_set[0].model_flops.output_params);
+    LOG_INF("\n");
+
+    model_flops ffo  = dev_info_set[0].model_flops;
+    int64_t total_flops = ffo.input_flops + ffo.output_flops + (ffo.layer_flops * llama_model_n_layers(model));
+    double cpu_flops_f16 = dev_info_set[0].cpu_props.flops_f16 * 1e9;
+
+    LOG_INF("| Token latency (ms)           ");
+    LOG_INF("| %-10.2f   ", total_flops / cpu_flops_f16 * 1000);
+    LOG_INF("\n");
+
     LOG_INF("-------------------------------------------------------------------------------------------\n\n");
 }
 
@@ -711,6 +738,7 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
 
     memcpy(ptr, &dev_info->gpu_props.cuda_flops_q4k, sizeof(float));
 
+    // no need to synchronize model flops
     return total_size;
 }
 
@@ -799,4 +827,6 @@ void deserialize(const char * buffer, struct device_info * dev_info) {
     ptr += sizeof(float);
 
     memcpy(&dev_info->gpu_props.cuda_flops_q4k, ptr, sizeof(float));
+
+    // no need to synchronize model flops
 }
diff --git a/common/profiler.h b/common/profiler.h
index 8d796769..c9f046a1 100644
--- a/common/profiler.h
+++ b/common/profiler.h
@@ -54,20 +54,7 @@ struct gpu_props {
         : name(""), description(""), memory_free(0.0f), memory_total(0.0f), metal_flops(0.0f), cuda_flops_f32(0.0f), cuda_flops_f16(0.0f), cuda_flops_q8(0.0f), cuda_flops_q4k(0.0f) {}
 };
 
-struct device_info {
-    uint32_t           rank;
-    const char *       device_name;
-    float              disk_read_bandwidth;  // in GB/s
-    struct cpu_props   cpu_props;
-    struct memory_info memory;
-    struct gpu_support gpu_support;
-    struct gpu_props   gpu_props;
-
-    device_info()
-        : rank(0), device_name(""), disk_read_bandwidth(0.0f), cpu_props(), memory(), gpu_support(), gpu_props() {}
-};
-
-struct flops_info {
+struct model_flops {
     // model flops
     int64_t input_flops;
     int64_t output_flops;
@@ -78,10 +65,24 @@ struct flops_info {
     int64_t output_params;
     int64_t layer_params;
 
-    flops_info()
+    model_flops()
         : input_flops(0), output_flops(0), layer_flops(0), input_params(0), output_params(0), layer_params(0) {}
 };
 
+struct device_info {
+    uint32_t           rank;
+    const char *       device_name;
+    float              disk_read_bandwidth;  // in GB/s
+    struct cpu_props   cpu_props;
+    struct memory_info memory;
+    struct gpu_support gpu_support;
+    struct gpu_props   gpu_props;
+    struct model_flops model_flops;
+
+    device_info()
+        : rank(0), device_name(""), disk_read_bandwidth(0.0f), cpu_props(), memory(), gpu_support(), gpu_props(), model_flops() {}
+};
+
 enum profiler_backend_type {
     PROFILER_BACKEND_TYPE_CPU   = 0,
     PROFILER_BACKEND_TYPE_METAL = 1,
@@ -99,7 +100,7 @@ uint64_t device_swap_memory    (bool available);
 uint64_t device_disk_read_bw   (const char * test_file, size_t buffer_size_mb);
 uint64_t device_memory_bw      (size_t buffer_size_mb);
 void     device_get_props      (struct llama_model * model, int device, struct ggml_backend_dev_props * props); 
-void     device_print_props    (struct device_info * dev_info_set, int n);
+void     device_print_props    (struct device_info * dev_info_set, int n, struct llama_model * model);
 
 int      device_has_metal  (void);
 int      device_has_cuda   (void);
diff --git a/include/llama.h b/include/llama.h
index 5506ad0a..7fe18d39 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -528,7 +528,7 @@ extern "C" {
     LLAMA_API void llama_model_n_flops(
                         struct llama_model * model, 
                  struct llama_model_loader * ml, 
-                         struct flops_info * ffo,
+                        struct model_flops * ffo,
                              const int64_t   n_input,
                              const int64_t   n_history);
 
diff --git a/src/llama.cpp b/src/llama.cpp
index 5532585b..303f451d 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -110,7 +110,7 @@ struct Timer {
     ~Timer() {
         if (enable_timer) {
             int64_t end_time = ggml_time_us();
-            LLAMA_LOG_INFO("Time to run %s: %lld ms\n", name, (end_time - start_time)/1000);
+            LLAMA_LOG_INFO("Time to run %s: %lu ms\n", name, (end_time - start_time)/1000);
         }
     }
 };
@@ -3587,14 +3587,8 @@ void llama_profile_device(device_info * dev_info, struct llama_model * model, ll
     dev_info->gpu_props.cuda_flops_q4k  = device_cuda_flops(model,  GGML_TYPE_Q4_K);
 
     if (dev_info->rank == 0) {
-        struct flops_info ffo = flops_info{};
-        llama_model_n_flops(model, ml, &ffo, 1, 10);
-        LLAMA_LOG_INFO("input_flops:   %llu\n", ffo.input_flops);
-        LLAMA_LOG_INFO("output_flops:  %llu\n", ffo.output_flops);
-        LLAMA_LOG_INFO("layer_flops:   %llu\n", ffo.layer_flops);
-        LLAMA_LOG_INFO("input_params:  %llu\n", ffo.input_params);
-        LLAMA_LOG_INFO("output_params: %llu\n", ffo.output_params);
-        LLAMA_LOG_INFO("layer_params:  %llu\n", ffo.layer_params);
+        struct model_flops * ffo = &dev_info->model_flops;
+        llama_model_n_flops(model, ml, ffo, 1, 10);
     }
 }
 
@@ -20668,7 +20662,7 @@ static void llama_model_reset_tensors(struct llama_model * model) {
     model->cls_out_b = nullptr;
 }
 
-void llama_model_n_flops(struct llama_model * model, struct llama_model_loader * ml, struct flops_info * ffo, const int64_t n_input, const int64_t n_history) {
+void llama_model_n_flops(struct llama_model * model, struct llama_model_loader * ml, struct model_flops * ffo, const int64_t n_input, const int64_t n_history) {
     const llama_hparams hparams  = model->hparams;
     const int64_t n_layer        = hparams.n_layer;
     const int64_t n_vocab        = hparams.n_vocab;