From f9b4c46b7424143de9362f14d037513d824fa095 Mon Sep 17 00:00:00 2001 From: Lizonghang <870644199@qq.com> Date: Thu, 30 Jan 2025 11:12:26 +0400 Subject: [PATCH] ignore the first eval to make time test more accurate --- src/llama.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index c937edaf..b8da9822 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -111,7 +111,7 @@ struct Timer { ~Timer() { if (enable_timer) { int64_t end_time = ggml_time_us(); - LLAMA_LOG_INFO("Time to run %s: %" PRId64 " ms\n", name, (end_time - start_time) / 1000); + LLAMA_LOG_INFO("\nTime to run %s: %" PRId64 " ms: ", name, (end_time - start_time) / 1000); } } }; @@ -3344,7 +3344,7 @@ struct llama_context { mutable int64_t n_queued_tokens = 0; mutable int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1) - mutable int32_t n_eval = 0; // number of eval calls + mutable int32_t n_eval = -1; // number of eval calls, set to -1 to ignore the first eval // host buffer for the model output (logits and embeddings) ggml_backend_buffer_t buf_output = nullptr; @@ -22715,7 +22715,7 @@ void llama_synchronize(struct llama_context * ctx) { // add the evaluation to the stats if (ctx->n_queued_tokens == 1) { - if (!ctx->cparams.no_perf) { + if (!ctx->cparams.no_perf && ctx->n_eval >= 0) { // ignore the first two evals due to preheat ctx->t_eval_us += ggml_time_us() - ctx->t_compute_start_us; } ctx->n_eval++; @@ -23362,7 +23362,8 @@ void llama_perf_context_print(const struct llama_context * ctx) { void llama_perf_context_reset(struct llama_context * ctx) { ctx->t_start_us = ggml_time_us(); - ctx->t_eval_us = ctx->n_eval = 0; + ctx->t_eval_us = 0; + ctx->n_eval = -1; // set to -1 to ignore the first eval due to preheat ctx->t_p_eval_us = ctx->n_p_eval = 0; }