From 3c81c8deeabba01fa40869325ea80d07eef75fc6 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 19 May 2026 09:46:58 +0300 Subject: [PATCH] server : print graphs reused in slot timings (#23279) Add graphs reused counter to the per-slot timing output, printed via llama_perf_context(). Assisted-by: llama.cpp:local pi Co-authored-by: ggerganov --- tools/server/server-context.cpp | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 6b16c6b49..88b207ad5 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -467,20 +467,26 @@ struct server_slot { const double n_gen_second = 1e3 / t_token_generation * n_decoded; SLT_INF(*this, - "\n" - "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n" - " eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n" + "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", + t_prompt_processing, n_prompt_tokens_processed, t_prompt, n_prompt_second); + + SLT_INF(*this, + " eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", + t_token_generation, n_decoded, t_gen, n_gen_second); + + SLT_INF(*this, " total time = %10.2f ms / %5d tokens\n", - t_prompt_processing, n_prompt_tokens_processed, t_prompt, n_prompt_second, - t_token_generation, n_decoded, t_gen, n_gen_second, t_prompt_processing + t_token_generation, n_prompt_tokens_processed + n_decoded); + SLT_INF(*this, + " graphs reused = %10d\n", + llama_perf_context(ctx_tgt).n_reused); + if (n_draft_total > 0) { const float draft_ratio = (float) n_draft_accepted / n_draft_total; - SLT_CNT(*this, - "draft acceptance rate = %0.5f (%5d accepted / %5d generated)\n", - draft_ratio, n_draft_accepted, n_draft_total - ); + SLT_INF(*this, + "draft acceptance = %0.5f (%5d accepted / %5d generated)\n", + draft_ratio, n_draft_accepted, n_draft_total); } common_speculative_print_stats(spec);