diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 0b40f7bfa..24bfae789 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -1201,6 +1201,8 @@ struct server_task_result_metrics : server_task_result { uint64_t n_tokens_predicted_total = 0; uint64_t t_tokens_generation_total = 0; + uint64_t n_past_max = 0; + uint64_t n_prompt_tokens_processed = 0; uint64_t t_prompt_processing = 0; @@ -1226,6 +1228,8 @@ struct server_task_result_metrics : server_task_result { { "n_tokens_predicted_total", n_tokens_predicted_total }, { "t_prompt_processing_total", t_prompt_processing_total }, + { "n_past_max", n_past_max }, + { "n_prompt_tokens_processed", n_prompt_tokens_processed }, { "t_prompt_processing", t_prompt_processing }, { "n_tokens_predicted", n_tokens_predicted }, @@ -1587,6 +1591,8 @@ struct server_metrics { uint64_t n_tokens_predicted_total = 0; uint64_t t_tokens_generation_total = 0; + uint64_t n_past_max = 0; + uint64_t n_prompt_tokens_processed = 0; uint64_t t_prompt_processing = 0; @@ -1605,6 +1611,10 @@ struct server_metrics { n_prompt_tokens_processed += slot.n_prompt_tokens_processed; t_prompt_processing += slot.t_prompt_processing; t_prompt_processing_total += slot.t_prompt_processing; + + if (slot.n_past > 0) { + n_past_max = std::max(n_past_max, (uint64_t) slot.n_past); + } } void on_prediction(const server_slot & slot) { @@ -1620,6 +1630,9 @@ struct server_metrics { if (slot.is_processing()) { n_busy_slots_total++; } + if (slot.n_past > 0) { + n_past_max = std::max(n_past_max, (uint64_t) slot.n_past); + } } } @@ -2875,6 +2888,8 @@ struct server_context { res->n_tokens_predicted_total = metrics.n_tokens_predicted_total; res->t_tokens_generation_total = metrics.t_tokens_generation_total; + res->n_past_max = metrics.n_past_max; + res->n_prompt_tokens_processed = metrics.n_prompt_tokens_processed; res->t_prompt_processing = metrics.t_prompt_processing; res->n_tokens_predicted = metrics.n_tokens_predicted; @@ -4077,6 +4092,10 @@ int main(int argc, char ** argv) { {"name", "n_decode_total"}, {"help", "Total number of llama_decode() calls"}, {"value", res_metrics->n_decode_total} + }, { + {"name", "n_past_max"}, + {"help", "Largest observed n_past."}, + {"value", res_metrics->n_past_max} }, { {"name", "n_busy_slots_per_decode"}, {"help", "Average number of busy slots per llama_decode() call"},