logs : reduce (#23021)
Some checks failed
Python Type-Check / python type-check (push) Waiting to run
Check Pre-Tokenizer Hashes / pre-tokenizer-hashes (push) Has been cancelled
Python check requirements.txt / check-requirements (push) Has been cancelled
Update Operations Documentation / update-ops-docs (push) Has been cancelled

* logs : reduce

* args : fix envs

* server : fix build

* common : print verbosity level at start

* server : clean-up logs

* server : print prompt processing timings + sampling params

* minor : whitespaces
This commit is contained in:
Georgi Gerganov 2026-05-14 13:05:52 +03:00 committed by GitHub
parent 81b0d882ae
commit 67b2b7f2f2
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
17 changed files with 207 additions and 147 deletions

View file

@ -166,6 +166,7 @@ struct server_slot {
// stats
size_t n_sent_text = 0; // number of sent text character
int64_t t_print_last = 0;
int64_t t_start_process_prompt;
int64_t t_start_generation;
@ -233,7 +234,7 @@ struct server_slot {
}
}
SLT_INF(*this, "init sampler, took %0.2f ms, tokens: text = %d, total = %d\n",
SLT_TRC(*this, "init sampler, took %0.2f ms, tokens: text = %d, total = %d\n",
(ggml_time_us() - t_start) / 1000.0, n_text, (int) prompt.tokens.size());
}
@ -417,6 +418,36 @@ struct server_slot {
return stop_pos;
}
void print_timings_tg() {
if (n_decoded < 100) {
return;
}
const int64_t t_now = ggml_time_us();
if (t_now - t_print_last < 3*1000*1000) {
return;
}
t_print_last = t_now;
const double n_gen_second = 1e3 / t_token_generation * n_decoded;
SLT_INF(*this, "n_decoded = %6d, tg = %6.2f t/s\n", n_decoded, n_gen_second);
}
void print_timings_pp() const {
const double n_prompt_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed;
const double f_progress = (float) prompt.n_tokens() / task->n_tokens();
if (t_prompt_processing < 3000.0) {
return;
}
SLT_INF(*this, "prompt processing, n_tokens = %6d, progress = %.2f, t = %6.2f s / %.2f tokens per second\n",
n_prompt_tokens_processed, f_progress, t_prompt_processing / 1e3, n_prompt_second);
}
void print_timings() const {
const double t_prompt = t_prompt_processing / n_prompt_tokens_processed;
const double n_prompt_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed;
@ -588,6 +619,10 @@ public:
// note: chat_params must not be refreshed upon existing sleeping state
server_chat_params chat_params;
server_context_impl() {
mtmd_helper_log_set(common_log_default_callback, nullptr);
}
~server_context_impl() {
if (!sleeping) {
// destroy() is already called when entering sleeping state
@ -749,10 +784,6 @@ private:
std::string & mmproj_path = params_base.mmproj.path;
if (!mmproj_path.empty()) {
if (!is_resume) {
mtmd_helper_log_set(common_log_default_callback, nullptr);
}
mtmd_context_params mparams = mtmd_context_params_default();
mparams.use_gpu = params_base.mmproj_use_gpu;
@ -896,17 +927,17 @@ private:
if (params_base.cache_ram_mib != 0) {
if (params_base.cache_ram_mib < 0) {
SRV_WRN("prompt cache is enabled, size limit: %s\n", "no limit");
SRV_INF("prompt cache is enabled, size limit: %s\n", "no limit");
} else {
SRV_WRN("prompt cache is enabled, size limit: %d MiB\n", params_base.cache_ram_mib);
SRV_INF("prompt cache is enabled, size limit: %d MiB\n", params_base.cache_ram_mib);
}
SRV_WRN("%s", "use `--cache-ram 0` to disable the prompt cache\n");
SRV_INF("%s", "use `--cache-ram 0` to disable the prompt cache\n");
prompt_cache = std::make_unique<server_prompt_cache>(params_base.cache_ram_mib, n_ctx);
} else {
SRV_WRN("%s", "prompt cache is disabled - use `--cache-ram N` to enable it\n");
SRV_INF("%s", "prompt cache is disabled - use `--cache-ram N` to enable it\n");
}
SRV_WRN("%s", "for more info see https://github.com/ggml-org/llama.cpp/pull/16391\n");
SRV_INF("%s", "for more info see https://github.com/ggml-org/llama.cpp/pull/16391\n");
if (!params_base.model_alias.empty()) {
// backward compat: use first alias as model name
@ -954,13 +985,13 @@ private:
if (params_base.cache_idle_slots) {
if (!params_base.kv_unified) {
SRV_WRN("%s: --cache-idle-slots requires --kv-unified, disabling\n", __func__);
SRV_WRN("%s", "--cache-idle-slots requires --kv-unified, disabling\n");
params_base.cache_idle_slots = false;
} else if (params_base.cache_ram_mib == 0) {
SRV_WRN("%s: --cache-idle-slots requires --cache-ram, disabling\n", __func__);
SRV_WRN("%s", "--cache-idle-slots requires --cache-ram, disabling\n");
params_base.cache_idle_slots = false;
} else {
SRV_INF("%s: idle slots will be saved to prompt cache and cleared upon starting a new task\n", __func__);
SRV_INF("%s", "idle slots will be saved to prompt cache and cleared upon starting a new task\n");
SRV_DBG("%s", "__TEST_TAG_CACHE_IDLE_SLOTS_ENABLED__\n");
}
}
@ -1112,7 +1143,7 @@ private:
update_cache = update_cache && task.type == SERVER_TASK_TYPE_COMPLETION;
if (update_cache) {
SRV_WRN("%s", "updating prompt cache\n");
SRV_INF("%s", "updating prompt cache\n");
const int64_t t_start = ggml_time_us();
@ -1127,7 +1158,7 @@ private:
prompt_cache->update();
SRV_WRN("prompt cache update took %.2f ms\n", (ggml_time_us() - t_start) / 1000.0);
SRV_INF("prompt cache update took %.2f ms\n", (ggml_time_us() - t_start) / 1000.0);
}
}
@ -1186,10 +1217,10 @@ private:
if (!are_lora_equal(task_loras, slot.lora)) {
// if lora has changed, check to see if the cache should be cleared
if (lora_should_clear_cache(slot.lora, task_loras)) {
SLT_INF(slot, "clearing cache for lora change. %zu loras -> %zu loras\n", slot.lora.size(), task.params.lora.size());
SLT_TRC(slot, "clearing cache for lora change. %zu loras -> %zu loras\n", slot.lora.size(), task.params.lora.size());
slot.prompt.tokens.clear();
} else {
SLT_INF(slot, "keeping cache for alora. %zu target loras\n", task_loras.size());
SLT_TRC(slot, "keeping cache for alora. %zu target loras\n", task_loras.size());
}
slot.lora = task_loras;
}
@ -1281,7 +1312,8 @@ private:
llama_set_sampler(ctx_tgt, slot.id, nullptr);
}
SLT_INF(slot, "sampler chain: %s\n", common_sampler_print(slot.smpl.get()).c_str());
SLT_TRC(slot, "sampler chain: %s\n", common_sampler_print(slot.smpl.get()).c_str());
SLT_TRC(slot, "sampler params: \n%s\n", task.params.sampling.print().c_str());
} else {
slot.smpl.reset();
}
@ -1800,7 +1832,7 @@ private:
cur.update_tgt(ctx_tgt, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
cur.update_dft(ctx_dft.get(), slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
SLT_WRN(slot,
SLT_INF(slot,
"created context checkpoint %d of %d (pos_min = %d, pos_max = %d, n_tokens = %" PRId64 ", size = %.3f MiB)\n",
(int) slot.prompt.checkpoints.size(), params_base.n_ctx_checkpoints, cur.pos_min,
cur.pos_max, cur.n_tokens, (float) cur.size() / 1024 / 1024);
@ -2339,7 +2371,7 @@ private:
slot.state = SLOT_STATE_PROCESSING_PROMPT;
SLT_INF(slot, "new prompt, n_ctx_slot = %d, n_keep = %d, task.n_tokens = %d\n",
SLT_TRC(slot, "new prompt, n_ctx_slot = %d, n_keep = %d, task.n_tokens = %d\n",
slot.n_ctx, slot.task->params.n_keep, slot.task->n_tokens());
// print prompt tokens (for debugging)
@ -2454,7 +2486,7 @@ private:
}
if (n_match >= (size_t) n_cache_reuse) {
SLT_INF(slot, "reusing chunk with size %zu, shifting KV cache [%zu, %zu) -> [%zu, %zu)\n", n_match, head_c, head_c + n_match, head_p, head_p + n_match);
SLT_TRC(slot, "reusing chunk with size %zu, shifting KV cache [%zu, %zu) -> [%zu, %zu)\n", n_match, head_c, head_c + n_match, head_p, head_p + n_match);
//for (size_t i = head_p; i < head_p + n_match; i++) {
// SLT_DBG(slot, "cache token %3zu: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx_tgt, prompt_tokens[i]).c_str());
//}
@ -2620,10 +2652,14 @@ private:
}
}
const int64_t t_current = ggml_time_us();
slot.t_prompt_processing = (t_current - slot.t_start_process_prompt) / 1e3;
slot.print_timings_pp();
// truncate any tokens that are beyond n_past for this slot
const llama_pos p0 = slot.prompt.tokens.pos_next();
SLT_INF(slot, "n_tokens = %d, memory_seq_rm [%d, end)\n", slot.prompt.n_tokens(), p0);
SLT_TRC(slot, "cached n_tokens = %d, memory_seq_rm [%d, end)\n", slot.prompt.n_tokens(), p0);
if (!llama_memory_seq_rm(llama_get_memory(ctx_tgt), slot.id, p0, -1)) {
SLT_WRN(slot, "failed to truncate tokens with position >= %d - clearing the memory\n", p0);
@ -2764,7 +2800,6 @@ private:
slot.i_batch = batch.n_tokens - 1;
slot.init_sampler();
SLT_INF(slot, "prompt processing done, n_tokens = %d, batch.n_tokens = %d\n", slot.prompt.n_tokens(), batch.n_tokens);
} else {
if (slot.task->n_tokens() < slot.prompt.n_tokens() + n_ubatch) {
// near the end of the prompt
@ -2786,8 +2821,6 @@ private:
}
}
}
SLT_INF(slot, "prompt processing progress, n_tokens = %d, batch.n_tokens = %d, progress = %f\n", slot.prompt.n_tokens(), batch.n_tokens, (float) slot.prompt.n_tokens() / slot.task->n_tokens());
}
const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx_tgt), slot.id);
@ -3084,6 +3117,8 @@ private:
continue;
}
slot.print_timings_tg();
}
// speculative decoding - main model sample and accept
@ -3196,6 +3231,8 @@ private:
}
}
slot.print_timings_tg();
SLT_DBG(slot, "accepted %d/%d draft tokens, new n_tokens = %d\n", (int) ids.size() - 1, (int) n_draft, slot.prompt.n_tokens());
}
}