From 67b2b7f2f2d6dac7962b219168a4c7a20c7359b7 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 14 May 2026 13:05:52 +0300
Subject: [PATCH] logs : reduce (#23021)

* logs : reduce

* args : fix envs

* server : fix build

* common : print verbosity level at start

* server : clean-up logs

* server : print prompt processing timings + sampling params

* minor : whitespaces
---
 common/arg.cpp                          | 20 +++---
 common/common.cpp                       | 29 ++++++--
 common/common.h                         |  1 +
 common/download.cpp                     |  4 +-
 common/fit.cpp                          | 88 ++++++++++++-------------
 common/log.cpp                          |  4 +-
 common/log.h                            |  7 +-
 common/speculative.cpp                  |  2 +-
 ggml/src/ggml-metal/ggml-metal-device.m |  2 +-
 tools/mtmd/clip-impl.h                  |  2 +-
 tools/mtmd/clip.cpp                     |  2 +-
 tools/server/server-common.h            | 10 +--
 tools/server/server-context.cpp         | 87 +++++++++++++++++-------
 tools/server/server-http.cpp            | 38 +++++------
 tools/server/server-task.cpp            | 12 ++--
 tools/server/server.cpp                 | 44 ++++++-------
 tools/tts/tts.cpp                       |  2 +-
 17 files changed, 207 insertions(+), 147 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 75a4d38d3..15d5ad77a 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -308,12 +308,14 @@ static bool common_params_handle_remote_preset(common_params & params, llama_exa
     common_download_opts opts;
     opts.bearer_token = params.hf_token;
     opts.offline = params.offline;
+
+    LOG_TRC("%s: looking for remote preset at %s\n", __func__, preset_url.c_str());
     const int status = common_download_file_single(preset_url, preset_path, opts);
     const bool has_preset = status >= 200 && status < 400;
 
     // remote preset is optional, so we don't error out if not found
     if (has_preset) {
-        LOG_INF("applying remote preset from %s\n", preset_url.c_str());
+        LOG_TRC("%s: applying remote preset from %s\n", __func__, preset_url.c_str());
         common_preset_context ctx(ex, /* only_remote_allowed */ true);
         common_preset global;
         auto remote_presets = ctx.load_from_ini(preset_path, global);
@@ -326,7 +328,7 @@ static bool common_params_handle_remote_preset(common_params & params, llama_exa
             throw std::runtime_error("Remote preset.ini does not contain [" + std::string(hf_tag) + "] section");
         }
     } else {
-        LOG_INF("%s", "no remote preset found, skipping\n");
+        LOG_TRC("%s: no remote preset found, skipping\n", __func__);
     }
 
     return has_preset;
@@ -3301,18 +3303,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_env("LLAMA_LOG_VERBOSITY"));
     add_opt(common_arg(
         {"--log-prefix"},
+        {"--no-log-prefix"},
         "Enable prefix in log messages",
-        [](common_params &) {
-            common_log_set_prefix(common_log_main(), true);
+        [](common_params &, bool value) {
+            common_log_set_prefix(common_log_main(), value);
         }
-    ).set_env("LLAMA_LOG_PREFIX"));
+    ).set_env("LLAMA_ARG_LOG_PREFIX"));
     add_opt(common_arg(
         {"--log-timestamps"},
+        {"--no-log-timestamps"},
         "Enable timestamps in log messages",
-        [](common_params &) {
-            common_log_set_timestamps(common_log_main(), true);
+        [](common_params &, bool value) {
+            common_log_set_timestamps(common_log_main(), value);
         }
-    ).set_env("LLAMA_LOG_TIMESTAMPS"));
+    ).set_env("LLAMA_ARG_LOG_TIMESTAMPS"));
 
     //
     // speculative parameters
diff --git a/common/common.cpp b/common/common.cpp
index 352af0b17..b701edddb 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -366,15 +366,29 @@ void common_init() {
     SetConsoleCP(CP_UTF8);
 #endif
 
-    llama_log_set(common_log_default_callback, NULL);
+    common_log_set_prefix(common_log_main(), true);
+    common_log_set_timestamps(common_log_main(), true);
 
+    llama_log_set(common_log_default_callback, NULL);
+}
+
+void common_params_print_info(const common_params & params) {
 #ifdef NDEBUG
     const char * build_type = "";
 #else
     const char * build_type = " (debug)";
 #endif
+    LOG_TRC("%s: build %d (%s) with %s for %s%s\n", __func__, llama_build_number(), llama_commit(), llama_compiler(), llama_build_target(), build_type);
 
-    LOG_DBG("build: %d (%s) with %s for %s%s\n", llama_build_number(), llama_commit(), llama_compiler(), llama_build_target(), build_type);
+    LOG_INF("log_info: verbosity = %d (adjust with the `-lv N` CLI arg)\n", common_log_get_verbosity_thold());
+    LOG_INF("device_info:\n");
+    for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
+        auto * dev = ggml_backend_dev_get(i);
+        size_t free, total;
+        ggml_backend_dev_memory(dev, &free, &total);
+        LOG_INF("  - %-8s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
+    }
+    LOG_INF("%s\n", common_params_get_system_info(params).c_str());
 }
 
 std::string common_params_get_system_info(const common_params & params) {
@@ -1147,7 +1161,8 @@ common_init_result::common_init_result(common_params & params) :
     auto cparams = common_context_params_to_llama(params);
 
     if (params.fit_params) {
-        LOG_INF("%s: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on\n", __func__);
+        LOG_INF("%s: fitting params to device memory ...\n", __func__);
+        LOG_INF("%s: (for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on)\n", __func__);
         common_fit_params(params.model.path.c_str(), &mparams, &cparams,
             params.tensor_split,
             params.tensor_buft_overrides.data(),
@@ -1196,7 +1211,7 @@ common_init_result::common_init_result(common_params & params) :
     // initialize once
     for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
         if (llama_vocab_is_eog(vocab, i)) {
-            LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(vocab, i).c_str(), -INFINITY);
+            LOG_TRC("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(vocab, i).c_str(), -INFINITY);
             params.sampling.logit_bias_eog.push_back({i, -INFINITY});
         }
     }
@@ -1209,12 +1224,12 @@ common_init_result::common_init_result(common_params & params) :
     }
 
     //if (params.sampling.penalty_last_n == -1) {
-    //    LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
+    //    LOG_TRC("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
     //    params.sampling.penalty_last_n = llama_n_ctx(lctx);
     //}
 
     //if (params.sampling.dry_penalty_last_n == -1) {
-    //    LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
+    //    LOG_TRC("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
     //    params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
     //}
 
@@ -1422,7 +1437,7 @@ common_context_seq_rm_type common_context_can_seq_rm(llama_context * ctx) {
 
     // try to remove the last tokens
     if (!llama_memory_seq_rm(mem, 0, 1, -1)) {
-        LOG_WRN("%s: the context does not support partial sequence removal\n", __func__);
+        LOG_TRC("%s: the context does not support partial sequence removal\n", __func__);
         res = COMMON_CONTEXT_SEQ_RM_TYPE_FULL;
         goto done;
     }
diff --git a/common/common.h b/common/common.h
index a3cd17439..c0a5a730d 100644
--- a/common/common.h
+++ b/common/common.h
@@ -686,6 +686,7 @@ struct common_params {
 // initializes the logging system and prints info about the build
 void common_init();
 
+void common_params_print_info(const common_params & params);
 std::string common_params_get_system_info(const common_params & params);
 
 bool parse_cpu_range(const std::string & range, bool(&boolmask)[GGML_MAX_N_THREADS]);
diff --git a/common/download.cpp b/common/download.cpp
index d6d47b2d2..0bf12ad4a 100644
--- a/common/download.cpp
+++ b/common/download.cpp
@@ -320,9 +320,9 @@ static int common_download_file_single_online(const std::string & url,
 
     auto head = cli.Head(parts.path);
     if (!head || head->status < 200 || head->status >= 300) {
-        LOG_WRN("%s: HEAD failed, status: %d\n", __func__, head ? head->status : -1);
+        LOG_TRC("%s: HEAD failed, status: %d\n", __func__, head ? head->status : -1);
         if (file_exists) {
-            LOG_INF("%s: using cached file (HEAD failed): %s\n", __func__, path.c_str());
+            LOG_TRC("%s: using cached file (HEAD failed): %s\n", __func__, path.c_str());
             return 304; // 304 Not Modified - fake cached response
         }
         return head ? head->status : -1;
diff --git a/common/fit.cpp b/common/fit.cpp
index e66982df5..c10cb7f08 100644
--- a/common/fit.cpp
+++ b/common/fit.cpp
@@ -168,7 +168,7 @@ static void common_params_fit_impl(
 
     // step 1: get data for default parameters and check whether any changes are necessary in the first place
 
-    LOG_INF("%s: getting device memory data for initial parameters:\n", __func__);
+    LOG_TRC("%s: getting device memory data for initial parameters:\n", __func__);
     const dmds_t dmds_full = common_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
     const size_t nd = devs.size(); // number of devices
 
@@ -213,13 +213,13 @@ static void common_params_fit_impl(
         LOG_INF("%s: projected to use %" PRId64 " MiB of host memory vs. %" PRId64 " MiB of total host memory\n",
             __func__, sum_projected_used/MiB, sum_free/MiB);
         if (sum_projected_free >= margins[0]) {
-            LOG_INF("%s: will leave %" PRId64 " >= %" PRId64 " MiB of system memory, no changes needed\n",
+            LOG_TRC("%s: will leave %" PRId64 " >= %" PRId64 " MiB of system memory, no changes needed\n",
                 __func__, sum_projected_free/MiB, margins[0]/MiB);
             return;
         }
     } else {
         if (nd > 1) {
-            LOG_INF("%s: projected memory use with initial parameters [MiB]:\n", __func__);
+            LOG_TRC("%s: projected memory use with initial parameters [MiB]:\n", __func__);
         }
         for (size_t id = 0; id < nd; id++) {
             const llama_device_memory_data & dmd = dmds_full[id];
@@ -234,16 +234,16 @@ static void common_params_fit_impl(
             sum_projected_model += dmd.mb.model;
 
             if (nd > 1) {
-                LOG_INF("%s:   - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n",
+                LOG_TRC("%s:   - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n",
                     __func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB);
             }
         }
         assert(sum_free >= 0 && sum_projected_used >= 0);
-        LOG_INF("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
+        LOG_TRC("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
             __func__, sum_projected_used/MiB, sum_free/MiB);
         if (nd == 1) {
             if (projected_free_per_device[0] >= margins[0]) {
-                LOG_INF("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
+                LOG_TRC("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
                     __func__, projected_free_per_device[0]/MiB, margins[0]/MiB);
                 return;
             }
@@ -256,7 +256,7 @@ static void common_params_fit_impl(
                 }
             }
             if (!changes_needed) {
-                LOG_INF("%s: targets for free memory can be met on all devices, no changes needed\n", __func__);
+                LOG_TRC("%s: targets for free memory can be met on all devices, no changes needed\n", __func__);
                 return;
             }
         }
@@ -275,10 +275,10 @@ static void common_params_fit_impl(
         }
         if (global_surplus < 0) {
             if (nd <= 1) {
-                LOG_INF("%s: cannot meet free memory target of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n",
+                LOG_TRC("%s: cannot meet free memory target of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n",
                     __func__, margins[0]/MiB, -global_surplus/MiB);
             } else {
-                LOG_INF(
+                LOG_TRC(
                     "%s: cannot meet free memory targets on all devices, need to use %" PRId64 " MiB less in total\n",
                     __func__, -global_surplus/MiB);
             }
@@ -320,28 +320,28 @@ static void common_params_fit_impl(
 
                         const int64_t bytes_per_ctx = (sum_projected_used - sum_projected_used_min_ctx) / (hp_nct - n_ctx_min);
                         const int64_t memory_reduction = (hp_nct - cparams->n_ctx) * bytes_per_ctx;
-                        LOG_INF("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
+                        LOG_TRC("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
                             __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
                         if (nd <= 1) {
-                            LOG_INF("%s: entire model can be fit by reducing context\n", __func__);
+                            LOG_TRC("%s: entire model can be fit by reducing context\n", __func__);
                             return;
                         }
-                        LOG_INF("%s: entire model should be fit across devices by reducing context\n", __func__);
+                        LOG_TRC("%s: entire model should be fit across devices by reducing context\n", __func__);
                     } else {
                         const int64_t memory_reduction = sum_projected_used - sum_projected_used_min_ctx;
-                        LOG_INF("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
+                        LOG_TRC("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
                             __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
                     }
                 } else {
                     if (n_ctx_min == UINT32_MAX) {
-                        LOG_INF("%s: user has requested full context size of %" PRIu32 " -> no change\n", __func__, hp_nct);
+                        LOG_TRC("%s: user has requested full context size of %" PRIu32 " -> no change\n", __func__, hp_nct);
                     } else {
-                        LOG_INF("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n",
+                        LOG_TRC("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n",
                             __func__, hp_nct, n_ctx_min);
                     }
                 }
             } else {
-                LOG_INF("%s: context size set by user to %" PRIu32 " -> no change\n", __func__, cparams->n_ctx);
+                LOG_TRC("%s: context size set by user to %" PRIu32 " -> no change\n", __func__, cparams->n_ctx);
             }
         }
     }
@@ -485,10 +485,10 @@ static void common_params_fit_impl(
         const dmds_t dmd_nl = common_get_device_memory_data(
             path_model, &mparams_copy, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
 
-        LOG_INF("%s: memory for test allocation by device:\n", func_name);
+        LOG_TRC("%s: memory for test allocation by device:\n", func_name);
         for (size_t id = 0; id < nd; id++) {
             const ngl_t & n = ngl_per_device[id];
-            LOG_INF(
+            LOG_TRC(
                 "%s: id=%zu, n_layer=%2" PRIu32 ", n_part=%2" PRIu32 ", overflow_type=%d, mem=%6" PRId64 " MiB\n",
                 func_name, id, n.n_layer, n.n_part, int(n.overflow_type), dmd_nl[id].mb.total()/MiB);
         }
@@ -509,7 +509,7 @@ static void common_params_fit_impl(
         tensor_buft_overrides[1] = {nullptr, nullptr};
         mparams->tensor_buft_overrides = tensor_buft_overrides;
 
-        LOG_INF("%s: getting device memory data with all MoE tensors moved to system memory:\n", __func__);
+        LOG_TRC("%s: getting device memory data with all MoE tensors moved to system memory:\n", __func__);
         const dmds_t dmds_cpu_moe = common_get_device_memory_data(
             path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
 
@@ -519,10 +519,10 @@ static void common_params_fit_impl(
         }
 
         if (global_surplus_cpu_moe > 0) {
-            LOG_INF("%s: with only dense weights in device memory there is a total surplus of %" PRId64 " MiB\n",
+            LOG_TRC("%s: with only dense weights in device memory there is a total surplus of %" PRId64 " MiB\n",
                 __func__, global_surplus_cpu_moe/MiB);
         } else {
-            LOG_INF("%s: with only dense weights in device memory there is still a total deficit of %" PRId64 " MiB\n",
+            LOG_TRC("%s: with only dense weights in device memory there is still a total deficit of %" PRId64 " MiB\n",
                 __func__, -global_surplus_cpu_moe/MiB);
         }
 
@@ -535,7 +535,7 @@ static void common_params_fit_impl(
     targets.reserve(nd);
     for (size_t id = 0; id < nd; id++) {
         targets.push_back(dmds_full[id].free - margins[id]);
-        LOG_INF("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
+        LOG_TRC("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
     }
 
     std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the first partial layer of a device overflows to:
@@ -555,9 +555,9 @@ static void common_params_fit_impl(
     //   - once we only have a difference of a single layer, stop and return the lower bound that just barely still fits
     //   - the last device has the output layer, which cannot be a partial layer
     if (hp_nex == 0) {
-        LOG_INF("%s: filling dense layers back-to-front:\n", __func__);
+        LOG_TRC("%s: filling dense layers back-to-front:\n", __func__);
     } else {
-        LOG_INF("%s: filling dense-only layers back-to-front:\n", __func__);
+        LOG_TRC("%s: filling dense-only layers back-to-front:\n", __func__);
     }
     for (int id = nd - 1; id >= 0; id--) {
         uint32_t n_unassigned = hp_ngl + 1;
@@ -576,7 +576,7 @@ static void common_params_fit_impl(
             if (mem_high[id] > targets[id]) {
                 assert(ngl_per_device_high[id].n_layer > ngl_per_device[id].n_layer);
                 uint32_t delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
-                LOG_INF("%s: start filling device %" PRIu32 ", delta=%" PRIu32 "\n", __func__, id, delta);
+                LOG_TRC("%s: start filling device %" PRIu32 ", delta=%" PRIu32 "\n", __func__, id, delta);
                 while (delta > 1) {
                     uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
                     step_size = std::max(step_size, uint32_t(1));
@@ -593,11 +593,11 @@ static void common_params_fit_impl(
                     if (mem_test[id] <= targets[id]) {
                         ngl_per_device = ngl_per_device_test;
                         mem            = mem_test;
-                        LOG_INF("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
+                        LOG_TRC("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
                     } else {
                         ngl_per_device_high = ngl_per_device_test;
                         mem_high            = mem_test;
-                        LOG_INF("%s: set ngl_per_device_high[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device_high[id].n_layer);
+                        LOG_TRC("%s: set ngl_per_device_high[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device_high[id].n_layer);
                     }
                     delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
                 }
@@ -605,12 +605,12 @@ static void common_params_fit_impl(
                 assert(ngl_per_device_high[id].n_layer == n_unassigned);
                 ngl_per_device = ngl_per_device_high;
                 mem            = mem_high;
-                LOG_INF("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
+                LOG_TRC("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
             }
         }
 
         const int64_t projected_margin = dmds_full[id].free - mem[id];
-        LOG_INF(
+        LOG_TRC(
             "%s:   - %s: %2" PRIu32 " layers, %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
             __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, mem[id]/MiB, projected_margin/MiB);
     }
@@ -634,7 +634,7 @@ static void common_params_fit_impl(
     }
     assert(id_dense_start < nd);
 
-    LOG_INF("%s: converting dense-only layers to full layers and filling them front-to-back with overflow to next device/system memory:\n", __func__);
+    LOG_TRC("%s: converting dense-only layers to full layers and filling them front-to-back with overflow to next device/system memory:\n", __func__);
     for (size_t id = 0; id <= id_dense_start && id_dense_start < nd; id++) {
         std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
         for (size_t jd = id_dense_start; jd < nd; jd++) {
@@ -674,13 +674,13 @@ static void common_params_fit_impl(
                     ngl_per_device = ngl_per_device_test;
                     mem            = mem_test;
                     id_dense_start = id_dense_start_test;
-                    LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
+                    LOG_TRC("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
                         __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
                 } else {
                     ngl_per_device_high = ngl_per_device_test;
                     mem_high            = mem_test;
                     id_dense_start_high = id_dense_start_test;
-                    LOG_INF("%s: set ngl_per_device_high[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start_high=%zu\n",
+                    LOG_TRC("%s: set ngl_per_device_high[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start_high=%zu\n",
                         __func__, id, ngl_per_device_high[id].n_layer, ngl_per_device_high[id].n_part, id_dense_start_high);
                 }
                 assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
@@ -690,7 +690,7 @@ static void common_params_fit_impl(
             ngl_per_device = ngl_per_device_high;
             mem            = mem_high;
             id_dense_start = id_dense_start_high;
-            LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
+            LOG_TRC("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
                 __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
         }
 
@@ -710,44 +710,44 @@ static void common_params_fit_impl(
             if (id < nd - 1) {
                 overflow_bufts_test[id] = ggml_backend_dev_buffer_type(devs[id + 1]);
             }
-            LOG_INF("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__);
+            LOG_TRC("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__);
             std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
             if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
                 ngl_per_device = ngl_per_device_test;
                 overflow_bufts = overflow_bufts_test;
                 mem            = mem_test;
                 id_dense_start = id_dense_start_test;
-                LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", UP), id_dense_start=%zu\n",
+                LOG_TRC("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", UP), id_dense_start=%zu\n",
                     __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
 
                 ngl_per_device_test[id].overflow_type = LAYER_FRACTION_GATE;
-                LOG_INF("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__);
+                LOG_TRC("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__);
                 mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
                 if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
                     ngl_per_device = ngl_per_device_test;
                     overflow_bufts = overflow_bufts_test;
                     mem            = mem_test;
                     id_dense_start = id_dense_start_test;
-                    LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", GATE), id_dense_start=%zu\n",
+                    LOG_TRC("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", GATE), id_dense_start=%zu\n",
                         __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
                 }
             } else {
                 ngl_per_device_test[id].overflow_type = LAYER_FRACTION_ATTN;
-                LOG_INF("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__);
+                LOG_TRC("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__);
                 mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
                 if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
                     ngl_per_device = ngl_per_device_test;
                     overflow_bufts = overflow_bufts_test;
                     mem            = mem_test;
                     id_dense_start = id_dense_start_test;
-                    LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", ATTN), id_dense_start=%zu\n",
+                    LOG_TRC("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", ATTN), id_dense_start=%zu\n",
                         __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
                 }
             }
         }
 
         const int64_t projected_margin = dmds_full[id].free - mem[id];
-        LOG_INF(
+        LOG_TRC(
             "%s:   - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
             __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
     }
@@ -755,7 +755,7 @@ static void common_params_fit_impl(
     // print info for devices that were not changed during the conversion from dense only to full layers:
     for (size_t id = id_dense_start + 1; id < nd; id++) {
         const int64_t projected_margin = dmds_full[id].free - mem[id];
-        LOG_INF(
+        LOG_TRC(
             "%s:   - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
             __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
     }
@@ -776,7 +776,7 @@ enum common_params_fit_status common_fit_params(
     common_params_fit_status status = COMMON_PARAMS_FIT_STATUS_SUCCESS;
     try {
         common_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margins, n_ctx_min, log_level);
-        LOG_INF("%s: successfully fit params to free device memory\n", __func__);
+        LOG_TRC("%s: successfully fit params to free device memory\n", __func__);
     } catch (const common_params_fit_exception & e) {
         LOG_WRN("%s: failed to fit params to free device memory: %s\n", __func__, e.what());
         status = COMMON_PARAMS_FIT_STATUS_FAILURE;
@@ -785,7 +785,7 @@ enum common_params_fit_status common_fit_params(
         status = COMMON_PARAMS_FIT_STATUS_ERROR;
     }
     const int64_t t1_us = llama_time_us();
-    LOG_INF("%s: fitting params to free memory took %.2f seconds\n", __func__, (t1_us - t0_us) * 1e-6);
+    LOG_TRC("%s: fitting params to free memory took %.2f seconds\n", __func__, (t1_us - t0_us) * 1e-6);
     return status;
 }
 
@@ -925,7 +925,7 @@ void common_memory_breakdown_print(const struct llama_context * ctx) {
         }
     }
     for (const auto & td : table_data) {
-        LOG_INF(td[0].c_str(),
+        LOG_TRC(td[0].c_str(),
             __func__, td[1].c_str(), td[2].c_str(), td[3].c_str(), td[4].c_str(), td[5].c_str(),
             td[6].c_str(), td[7].c_str(), td[8].c_str());
     }
diff --git a/common/log.cpp b/common/log.cpp
index 4b4159db4..bd62616d8 100644
--- a/common/log.cpp
+++ b/common/log.cpp
@@ -435,10 +435,10 @@ void common_log_flush(struct common_log * log) {
 static int common_get_verbosity(enum ggml_log_level level) {
     switch (level) {
         case GGML_LOG_LEVEL_DEBUG: return LOG_LEVEL_DEBUG;
-        case GGML_LOG_LEVEL_INFO:  return LOG_LEVEL_INFO;
+        case GGML_LOG_LEVEL_INFO:  return LOG_LEVEL_TRACE;
         case GGML_LOG_LEVEL_WARN:  return LOG_LEVEL_WARN;
         case GGML_LOG_LEVEL_ERROR: return LOG_LEVEL_ERROR;
-        case GGML_LOG_LEVEL_CONT:  return LOG_LEVEL_INFO; // same as INFO
+        case GGML_LOG_LEVEL_CONT:  return LOG_LEVEL_TRACE;
         case GGML_LOG_LEVEL_NONE:
         default:
             return LOG_LEVEL_OUTPUT;
diff --git a/common/log.h b/common/log.h
index 8ee3e6b13..45d82f4dd 100644
--- a/common/log.h
+++ b/common/log.h
@@ -21,7 +21,8 @@
 #    define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
 #endif
 
-#define LOG_LEVEL_DEBUG  4
+#define LOG_LEVEL_DEBUG  5
+#define LOG_LEVEL_TRACE  4
 #define LOG_LEVEL_INFO   3
 #define LOG_LEVEL_WARN   2
 #define LOG_LEVEL_ERROR  1
@@ -111,13 +112,15 @@ void common_log_flush         (struct common_log * log);                    // f
 #define LOGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_NONE, verbosity,        __VA_ARGS__)
 
 #define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, LOG_LEVEL_DEBUG,  __VA_ARGS__)
+#define LOG_TRC(...) LOG_TMPL(GGML_LOG_LEVEL_INFO,  LOG_LEVEL_TRACE,  __VA_ARGS__)
 #define LOG_INF(...) LOG_TMPL(GGML_LOG_LEVEL_INFO,  LOG_LEVEL_INFO,   __VA_ARGS__)
 #define LOG_WRN(...) LOG_TMPL(GGML_LOG_LEVEL_WARN,  LOG_LEVEL_WARN,   __VA_ARGS__)
 #define LOG_ERR(...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, LOG_LEVEL_ERROR,  __VA_ARGS__)
 #define LOG_CNT(...) LOG_TMPL(GGML_LOG_LEVEL_CONT,  LOG_LEVEL_INFO,   __VA_ARGS__) // same as INFO
 
+#define LOG_DBGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, verbosity, __VA_ARGS__)
+#define LOG_TRCV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_TRACE, verbosity, __VA_ARGS__)
 #define LOG_INFV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_INFO,  verbosity, __VA_ARGS__)
 #define LOG_WRNV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_WARN,  verbosity, __VA_ARGS__)
 #define LOG_ERRV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, verbosity, __VA_ARGS__)
-#define LOG_DBGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, verbosity, __VA_ARGS__)
 #define LOG_CNTV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_CONT,  verbosity, __VA_ARGS__)
diff --git a/common/speculative.cpp b/common/speculative.cpp
index 0eebcb3dc..476e1398e 100644
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -984,7 +984,7 @@ common_speculative * common_speculative_init(common_params_speculative & params,
     }
 
     if (impls.empty()) {
-        LOG_WRN("%s", "no implementations specified for speculative decoding\n");
+        LOG_WRN("%s: no implementations specified for speculative decoding\n", __func__);
         return nullptr;
     }
 
diff --git a/ggml/src/ggml-metal/ggml-metal-device.m b/ggml/src/ggml-metal/ggml-metal-device.m
index fab7891c0..780dfe81b 100644
--- a/ggml/src/ggml-metal/ggml-metal-device.m
+++ b/ggml/src/ggml-metal/ggml-metal-device.m
@@ -672,7 +672,7 @@ ggml_metal_device_t ggml_metal_device_init(int device) {
                 ![[dev->mtl_device name] containsString:@"M6"] &&
                 ![[dev->mtl_device name] containsString:@"A19"] &&
                 ![[dev->mtl_device name] containsString:@"A20"]) {
-                GGML_LOG_WARN("%s: tensor API disabled for pre-M5 and pre-A19 devices\n", __func__);
+                GGML_LOG_INFO("%s: tensor API disabled for pre-M5 and pre-A19 devices\n", __func__);
                 dev->props.has_tensor = false;
             }
 
diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h
index 8e09f26e9..c35985199 100644
--- a/tools/mtmd/clip-impl.h
+++ b/tools/mtmd/clip-impl.h
@@ -473,10 +473,10 @@ static void clip_log_internal(enum ggml_log_level level, const char * format, ..
     va_end(args);
 }
 
+#define LOG_DBG(...) clip_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
 #define LOG_INF(...) clip_log_internal(GGML_LOG_LEVEL_INFO,  __VA_ARGS__)
 #define LOG_WRN(...) clip_log_internal(GGML_LOG_LEVEL_WARN,  __VA_ARGS__)
 #define LOG_ERR(...) clip_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
-#define LOG_DBG(...) clip_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
 #define LOG_CNT(...) clip_log_internal(GGML_LOG_LEVEL_CONT,  __VA_ARGS__)
 
 //
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 2e0cfa61f..02f098279 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -169,7 +169,7 @@ struct clip_ctx {
             throw std::runtime_error("failed to initialize CPU backend");
         }
         if (ctx_params.use_gpu) {
-            auto backend_name = std::getenv("MTMD_BACKEND_DEVICE");
+            auto * backend_name = std::getenv("MTMD_BACKEND_DEVICE");
             if (backend_name != nullptr) {
                 backend = ggml_backend_init_by_name(backend_name, nullptr);
                 if (!backend) {
diff --git a/tools/server/server-common.h b/tools/server/server-common.h
index 4681f9c51..c28558d8b 100644
--- a/tools/server/server-common.h
+++ b/tools/server/server-common.h
@@ -15,17 +15,19 @@
 
 using json = nlohmann::ordered_json;
 
+#define SLT_DBG(slot, fmt, ...) LOG_DBG("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__)
+#define SLT_TRC(slot, fmt, ...) LOG_TRC("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__)
 #define SLT_INF(slot, fmt, ...) LOG_INF("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__)
-#define SLT_CNT(slot, fmt, ...) LOG_CNT(""                                 fmt,                                                                __VA_ARGS__)
 #define SLT_WRN(slot, fmt, ...) LOG_WRN("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__)
 #define SLT_ERR(slot, fmt, ...) LOG_ERR("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__)
-#define SLT_DBG(slot, fmt, ...) LOG_DBG("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__)
+#define SLT_CNT(slot, fmt, ...) LOG_CNT(""                                 fmt,                                                                __VA_ARGS__)
 
+#define SRV_DBG(fmt, ...) LOG_DBG("srv  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define SRV_TRC(fmt, ...) LOG_TRC("srv  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
 #define SRV_INF(fmt, ...) LOG_INF("srv  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
-#define SRV_CNT(fmt, ...) LOG_CNT(""              fmt,               __VA_ARGS__)
 #define SRV_WRN(fmt, ...) LOG_WRN("srv  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
 #define SRV_ERR(fmt, ...) LOG_ERR("srv  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
-#define SRV_DBG(fmt, ...) LOG_DBG("srv  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define SRV_CNT(fmt, ...) LOG_CNT(""              fmt,               __VA_ARGS__)
 
 using raw_buffer = std::vector<uint8_t>;
 
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index ce743e665..d49c986fe 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -166,6 +166,7 @@ struct server_slot {
     // stats
     size_t n_sent_text = 0; // number of sent text character
 
+    int64_t t_print_last = 0;
     int64_t t_start_process_prompt;
     int64_t t_start_generation;
 
@@ -233,7 +234,7 @@ struct server_slot {
             }
         }
 
-        SLT_INF(*this, "init sampler, took %0.2f ms, tokens: text = %d, total = %d\n",
+        SLT_TRC(*this, "init sampler, took %0.2f ms, tokens: text = %d, total = %d\n",
                 (ggml_time_us() - t_start) / 1000.0, n_text, (int) prompt.tokens.size());
     }
 
@@ -417,6 +418,36 @@ struct server_slot {
         return stop_pos;
     }
 
+    void print_timings_tg() {
+        if (n_decoded < 100) {
+            return;
+        }
+
+        const int64_t t_now = ggml_time_us();
+
+        if (t_now - t_print_last < 3*1000*1000) {
+            return;
+        }
+
+        t_print_last = t_now;
+
+        const double n_gen_second = 1e3 / t_token_generation * n_decoded;
+
+        SLT_INF(*this, "n_decoded = %6d, tg = %6.2f t/s\n", n_decoded, n_gen_second);
+    }
+
+    void print_timings_pp() const {
+        const double n_prompt_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed;
+        const double f_progress = (float) prompt.n_tokens() / task->n_tokens();
+
+        if (t_prompt_processing < 3000.0) {
+            return;
+        }
+
+        SLT_INF(*this, "prompt processing, n_tokens = %6d, progress = %.2f, t = %6.2f s / %.2f tokens per second\n",
+                n_prompt_tokens_processed, f_progress, t_prompt_processing / 1e3, n_prompt_second);
+    }
+
     void print_timings() const {
         const double t_prompt        =       t_prompt_processing / n_prompt_tokens_processed;
         const double n_prompt_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed;
@@ -588,6 +619,10 @@ public:
     // note: chat_params must not be refreshed upon existing sleeping state
     server_chat_params chat_params;
 
+    server_context_impl() {
+        mtmd_helper_log_set(common_log_default_callback, nullptr);
+    }
+
     ~server_context_impl() {
         if (!sleeping) {
             // destroy() is already called when entering sleeping state
@@ -749,10 +784,6 @@ private:
 
         std::string & mmproj_path = params_base.mmproj.path;
         if (!mmproj_path.empty()) {
-            if (!is_resume) {
-                mtmd_helper_log_set(common_log_default_callback, nullptr);
-            }
-
             mtmd_context_params mparams = mtmd_context_params_default();
 
             mparams.use_gpu          = params_base.mmproj_use_gpu;
@@ -896,17 +927,17 @@ private:
 
         if (params_base.cache_ram_mib != 0) {
             if (params_base.cache_ram_mib < 0) {
-                SRV_WRN("prompt cache is enabled, size limit: %s\n", "no limit");
+                SRV_INF("prompt cache is enabled, size limit: %s\n", "no limit");
             } else {
-                SRV_WRN("prompt cache is enabled, size limit: %d MiB\n", params_base.cache_ram_mib);
+                SRV_INF("prompt cache is enabled, size limit: %d MiB\n", params_base.cache_ram_mib);
             }
-            SRV_WRN("%s", "use `--cache-ram 0` to disable the prompt cache\n");
+            SRV_INF("%s", "use `--cache-ram 0` to disable the prompt cache\n");
 
             prompt_cache = std::make_unique<server_prompt_cache>(params_base.cache_ram_mib, n_ctx);
         } else {
-            SRV_WRN("%s", "prompt cache is disabled - use `--cache-ram N` to enable it\n");
+            SRV_INF("%s", "prompt cache is disabled - use `--cache-ram N` to enable it\n");
         }
-        SRV_WRN("%s", "for more info see https://github.com/ggml-org/llama.cpp/pull/16391\n");
+        SRV_INF("%s", "for more info see https://github.com/ggml-org/llama.cpp/pull/16391\n");
 
         if (!params_base.model_alias.empty()) {
             // backward compat: use first alias as model name
@@ -954,13 +985,13 @@ private:
 
         if (params_base.cache_idle_slots) {
             if (!params_base.kv_unified) {
-                SRV_WRN("%s: --cache-idle-slots requires --kv-unified, disabling\n", __func__);
+                SRV_WRN("%s", "--cache-idle-slots requires --kv-unified, disabling\n");
                 params_base.cache_idle_slots = false;
             } else if (params_base.cache_ram_mib == 0) {
-                SRV_WRN("%s: --cache-idle-slots requires --cache-ram, disabling\n", __func__);
+                SRV_WRN("%s", "--cache-idle-slots requires --cache-ram, disabling\n");
                 params_base.cache_idle_slots = false;
             } else {
-                SRV_INF("%s: idle slots will be saved to prompt cache and cleared upon starting a new task\n", __func__);
+                SRV_INF("%s", "idle slots will be saved to prompt cache and cleared upon starting a new task\n");
                 SRV_DBG("%s", "__TEST_TAG_CACHE_IDLE_SLOTS_ENABLED__\n");
             }
         }
@@ -1112,7 +1143,7 @@ private:
             update_cache = update_cache && task.type == SERVER_TASK_TYPE_COMPLETION;
 
             if (update_cache) {
-                SRV_WRN("%s", "updating prompt cache\n");
+                SRV_INF("%s", "updating prompt cache\n");
 
                 const int64_t t_start = ggml_time_us();
 
@@ -1127,7 +1158,7 @@ private:
 
                 prompt_cache->update();
 
-                SRV_WRN("prompt cache update took %.2f ms\n", (ggml_time_us() - t_start) / 1000.0);
+                SRV_INF("prompt cache update took %.2f ms\n", (ggml_time_us() - t_start) / 1000.0);
             }
         }
 
@@ -1186,10 +1217,10 @@ private:
             if (!are_lora_equal(task_loras, slot.lora)) {
                 // if lora has changed, check to see if the cache should be cleared
                 if (lora_should_clear_cache(slot.lora, task_loras)) {
-                    SLT_INF(slot, "clearing cache for lora change. %zu loras -> %zu loras\n", slot.lora.size(), task.params.lora.size());
+                    SLT_TRC(slot, "clearing cache for lora change. %zu loras -> %zu loras\n", slot.lora.size(), task.params.lora.size());
                     slot.prompt.tokens.clear();
                 } else {
-                    SLT_INF(slot, "keeping cache for alora. %zu target loras\n", task_loras.size());
+                    SLT_TRC(slot, "keeping cache for alora. %zu target loras\n", task_loras.size());
                 }
                 slot.lora = task_loras;
             }
@@ -1281,7 +1312,8 @@ private:
                 llama_set_sampler(ctx_tgt, slot.id, nullptr);
             }
 
-            SLT_INF(slot, "sampler chain: %s\n", common_sampler_print(slot.smpl.get()).c_str());
+            SLT_TRC(slot, "sampler chain: %s\n", common_sampler_print(slot.smpl.get()).c_str());
+            SLT_TRC(slot, "sampler params: \n%s\n", task.params.sampling.print().c_str());
         } else {
             slot.smpl.reset();
         }
@@ -1800,7 +1832,7 @@ private:
         cur.update_tgt(ctx_tgt,       slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
         cur.update_dft(ctx_dft.get(), slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
 
-        SLT_WRN(slot,
+        SLT_INF(slot,
                 "created context checkpoint %d of %d (pos_min = %d, pos_max = %d, n_tokens = %" PRId64 ", size = %.3f MiB)\n",
                 (int) slot.prompt.checkpoints.size(), params_base.n_ctx_checkpoints, cur.pos_min,
                 cur.pos_max, cur.n_tokens, (float) cur.size() / 1024 / 1024);
@@ -2339,7 +2371,7 @@ private:
 
                         slot.state = SLOT_STATE_PROCESSING_PROMPT;
 
-                        SLT_INF(slot, "new prompt, n_ctx_slot = %d, n_keep = %d, task.n_tokens = %d\n",
+                        SLT_TRC(slot, "new prompt, n_ctx_slot = %d, n_keep = %d, task.n_tokens = %d\n",
                                 slot.n_ctx, slot.task->params.n_keep, slot.task->n_tokens());
 
                         // print prompt tokens (for debugging)
@@ -2454,7 +2486,7 @@ private:
                                         }
 
                                         if (n_match >= (size_t) n_cache_reuse) {
-                                            SLT_INF(slot, "reusing chunk with size %zu, shifting KV cache [%zu, %zu) -> [%zu, %zu)\n", n_match, head_c, head_c + n_match, head_p, head_p + n_match);
+                                            SLT_TRC(slot, "reusing chunk with size %zu, shifting KV cache [%zu, %zu) -> [%zu, %zu)\n", n_match, head_c, head_c + n_match, head_p, head_p + n_match);
                                             //for (size_t i = head_p; i < head_p + n_match; i++) {
                                             //    SLT_DBG(slot, "cache token %3zu: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx_tgt, prompt_tokens[i]).c_str());
                                             //}
@@ -2620,10 +2652,14 @@ private:
                         }
                     }
 
+                    const int64_t t_current = ggml_time_us();
+                    slot.t_prompt_processing = (t_current - slot.t_start_process_prompt) / 1e3;
+                    slot.print_timings_pp();
+
                     // truncate any tokens that are beyond n_past for this slot
                     const llama_pos p0 = slot.prompt.tokens.pos_next();
 
-                    SLT_INF(slot, "n_tokens = %d, memory_seq_rm [%d, end)\n", slot.prompt.n_tokens(), p0);
+                    SLT_TRC(slot, "cached n_tokens = %d, memory_seq_rm [%d, end)\n", slot.prompt.n_tokens(), p0);
 
                     if (!llama_memory_seq_rm(llama_get_memory(ctx_tgt), slot.id, p0, -1)) {
                         SLT_WRN(slot, "failed to truncate tokens with position >= %d - clearing the memory\n", p0);
@@ -2764,7 +2800,6 @@ private:
                         slot.i_batch   = batch.n_tokens - 1;
 
                         slot.init_sampler();
-                        SLT_INF(slot, "prompt processing done, n_tokens = %d, batch.n_tokens = %d\n", slot.prompt.n_tokens(), batch.n_tokens);
                     } else {
                         if (slot.task->n_tokens() < slot.prompt.n_tokens() + n_ubatch) {
                             // near the end of the prompt
@@ -2786,8 +2821,6 @@ private:
                                 }
                             }
                         }
-
-                        SLT_INF(slot, "prompt processing progress, n_tokens = %d, batch.n_tokens = %d, progress = %f\n", slot.prompt.n_tokens(), batch.n_tokens, (float) slot.prompt.n_tokens() / slot.task->n_tokens());
                     }
 
                     const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx_tgt), slot.id);
@@ -3084,6 +3117,8 @@ private:
 
                     continue;
                 }
+
+                slot.print_timings_tg();
             }
 
             // speculative decoding - main model sample and accept
@@ -3196,6 +3231,8 @@ private:
                     }
                 }
 
+                slot.print_timings_tg();
+
                 SLT_DBG(slot, "accepted %d/%d draft tokens, new n_tokens = %d\n", (int) ids.size() - 1, (int) n_draft, slot.prompt.n_tokens());
             }
         }
diff --git a/tools/server/server-http.cpp b/tools/server/server-http.cpp
index 8f25743fc..dd5a6f9e7 100644
--- a/tools/server/server-http.cpp
+++ b/tools/server/server-http.cpp
@@ -47,7 +47,7 @@ static void log_server_request(const httplib::Request & req, const httplib::Resp
 
     // reminder: this function is not covered by httplib's exception handler; if someone does more complicated stuff, think about wrapping it in try-catch
 
-    SRV_INF("done request: %s %s %s %d\n", req.method.c_str(), req.path.c_str(), req.remote_addr.c_str(), res.status);
+    SRV_TRC("done request: %s %s %s %d\n", req.method.c_str(), req.path.c_str(), req.remote_addr.c_str(), res.status);
 
     SRV_DBG("request:  %s\n", req.body.c_str());
     SRV_DBG("response: %s\n", res.body.c_str());
@@ -89,10 +89,10 @@ bool server_http_context::init(const common_params & params) {
     hostname = params.hostname;
 
     if (gcp.enabled) {
-        LOG_INF("%s: Google Cloud Platform compat: health route = %s, predict route = %s, port = %d\n", __func__, gcp.path_health.c_str(), gcp.path_predict.c_str(), gcp.port);
+        SRV_INF("Google Cloud Platform compat: health route = %s, predict route = %s, port = %d\n", gcp.path_health.c_str(), gcp.path_predict.c_str(), gcp.port);
 
         if (port != gcp.port) {
-            LOG_WRN("%s: Google Cloud Platform compat: overriding server port %d with AIP_HTTP_PORT %d\n", __func__, port, gcp.port);
+            SRV_WRN("Google Cloud Platform compat: overriding server port %d with AIP_HTTP_PORT %d\n", port, gcp.port);
         }
 
         port = gcp.port;
@@ -102,17 +102,17 @@ bool server_http_context::init(const common_params & params) {
 
 #ifdef CPPHTTPLIB_OPENSSL_SUPPORT
     if (params.ssl_file_key != "" && params.ssl_file_cert != "") {
-        LOG_INF("Running with SSL: key = %s, cert = %s\n", params.ssl_file_key.c_str(), params.ssl_file_cert.c_str());
+        SRV_INF("running with SSL: key = %s, cert = %s\n", params.ssl_file_key.c_str(), params.ssl_file_cert.c_str());
         srv.reset(
             new httplib::SSLServer(params.ssl_file_cert.c_str(), params.ssl_file_key.c_str())
         );
     } else {
-        LOG_INF("Running without SSL\n");
+        SRV_INF("%s", "running without SSL\n");
         srv.reset(new httplib::Server());
     }
 #else
     if (params.ssl_file_key != "" && params.ssl_file_cert != "") {
-        LOG_ERR("Server is built without SSL support\n");
+        SRV_ERR("%s", "the server is built without SSL support\n");
         return false;
     }
     srv.reset(new httplib::Server());
@@ -134,7 +134,7 @@ bool server_http_context::init(const common_params & params) {
 
         res.status = 500;
         res.set_content(message, "text/plain");
-        LOG_ERR("got exception: %s\n", message.c_str());
+        SRV_ERR("got exception: %s\n", message.c_str());
     });
 
     srv->set_error_handler([](const httplib::Request &, httplib::Response & res) {
@@ -162,7 +162,7 @@ bool server_http_context::init(const common_params & params) {
 #ifdef SO_REUSEPORT
             httplib::set_socket_opt(sock, SOL_SOCKET, SO_REUSEPORT, 1);
 #else
-            LOG_WRN("%s: SO_REUSEPORT is not supported\n", __func__);
+            SRV_WRN("%s", "SO_REUSEPORT is not supported\n");
 #endif
         }
     });
@@ -170,9 +170,9 @@ bool server_http_context::init(const common_params & params) {
     if (params.api_keys.size() == 1) {
         auto key = params.api_keys[0];
         std::string substr = key.substr(std::max((int)(key.length() - 4), 0));
-        LOG_INF("%s: api_keys: ****%s\n", __func__, substr.c_str());
+        SRV_INF("api_keys: ****%s\n", substr.c_str());
     } else if (params.api_keys.size() > 1) {
-        LOG_INF("%s: api_keys: %zu keys loaded\n", __func__, params.api_keys.size());
+        SRV_INF("api_keys: %zu keys loaded\n", params.api_keys.size());
     }
 
     //
@@ -232,7 +232,7 @@ bool server_http_context::init(const common_params & params) {
             "application/json; charset=utf-8"
         );
 
-        LOG_WRN("Unauthorized: Invalid API Key\n");
+        SRV_WRN("%s", "unauthorized: Invalid API Key\n");
 
         return false;
     };
@@ -292,7 +292,7 @@ bool server_http_context::init(const common_params & params) {
         // +4 threads for monitoring, health and some threads reserved for MCP and other tasks in the future
         n_threads_http = std::max(params.n_parallel + 4, (int32_t) std::thread::hardware_concurrency() - 1);
     }
-    LOG_INF("%s: using %d threads for HTTP server\n", __func__, n_threads_http);
+    SRV_INF("using %d threads for HTTP server\n", n_threads_http);
     srv->new_task_queue = [n_threads_http] {
         // spawn n_threads_http fixed thread (always alive), while allow up to 1024 max possible additional threads
         // when n_threads_http is used, server will create new "dynamic" threads that will be destroyed after processing each request
@@ -306,14 +306,14 @@ bool server_http_context::init(const common_params & params) {
     //
 
     if (!params.webui) {
-        LOG_INF("Web UI is disabled\n");
+        SRV_INF("%s", "the WebUI is disabled\n");
     } else {
         // register static assets routes
         if (!params.public_path.empty()) {
             // Set the base directory for serving static files
             bool is_found = srv->set_mount_point(params.api_prefix + "/", params.public_path);
             if (!is_found) {
-                LOG_ERR("%s: static assets path not found: %s\n", __func__, params.public_path.c_str());
+                SRV_ERR("static assets path not found: %s\n", params.public_path.c_str());
                 return 1;
             }
         } else {
@@ -348,13 +348,13 @@ bool server_http_context::start() {
     bool is_sock = false;
     if (string_ends_with(std::string(hostname), ".sock")) {
         is_sock = true;
-        LOG_INF("%s: setting address family to AF_UNIX\n", __func__);
+        SRV_INF("%s", "setting address family to AF_UNIX\n");
         srv->set_address_family(AF_UNIX);
         // bind_to_port requires a second arg, any value other than 0 should
         // simply get ignored
         was_bound = srv->bind_to_port(hostname, 8080);
     } else {
-        LOG_INF("%s: binding port with default address family\n", __func__);
+        SRV_INF("%s", "binding port with default address family\n");
         // bind HTTP listen port
         if (port == 0) {
             int bound_port = srv->bind_to_any_port(hostname);
@@ -368,7 +368,7 @@ bool server_http_context::start() {
     }
 
     if (!was_bound) {
-        LOG_ERR("%s: couldn't bind HTTP server socket, hostname: %s, port: %d\n", __func__, hostname.c_str(), port);
+        SRV_ERR("couldn't bind HTTP server socket, hostname: %s, port: %d\n", hostname.c_str(), port);
         return false;
     }
 
@@ -580,7 +580,7 @@ void server_http_context::register_gcp_compat() {
     }
 
     if (handlers.count(gcp.path_predict)) {
-        LOG_ERR("%s: AIP_PREDICT_ROUTE=%s conflicts with an existing llama-server route\n", __func__, gcp.path_predict.c_str());
+        SRV_ERR("AIP_PREDICT_ROUTE=%s conflicts with an existing llama-server route\n", gcp.path_predict.c_str());
         exit(1);
     }
 
@@ -651,7 +651,7 @@ void server_http_context::register_gcp_compat() {
                     payload.erase("@requestFormat");
 
                     if (payload.contains("stream")) {
-                        LOG_WRN("%s: ignoring client-provided stream field in instance, streaming is not supported in predict route\n", __func__);
+                        SRV_WRN("%s", "ignoring client-provided stream field in instance, streaming is not supported in predict route\n");
                         payload["stream"] = false;
                     }
 
diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp
index b9b4a704a..cbc40a35f 100644
--- a/tools/server/server-task.cpp
+++ b/tools/server/server-task.cpp
@@ -1988,7 +1988,7 @@ server_prompt * server_prompt_cache::alloc(const server_prompt & prompt, size_t
         const int cur_lcp_len = it->tokens.get_common_prefix(prompt.tokens);
 
         if (cur_lcp_len == (int) prompt.tokens.size()) {
-            SRV_WRN("%s", " - prompt is already in the cache, skipping\n");
+            SRV_INF("%s", " - prompt is already in the cache, skipping\n");
             return nullptr;
         }
     }
@@ -2043,7 +2043,7 @@ bool server_prompt_cache::load(server_prompt & prompt, const server_tokens & tok
     float f_keep_best = prompt.tokens.size() > 0 ? float(lcp_best) / prompt.tokens.size() : -1.0f; // empty slot: any cache entry wins
     float sim_best    = float(lcp_best) / tokens_new.size();
 
-    SRV_WRN(" - looking for better prompt, base f_keep = %.3f, sim = %.3f\n", f_keep_best, sim_best);
+    SRV_INF(" - looking for better prompt, base f_keep = %.3f, sim = %.3f\n", f_keep_best, sim_best);
 
     auto it_best = states.end();
 
@@ -2068,7 +2068,7 @@ bool server_prompt_cache::load(server_prompt & prompt, const server_tokens & tok
     }
 
     if (it_best != states.end()) {
-        SRV_WRN(" - found better prompt with f_keep = %.3f, sim = %.3f\n", f_keep_best, sim_best);
+        SRV_INF(" - found better prompt with f_keep = %.3f, sim = %.3f\n", f_keep_best, sim_best);
 
         {
             auto & data = it_best->data.main;
@@ -2076,7 +2076,7 @@ bool server_prompt_cache::load(server_prompt & prompt, const server_tokens & tok
             const size_t size = data.size();
             const size_t n = llama_state_seq_set_data_ext(ctx_tgt, data.data(), size, id_slot, 0);
             if (n != size) {
-                SRV_WRN("failed to restore state with size %zu\n", size);
+                SRV_ERR("failed to restore state with size %zu\n", size);
 
                 return false;
             }
@@ -2145,11 +2145,11 @@ void server_prompt_cache::update() {
         }
     }
 
-    SRV_WRN(" - cache state: %zu prompts, %.3f MiB (limits: %.3f MiB, %zu tokens, %zu est)\n",
+    SRV_INF(" - cache state: %zu prompts, %.3f MiB (limits: %.3f MiB, %zu tokens, %zu est)\n",
             states.size(), size() / (1024.0 * 1024.0), limit_size / (1024.0 * 1024.0), limit_tokens, limit_tokens_cur);
 
     for (const auto & state : states) {
-        SRV_WRN("   - prompt %p: %7d tokens, checkpoints: %2zu, %9.3f MiB\n",
+        SRV_INF("   - prompt %p: %7d tokens, checkpoints: %2zu, %9.3f MiB\n",
                 (const void *)&state, state.n_tokens(), state.checkpoints.size(), state.size() / (1024.0 * 1024.0));
     }
 }
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 371b13c44..823ae5bda 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -83,17 +83,22 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+    llama_backend_init();
+    llama_numa_init(params.numa);
+
+    common_params_print_info(params);
+
     // validate batch size for embeddings
     // embeddings require all tokens to be processed in a single ubatch
     // see https://github.com/ggml-org/llama.cpp/issues/12836
     if (params.embedding && params.n_batch > params.n_ubatch) {
-        LOG_WRN("%s: embeddings enabled with n_batch (%d) > n_ubatch (%d)\n", __func__, params.n_batch, params.n_ubatch);
-        LOG_WRN("%s: setting n_batch = n_ubatch = %d to avoid assertion failure\n", __func__, params.n_ubatch);
+        SRV_WRN("embeddings enabled with n_batch (%d) > n_ubatch (%d)\n", params.n_batch, params.n_ubatch);
+        SRV_WRN("setting n_batch = n_ubatch = %d to avoid assertion failure\n", params.n_ubatch);
         params.n_batch = params.n_ubatch;
     }
 
     if (params.n_parallel < 0) {
-        LOG_INF("%s: n_parallel is set to auto, using n_parallel = 4 and kv_unified = true\n", __func__);
+        SRV_INF("%s", "n_parallel is set to auto, using n_parallel = 4 and kv_unified = true\n");
 
         params.n_parallel = 4;
         params.kv_unified = true;
@@ -107,15 +112,9 @@ int main(int argc, char ** argv) {
     // struct that contains llama context and inference
     server_context ctx_server;
 
-    llama_backend_init();
-    llama_numa_init(params.numa);
-
-    LOG_INF("build_info: %s\n", llama_build_info());
-    LOG_INF("%s\n", common_params_get_system_info(params).c_str());
-
     server_http_context ctx_http;
     if (!ctx_http.init(params)) {
-        LOG_ERR("%s: failed to initialize HTTP server\n", __func__);
+        SRV_ERR("%s", "failed to initialize HTTP server\n");
         return 1;
     }
 
@@ -134,7 +133,7 @@ int main(int argc, char ** argv) {
         try {
             models_routes.emplace(params, argc, argv);
         } catch (const std::exception & e) {
-            LOG_ERR("%s: failed to initialize router models: %s\n", __func__, e.what());
+            SRV_ERR("failed to initialize router models: %s\n", e.what());
             return 1;
         }
 
@@ -222,7 +221,7 @@ int main(int argc, char ** argv) {
         try {
             tools.setup(params.server_tools);
         } catch (const std::exception & e) {
-            LOG_ERR("%s: tools setup failed: %s\n", __func__, e.what());
+            SRV_ERR("tools setup failed: %s\n", e.what());
             return 1;
         }
         SRV_WRN("%s", "-----------------\n");
@@ -240,7 +239,7 @@ int main(int argc, char ** argv) {
     std::function<void()> clean_up;
 
     if (is_router_server) {
-        LOG_INF("%s: starting router server, no model will be loaded in this process\n", __func__);
+        SRV_INF("%s", "starting router server, no model will be loaded in this process\n");
 
         clean_up = [&models_routes]() {
             SRV_INF("%s: cleaning up before exit...\n", __func__);
@@ -252,7 +251,7 @@ int main(int argc, char ** argv) {
 
         if (!ctx_http.start()) {
             clean_up();
-            LOG_ERR("%s: exiting due to HTTP server error\n", __func__);
+            SRV_ERR("%s", "exiting due to HTTP server error\n");
             return 1;
         }
         ctx_http.is_ready.store(true);
@@ -273,12 +272,12 @@ int main(int argc, char ** argv) {
         // start the HTTP server before loading the model to be able to serve /health requests
         if (!ctx_http.start()) {
             clean_up();
-            LOG_ERR("%s: exiting due to HTTP server error\n", __func__);
+            SRV_ERR("%s", "exiting due to HTTP server error\n");
             return 1;
         }
 
         // load the model
-        LOG_INF("%s: loading model\n", __func__);
+        SRV_INF("%s", "loading model\n");
 
         if (server_models::is_child_server()) {
             ctx_server.on_sleeping_changed([&](bool sleeping) {
@@ -291,14 +290,14 @@ int main(int argc, char ** argv) {
             if (ctx_http.thread.joinable()) {
                 ctx_http.thread.join();
             }
-            LOG_ERR("%s: exiting due to model loading error\n", __func__);
+            SRV_ERR("%s", "exiting due to model loading error\n");
             return 1;
         }
 
         routes.update_meta(ctx_server);
         ctx_http.is_ready.store(true);
 
-        LOG_INF("%s: model loaded\n", __func__);
+        SRV_INF("%s", "model loaded\n");
 
         shutdown_handler = [&](int) {
             // this will unblock start_loop()
@@ -322,9 +321,9 @@ int main(int argc, char ** argv) {
 #endif
 
     if (is_router_server) {
-        LOG_INF("%s: router server is listening on %s\n", __func__, ctx_http.listening_address.c_str());
-        LOG_INF("%s: NOTE: router mode is experimental\n", __func__);
-        LOG_INF("%s:       it is not recommended to use this mode in untrusted environments\n", __func__);
+        SRV_INF("router server is listening on %s\n", ctx_http.listening_address.c_str());
+        SRV_WRN("%s", "NOTE: router mode is experimental\n");
+        SRV_WRN("%s", "      it is not recommended to use this mode in untrusted environments\n");
         if (ctx_http.thread.joinable()) {
             ctx_http.thread.join(); // keep the main thread alive
         }
@@ -332,8 +331,7 @@ int main(int argc, char ** argv) {
         // when the HTTP server stops, clean up and exit
         clean_up();
     } else {
-        LOG_INF("%s: server is listening on %s\n", __func__, ctx_http.listening_address.c_str());
-        LOG_INF("%s: starting the main loop...\n", __func__);
+        SRV_INF("server is listening on %s\n", ctx_http.listening_address.c_str());
 
         // optionally, notify router server that this instance is ready
         std::thread monitor_thread;
diff --git a/tools/tts/tts.cpp b/tools/tts/tts.cpp
index ce68213d1..2a1bdccc9 100644
--- a/tools/tts/tts.cpp
+++ b/tools/tts/tts.cpp
@@ -604,8 +604,8 @@ int main(int argc, char ** argv) {
     }
 
     LOG_INF("sampler seed: %u\n",     common_sampler_get_seed(smpl[0]));
-    LOG_INF("sampler params: \n%s\n", params.sampling.print().c_str());
     LOG_INF("sampler chain: %s\n",    common_sampler_print(smpl[0]).c_str());
+    LOG_INF("sampler params: \n%s\n", params.sampling.print().c_str());
 
     LOG_INF("%s: loading done\n", __func__);