From 67b2b7f2f2d6dac7962b219168a4c7a20c7359b7 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 14 May 2026 13:05:52 +0300 Subject: [PATCH] logs : reduce (#23021) * logs : reduce * args : fix envs * server : fix build * common : print verbosity level at start * server : clean-up logs * server : print prompt processing timings + sampling params * minor : whitespaces --- common/arg.cpp | 20 +++--- common/common.cpp | 29 ++++++-- common/common.h | 1 + common/download.cpp | 4 +- common/fit.cpp | 88 ++++++++++++------------- common/log.cpp | 4 +- common/log.h | 7 +- common/speculative.cpp | 2 +- ggml/src/ggml-metal/ggml-metal-device.m | 2 +- tools/mtmd/clip-impl.h | 2 +- tools/mtmd/clip.cpp | 2 +- tools/server/server-common.h | 10 +-- tools/server/server-context.cpp | 87 +++++++++++++++++------- tools/server/server-http.cpp | 38 +++++------ tools/server/server-task.cpp | 12 ++-- tools/server/server.cpp | 44 ++++++------- tools/tts/tts.cpp | 2 +- 17 files changed, 207 insertions(+), 147 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 75a4d38d3..15d5ad77a 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -308,12 +308,14 @@ static bool common_params_handle_remote_preset(common_params & params, llama_exa common_download_opts opts; opts.bearer_token = params.hf_token; opts.offline = params.offline; + + LOG_TRC("%s: looking for remote preset at %s\n", __func__, preset_url.c_str()); const int status = common_download_file_single(preset_url, preset_path, opts); const bool has_preset = status >= 200 && status < 400; // remote preset is optional, so we don't error out if not found if (has_preset) { - LOG_INF("applying remote preset from %s\n", preset_url.c_str()); + LOG_TRC("%s: applying remote preset from %s\n", __func__, preset_url.c_str()); common_preset_context ctx(ex, /* only_remote_allowed */ true); common_preset global; auto remote_presets = ctx.load_from_ini(preset_path, global); @@ -326,7 +328,7 @@ static bool common_params_handle_remote_preset(common_params & params, llama_exa throw std::runtime_error("Remote preset.ini does not contain [" + std::string(hf_tag) + "] section"); } } else { - LOG_INF("%s", "no remote preset found, skipping\n"); + LOG_TRC("%s: no remote preset found, skipping\n", __func__); } return has_preset; @@ -3301,18 +3303,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_env("LLAMA_LOG_VERBOSITY")); add_opt(common_arg( {"--log-prefix"}, + {"--no-log-prefix"}, "Enable prefix in log messages", - [](common_params &) { - common_log_set_prefix(common_log_main(), true); + [](common_params &, bool value) { + common_log_set_prefix(common_log_main(), value); } - ).set_env("LLAMA_LOG_PREFIX")); + ).set_env("LLAMA_ARG_LOG_PREFIX")); add_opt(common_arg( {"--log-timestamps"}, + {"--no-log-timestamps"}, "Enable timestamps in log messages", - [](common_params &) { - common_log_set_timestamps(common_log_main(), true); + [](common_params &, bool value) { + common_log_set_timestamps(common_log_main(), value); } - ).set_env("LLAMA_LOG_TIMESTAMPS")); + ).set_env("LLAMA_ARG_LOG_TIMESTAMPS")); // // speculative parameters diff --git a/common/common.cpp b/common/common.cpp index 352af0b17..b701edddb 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -366,15 +366,29 @@ void common_init() { SetConsoleCP(CP_UTF8); #endif - llama_log_set(common_log_default_callback, NULL); + common_log_set_prefix(common_log_main(), true); + common_log_set_timestamps(common_log_main(), true); + llama_log_set(common_log_default_callback, NULL); +} + +void common_params_print_info(const common_params & params) { #ifdef NDEBUG const char * build_type = ""; #else const char * build_type = " (debug)"; #endif + LOG_TRC("%s: build %d (%s) with %s for %s%s\n", __func__, llama_build_number(), llama_commit(), llama_compiler(), llama_build_target(), build_type); - LOG_DBG("build: %d (%s) with %s for %s%s\n", llama_build_number(), llama_commit(), llama_compiler(), llama_build_target(), build_type); + LOG_INF("log_info: verbosity = %d (adjust with the `-lv N` CLI arg)\n", common_log_get_verbosity_thold()); + LOG_INF("device_info:\n"); + for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { + auto * dev = ggml_backend_dev_get(i); + size_t free, total; + ggml_backend_dev_memory(dev, &free, &total); + LOG_INF(" - %-8s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024); + } + LOG_INF("%s\n", common_params_get_system_info(params).c_str()); } std::string common_params_get_system_info(const common_params & params) { @@ -1147,7 +1161,8 @@ common_init_result::common_init_result(common_params & params) : auto cparams = common_context_params_to_llama(params); if (params.fit_params) { - LOG_INF("%s: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on\n", __func__); + LOG_INF("%s: fitting params to device memory ...\n", __func__); + LOG_INF("%s: (for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on)\n", __func__); common_fit_params(params.model.path.c_str(), &mparams, &cparams, params.tensor_split, params.tensor_buft_overrides.data(), @@ -1196,7 +1211,7 @@ common_init_result::common_init_result(common_params & params) : // initialize once for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) { if (llama_vocab_is_eog(vocab, i)) { - LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(vocab, i).c_str(), -INFINITY); + LOG_TRC("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(vocab, i).c_str(), -INFINITY); params.sampling.logit_bias_eog.push_back({i, -INFINITY}); } } @@ -1209,12 +1224,12 @@ common_init_result::common_init_result(common_params & params) : } //if (params.sampling.penalty_last_n == -1) { - // LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx)); + // LOG_TRC("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx)); // params.sampling.penalty_last_n = llama_n_ctx(lctx); //} //if (params.sampling.dry_penalty_last_n == -1) { - // LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx)); + // LOG_TRC("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx)); // params.sampling.dry_penalty_last_n = llama_n_ctx(lctx); //} @@ -1422,7 +1437,7 @@ common_context_seq_rm_type common_context_can_seq_rm(llama_context * ctx) { // try to remove the last tokens if (!llama_memory_seq_rm(mem, 0, 1, -1)) { - LOG_WRN("%s: the context does not support partial sequence removal\n", __func__); + LOG_TRC("%s: the context does not support partial sequence removal\n", __func__); res = COMMON_CONTEXT_SEQ_RM_TYPE_FULL; goto done; } diff --git a/common/common.h b/common/common.h index a3cd17439..c0a5a730d 100644 --- a/common/common.h +++ b/common/common.h @@ -686,6 +686,7 @@ struct common_params { // initializes the logging system and prints info about the build void common_init(); +void common_params_print_info(const common_params & params); std::string common_params_get_system_info(const common_params & params); bool parse_cpu_range(const std::string & range, bool(&boolmask)[GGML_MAX_N_THREADS]); diff --git a/common/download.cpp b/common/download.cpp index d6d47b2d2..0bf12ad4a 100644 --- a/common/download.cpp +++ b/common/download.cpp @@ -320,9 +320,9 @@ static int common_download_file_single_online(const std::string & url, auto head = cli.Head(parts.path); if (!head || head->status < 200 || head->status >= 300) { - LOG_WRN("%s: HEAD failed, status: %d\n", __func__, head ? head->status : -1); + LOG_TRC("%s: HEAD failed, status: %d\n", __func__, head ? head->status : -1); if (file_exists) { - LOG_INF("%s: using cached file (HEAD failed): %s\n", __func__, path.c_str()); + LOG_TRC("%s: using cached file (HEAD failed): %s\n", __func__, path.c_str()); return 304; // 304 Not Modified - fake cached response } return head ? head->status : -1; diff --git a/common/fit.cpp b/common/fit.cpp index e66982df5..c10cb7f08 100644 --- a/common/fit.cpp +++ b/common/fit.cpp @@ -168,7 +168,7 @@ static void common_params_fit_impl( // step 1: get data for default parameters and check whether any changes are necessary in the first place - LOG_INF("%s: getting device memory data for initial parameters:\n", __func__); + LOG_TRC("%s: getting device memory data for initial parameters:\n", __func__); const dmds_t dmds_full = common_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level); const size_t nd = devs.size(); // number of devices @@ -213,13 +213,13 @@ static void common_params_fit_impl( LOG_INF("%s: projected to use %" PRId64 " MiB of host memory vs. %" PRId64 " MiB of total host memory\n", __func__, sum_projected_used/MiB, sum_free/MiB); if (sum_projected_free >= margins[0]) { - LOG_INF("%s: will leave %" PRId64 " >= %" PRId64 " MiB of system memory, no changes needed\n", + LOG_TRC("%s: will leave %" PRId64 " >= %" PRId64 " MiB of system memory, no changes needed\n", __func__, sum_projected_free/MiB, margins[0]/MiB); return; } } else { if (nd > 1) { - LOG_INF("%s: projected memory use with initial parameters [MiB]:\n", __func__); + LOG_TRC("%s: projected memory use with initial parameters [MiB]:\n", __func__); } for (size_t id = 0; id < nd; id++) { const llama_device_memory_data & dmd = dmds_full[id]; @@ -234,16 +234,16 @@ static void common_params_fit_impl( sum_projected_model += dmd.mb.model; if (nd > 1) { - LOG_INF("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n", + LOG_TRC("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n", __func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB); } } assert(sum_free >= 0 && sum_projected_used >= 0); - LOG_INF("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n", + LOG_TRC("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n", __func__, sum_projected_used/MiB, sum_free/MiB); if (nd == 1) { if (projected_free_per_device[0] >= margins[0]) { - LOG_INF("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n", + LOG_TRC("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n", __func__, projected_free_per_device[0]/MiB, margins[0]/MiB); return; } @@ -256,7 +256,7 @@ static void common_params_fit_impl( } } if (!changes_needed) { - LOG_INF("%s: targets for free memory can be met on all devices, no changes needed\n", __func__); + LOG_TRC("%s: targets for free memory can be met on all devices, no changes needed\n", __func__); return; } } @@ -275,10 +275,10 @@ static void common_params_fit_impl( } if (global_surplus < 0) { if (nd <= 1) { - LOG_INF("%s: cannot meet free memory target of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n", + LOG_TRC("%s: cannot meet free memory target of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n", __func__, margins[0]/MiB, -global_surplus/MiB); } else { - LOG_INF( + LOG_TRC( "%s: cannot meet free memory targets on all devices, need to use %" PRId64 " MiB less in total\n", __func__, -global_surplus/MiB); } @@ -320,28 +320,28 @@ static void common_params_fit_impl( const int64_t bytes_per_ctx = (sum_projected_used - sum_projected_used_min_ctx) / (hp_nct - n_ctx_min); const int64_t memory_reduction = (hp_nct - cparams->n_ctx) * bytes_per_ctx; - LOG_INF("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n", + LOG_TRC("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n", __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB); if (nd <= 1) { - LOG_INF("%s: entire model can be fit by reducing context\n", __func__); + LOG_TRC("%s: entire model can be fit by reducing context\n", __func__); return; } - LOG_INF("%s: entire model should be fit across devices by reducing context\n", __func__); + LOG_TRC("%s: entire model should be fit across devices by reducing context\n", __func__); } else { const int64_t memory_reduction = sum_projected_used - sum_projected_used_min_ctx; - LOG_INF("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n", + LOG_TRC("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n", __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB); } } else { if (n_ctx_min == UINT32_MAX) { - LOG_INF("%s: user has requested full context size of %" PRIu32 " -> no change\n", __func__, hp_nct); + LOG_TRC("%s: user has requested full context size of %" PRIu32 " -> no change\n", __func__, hp_nct); } else { - LOG_INF("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n", + LOG_TRC("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n", __func__, hp_nct, n_ctx_min); } } } else { - LOG_INF("%s: context size set by user to %" PRIu32 " -> no change\n", __func__, cparams->n_ctx); + LOG_TRC("%s: context size set by user to %" PRIu32 " -> no change\n", __func__, cparams->n_ctx); } } } @@ -485,10 +485,10 @@ static void common_params_fit_impl( const dmds_t dmd_nl = common_get_device_memory_data( path_model, &mparams_copy, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level); - LOG_INF("%s: memory for test allocation by device:\n", func_name); + LOG_TRC("%s: memory for test allocation by device:\n", func_name); for (size_t id = 0; id < nd; id++) { const ngl_t & n = ngl_per_device[id]; - LOG_INF( + LOG_TRC( "%s: id=%zu, n_layer=%2" PRIu32 ", n_part=%2" PRIu32 ", overflow_type=%d, mem=%6" PRId64 " MiB\n", func_name, id, n.n_layer, n.n_part, int(n.overflow_type), dmd_nl[id].mb.total()/MiB); } @@ -509,7 +509,7 @@ static void common_params_fit_impl( tensor_buft_overrides[1] = {nullptr, nullptr}; mparams->tensor_buft_overrides = tensor_buft_overrides; - LOG_INF("%s: getting device memory data with all MoE tensors moved to system memory:\n", __func__); + LOG_TRC("%s: getting device memory data with all MoE tensors moved to system memory:\n", __func__); const dmds_t dmds_cpu_moe = common_get_device_memory_data( path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level); @@ -519,10 +519,10 @@ static void common_params_fit_impl( } if (global_surplus_cpu_moe > 0) { - LOG_INF("%s: with only dense weights in device memory there is a total surplus of %" PRId64 " MiB\n", + LOG_TRC("%s: with only dense weights in device memory there is a total surplus of %" PRId64 " MiB\n", __func__, global_surplus_cpu_moe/MiB); } else { - LOG_INF("%s: with only dense weights in device memory there is still a total deficit of %" PRId64 " MiB\n", + LOG_TRC("%s: with only dense weights in device memory there is still a total deficit of %" PRId64 " MiB\n", __func__, -global_surplus_cpu_moe/MiB); } @@ -535,7 +535,7 @@ static void common_params_fit_impl( targets.reserve(nd); for (size_t id = 0; id < nd; id++) { targets.push_back(dmds_full[id].free - margins[id]); - LOG_INF("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB); + LOG_TRC("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB); } std::vector overflow_bufts; // which bufts the first partial layer of a device overflows to: @@ -555,9 +555,9 @@ static void common_params_fit_impl( // - once we only have a difference of a single layer, stop and return the lower bound that just barely still fits // - the last device has the output layer, which cannot be a partial layer if (hp_nex == 0) { - LOG_INF("%s: filling dense layers back-to-front:\n", __func__); + LOG_TRC("%s: filling dense layers back-to-front:\n", __func__); } else { - LOG_INF("%s: filling dense-only layers back-to-front:\n", __func__); + LOG_TRC("%s: filling dense-only layers back-to-front:\n", __func__); } for (int id = nd - 1; id >= 0; id--) { uint32_t n_unassigned = hp_ngl + 1; @@ -576,7 +576,7 @@ static void common_params_fit_impl( if (mem_high[id] > targets[id]) { assert(ngl_per_device_high[id].n_layer > ngl_per_device[id].n_layer); uint32_t delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer; - LOG_INF("%s: start filling device %" PRIu32 ", delta=%" PRIu32 "\n", __func__, id, delta); + LOG_TRC("%s: start filling device %" PRIu32 ", delta=%" PRIu32 "\n", __func__, id, delta); while (delta > 1) { uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]); step_size = std::max(step_size, uint32_t(1)); @@ -593,11 +593,11 @@ static void common_params_fit_impl( if (mem_test[id] <= targets[id]) { ngl_per_device = ngl_per_device_test; mem = mem_test; - LOG_INF("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer); + LOG_TRC("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer); } else { ngl_per_device_high = ngl_per_device_test; mem_high = mem_test; - LOG_INF("%s: set ngl_per_device_high[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device_high[id].n_layer); + LOG_TRC("%s: set ngl_per_device_high[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device_high[id].n_layer); } delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer; } @@ -605,12 +605,12 @@ static void common_params_fit_impl( assert(ngl_per_device_high[id].n_layer == n_unassigned); ngl_per_device = ngl_per_device_high; mem = mem_high; - LOG_INF("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer); + LOG_TRC("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer); } } const int64_t projected_margin = dmds_full[id].free - mem[id]; - LOG_INF( + LOG_TRC( "%s: - %s: %2" PRIu32 " layers, %6" PRId64 " MiB used, %6" PRId64 " MiB free\n", __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, mem[id]/MiB, projected_margin/MiB); } @@ -634,7 +634,7 @@ static void common_params_fit_impl( } assert(id_dense_start < nd); - LOG_INF("%s: converting dense-only layers to full layers and filling them front-to-back with overflow to next device/system memory:\n", __func__); + LOG_TRC("%s: converting dense-only layers to full layers and filling them front-to-back with overflow to next device/system memory:\n", __func__); for (size_t id = 0; id <= id_dense_start && id_dense_start < nd; id++) { std::vector ngl_per_device_high = ngl_per_device; for (size_t jd = id_dense_start; jd < nd; jd++) { @@ -674,13 +674,13 @@ static void common_params_fit_impl( ngl_per_device = ngl_per_device_test; mem = mem_test; id_dense_start = id_dense_start_test; - LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n", + LOG_TRC("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n", __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start); } else { ngl_per_device_high = ngl_per_device_test; mem_high = mem_test; id_dense_start_high = id_dense_start_test; - LOG_INF("%s: set ngl_per_device_high[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start_high=%zu\n", + LOG_TRC("%s: set ngl_per_device_high[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start_high=%zu\n", __func__, id, ngl_per_device_high[id].n_layer, ngl_per_device_high[id].n_part, id_dense_start_high); } assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full()); @@ -690,7 +690,7 @@ static void common_params_fit_impl( ngl_per_device = ngl_per_device_high; mem = mem_high; id_dense_start = id_dense_start_high; - LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n", + LOG_TRC("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n", __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start); } @@ -710,44 +710,44 @@ static void common_params_fit_impl( if (id < nd - 1) { overflow_bufts_test[id] = ggml_backend_dev_buffer_type(devs[id + 1]); } - LOG_INF("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__); + LOG_TRC("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__); std::vector mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test); if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) { ngl_per_device = ngl_per_device_test; overflow_bufts = overflow_bufts_test; mem = mem_test; id_dense_start = id_dense_start_test; - LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", UP), id_dense_start=%zu\n", + LOG_TRC("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", UP), id_dense_start=%zu\n", __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start); ngl_per_device_test[id].overflow_type = LAYER_FRACTION_GATE; - LOG_INF("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__); + LOG_TRC("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__); mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test); if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) { ngl_per_device = ngl_per_device_test; overflow_bufts = overflow_bufts_test; mem = mem_test; id_dense_start = id_dense_start_test; - LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", GATE), id_dense_start=%zu\n", + LOG_TRC("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", GATE), id_dense_start=%zu\n", __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start); } } else { ngl_per_device_test[id].overflow_type = LAYER_FRACTION_ATTN; - LOG_INF("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__); + LOG_TRC("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__); mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test); if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) { ngl_per_device = ngl_per_device_test; overflow_bufts = overflow_bufts_test; mem = mem_test; id_dense_start = id_dense_start_test; - LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", ATTN), id_dense_start=%zu\n", + LOG_TRC("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", ATTN), id_dense_start=%zu\n", __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start); } } } const int64_t projected_margin = dmds_full[id].free - mem[id]; - LOG_INF( + LOG_TRC( "%s: - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n", __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB); } @@ -755,7 +755,7 @@ static void common_params_fit_impl( // print info for devices that were not changed during the conversion from dense only to full layers: for (size_t id = id_dense_start + 1; id < nd; id++) { const int64_t projected_margin = dmds_full[id].free - mem[id]; - LOG_INF( + LOG_TRC( "%s: - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n", __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB); } @@ -776,7 +776,7 @@ enum common_params_fit_status common_fit_params( common_params_fit_status status = COMMON_PARAMS_FIT_STATUS_SUCCESS; try { common_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margins, n_ctx_min, log_level); - LOG_INF("%s: successfully fit params to free device memory\n", __func__); + LOG_TRC("%s: successfully fit params to free device memory\n", __func__); } catch (const common_params_fit_exception & e) { LOG_WRN("%s: failed to fit params to free device memory: %s\n", __func__, e.what()); status = COMMON_PARAMS_FIT_STATUS_FAILURE; @@ -785,7 +785,7 @@ enum common_params_fit_status common_fit_params( status = COMMON_PARAMS_FIT_STATUS_ERROR; } const int64_t t1_us = llama_time_us(); - LOG_INF("%s: fitting params to free memory took %.2f seconds\n", __func__, (t1_us - t0_us) * 1e-6); + LOG_TRC("%s: fitting params to free memory took %.2f seconds\n", __func__, (t1_us - t0_us) * 1e-6); return status; } @@ -925,7 +925,7 @@ void common_memory_breakdown_print(const struct llama_context * ctx) { } } for (const auto & td : table_data) { - LOG_INF(td[0].c_str(), + LOG_TRC(td[0].c_str(), __func__, td[1].c_str(), td[2].c_str(), td[3].c_str(), td[4].c_str(), td[5].c_str(), td[6].c_str(), td[7].c_str(), td[8].c_str()); } diff --git a/common/log.cpp b/common/log.cpp index 4b4159db4..bd62616d8 100644 --- a/common/log.cpp +++ b/common/log.cpp @@ -435,10 +435,10 @@ void common_log_flush(struct common_log * log) { static int common_get_verbosity(enum ggml_log_level level) { switch (level) { case GGML_LOG_LEVEL_DEBUG: return LOG_LEVEL_DEBUG; - case GGML_LOG_LEVEL_INFO: return LOG_LEVEL_INFO; + case GGML_LOG_LEVEL_INFO: return LOG_LEVEL_TRACE; case GGML_LOG_LEVEL_WARN: return LOG_LEVEL_WARN; case GGML_LOG_LEVEL_ERROR: return LOG_LEVEL_ERROR; - case GGML_LOG_LEVEL_CONT: return LOG_LEVEL_INFO; // same as INFO + case GGML_LOG_LEVEL_CONT: return LOG_LEVEL_TRACE; case GGML_LOG_LEVEL_NONE: default: return LOG_LEVEL_OUTPUT; diff --git a/common/log.h b/common/log.h index 8ee3e6b13..45d82f4dd 100644 --- a/common/log.h +++ b/common/log.h @@ -21,7 +21,8 @@ # define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__))) #endif -#define LOG_LEVEL_DEBUG 4 +#define LOG_LEVEL_DEBUG 5 +#define LOG_LEVEL_TRACE 4 #define LOG_LEVEL_INFO 3 #define LOG_LEVEL_WARN 2 #define LOG_LEVEL_ERROR 1 @@ -111,13 +112,15 @@ void common_log_flush (struct common_log * log); // f #define LOGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_NONE, verbosity, __VA_ARGS__) #define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, LOG_LEVEL_DEBUG, __VA_ARGS__) +#define LOG_TRC(...) LOG_TMPL(GGML_LOG_LEVEL_INFO, LOG_LEVEL_TRACE, __VA_ARGS__) #define LOG_INF(...) LOG_TMPL(GGML_LOG_LEVEL_INFO, LOG_LEVEL_INFO, __VA_ARGS__) #define LOG_WRN(...) LOG_TMPL(GGML_LOG_LEVEL_WARN, LOG_LEVEL_WARN, __VA_ARGS__) #define LOG_ERR(...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, LOG_LEVEL_ERROR, __VA_ARGS__) #define LOG_CNT(...) LOG_TMPL(GGML_LOG_LEVEL_CONT, LOG_LEVEL_INFO, __VA_ARGS__) // same as INFO +#define LOG_DBGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, verbosity, __VA_ARGS__) +#define LOG_TRCV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_TRACE, verbosity, __VA_ARGS__) #define LOG_INFV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_INFO, verbosity, __VA_ARGS__) #define LOG_WRNV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_WARN, verbosity, __VA_ARGS__) #define LOG_ERRV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, verbosity, __VA_ARGS__) -#define LOG_DBGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, verbosity, __VA_ARGS__) #define LOG_CNTV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_CONT, verbosity, __VA_ARGS__) diff --git a/common/speculative.cpp b/common/speculative.cpp index 0eebcb3dc..476e1398e 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -984,7 +984,7 @@ common_speculative * common_speculative_init(common_params_speculative & params, } if (impls.empty()) { - LOG_WRN("%s", "no implementations specified for speculative decoding\n"); + LOG_WRN("%s: no implementations specified for speculative decoding\n", __func__); return nullptr; } diff --git a/ggml/src/ggml-metal/ggml-metal-device.m b/ggml/src/ggml-metal/ggml-metal-device.m index fab7891c0..780dfe81b 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.m +++ b/ggml/src/ggml-metal/ggml-metal-device.m @@ -672,7 +672,7 @@ ggml_metal_device_t ggml_metal_device_init(int device) { ![[dev->mtl_device name] containsString:@"M6"] && ![[dev->mtl_device name] containsString:@"A19"] && ![[dev->mtl_device name] containsString:@"A20"]) { - GGML_LOG_WARN("%s: tensor API disabled for pre-M5 and pre-A19 devices\n", __func__); + GGML_LOG_INFO("%s: tensor API disabled for pre-M5 and pre-A19 devices\n", __func__); dev->props.has_tensor = false; } diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index 8e09f26e9..c35985199 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -473,10 +473,10 @@ static void clip_log_internal(enum ggml_log_level level, const char * format, .. va_end(args); } +#define LOG_DBG(...) clip_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__) #define LOG_INF(...) clip_log_internal(GGML_LOG_LEVEL_INFO, __VA_ARGS__) #define LOG_WRN(...) clip_log_internal(GGML_LOG_LEVEL_WARN, __VA_ARGS__) #define LOG_ERR(...) clip_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__) -#define LOG_DBG(...) clip_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__) #define LOG_CNT(...) clip_log_internal(GGML_LOG_LEVEL_CONT, __VA_ARGS__) // diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 2e0cfa61f..02f098279 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -169,7 +169,7 @@ struct clip_ctx { throw std::runtime_error("failed to initialize CPU backend"); } if (ctx_params.use_gpu) { - auto backend_name = std::getenv("MTMD_BACKEND_DEVICE"); + auto * backend_name = std::getenv("MTMD_BACKEND_DEVICE"); if (backend_name != nullptr) { backend = ggml_backend_init_by_name(backend_name, nullptr); if (!backend) { diff --git a/tools/server/server-common.h b/tools/server/server-common.h index 4681f9c51..c28558d8b 100644 --- a/tools/server/server-common.h +++ b/tools/server/server-common.h @@ -15,17 +15,19 @@ using json = nlohmann::ordered_json; +#define SLT_DBG(slot, fmt, ...) LOG_DBG("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__) +#define SLT_TRC(slot, fmt, ...) LOG_TRC("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__) #define SLT_INF(slot, fmt, ...) LOG_INF("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__) -#define SLT_CNT(slot, fmt, ...) LOG_CNT("" fmt, __VA_ARGS__) #define SLT_WRN(slot, fmt, ...) LOG_WRN("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__) #define SLT_ERR(slot, fmt, ...) LOG_ERR("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__) -#define SLT_DBG(slot, fmt, ...) LOG_DBG("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__) +#define SLT_CNT(slot, fmt, ...) LOG_CNT("" fmt, __VA_ARGS__) +#define SRV_DBG(fmt, ...) LOG_DBG("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__) +#define SRV_TRC(fmt, ...) LOG_TRC("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__) #define SRV_INF(fmt, ...) LOG_INF("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__) -#define SRV_CNT(fmt, ...) LOG_CNT("" fmt, __VA_ARGS__) #define SRV_WRN(fmt, ...) LOG_WRN("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__) #define SRV_ERR(fmt, ...) LOG_ERR("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__) -#define SRV_DBG(fmt, ...) LOG_DBG("srv %12.*s: " fmt, 12, __func__, __VA_ARGS__) +#define SRV_CNT(fmt, ...) LOG_CNT("" fmt, __VA_ARGS__) using raw_buffer = std::vector; diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index ce743e665..d49c986fe 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -166,6 +166,7 @@ struct server_slot { // stats size_t n_sent_text = 0; // number of sent text character + int64_t t_print_last = 0; int64_t t_start_process_prompt; int64_t t_start_generation; @@ -233,7 +234,7 @@ struct server_slot { } } - SLT_INF(*this, "init sampler, took %0.2f ms, tokens: text = %d, total = %d\n", + SLT_TRC(*this, "init sampler, took %0.2f ms, tokens: text = %d, total = %d\n", (ggml_time_us() - t_start) / 1000.0, n_text, (int) prompt.tokens.size()); } @@ -417,6 +418,36 @@ struct server_slot { return stop_pos; } + void print_timings_tg() { + if (n_decoded < 100) { + return; + } + + const int64_t t_now = ggml_time_us(); + + if (t_now - t_print_last < 3*1000*1000) { + return; + } + + t_print_last = t_now; + + const double n_gen_second = 1e3 / t_token_generation * n_decoded; + + SLT_INF(*this, "n_decoded = %6d, tg = %6.2f t/s\n", n_decoded, n_gen_second); + } + + void print_timings_pp() const { + const double n_prompt_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed; + const double f_progress = (float) prompt.n_tokens() / task->n_tokens(); + + if (t_prompt_processing < 3000.0) { + return; + } + + SLT_INF(*this, "prompt processing, n_tokens = %6d, progress = %.2f, t = %6.2f s / %.2f tokens per second\n", + n_prompt_tokens_processed, f_progress, t_prompt_processing / 1e3, n_prompt_second); + } + void print_timings() const { const double t_prompt = t_prompt_processing / n_prompt_tokens_processed; const double n_prompt_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed; @@ -588,6 +619,10 @@ public: // note: chat_params must not be refreshed upon existing sleeping state server_chat_params chat_params; + server_context_impl() { + mtmd_helper_log_set(common_log_default_callback, nullptr); + } + ~server_context_impl() { if (!sleeping) { // destroy() is already called when entering sleeping state @@ -749,10 +784,6 @@ private: std::string & mmproj_path = params_base.mmproj.path; if (!mmproj_path.empty()) { - if (!is_resume) { - mtmd_helper_log_set(common_log_default_callback, nullptr); - } - mtmd_context_params mparams = mtmd_context_params_default(); mparams.use_gpu = params_base.mmproj_use_gpu; @@ -896,17 +927,17 @@ private: if (params_base.cache_ram_mib != 0) { if (params_base.cache_ram_mib < 0) { - SRV_WRN("prompt cache is enabled, size limit: %s\n", "no limit"); + SRV_INF("prompt cache is enabled, size limit: %s\n", "no limit"); } else { - SRV_WRN("prompt cache is enabled, size limit: %d MiB\n", params_base.cache_ram_mib); + SRV_INF("prompt cache is enabled, size limit: %d MiB\n", params_base.cache_ram_mib); } - SRV_WRN("%s", "use `--cache-ram 0` to disable the prompt cache\n"); + SRV_INF("%s", "use `--cache-ram 0` to disable the prompt cache\n"); prompt_cache = std::make_unique(params_base.cache_ram_mib, n_ctx); } else { - SRV_WRN("%s", "prompt cache is disabled - use `--cache-ram N` to enable it\n"); + SRV_INF("%s", "prompt cache is disabled - use `--cache-ram N` to enable it\n"); } - SRV_WRN("%s", "for more info see https://github.com/ggml-org/llama.cpp/pull/16391\n"); + SRV_INF("%s", "for more info see https://github.com/ggml-org/llama.cpp/pull/16391\n"); if (!params_base.model_alias.empty()) { // backward compat: use first alias as model name @@ -954,13 +985,13 @@ private: if (params_base.cache_idle_slots) { if (!params_base.kv_unified) { - SRV_WRN("%s: --cache-idle-slots requires --kv-unified, disabling\n", __func__); + SRV_WRN("%s", "--cache-idle-slots requires --kv-unified, disabling\n"); params_base.cache_idle_slots = false; } else if (params_base.cache_ram_mib == 0) { - SRV_WRN("%s: --cache-idle-slots requires --cache-ram, disabling\n", __func__); + SRV_WRN("%s", "--cache-idle-slots requires --cache-ram, disabling\n"); params_base.cache_idle_slots = false; } else { - SRV_INF("%s: idle slots will be saved to prompt cache and cleared upon starting a new task\n", __func__); + SRV_INF("%s", "idle slots will be saved to prompt cache and cleared upon starting a new task\n"); SRV_DBG("%s", "__TEST_TAG_CACHE_IDLE_SLOTS_ENABLED__\n"); } } @@ -1112,7 +1143,7 @@ private: update_cache = update_cache && task.type == SERVER_TASK_TYPE_COMPLETION; if (update_cache) { - SRV_WRN("%s", "updating prompt cache\n"); + SRV_INF("%s", "updating prompt cache\n"); const int64_t t_start = ggml_time_us(); @@ -1127,7 +1158,7 @@ private: prompt_cache->update(); - SRV_WRN("prompt cache update took %.2f ms\n", (ggml_time_us() - t_start) / 1000.0); + SRV_INF("prompt cache update took %.2f ms\n", (ggml_time_us() - t_start) / 1000.0); } } @@ -1186,10 +1217,10 @@ private: if (!are_lora_equal(task_loras, slot.lora)) { // if lora has changed, check to see if the cache should be cleared if (lora_should_clear_cache(slot.lora, task_loras)) { - SLT_INF(slot, "clearing cache for lora change. %zu loras -> %zu loras\n", slot.lora.size(), task.params.lora.size()); + SLT_TRC(slot, "clearing cache for lora change. %zu loras -> %zu loras\n", slot.lora.size(), task.params.lora.size()); slot.prompt.tokens.clear(); } else { - SLT_INF(slot, "keeping cache for alora. %zu target loras\n", task_loras.size()); + SLT_TRC(slot, "keeping cache for alora. %zu target loras\n", task_loras.size()); } slot.lora = task_loras; } @@ -1281,7 +1312,8 @@ private: llama_set_sampler(ctx_tgt, slot.id, nullptr); } - SLT_INF(slot, "sampler chain: %s\n", common_sampler_print(slot.smpl.get()).c_str()); + SLT_TRC(slot, "sampler chain: %s\n", common_sampler_print(slot.smpl.get()).c_str()); + SLT_TRC(slot, "sampler params: \n%s\n", task.params.sampling.print().c_str()); } else { slot.smpl.reset(); } @@ -1800,7 +1832,7 @@ private: cur.update_tgt(ctx_tgt, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); cur.update_dft(ctx_dft.get(), slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); - SLT_WRN(slot, + SLT_INF(slot, "created context checkpoint %d of %d (pos_min = %d, pos_max = %d, n_tokens = %" PRId64 ", size = %.3f MiB)\n", (int) slot.prompt.checkpoints.size(), params_base.n_ctx_checkpoints, cur.pos_min, cur.pos_max, cur.n_tokens, (float) cur.size() / 1024 / 1024); @@ -2339,7 +2371,7 @@ private: slot.state = SLOT_STATE_PROCESSING_PROMPT; - SLT_INF(slot, "new prompt, n_ctx_slot = %d, n_keep = %d, task.n_tokens = %d\n", + SLT_TRC(slot, "new prompt, n_ctx_slot = %d, n_keep = %d, task.n_tokens = %d\n", slot.n_ctx, slot.task->params.n_keep, slot.task->n_tokens()); // print prompt tokens (for debugging) @@ -2454,7 +2486,7 @@ private: } if (n_match >= (size_t) n_cache_reuse) { - SLT_INF(slot, "reusing chunk with size %zu, shifting KV cache [%zu, %zu) -> [%zu, %zu)\n", n_match, head_c, head_c + n_match, head_p, head_p + n_match); + SLT_TRC(slot, "reusing chunk with size %zu, shifting KV cache [%zu, %zu) -> [%zu, %zu)\n", n_match, head_c, head_c + n_match, head_p, head_p + n_match); //for (size_t i = head_p; i < head_p + n_match; i++) { // SLT_DBG(slot, "cache token %3zu: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx_tgt, prompt_tokens[i]).c_str()); //} @@ -2620,10 +2652,14 @@ private: } } + const int64_t t_current = ggml_time_us(); + slot.t_prompt_processing = (t_current - slot.t_start_process_prompt) / 1e3; + slot.print_timings_pp(); + // truncate any tokens that are beyond n_past for this slot const llama_pos p0 = slot.prompt.tokens.pos_next(); - SLT_INF(slot, "n_tokens = %d, memory_seq_rm [%d, end)\n", slot.prompt.n_tokens(), p0); + SLT_TRC(slot, "cached n_tokens = %d, memory_seq_rm [%d, end)\n", slot.prompt.n_tokens(), p0); if (!llama_memory_seq_rm(llama_get_memory(ctx_tgt), slot.id, p0, -1)) { SLT_WRN(slot, "failed to truncate tokens with position >= %d - clearing the memory\n", p0); @@ -2764,7 +2800,6 @@ private: slot.i_batch = batch.n_tokens - 1; slot.init_sampler(); - SLT_INF(slot, "prompt processing done, n_tokens = %d, batch.n_tokens = %d\n", slot.prompt.n_tokens(), batch.n_tokens); } else { if (slot.task->n_tokens() < slot.prompt.n_tokens() + n_ubatch) { // near the end of the prompt @@ -2786,8 +2821,6 @@ private: } } } - - SLT_INF(slot, "prompt processing progress, n_tokens = %d, batch.n_tokens = %d, progress = %f\n", slot.prompt.n_tokens(), batch.n_tokens, (float) slot.prompt.n_tokens() / slot.task->n_tokens()); } const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx_tgt), slot.id); @@ -3084,6 +3117,8 @@ private: continue; } + + slot.print_timings_tg(); } // speculative decoding - main model sample and accept @@ -3196,6 +3231,8 @@ private: } } + slot.print_timings_tg(); + SLT_DBG(slot, "accepted %d/%d draft tokens, new n_tokens = %d\n", (int) ids.size() - 1, (int) n_draft, slot.prompt.n_tokens()); } } diff --git a/tools/server/server-http.cpp b/tools/server/server-http.cpp index 8f25743fc..dd5a6f9e7 100644 --- a/tools/server/server-http.cpp +++ b/tools/server/server-http.cpp @@ -47,7 +47,7 @@ static void log_server_request(const httplib::Request & req, const httplib::Resp // reminder: this function is not covered by httplib's exception handler; if someone does more complicated stuff, think about wrapping it in try-catch - SRV_INF("done request: %s %s %s %d\n", req.method.c_str(), req.path.c_str(), req.remote_addr.c_str(), res.status); + SRV_TRC("done request: %s %s %s %d\n", req.method.c_str(), req.path.c_str(), req.remote_addr.c_str(), res.status); SRV_DBG("request: %s\n", req.body.c_str()); SRV_DBG("response: %s\n", res.body.c_str()); @@ -89,10 +89,10 @@ bool server_http_context::init(const common_params & params) { hostname = params.hostname; if (gcp.enabled) { - LOG_INF("%s: Google Cloud Platform compat: health route = %s, predict route = %s, port = %d\n", __func__, gcp.path_health.c_str(), gcp.path_predict.c_str(), gcp.port); + SRV_INF("Google Cloud Platform compat: health route = %s, predict route = %s, port = %d\n", gcp.path_health.c_str(), gcp.path_predict.c_str(), gcp.port); if (port != gcp.port) { - LOG_WRN("%s: Google Cloud Platform compat: overriding server port %d with AIP_HTTP_PORT %d\n", __func__, port, gcp.port); + SRV_WRN("Google Cloud Platform compat: overriding server port %d with AIP_HTTP_PORT %d\n", port, gcp.port); } port = gcp.port; @@ -102,17 +102,17 @@ bool server_http_context::init(const common_params & params) { #ifdef CPPHTTPLIB_OPENSSL_SUPPORT if (params.ssl_file_key != "" && params.ssl_file_cert != "") { - LOG_INF("Running with SSL: key = %s, cert = %s\n", params.ssl_file_key.c_str(), params.ssl_file_cert.c_str()); + SRV_INF("running with SSL: key = %s, cert = %s\n", params.ssl_file_key.c_str(), params.ssl_file_cert.c_str()); srv.reset( new httplib::SSLServer(params.ssl_file_cert.c_str(), params.ssl_file_key.c_str()) ); } else { - LOG_INF("Running without SSL\n"); + SRV_INF("%s", "running without SSL\n"); srv.reset(new httplib::Server()); } #else if (params.ssl_file_key != "" && params.ssl_file_cert != "") { - LOG_ERR("Server is built without SSL support\n"); + SRV_ERR("%s", "the server is built without SSL support\n"); return false; } srv.reset(new httplib::Server()); @@ -134,7 +134,7 @@ bool server_http_context::init(const common_params & params) { res.status = 500; res.set_content(message, "text/plain"); - LOG_ERR("got exception: %s\n", message.c_str()); + SRV_ERR("got exception: %s\n", message.c_str()); }); srv->set_error_handler([](const httplib::Request &, httplib::Response & res) { @@ -162,7 +162,7 @@ bool server_http_context::init(const common_params & params) { #ifdef SO_REUSEPORT httplib::set_socket_opt(sock, SOL_SOCKET, SO_REUSEPORT, 1); #else - LOG_WRN("%s: SO_REUSEPORT is not supported\n", __func__); + SRV_WRN("%s", "SO_REUSEPORT is not supported\n"); #endif } }); @@ -170,9 +170,9 @@ bool server_http_context::init(const common_params & params) { if (params.api_keys.size() == 1) { auto key = params.api_keys[0]; std::string substr = key.substr(std::max((int)(key.length() - 4), 0)); - LOG_INF("%s: api_keys: ****%s\n", __func__, substr.c_str()); + SRV_INF("api_keys: ****%s\n", substr.c_str()); } else if (params.api_keys.size() > 1) { - LOG_INF("%s: api_keys: %zu keys loaded\n", __func__, params.api_keys.size()); + SRV_INF("api_keys: %zu keys loaded\n", params.api_keys.size()); } // @@ -232,7 +232,7 @@ bool server_http_context::init(const common_params & params) { "application/json; charset=utf-8" ); - LOG_WRN("Unauthorized: Invalid API Key\n"); + SRV_WRN("%s", "unauthorized: Invalid API Key\n"); return false; }; @@ -292,7 +292,7 @@ bool server_http_context::init(const common_params & params) { // +4 threads for monitoring, health and some threads reserved for MCP and other tasks in the future n_threads_http = std::max(params.n_parallel + 4, (int32_t) std::thread::hardware_concurrency() - 1); } - LOG_INF("%s: using %d threads for HTTP server\n", __func__, n_threads_http); + SRV_INF("using %d threads for HTTP server\n", n_threads_http); srv->new_task_queue = [n_threads_http] { // spawn n_threads_http fixed thread (always alive), while allow up to 1024 max possible additional threads // when n_threads_http is used, server will create new "dynamic" threads that will be destroyed after processing each request @@ -306,14 +306,14 @@ bool server_http_context::init(const common_params & params) { // if (!params.webui) { - LOG_INF("Web UI is disabled\n"); + SRV_INF("%s", "the WebUI is disabled\n"); } else { // register static assets routes if (!params.public_path.empty()) { // Set the base directory for serving static files bool is_found = srv->set_mount_point(params.api_prefix + "/", params.public_path); if (!is_found) { - LOG_ERR("%s: static assets path not found: %s\n", __func__, params.public_path.c_str()); + SRV_ERR("static assets path not found: %s\n", params.public_path.c_str()); return 1; } } else { @@ -348,13 +348,13 @@ bool server_http_context::start() { bool is_sock = false; if (string_ends_with(std::string(hostname), ".sock")) { is_sock = true; - LOG_INF("%s: setting address family to AF_UNIX\n", __func__); + SRV_INF("%s", "setting address family to AF_UNIX\n"); srv->set_address_family(AF_UNIX); // bind_to_port requires a second arg, any value other than 0 should // simply get ignored was_bound = srv->bind_to_port(hostname, 8080); } else { - LOG_INF("%s: binding port with default address family\n", __func__); + SRV_INF("%s", "binding port with default address family\n"); // bind HTTP listen port if (port == 0) { int bound_port = srv->bind_to_any_port(hostname); @@ -368,7 +368,7 @@ bool server_http_context::start() { } if (!was_bound) { - LOG_ERR("%s: couldn't bind HTTP server socket, hostname: %s, port: %d\n", __func__, hostname.c_str(), port); + SRV_ERR("couldn't bind HTTP server socket, hostname: %s, port: %d\n", hostname.c_str(), port); return false; } @@ -580,7 +580,7 @@ void server_http_context::register_gcp_compat() { } if (handlers.count(gcp.path_predict)) { - LOG_ERR("%s: AIP_PREDICT_ROUTE=%s conflicts with an existing llama-server route\n", __func__, gcp.path_predict.c_str()); + SRV_ERR("AIP_PREDICT_ROUTE=%s conflicts with an existing llama-server route\n", gcp.path_predict.c_str()); exit(1); } @@ -651,7 +651,7 @@ void server_http_context::register_gcp_compat() { payload.erase("@requestFormat"); if (payload.contains("stream")) { - LOG_WRN("%s: ignoring client-provided stream field in instance, streaming is not supported in predict route\n", __func__); + SRV_WRN("%s", "ignoring client-provided stream field in instance, streaming is not supported in predict route\n"); payload["stream"] = false; } diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp index b9b4a704a..cbc40a35f 100644 --- a/tools/server/server-task.cpp +++ b/tools/server/server-task.cpp @@ -1988,7 +1988,7 @@ server_prompt * server_prompt_cache::alloc(const server_prompt & prompt, size_t const int cur_lcp_len = it->tokens.get_common_prefix(prompt.tokens); if (cur_lcp_len == (int) prompt.tokens.size()) { - SRV_WRN("%s", " - prompt is already in the cache, skipping\n"); + SRV_INF("%s", " - prompt is already in the cache, skipping\n"); return nullptr; } } @@ -2043,7 +2043,7 @@ bool server_prompt_cache::load(server_prompt & prompt, const server_tokens & tok float f_keep_best = prompt.tokens.size() > 0 ? float(lcp_best) / prompt.tokens.size() : -1.0f; // empty slot: any cache entry wins float sim_best = float(lcp_best) / tokens_new.size(); - SRV_WRN(" - looking for better prompt, base f_keep = %.3f, sim = %.3f\n", f_keep_best, sim_best); + SRV_INF(" - looking for better prompt, base f_keep = %.3f, sim = %.3f\n", f_keep_best, sim_best); auto it_best = states.end(); @@ -2068,7 +2068,7 @@ bool server_prompt_cache::load(server_prompt & prompt, const server_tokens & tok } if (it_best != states.end()) { - SRV_WRN(" - found better prompt with f_keep = %.3f, sim = %.3f\n", f_keep_best, sim_best); + SRV_INF(" - found better prompt with f_keep = %.3f, sim = %.3f\n", f_keep_best, sim_best); { auto & data = it_best->data.main; @@ -2076,7 +2076,7 @@ bool server_prompt_cache::load(server_prompt & prompt, const server_tokens & tok const size_t size = data.size(); const size_t n = llama_state_seq_set_data_ext(ctx_tgt, data.data(), size, id_slot, 0); if (n != size) { - SRV_WRN("failed to restore state with size %zu\n", size); + SRV_ERR("failed to restore state with size %zu\n", size); return false; } @@ -2145,11 +2145,11 @@ void server_prompt_cache::update() { } } - SRV_WRN(" - cache state: %zu prompts, %.3f MiB (limits: %.3f MiB, %zu tokens, %zu est)\n", + SRV_INF(" - cache state: %zu prompts, %.3f MiB (limits: %.3f MiB, %zu tokens, %zu est)\n", states.size(), size() / (1024.0 * 1024.0), limit_size / (1024.0 * 1024.0), limit_tokens, limit_tokens_cur); for (const auto & state : states) { - SRV_WRN(" - prompt %p: %7d tokens, checkpoints: %2zu, %9.3f MiB\n", + SRV_INF(" - prompt %p: %7d tokens, checkpoints: %2zu, %9.3f MiB\n", (const void *)&state, state.n_tokens(), state.checkpoints.size(), state.size() / (1024.0 * 1024.0)); } } diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 371b13c44..823ae5bda 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -83,17 +83,22 @@ int main(int argc, char ** argv) { return 1; } + llama_backend_init(); + llama_numa_init(params.numa); + + common_params_print_info(params); + // validate batch size for embeddings // embeddings require all tokens to be processed in a single ubatch // see https://github.com/ggml-org/llama.cpp/issues/12836 if (params.embedding && params.n_batch > params.n_ubatch) { - LOG_WRN("%s: embeddings enabled with n_batch (%d) > n_ubatch (%d)\n", __func__, params.n_batch, params.n_ubatch); - LOG_WRN("%s: setting n_batch = n_ubatch = %d to avoid assertion failure\n", __func__, params.n_ubatch); + SRV_WRN("embeddings enabled with n_batch (%d) > n_ubatch (%d)\n", params.n_batch, params.n_ubatch); + SRV_WRN("setting n_batch = n_ubatch = %d to avoid assertion failure\n", params.n_ubatch); params.n_batch = params.n_ubatch; } if (params.n_parallel < 0) { - LOG_INF("%s: n_parallel is set to auto, using n_parallel = 4 and kv_unified = true\n", __func__); + SRV_INF("%s", "n_parallel is set to auto, using n_parallel = 4 and kv_unified = true\n"); params.n_parallel = 4; params.kv_unified = true; @@ -107,15 +112,9 @@ int main(int argc, char ** argv) { // struct that contains llama context and inference server_context ctx_server; - llama_backend_init(); - llama_numa_init(params.numa); - - LOG_INF("build_info: %s\n", llama_build_info()); - LOG_INF("%s\n", common_params_get_system_info(params).c_str()); - server_http_context ctx_http; if (!ctx_http.init(params)) { - LOG_ERR("%s: failed to initialize HTTP server\n", __func__); + SRV_ERR("%s", "failed to initialize HTTP server\n"); return 1; } @@ -134,7 +133,7 @@ int main(int argc, char ** argv) { try { models_routes.emplace(params, argc, argv); } catch (const std::exception & e) { - LOG_ERR("%s: failed to initialize router models: %s\n", __func__, e.what()); + SRV_ERR("failed to initialize router models: %s\n", e.what()); return 1; } @@ -222,7 +221,7 @@ int main(int argc, char ** argv) { try { tools.setup(params.server_tools); } catch (const std::exception & e) { - LOG_ERR("%s: tools setup failed: %s\n", __func__, e.what()); + SRV_ERR("tools setup failed: %s\n", e.what()); return 1; } SRV_WRN("%s", "-----------------\n"); @@ -240,7 +239,7 @@ int main(int argc, char ** argv) { std::function clean_up; if (is_router_server) { - LOG_INF("%s: starting router server, no model will be loaded in this process\n", __func__); + SRV_INF("%s", "starting router server, no model will be loaded in this process\n"); clean_up = [&models_routes]() { SRV_INF("%s: cleaning up before exit...\n", __func__); @@ -252,7 +251,7 @@ int main(int argc, char ** argv) { if (!ctx_http.start()) { clean_up(); - LOG_ERR("%s: exiting due to HTTP server error\n", __func__); + SRV_ERR("%s", "exiting due to HTTP server error\n"); return 1; } ctx_http.is_ready.store(true); @@ -273,12 +272,12 @@ int main(int argc, char ** argv) { // start the HTTP server before loading the model to be able to serve /health requests if (!ctx_http.start()) { clean_up(); - LOG_ERR("%s: exiting due to HTTP server error\n", __func__); + SRV_ERR("%s", "exiting due to HTTP server error\n"); return 1; } // load the model - LOG_INF("%s: loading model\n", __func__); + SRV_INF("%s", "loading model\n"); if (server_models::is_child_server()) { ctx_server.on_sleeping_changed([&](bool sleeping) { @@ -291,14 +290,14 @@ int main(int argc, char ** argv) { if (ctx_http.thread.joinable()) { ctx_http.thread.join(); } - LOG_ERR("%s: exiting due to model loading error\n", __func__); + SRV_ERR("%s", "exiting due to model loading error\n"); return 1; } routes.update_meta(ctx_server); ctx_http.is_ready.store(true); - LOG_INF("%s: model loaded\n", __func__); + SRV_INF("%s", "model loaded\n"); shutdown_handler = [&](int) { // this will unblock start_loop() @@ -322,9 +321,9 @@ int main(int argc, char ** argv) { #endif if (is_router_server) { - LOG_INF("%s: router server is listening on %s\n", __func__, ctx_http.listening_address.c_str()); - LOG_INF("%s: NOTE: router mode is experimental\n", __func__); - LOG_INF("%s: it is not recommended to use this mode in untrusted environments\n", __func__); + SRV_INF("router server is listening on %s\n", ctx_http.listening_address.c_str()); + SRV_WRN("%s", "NOTE: router mode is experimental\n"); + SRV_WRN("%s", " it is not recommended to use this mode in untrusted environments\n"); if (ctx_http.thread.joinable()) { ctx_http.thread.join(); // keep the main thread alive } @@ -332,8 +331,7 @@ int main(int argc, char ** argv) { // when the HTTP server stops, clean up and exit clean_up(); } else { - LOG_INF("%s: server is listening on %s\n", __func__, ctx_http.listening_address.c_str()); - LOG_INF("%s: starting the main loop...\n", __func__); + SRV_INF("server is listening on %s\n", ctx_http.listening_address.c_str()); // optionally, notify router server that this instance is ready std::thread monitor_thread; diff --git a/tools/tts/tts.cpp b/tools/tts/tts.cpp index ce68213d1..2a1bdccc9 100644 --- a/tools/tts/tts.cpp +++ b/tools/tts/tts.cpp @@ -604,8 +604,8 @@ int main(int argc, char ** argv) { } LOG_INF("sampler seed: %u\n", common_sampler_get_seed(smpl[0])); - LOG_INF("sampler params: \n%s\n", params.sampling.print().c_str()); LOG_INF("sampler chain: %s\n", common_sampler_print(smpl[0]).c_str()); + LOG_INF("sampler params: \n%s\n", params.sampling.print().c_str()); LOG_INF("%s: loading done\n", __func__);