mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-19 08:00:25 +00:00
logs : reduce (#23021)
Some checks failed
Python Type-Check / python type-check (push) Waiting to run
Check Pre-Tokenizer Hashes / pre-tokenizer-hashes (push) Has been cancelled
Python check requirements.txt / check-requirements (push) Has been cancelled
Update Operations Documentation / update-ops-docs (push) Has been cancelled
Some checks failed
Python Type-Check / python type-check (push) Waiting to run
Check Pre-Tokenizer Hashes / pre-tokenizer-hashes (push) Has been cancelled
Python check requirements.txt / check-requirements (push) Has been cancelled
Update Operations Documentation / update-ops-docs (push) Has been cancelled
* logs : reduce * args : fix envs * server : fix build * common : print verbosity level at start * server : clean-up logs * server : print prompt processing timings + sampling params * minor : whitespaces
This commit is contained in:
parent
81b0d882ae
commit
67b2b7f2f2
17 changed files with 207 additions and 147 deletions
|
|
@ -166,6 +166,7 @@ struct server_slot {
|
|||
// stats
|
||||
size_t n_sent_text = 0; // number of sent text character
|
||||
|
||||
int64_t t_print_last = 0;
|
||||
int64_t t_start_process_prompt;
|
||||
int64_t t_start_generation;
|
||||
|
||||
|
|
@ -233,7 +234,7 @@ struct server_slot {
|
|||
}
|
||||
}
|
||||
|
||||
SLT_INF(*this, "init sampler, took %0.2f ms, tokens: text = %d, total = %d\n",
|
||||
SLT_TRC(*this, "init sampler, took %0.2f ms, tokens: text = %d, total = %d\n",
|
||||
(ggml_time_us() - t_start) / 1000.0, n_text, (int) prompt.tokens.size());
|
||||
}
|
||||
|
||||
|
|
@ -417,6 +418,36 @@ struct server_slot {
|
|||
return stop_pos;
|
||||
}
|
||||
|
||||
void print_timings_tg() {
|
||||
if (n_decoded < 100) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int64_t t_now = ggml_time_us();
|
||||
|
||||
if (t_now - t_print_last < 3*1000*1000) {
|
||||
return;
|
||||
}
|
||||
|
||||
t_print_last = t_now;
|
||||
|
||||
const double n_gen_second = 1e3 / t_token_generation * n_decoded;
|
||||
|
||||
SLT_INF(*this, "n_decoded = %6d, tg = %6.2f t/s\n", n_decoded, n_gen_second);
|
||||
}
|
||||
|
||||
void print_timings_pp() const {
|
||||
const double n_prompt_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed;
|
||||
const double f_progress = (float) prompt.n_tokens() / task->n_tokens();
|
||||
|
||||
if (t_prompt_processing < 3000.0) {
|
||||
return;
|
||||
}
|
||||
|
||||
SLT_INF(*this, "prompt processing, n_tokens = %6d, progress = %.2f, t = %6.2f s / %.2f tokens per second\n",
|
||||
n_prompt_tokens_processed, f_progress, t_prompt_processing / 1e3, n_prompt_second);
|
||||
}
|
||||
|
||||
void print_timings() const {
|
||||
const double t_prompt = t_prompt_processing / n_prompt_tokens_processed;
|
||||
const double n_prompt_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed;
|
||||
|
|
@ -588,6 +619,10 @@ public:
|
|||
// note: chat_params must not be refreshed upon existing sleeping state
|
||||
server_chat_params chat_params;
|
||||
|
||||
server_context_impl() {
|
||||
mtmd_helper_log_set(common_log_default_callback, nullptr);
|
||||
}
|
||||
|
||||
~server_context_impl() {
|
||||
if (!sleeping) {
|
||||
// destroy() is already called when entering sleeping state
|
||||
|
|
@ -749,10 +784,6 @@ private:
|
|||
|
||||
std::string & mmproj_path = params_base.mmproj.path;
|
||||
if (!mmproj_path.empty()) {
|
||||
if (!is_resume) {
|
||||
mtmd_helper_log_set(common_log_default_callback, nullptr);
|
||||
}
|
||||
|
||||
mtmd_context_params mparams = mtmd_context_params_default();
|
||||
|
||||
mparams.use_gpu = params_base.mmproj_use_gpu;
|
||||
|
|
@ -896,17 +927,17 @@ private:
|
|||
|
||||
if (params_base.cache_ram_mib != 0) {
|
||||
if (params_base.cache_ram_mib < 0) {
|
||||
SRV_WRN("prompt cache is enabled, size limit: %s\n", "no limit");
|
||||
SRV_INF("prompt cache is enabled, size limit: %s\n", "no limit");
|
||||
} else {
|
||||
SRV_WRN("prompt cache is enabled, size limit: %d MiB\n", params_base.cache_ram_mib);
|
||||
SRV_INF("prompt cache is enabled, size limit: %d MiB\n", params_base.cache_ram_mib);
|
||||
}
|
||||
SRV_WRN("%s", "use `--cache-ram 0` to disable the prompt cache\n");
|
||||
SRV_INF("%s", "use `--cache-ram 0` to disable the prompt cache\n");
|
||||
|
||||
prompt_cache = std::make_unique<server_prompt_cache>(params_base.cache_ram_mib, n_ctx);
|
||||
} else {
|
||||
SRV_WRN("%s", "prompt cache is disabled - use `--cache-ram N` to enable it\n");
|
||||
SRV_INF("%s", "prompt cache is disabled - use `--cache-ram N` to enable it\n");
|
||||
}
|
||||
SRV_WRN("%s", "for more info see https://github.com/ggml-org/llama.cpp/pull/16391\n");
|
||||
SRV_INF("%s", "for more info see https://github.com/ggml-org/llama.cpp/pull/16391\n");
|
||||
|
||||
if (!params_base.model_alias.empty()) {
|
||||
// backward compat: use first alias as model name
|
||||
|
|
@ -954,13 +985,13 @@ private:
|
|||
|
||||
if (params_base.cache_idle_slots) {
|
||||
if (!params_base.kv_unified) {
|
||||
SRV_WRN("%s: --cache-idle-slots requires --kv-unified, disabling\n", __func__);
|
||||
SRV_WRN("%s", "--cache-idle-slots requires --kv-unified, disabling\n");
|
||||
params_base.cache_idle_slots = false;
|
||||
} else if (params_base.cache_ram_mib == 0) {
|
||||
SRV_WRN("%s: --cache-idle-slots requires --cache-ram, disabling\n", __func__);
|
||||
SRV_WRN("%s", "--cache-idle-slots requires --cache-ram, disabling\n");
|
||||
params_base.cache_idle_slots = false;
|
||||
} else {
|
||||
SRV_INF("%s: idle slots will be saved to prompt cache and cleared upon starting a new task\n", __func__);
|
||||
SRV_INF("%s", "idle slots will be saved to prompt cache and cleared upon starting a new task\n");
|
||||
SRV_DBG("%s", "__TEST_TAG_CACHE_IDLE_SLOTS_ENABLED__\n");
|
||||
}
|
||||
}
|
||||
|
|
@ -1112,7 +1143,7 @@ private:
|
|||
update_cache = update_cache && task.type == SERVER_TASK_TYPE_COMPLETION;
|
||||
|
||||
if (update_cache) {
|
||||
SRV_WRN("%s", "updating prompt cache\n");
|
||||
SRV_INF("%s", "updating prompt cache\n");
|
||||
|
||||
const int64_t t_start = ggml_time_us();
|
||||
|
||||
|
|
@ -1127,7 +1158,7 @@ private:
|
|||
|
||||
prompt_cache->update();
|
||||
|
||||
SRV_WRN("prompt cache update took %.2f ms\n", (ggml_time_us() - t_start) / 1000.0);
|
||||
SRV_INF("prompt cache update took %.2f ms\n", (ggml_time_us() - t_start) / 1000.0);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1186,10 +1217,10 @@ private:
|
|||
if (!are_lora_equal(task_loras, slot.lora)) {
|
||||
// if lora has changed, check to see if the cache should be cleared
|
||||
if (lora_should_clear_cache(slot.lora, task_loras)) {
|
||||
SLT_INF(slot, "clearing cache for lora change. %zu loras -> %zu loras\n", slot.lora.size(), task.params.lora.size());
|
||||
SLT_TRC(slot, "clearing cache for lora change. %zu loras -> %zu loras\n", slot.lora.size(), task.params.lora.size());
|
||||
slot.prompt.tokens.clear();
|
||||
} else {
|
||||
SLT_INF(slot, "keeping cache for alora. %zu target loras\n", task_loras.size());
|
||||
SLT_TRC(slot, "keeping cache for alora. %zu target loras\n", task_loras.size());
|
||||
}
|
||||
slot.lora = task_loras;
|
||||
}
|
||||
|
|
@ -1281,7 +1312,8 @@ private:
|
|||
llama_set_sampler(ctx_tgt, slot.id, nullptr);
|
||||
}
|
||||
|
||||
SLT_INF(slot, "sampler chain: %s\n", common_sampler_print(slot.smpl.get()).c_str());
|
||||
SLT_TRC(slot, "sampler chain: %s\n", common_sampler_print(slot.smpl.get()).c_str());
|
||||
SLT_TRC(slot, "sampler params: \n%s\n", task.params.sampling.print().c_str());
|
||||
} else {
|
||||
slot.smpl.reset();
|
||||
}
|
||||
|
|
@ -1800,7 +1832,7 @@ private:
|
|||
cur.update_tgt(ctx_tgt, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
|
||||
cur.update_dft(ctx_dft.get(), slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
|
||||
|
||||
SLT_WRN(slot,
|
||||
SLT_INF(slot,
|
||||
"created context checkpoint %d of %d (pos_min = %d, pos_max = %d, n_tokens = %" PRId64 ", size = %.3f MiB)\n",
|
||||
(int) slot.prompt.checkpoints.size(), params_base.n_ctx_checkpoints, cur.pos_min,
|
||||
cur.pos_max, cur.n_tokens, (float) cur.size() / 1024 / 1024);
|
||||
|
|
@ -2339,7 +2371,7 @@ private:
|
|||
|
||||
slot.state = SLOT_STATE_PROCESSING_PROMPT;
|
||||
|
||||
SLT_INF(slot, "new prompt, n_ctx_slot = %d, n_keep = %d, task.n_tokens = %d\n",
|
||||
SLT_TRC(slot, "new prompt, n_ctx_slot = %d, n_keep = %d, task.n_tokens = %d\n",
|
||||
slot.n_ctx, slot.task->params.n_keep, slot.task->n_tokens());
|
||||
|
||||
// print prompt tokens (for debugging)
|
||||
|
|
@ -2454,7 +2486,7 @@ private:
|
|||
}
|
||||
|
||||
if (n_match >= (size_t) n_cache_reuse) {
|
||||
SLT_INF(slot, "reusing chunk with size %zu, shifting KV cache [%zu, %zu) -> [%zu, %zu)\n", n_match, head_c, head_c + n_match, head_p, head_p + n_match);
|
||||
SLT_TRC(slot, "reusing chunk with size %zu, shifting KV cache [%zu, %zu) -> [%zu, %zu)\n", n_match, head_c, head_c + n_match, head_p, head_p + n_match);
|
||||
//for (size_t i = head_p; i < head_p + n_match; i++) {
|
||||
// SLT_DBG(slot, "cache token %3zu: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx_tgt, prompt_tokens[i]).c_str());
|
||||
//}
|
||||
|
|
@ -2620,10 +2652,14 @@ private:
|
|||
}
|
||||
}
|
||||
|
||||
const int64_t t_current = ggml_time_us();
|
||||
slot.t_prompt_processing = (t_current - slot.t_start_process_prompt) / 1e3;
|
||||
slot.print_timings_pp();
|
||||
|
||||
// truncate any tokens that are beyond n_past for this slot
|
||||
const llama_pos p0 = slot.prompt.tokens.pos_next();
|
||||
|
||||
SLT_INF(slot, "n_tokens = %d, memory_seq_rm [%d, end)\n", slot.prompt.n_tokens(), p0);
|
||||
SLT_TRC(slot, "cached n_tokens = %d, memory_seq_rm [%d, end)\n", slot.prompt.n_tokens(), p0);
|
||||
|
||||
if (!llama_memory_seq_rm(llama_get_memory(ctx_tgt), slot.id, p0, -1)) {
|
||||
SLT_WRN(slot, "failed to truncate tokens with position >= %d - clearing the memory\n", p0);
|
||||
|
|
@ -2764,7 +2800,6 @@ private:
|
|||
slot.i_batch = batch.n_tokens - 1;
|
||||
|
||||
slot.init_sampler();
|
||||
SLT_INF(slot, "prompt processing done, n_tokens = %d, batch.n_tokens = %d\n", slot.prompt.n_tokens(), batch.n_tokens);
|
||||
} else {
|
||||
if (slot.task->n_tokens() < slot.prompt.n_tokens() + n_ubatch) {
|
||||
// near the end of the prompt
|
||||
|
|
@ -2786,8 +2821,6 @@ private:
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
SLT_INF(slot, "prompt processing progress, n_tokens = %d, batch.n_tokens = %d, progress = %f\n", slot.prompt.n_tokens(), batch.n_tokens, (float) slot.prompt.n_tokens() / slot.task->n_tokens());
|
||||
}
|
||||
|
||||
const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx_tgt), slot.id);
|
||||
|
|
@ -3084,6 +3117,8 @@ private:
|
|||
|
||||
continue;
|
||||
}
|
||||
|
||||
slot.print_timings_tg();
|
||||
}
|
||||
|
||||
// speculative decoding - main model sample and accept
|
||||
|
|
@ -3196,6 +3231,8 @@ private:
|
|||
}
|
||||
}
|
||||
|
||||
slot.print_timings_tg();
|
||||
|
||||
SLT_DBG(slot, "accepted %d/%d draft tokens, new n_tokens = %d\n", (int) ids.size() - 1, (int) n_draft, slot.prompt.n_tokens());
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue