mirror of
https://github.com/Lizonghang/prima.cpp.git
synced 2025-09-06 21:59:03 +00:00
ignore the first eval to make time test more accurate
This commit is contained in:
parent
cd758247e6
commit
f9b4c46b74
1 changed files with 5 additions and 4 deletions
|
@ -111,7 +111,7 @@ struct Timer {
|
||||||
~Timer() {
|
~Timer() {
|
||||||
if (enable_timer) {
|
if (enable_timer) {
|
||||||
int64_t end_time = ggml_time_us();
|
int64_t end_time = ggml_time_us();
|
||||||
LLAMA_LOG_INFO("Time to run %s: %" PRId64 " ms\n", name, (end_time - start_time) / 1000);
|
LLAMA_LOG_INFO("\nTime to run %s: %" PRId64 " ms: ", name, (end_time - start_time) / 1000);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -3344,7 +3344,7 @@ struct llama_context {
|
||||||
mutable int64_t n_queued_tokens = 0;
|
mutable int64_t n_queued_tokens = 0;
|
||||||
|
|
||||||
mutable int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
|
mutable int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
|
||||||
mutable int32_t n_eval = 0; // number of eval calls
|
mutable int32_t n_eval = -1; // number of eval calls, set to -1 to ignore the first eval
|
||||||
|
|
||||||
// host buffer for the model output (logits and embeddings)
|
// host buffer for the model output (logits and embeddings)
|
||||||
ggml_backend_buffer_t buf_output = nullptr;
|
ggml_backend_buffer_t buf_output = nullptr;
|
||||||
|
@ -22715,7 +22715,7 @@ void llama_synchronize(struct llama_context * ctx) {
|
||||||
|
|
||||||
// add the evaluation to the stats
|
// add the evaluation to the stats
|
||||||
if (ctx->n_queued_tokens == 1) {
|
if (ctx->n_queued_tokens == 1) {
|
||||||
if (!ctx->cparams.no_perf) {
|
if (!ctx->cparams.no_perf && ctx->n_eval >= 0) { // ignore the first two evals due to preheat
|
||||||
ctx->t_eval_us += ggml_time_us() - ctx->t_compute_start_us;
|
ctx->t_eval_us += ggml_time_us() - ctx->t_compute_start_us;
|
||||||
}
|
}
|
||||||
ctx->n_eval++;
|
ctx->n_eval++;
|
||||||
|
@ -23362,7 +23362,8 @@ void llama_perf_context_print(const struct llama_context * ctx) {
|
||||||
|
|
||||||
void llama_perf_context_reset(struct llama_context * ctx) {
|
void llama_perf_context_reset(struct llama_context * ctx) {
|
||||||
ctx->t_start_us = ggml_time_us();
|
ctx->t_start_us = ggml_time_us();
|
||||||
ctx->t_eval_us = ctx->n_eval = 0;
|
ctx->t_eval_us = 0;
|
||||||
|
ctx->n_eval = -1; // set to -1 to ignore the first eval due to preheat
|
||||||
ctx->t_p_eval_us = ctx->n_p_eval = 0;
|
ctx->t_p_eval_us = ctx->n_p_eval = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Add table
Reference in a new issue