diff --git a/common/profiler.cpp b/common/profiler.cpp index eac53110..14b324fd 100644 --- a/common/profiler.cpp +++ b/common/profiler.cpp @@ -525,12 +525,14 @@ static uint64_t device_host_physical_memory(bool available) { // because GPU is more likely to use the inactive memory memory += vm_stats.active_count * 0.2 * page_size; } else { - // assume 50% of active pages can be compressed on macOS x86_64 (an empirical value) - memory += vm_stats.active_count * 0.6 * page_size; + // assume 50% of active pages can be compressed on macOS NUMA (an empirical value) + memory += vm_stats.active_count * 0.5 * page_size; } - - if (!is_uma_arch()) memory += vm_stats.speculative_count * page_size; + if (!is_uma_arch()) { + memory += vm_stats.speculative_count * page_size; + memory += vm_stats.compressor_page_count * page_size; + } } else { LOG_INF("host_statistics64 failed\n"); } diff --git a/src/llama.cpp b/src/llama.cpp index 5be02d68..741d3063 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3344,7 +3344,7 @@ struct llama_context { mutable int64_t n_queued_tokens = 0; mutable int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1) - mutable int32_t n_eval = -1; // number of eval calls, set to -1 to ignore the first eval + mutable int32_t n_eval = -5; // number of eval calls, set to -5 to ignore the first 5 evals // host buffer for the model output (logits and embeddings) ggml_backend_buffer_t buf_output = nullptr; @@ -22716,7 +22716,7 @@ void llama_synchronize(struct llama_context * ctx) { // add the evaluation to the stats if (ctx->n_queued_tokens == 1) { - if (!ctx->cparams.no_perf && ctx->n_eval >= 0) { // ignore the first two evals due to preheat + if (!ctx->cparams.no_perf && ctx->n_eval >= 0) { // ignore the first 5 evals due to preheat ctx->t_eval_us += ggml_time_us() - ctx->t_compute_start_us; } ctx->n_eval++; @@ -23364,7 +23364,7 @@ void llama_perf_context_print(const struct llama_context * ctx) { void llama_perf_context_reset(struct llama_context * ctx) { ctx->t_start_us = ggml_time_us(); ctx->t_eval_us = 0; - ctx->n_eval = -1; // set to -1 to ignore the first eval due to preheat + ctx->n_eval = -5; // set to -5 to ignore the first 5 evals due to preheat ctx->t_p_eval_us = ctx->n_p_eval = 0; }