Merge commit '467576b6cc' into concedo_experimental

# Conflicts: # .gitignore # Makefile # README.md # common/common.h # docs/build.md # examples/infill/infill.cpp # examples/perplexity/perplexity.cpp # examples/server/README.md # ggml/CMakeLists.txt # ggml/src/CMakeLists.txt # ggml/src/ggml-cuda/CMakeLists.txt # scripts/sync-ggml-am.sh # scripts/sync-ggml.sh # tests/CMakeLists.txt # tests/test-backend-ops.cpp # tests/test-opt.cpp # tests/test-quantize-perf.cpp
2026-05-09 19:46:11 +00:00 · 2024-11-21 16:05:21 +08:00 · 2024-11-21 16:05:21 +08:00 · 282a647689
commit 282a647689
parent 272828cab0 467576b6cc
21 changed files with 1788 additions and 1788 deletions
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -22252,28 +22252,6 @@ void llama_perf_context_reset(struct llama_context * ctx) {
    ctx->t_p_eval_us = ctx->n_p_eval = 0;
 }

-void llama_perf_dump_yaml(FILE * stream, const llama_context * ctx) {
-    fprintf(stream, "\n");
-    fprintf(stream, "###########\n");
-    fprintf(stream, "# Timings #\n");
-    fprintf(stream, "###########\n");
-    fprintf(stream, "\n");
-
-    fprintf(stream, "mst_eval: %.2f  # ms / token during generation\n",
-            1.0e-3 * ctx->t_eval_us / ctx->n_eval);
-    fprintf(stream, "mst_p_eval: %.2f  # ms / token during prompt processing\n",
-            1.0e-3 * ctx->t_p_eval_us / ctx->n_p_eval);
-    fprintf(stream, "n_eval: %d  # number of tokens generated (excluding the first one)\n", ctx->n_eval);
-    fprintf(stream, "n_p_eval: %d  # number of tokens processed in batches at the beginning\n", ctx->n_p_eval);
-    fprintf(stream, "t_eval_us: %" PRId64 "  # total microseconds spent generating tokens\n", ctx->t_eval_us);
-    fprintf(stream, "t_load_us: %" PRId64 "  # total microseconds spent loading the model\n", ctx->t_load_us);
-    fprintf(stream, "t_p_eval_us: %" PRId64 "  # total microseconds spent prompt processing\n", ctx->t_p_eval_us);
-    fprintf(stream, "ts_eval: %.2f  # tokens / second during generation\n",
-            1.0e6 * ctx->n_eval / ctx->t_eval_us);
-    fprintf(stream, "ts_p_eval: %.2f  # tokens / second during prompt processing\n",
-            1.0e6 * ctx->n_p_eval / ctx->t_p_eval_us);
-}
-
 // For internal test use
 const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
    struct llama_context * ctx