Merge commit 'df270ef745' into concedo_experimental

# Conflicts: # Makefile # common/CMakeLists.txt # common/common.h # common/sampling.cpp # common/sampling.h # examples/infill/infill.cpp # examples/llama-bench/llama-bench.cpp # examples/quantize-stats/quantize-stats.cpp # examples/server/server.cpp # include/llama.h # src/llama-sampling.cpp # src/llama-sampling.h # src/llama.cpp # tests/test-grammar-integration.cpp # tests/test-grammar-parser.cpp # tests/test-json-schema-to-grammar.cpp # tests/test-llama-grammar.cpp # tests/test-sampling.cpp
2025-09-12 18:09:42 +00:00 · 2024-09-09 17:10:08 +08:00 · 2024-09-09 17:10:08 +08:00 · 12fd16bfd4
commit 12fd16bfd4
parent 70cdb55cc9 df270ef745
86 changed files with 3406 additions and 7795 deletions
--- a/examples/eval-callback/README.md
+++ b/examples/eval-callback/README.md
@ -1,95 +0,0 @@
-# llama.cpp/examples/eval-callback
-
-A simple example which demonstrates how to use callback during the inference.
-It simply prints to the console all operations and tensor data.
-
-Usage:
-
-```shell
-llama-eval-callback \
-  --hf-repo ggml-org/models \
-  --hf-file phi-2/ggml-model-q4_0.gguf \
-  --model phi-2-q4_0.gguf \
-  --prompt hello \
-  --seed 42 \
-  -ngl 33
-```
-
-Will print:
-
-```shell
-llm_load_tensors: offloaded 33/33 layers to GPU
-...
-llama_new_context_with_model: n_ctx      = 512
-...
-llama_new_context_with_model:      CUDA0 compute buffer size =   105.00 MiB
-llama_new_context_with_model:  CUDA_Host compute buffer size =     6.01 MiB
-llama_new_context_with_model: graph nodes  = 1225
-llama_new_context_with_model: graph splits = 2
-ggml_debug:                 inp_embd = (f32)   GET_ROWS(token_embd.weight{2560, 51200, 1, 1}, inp_tokens{1, 1, 1, 1}}) = {2560, 1, 1, 1}
-                                     [
-                                      [
-                                       [ -0.0181,   0.0272,   0.0272, ...],
-                                      ],
-                                     ]
-ggml_debug:                   norm-0 = (f32)       NORM(CUDA0#inp_embd#0{2560, 1, 1, 1}, }) = {2560, 1, 1, 1}
-                                     [
-                                      [
-                                       [ -0.6989,   1.0636,   1.0636, ...],
-                                      ],
-                                     ]
-ggml_debug:                 norm_w-0 = (f32)        MUL(norm-0{2560, 1, 1, 1}, blk.0.attn_norm.weight{2560, 1, 1, 1}}) = {2560, 1, 1, 1}
-                                     [
-                                      [
-                                       [ -0.1800,   0.2817,   0.2632, ...],
-                                      ],
-                                     ]
-ggml_debug:              attn_norm-0 = (f32)        ADD(norm_w-0{2560, 1, 1, 1}, blk.0.attn_norm.bias{2560, 1, 1, 1}}) = {2560, 1, 1, 1}
-                                     [
-                                      [
-                                       [ -0.1863,   0.2970,   0.2604, ...],
-                                      ],
-                                     ]
-ggml_debug:                   wqkv-0 = (f32)    MUL_MAT(blk.0.attn_qkv.weight{2560, 7680, 1, 1}, attn_norm-0{2560, 1, 1, 1}}) = {7680, 1, 1, 1}
-                                     [
-                                      [
-                                       [ -1.1238,   1.2876,  -1.8086, ...],
-                                      ],
-                                     ]
-ggml_debug:                   bqkv-0 = (f32)        ADD(wqkv-0{7680, 1, 1, 1}, blk.0.attn_qkv.bias{7680, 1, 1, 1}}) = {7680, 1, 1, 1}
-                                     [
-                                      [
-                                       [ -1.1135,   1.4604,  -1.9226, ...],
-                                      ],
-                                     ]
-ggml_debug:            bqkv-0 (view) = (f32)       VIEW(bqkv-0{7680, 1, 1, 1}, }) = {2560, 1, 1, 1}
-                                     [
-                                      [
-                                       [ -1.1135,   1.4604,  -1.9226, ...],
-                                      ],
-                                     ]
-ggml_debug:                   Qcur-0 = (f32)       CONT(bqkv-0 (view){2560, 1, 1, 1}, }) = {2560, 1, 1, 1}
-                                     [
-                                      [
-                                       [ -1.1135,   1.4604,  -1.9226, ...],
-                                      ],
-                                     ]
-ggml_debug:        Qcur-0 (reshaped) = (f32)    RESHAPE(Qcur-0{2560, 1, 1, 1}, }) = {80, 32, 1, 1}
-                                     [
-                                      [
-                                       [ -1.1135,   1.4604,  -1.9226, ...],
-                                       [ -0.3608,   0.5076,  -1.8866, ...],
-                                       [  1.7643,   0.0273,  -2.1065, ...],
-                                       ...
-                                      ],
-                                     ]
-ggml_debug:                   Qcur-0 = (f32)       ROPE(Qcur-0 (reshaped){80, 32, 1, 1}, CUDA0#inp_pos#0{1, 1, 1, 1}}) = {80, 32, 1, 1}
-                                     [
-                                      [
-                                       [ -1.1135,   1.4604,  -1.9226, ...],
-                                       [ -0.3608,   0.5076,  -1.8866, ...],
-                                       [  1.7643,   0.0273,  -2.1065, ...],
-                                       ...
-                                      ],
-                                     ]
-```
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@ -151,8 +151,6 @@ int main(int argc, char ** argv) {

    print_build_info();

-    std::mt19937 rng(params.seed);
-
    llama_backend_init();
    llama_numa_init(params.numa);

@ -183,7 +181,8 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    llama_print_timings(ctx);
+    LOG_TEE("\n");
+    llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);

    llama_free(ctx);
    llama_free_model(model);