Merge branch 'upstream' into concedo_experimental

# Conflicts: # .github/workflows/build.yml # .github/workflows/server.yml # CMakeLists.txt # Makefile # examples/embedding/embedding.cpp # examples/imatrix/imatrix.cpp # examples/llama-bench/llama-bench.cpp # examples/llava/MobileVLM-README.md # examples/parallel/parallel.cpp # examples/perplexity/perplexity.cpp # examples/quantize/CMakeLists.txt # examples/server/README.md # examples/speculative/speculative.cpp # tests/test-backend-ops.cpp
2025-09-11 17:44:38 +00:00 · 2024-09-13 16:17:24 +08:00 · 2024-09-13 16:17:24 +08:00 · e44ddf26ef
commit e44ddf26ef
parent 0fd85c3940 0abc6a2c25
47 changed files with 117978 additions and 117646 deletions
--- a/examples/benchmark/benchmark-matmult.cpp
+++ b/examples/benchmark/benchmark-matmult.cpp
@ -184,7 +184,7 @@ int main(int argc, char ** argv)  {

    ggml_graph_compute_helper(work_buffer, gf, benchmark_params.n_threads);

-    TENSOR_DUMP(gf->nodes[0]);
+    TENSOR_DUMP(ggml_graph_node(gf, 0));

    printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype));

@ -225,7 +225,7 @@ int main(int argc, char ** argv)  {


    // Let's use the F32 result from above as a reference for the quantized multiplication
-    float sum_of_F32_reference = tensor_sum_elements(gf->nodes[0]);
+    float sum_of_F32_reference = tensor_sum_elements(ggml_graph_node(gf, 0));

    printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n");
    printf("=====================================================================================\n");
@ -253,7 +253,7 @@ int main(int argc, char ** argv)  {

        // Check that the matrix multiplication result is in the right ballpark
        // We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
-        float sum_of_Q4_result = tensor_sum_elements(gf31->nodes[0]);
+        float sum_of_Q4_result = tensor_sum_elements(ggml_graph_node(gf31, 0));
        float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference);
        float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; //  Let's accept an epsilon of 10^-6