Merge branch 'master' into concedo_experimental

# Conflicts: # README.md # scripts/sync-ggml.sh
2025-09-11 01:24:36 +00:00 · 2024-02-11 15:18:46 +08:00 · 2024-02-11 15:18:46 +08:00 · ea3fd87f68
commit ea3fd87f68
parent 038779af41 f026f8120f
14 changed files with 93 additions and 24 deletions
--- a/llama.cpp
+++ b/llama.cpp
@ -7360,7 +7360,9 @@ static int llama_decode_internal(
    // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
    //       we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
    //       with the BLAS calls. need a better solution
-    if (n_tokens >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
+    // MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is
+    //                   being processed then Accelerate/BLAS will not be involved, so capping would limit performance.
+    if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
        n_threads = std::min(4, n_threads);
    }