Merge branch 'upstream' into concedo_experimental

# Conflicts: # .devops/full-cuda.Dockerfile # .devops/llama-cli-cuda.Dockerfile # .devops/llama-server-cuda.Dockerfile # .devops/llama-server-intel.Dockerfile # .devops/llama-server-rocm.Dockerfile # .devops/llama-server-vulkan.Dockerfile # .devops/llama-server.Dockerfile # .github/workflows/docker.yml # docs/docker.md # examples/llama-bench/llama-bench.cpp # flake.lock # ggml/include/ggml.h # ggml/src/CMakeLists.txt # scripts/sync-ggml.last # src/llama.cpp # tests/test-backend-ops.cpp # tests/test-grad0.cpp # tests/test-rope.cpp
2025-09-12 09:59:41 +00:00 · 2024-08-30 10:37:39 +08:00 · 2024-08-30 10:37:39 +08:00 · d220495dd4
commit d220495dd4
parent 0f9968ef64 42c76d1358
42 changed files with 100585 additions and 99448 deletions
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -2386,8 +2386,8 @@ struct llama_cparams {
    uint32_t n_batch;
    uint32_t n_ubatch;
    uint32_t n_seq_max;
-    uint32_t n_threads;       // number of threads to use for generation
-    uint32_t n_threads_batch; // number of threads to use for batch processing
+    int      n_threads;       // number of threads to use for generation
+    int      n_threads_batch; // number of threads to use for batch processing

    float rope_freq_base;
    float rope_freq_scale;
@ -3104,6 +3104,9 @@ struct llama_context {
 #endif
    ggml_backend_t backend_cpu = nullptr;

+    ggml_threadpool_t threadpool       = nullptr;
+    ggml_threadpool_t threadpool_batch = nullptr;
+
    bool has_evaluated_once = false;

    int64_t t_start_us;
@ -15570,9 +15573,10 @@ static void llama_output_reorder(struct llama_context * ctx) {
 }

 static void llama_graph_compute(
-        llama_context & lctx,
-          ggml_cgraph * gf,
-                  int   n_threads) {
+          llama_context & lctx,
+            ggml_cgraph * gf,
+                    int   n_threads,
+        ggml_threadpool * threadpool) {
 #ifdef GGML_USE_METAL
    if (ggml_backend_is_metal(lctx.backend_metal)) {
        ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
@ -15581,6 +15585,7 @@ static void llama_graph_compute(

    if (lctx.backend_cpu != nullptr) {
        ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
+        ggml_backend_cpu_set_threadpool(lctx.backend_cpu, threadpool);
        ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
    }
 #ifdef GGML_USE_BLAS
@ -15701,6 +15706,8 @@ static int llama_decode_internal(
        }

        int n_threads = (n_tokens < 32) ? cparams.n_threads : cparams.n_threads_batch;
+        ggml_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch;
+
        GGML_ASSERT(n_threads > 0);

        // non-causal masks do not use the KV cache
@ -15762,7 +15769,7 @@ static int llama_decode_internal(

        llama_set_inputs(lctx, ubatch);

-        llama_graph_compute(lctx, gf, n_threads);
+        llama_graph_compute(lctx, gf, n_threads, threadpool);

        // update the kv ring buffer
        {
@ -15939,7 +15946,9 @@ static int llama_encode_internal(
    lctx.inp_embd_enc = NULL;
    lctx.n_outputs = n_tokens;

-    const int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
+    int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
+    ggml_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch;
+
    GGML_ASSERT(n_threads > 0);

    ggml_backend_sched_reset(lctx.sched);
@ -15971,7 +15980,7 @@ static int llama_encode_internal(

    llama_set_inputs(lctx, ubatch);

-    llama_graph_compute(lctx, gf, n_threads);
+    llama_graph_compute(lctx, gf, n_threads, threadpool);

    // extract embeddings
    if (embd) {
@ -16253,7 +16262,7 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {

    ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids);

-    llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
+    llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
 #endif

    //const int64_t t_end = ggml_time_us();
@ -16279,7 +16288,7 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {

            llama_set_k_shift(lctx);

-            llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
+            llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);

            need_reserve = true;
        }
@ -16898,7 +16907,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s

        // TODO: avoid hardcoded tensor names - use the TN_* constants
        if (name.find("attn_v.weight")   != std::string::npos ||
-            name.find("attn_qkv.weight") != std::string::npos) {
+            name.find("attn_qkv.weight") != std::string::npos ||
+            name.find("attn_kv_b.weight")!= std::string::npos) {
            ++qs.n_attention_wv;
        } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
            qs.has_output = true;
@ -17512,6 +17522,19 @@ void llama_numa_init(enum ggml_numa_strategy numa) {
    }
 }

+void llama_attach_threadpool(
+             struct llama_context * ctx,
+        ggml_threadpool_t   threadpool,
+        ggml_threadpool_t   threadpool_batch) {
+    ctx->threadpool       = threadpool;
+    ctx->threadpool_batch = threadpool_batch ? threadpool_batch : threadpool;
+}
+
+void llama_detach_threadpool(struct llama_context * ctx) {
+    ctx->threadpool       = nullptr;
+    ctx->threadpool_batch = nullptr;
+}
+
 void llama_backend_free(void) {
    ggml_quantize_free();
 }
@ -19428,7 +19451,6 @@ size_t llama_state_seq_load_file(struct llama_context * ctx, const char * filepa
    }
 }

-
 void printcache(struct llama_context * ctx)
 {
    struct llama_kv_cache & cache = ctx->kv_self;
@ -19439,16 +19461,16 @@ void printcache(struct llama_context * ctx)
    printf("%s",vals.c_str());
 }

-void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) {
+void llama_set_n_threads(struct llama_context * ctx, int32_t n_threads, int32_t n_threads_batch) {
    ctx->cparams.n_threads       = n_threads;
    ctx->cparams.n_threads_batch = n_threads_batch;
 }

-uint32_t llama_n_threads(struct llama_context * ctx) {
+int32_t llama_n_threads(struct llama_context * ctx) {
    return ctx->cparams.n_threads;
 }

-uint32_t llama_n_threads_batch(struct llama_context * ctx) {
+int32_t llama_n_threads_batch(struct llama_context * ctx) {
    return ctx->cparams.n_threads_batch;
 }