From 2819f784d4c0073d176b8ac3bb633f5f3ae9570f Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Mon, 12 May 2025 18:06:10 +0800 Subject: [PATCH] use a threadpool, seems to improve tg performance --- gpttype_adapter.cpp | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index e56c1802b..7cc574c4d 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -2343,6 +2343,21 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, kcpp_data->model_filename.c_str()); return ModelLoadResult::FAIL; } + + //we use a threadpool, greatly speeds up qwen3moe tg + ggml_threadpool_params threadpool1_params, threadpool2_params; + ggml_threadpool_params_init(&threadpool1_params,kcpp_data->n_threads); + ggml_threadpool_params_init(&threadpool2_params,kcpp_data->n_blasthreads); + + printf("Threadpool set to %d threads and %d blasthreads...\n", kcpp_data->n_threads,kcpp_data->n_blasthreads); + struct ggml_threadpool * threadpool1 = ggml_threadpool_new(&threadpool1_params); + struct ggml_threadpool * threadpool2 = ggml_threadpool_new(&threadpool2_params); + if (!threadpool1 || !threadpool2) { + fprintf(stderr, "%s: error: failed to create threadpool.\n", __func__); + return ModelLoadResult::FAIL; + } + llama_attach_threadpool(llama_ctx_v4, threadpool1, threadpool2); + if (lora_filename != "") { printf("\nAttempting to apply LORA adapter: %s\n", lora_filename.c_str());