allow specifying a different thread count for GPU blas

2025-09-10 00:54:41 +00:00 · 2023-05-03 21:19:59 +08:00 · 2023-05-03 21:19:59 +08:00 · 4857739ab5
commit 4857739ab5
parent 89044502fe
3 changed files with 15 additions and 2 deletions
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@ -41,6 +41,7 @@ static llama_context * llama_ctx_v1;
 static gpt_params params;
 static int n_past = 0;
 static int n_threads = 4;
+static int n_blasthreads = 4;
 static int n_batch = 8;
 static bool useSmartContext = false;
 static bool unbanTokens = false;
@ -137,6 +138,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in

    file_format = in_file_format;
    n_threads = params.n_threads = inputs.threads;
+    n_blasthreads = inputs.blasthreads;
    n_batch = params.n_batch = inputs.batch_size;
    modelname = params.model = inputs.model_filename;
    useSmartContext = inputs.use_smartcontext;
@ -460,6 +462,10 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
        {
            params.n_threads = 1; //do not limit here anymore.
        }
+        else
+        {
+            params.n_threads = n_blasthreads;
+        }
    }

    current_context_tokens.resize(n_past);