mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 17:44:38 +00:00
fix to allow clblast to work even after blas backend splitoff
This commit is contained in:
parent
b53e760557
commit
ba9ef4d01b
20 changed files with 802 additions and 466 deletions
37
llama.cpp
37
llama.cpp
|
@ -26,6 +26,10 @@
|
|||
# include "ggml-kompute.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_BLAS
|
||||
# include "ggml-blas.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_METAL
|
||||
# include "ggml-metal.h"
|
||||
#endif
|
||||
|
@ -2331,9 +2335,13 @@ struct llama_context {
|
|||
std::vector<ggml_backend_t> backends;
|
||||
#ifdef GGML_USE_METAL
|
||||
ggml_backend_t backend_metal = nullptr;
|
||||
#endif
|
||||
#ifdef GGML_USE_BLAS
|
||||
ggml_backend_t backend_blas = nullptr;
|
||||
#endif
|
||||
ggml_backend_t backend_cpu = nullptr;
|
||||
|
||||
|
||||
const llama_model & model;
|
||||
|
||||
// key + value cache for the self attention
|
||||
|
@ -11607,7 +11615,8 @@ static struct ggml_cgraph * llama_build_graph(
|
|||
if (batch.n_tokens < 32 || full_offload) {
|
||||
if (il != -1 && strcmp(name, "norm") == 0) {
|
||||
for (auto * backend : lctx.backends) {
|
||||
if (ggml_backend_buft_supports_backend(lctx.model.buft_layer[il].buft, backend)) {
|
||||
if (ggml_backend_supports_buft(backend, lctx.model.buft_layer[il].buft) &&
|
||||
(ggml_backend_supports_op(backend, cur) || ggml_backend_offload_op(backend, cur))) {
|
||||
ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend);
|
||||
break;
|
||||
}
|
||||
|
@ -12104,6 +12113,11 @@ static void llama_graph_compute(
|
|||
ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
|
||||
ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
|
||||
}
|
||||
#ifdef GGML_USE_BLAS
|
||||
if (lctx.backend_blas != nullptr) {
|
||||
ggml_backend_blas_set_n_threads(lctx.backend_blas, n_threads);
|
||||
}
|
||||
#endif
|
||||
|
||||
ggml_backend_sched_graph_compute_async(lctx.sched, gf);
|
||||
|
||||
|
@ -12326,17 +12340,6 @@ static int llama_decode_internal(
|
|||
}
|
||||
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
||||
|
||||
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
||||
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
||||
// TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
|
||||
// we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
|
||||
// with the BLAS calls. need a better solution
|
||||
// MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is
|
||||
// being processed then Accelerate/BLAS will not be involved, so capping would limit performance.
|
||||
if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
|
||||
n_threads = std::min(4, n_threads);
|
||||
}
|
||||
|
||||
ggml_backend_sched_alloc_graph(lctx.sched, gf);
|
||||
|
||||
llama_set_inputs(lctx, u_batch);
|
||||
|
@ -16562,6 +16565,16 @@ struct llama_context * llama_new_context_with_model(
|
|||
ctx->backends.push_back(backend);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_BLAS
|
||||
ctx->backend_blas = ggml_backend_blas_init();
|
||||
if (ctx->backend_blas == nullptr) {
|
||||
LLAMA_LOG_WARN("%s: failed to initialize BLAS backend\n", __func__);
|
||||
} else {
|
||||
ctx->backends.push_back(ctx->backend_blas);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(GGML_USE_RPC)
|
||||
if (model->n_gpu_layers > 0) {
|
||||
for (const auto & endpoint : model->rpc_servers) {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue