mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-12 09:59:41 +00:00
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .devops/full-cuda.Dockerfile # .devops/llama-cli-cuda.Dockerfile # .devops/llama-server-cuda.Dockerfile # .devops/llama-server-intel.Dockerfile # .devops/llama-server-rocm.Dockerfile # .devops/llama-server-vulkan.Dockerfile # .devops/llama-server.Dockerfile # .github/workflows/docker.yml # docs/docker.md # examples/llama-bench/llama-bench.cpp # flake.lock # ggml/include/ggml.h # ggml/src/CMakeLists.txt # scripts/sync-ggml.last # src/llama.cpp # tests/test-backend-ops.cpp # tests/test-grad0.cpp # tests/test-rope.cpp
This commit is contained in:
commit
d220495dd4
42 changed files with 100585 additions and 99448 deletions
|
@ -2386,8 +2386,8 @@ struct llama_cparams {
|
|||
uint32_t n_batch;
|
||||
uint32_t n_ubatch;
|
||||
uint32_t n_seq_max;
|
||||
uint32_t n_threads; // number of threads to use for generation
|
||||
uint32_t n_threads_batch; // number of threads to use for batch processing
|
||||
int n_threads; // number of threads to use for generation
|
||||
int n_threads_batch; // number of threads to use for batch processing
|
||||
|
||||
float rope_freq_base;
|
||||
float rope_freq_scale;
|
||||
|
@ -3104,6 +3104,9 @@ struct llama_context {
|
|||
#endif
|
||||
ggml_backend_t backend_cpu = nullptr;
|
||||
|
||||
ggml_threadpool_t threadpool = nullptr;
|
||||
ggml_threadpool_t threadpool_batch = nullptr;
|
||||
|
||||
bool has_evaluated_once = false;
|
||||
|
||||
int64_t t_start_us;
|
||||
|
@ -15570,9 +15573,10 @@ static void llama_output_reorder(struct llama_context * ctx) {
|
|||
}
|
||||
|
||||
static void llama_graph_compute(
|
||||
llama_context & lctx,
|
||||
ggml_cgraph * gf,
|
||||
int n_threads) {
|
||||
llama_context & lctx,
|
||||
ggml_cgraph * gf,
|
||||
int n_threads,
|
||||
ggml_threadpool * threadpool) {
|
||||
#ifdef GGML_USE_METAL
|
||||
if (ggml_backend_is_metal(lctx.backend_metal)) {
|
||||
ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
|
||||
|
@ -15581,6 +15585,7 @@ static void llama_graph_compute(
|
|||
|
||||
if (lctx.backend_cpu != nullptr) {
|
||||
ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
|
||||
ggml_backend_cpu_set_threadpool(lctx.backend_cpu, threadpool);
|
||||
ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
|
||||
}
|
||||
#ifdef GGML_USE_BLAS
|
||||
|
@ -15701,6 +15706,8 @@ static int llama_decode_internal(
|
|||
}
|
||||
|
||||
int n_threads = (n_tokens < 32) ? cparams.n_threads : cparams.n_threads_batch;
|
||||
ggml_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch;
|
||||
|
||||
GGML_ASSERT(n_threads > 0);
|
||||
|
||||
// non-causal masks do not use the KV cache
|
||||
|
@ -15762,7 +15769,7 @@ static int llama_decode_internal(
|
|||
|
||||
llama_set_inputs(lctx, ubatch);
|
||||
|
||||
llama_graph_compute(lctx, gf, n_threads);
|
||||
llama_graph_compute(lctx, gf, n_threads, threadpool);
|
||||
|
||||
// update the kv ring buffer
|
||||
{
|
||||
|
@ -15939,7 +15946,9 @@ static int llama_encode_internal(
|
|||
lctx.inp_embd_enc = NULL;
|
||||
lctx.n_outputs = n_tokens;
|
||||
|
||||
const int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
|
||||
int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
|
||||
ggml_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch;
|
||||
|
||||
GGML_ASSERT(n_threads > 0);
|
||||
|
||||
ggml_backend_sched_reset(lctx.sched);
|
||||
|
@ -15971,7 +15980,7 @@ static int llama_encode_internal(
|
|||
|
||||
llama_set_inputs(lctx, ubatch);
|
||||
|
||||
llama_graph_compute(lctx, gf, n_threads);
|
||||
llama_graph_compute(lctx, gf, n_threads, threadpool);
|
||||
|
||||
// extract embeddings
|
||||
if (embd) {
|
||||
|
@ -16253,7 +16262,7 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
|||
|
||||
ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids);
|
||||
|
||||
llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
|
||||
llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
|
||||
#endif
|
||||
|
||||
//const int64_t t_end = ggml_time_us();
|
||||
|
@ -16279,7 +16288,7 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
|
|||
|
||||
llama_set_k_shift(lctx);
|
||||
|
||||
llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
|
||||
llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
|
||||
|
||||
need_reserve = true;
|
||||
}
|
||||
|
@ -16898,7 +16907,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
|
||||
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
||||
if (name.find("attn_v.weight") != std::string::npos ||
|
||||
name.find("attn_qkv.weight") != std::string::npos) {
|
||||
name.find("attn_qkv.weight") != std::string::npos ||
|
||||
name.find("attn_kv_b.weight")!= std::string::npos) {
|
||||
++qs.n_attention_wv;
|
||||
} else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
|
||||
qs.has_output = true;
|
||||
|
@ -17512,6 +17522,19 @@ void llama_numa_init(enum ggml_numa_strategy numa) {
|
|||
}
|
||||
}
|
||||
|
||||
void llama_attach_threadpool(
|
||||
struct llama_context * ctx,
|
||||
ggml_threadpool_t threadpool,
|
||||
ggml_threadpool_t threadpool_batch) {
|
||||
ctx->threadpool = threadpool;
|
||||
ctx->threadpool_batch = threadpool_batch ? threadpool_batch : threadpool;
|
||||
}
|
||||
|
||||
void llama_detach_threadpool(struct llama_context * ctx) {
|
||||
ctx->threadpool = nullptr;
|
||||
ctx->threadpool_batch = nullptr;
|
||||
}
|
||||
|
||||
void llama_backend_free(void) {
|
||||
ggml_quantize_free();
|
||||
}
|
||||
|
@ -19428,7 +19451,6 @@ size_t llama_state_seq_load_file(struct llama_context * ctx, const char * filepa
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
void printcache(struct llama_context * ctx)
|
||||
{
|
||||
struct llama_kv_cache & cache = ctx->kv_self;
|
||||
|
@ -19439,16 +19461,16 @@ void printcache(struct llama_context * ctx)
|
|||
printf("%s",vals.c_str());
|
||||
}
|
||||
|
||||
void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) {
|
||||
void llama_set_n_threads(struct llama_context * ctx, int32_t n_threads, int32_t n_threads_batch) {
|
||||
ctx->cparams.n_threads = n_threads;
|
||||
ctx->cparams.n_threads_batch = n_threads_batch;
|
||||
}
|
||||
|
||||
uint32_t llama_n_threads(struct llama_context * ctx) {
|
||||
int32_t llama_n_threads(struct llama_context * ctx) {
|
||||
return ctx->cparams.n_threads;
|
||||
}
|
||||
|
||||
uint32_t llama_n_threads_batch(struct llama_context * ctx) {
|
||||
int32_t llama_n_threads_batch(struct llama_context * ctx) {
|
||||
return ctx->cparams.n_threads_batch;
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue