Merge branch 'upstream' into concedo_experimental

# Conflicts:
#	.devops/full-cuda.Dockerfile
#	.devops/llama-cli-cuda.Dockerfile
#	.devops/llama-server-cuda.Dockerfile
#	.devops/llama-server-intel.Dockerfile
#	.devops/llama-server-rocm.Dockerfile
#	.devops/llama-server-vulkan.Dockerfile
#	.devops/llama-server.Dockerfile
#	.github/workflows/docker.yml
#	docs/docker.md
#	examples/llama-bench/llama-bench.cpp
#	flake.lock
#	ggml/include/ggml.h
#	ggml/src/CMakeLists.txt
#	scripts/sync-ggml.last
#	src/llama.cpp
#	tests/test-backend-ops.cpp
#	tests/test-grad0.cpp
#	tests/test-rope.cpp
This commit is contained in:
Concedo 2024-08-30 10:37:39 +08:00
commit d220495dd4
42 changed files with 100585 additions and 99448 deletions

View file

@ -2386,8 +2386,8 @@ struct llama_cparams {
uint32_t n_batch;
uint32_t n_ubatch;
uint32_t n_seq_max;
uint32_t n_threads; // number of threads to use for generation
uint32_t n_threads_batch; // number of threads to use for batch processing
int n_threads; // number of threads to use for generation
int n_threads_batch; // number of threads to use for batch processing
float rope_freq_base;
float rope_freq_scale;
@ -3104,6 +3104,9 @@ struct llama_context {
#endif
ggml_backend_t backend_cpu = nullptr;
ggml_threadpool_t threadpool = nullptr;
ggml_threadpool_t threadpool_batch = nullptr;
bool has_evaluated_once = false;
int64_t t_start_us;
@ -15570,9 +15573,10 @@ static void llama_output_reorder(struct llama_context * ctx) {
}
static void llama_graph_compute(
llama_context & lctx,
ggml_cgraph * gf,
int n_threads) {
llama_context & lctx,
ggml_cgraph * gf,
int n_threads,
ggml_threadpool * threadpool) {
#ifdef GGML_USE_METAL
if (ggml_backend_is_metal(lctx.backend_metal)) {
ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
@ -15581,6 +15585,7 @@ static void llama_graph_compute(
if (lctx.backend_cpu != nullptr) {
ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
ggml_backend_cpu_set_threadpool(lctx.backend_cpu, threadpool);
ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
}
#ifdef GGML_USE_BLAS
@ -15701,6 +15706,8 @@ static int llama_decode_internal(
}
int n_threads = (n_tokens < 32) ? cparams.n_threads : cparams.n_threads_batch;
ggml_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch;
GGML_ASSERT(n_threads > 0);
// non-causal masks do not use the KV cache
@ -15762,7 +15769,7 @@ static int llama_decode_internal(
llama_set_inputs(lctx, ubatch);
llama_graph_compute(lctx, gf, n_threads);
llama_graph_compute(lctx, gf, n_threads, threadpool);
// update the kv ring buffer
{
@ -15939,7 +15946,9 @@ static int llama_encode_internal(
lctx.inp_embd_enc = NULL;
lctx.n_outputs = n_tokens;
const int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
ggml_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch;
GGML_ASSERT(n_threads > 0);
ggml_backend_sched_reset(lctx.sched);
@ -15971,7 +15980,7 @@ static int llama_encode_internal(
llama_set_inputs(lctx, ubatch);
llama_graph_compute(lctx, gf, n_threads);
llama_graph_compute(lctx, gf, n_threads, threadpool);
// extract embeddings
if (embd) {
@ -16253,7 +16262,7 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids);
llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
#endif
//const int64_t t_end = ggml_time_us();
@ -16279,7 +16288,7 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
llama_set_k_shift(lctx);
llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
need_reserve = true;
}
@ -16898,7 +16907,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
// TODO: avoid hardcoded tensor names - use the TN_* constants
if (name.find("attn_v.weight") != std::string::npos ||
name.find("attn_qkv.weight") != std::string::npos) {
name.find("attn_qkv.weight") != std::string::npos ||
name.find("attn_kv_b.weight")!= std::string::npos) {
++qs.n_attention_wv;
} else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
qs.has_output = true;
@ -17512,6 +17522,19 @@ void llama_numa_init(enum ggml_numa_strategy numa) {
}
}
void llama_attach_threadpool(
struct llama_context * ctx,
ggml_threadpool_t threadpool,
ggml_threadpool_t threadpool_batch) {
ctx->threadpool = threadpool;
ctx->threadpool_batch = threadpool_batch ? threadpool_batch : threadpool;
}
void llama_detach_threadpool(struct llama_context * ctx) {
ctx->threadpool = nullptr;
ctx->threadpool_batch = nullptr;
}
void llama_backend_free(void) {
ggml_quantize_free();
}
@ -19428,7 +19451,6 @@ size_t llama_state_seq_load_file(struct llama_context * ctx, const char * filepa
}
}
void printcache(struct llama_context * ctx)
{
struct llama_kv_cache & cache = ctx->kv_self;
@ -19439,16 +19461,16 @@ void printcache(struct llama_context * ctx)
printf("%s",vals.c_str());
}
void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) {
void llama_set_n_threads(struct llama_context * ctx, int32_t n_threads, int32_t n_threads_batch) {
ctx->cparams.n_threads = n_threads;
ctx->cparams.n_threads_batch = n_threads_batch;
}
uint32_t llama_n_threads(struct llama_context * ctx) {
int32_t llama_n_threads(struct llama_context * ctx) {
return ctx->cparams.n_threads;
}
uint32_t llama_n_threads_batch(struct llama_context * ctx) {
int32_t llama_n_threads_batch(struct llama_context * ctx) {
return ctx->cparams.n_threads_batch;
}