mirror of
https://github.com/Lizonghang/prima.cpp.git
synced 2025-09-10 06:54:33 +00:00
add gpu support in llama_model_kvcache_size and llama_model_compute_buf_size
This commit is contained in:
parent
f8e9dc2713
commit
6f54a12c7d
3 changed files with 58 additions and 28 deletions
|
@ -20810,7 +20810,12 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype,
|
|||
}
|
||||
}
|
||||
|
||||
uint64_t llama_model_compute_buf_size(const struct llama_model * model, const struct llama_context_params cparams, bool compress_memory) {
|
||||
void llama_model_compute_buf_size(
|
||||
uint64_t * cpu_buf,
|
||||
uint64_t * gpu_buf,
|
||||
const struct llama_model * model,
|
||||
const struct llama_context_params cparams,
|
||||
bool use_gpu) {
|
||||
const llama_hparams hparams = model->hparams;
|
||||
|
||||
// input tensors
|
||||
|
@ -20831,30 +20836,42 @@ uint64_t llama_model_compute_buf_size(const struct llama_model * model, const st
|
|||
const uint64_t n_output = hparams.n_vocab * cparams.n_ubatch;
|
||||
|
||||
// compute buffer size for input, each layer, and output
|
||||
const uint64_t n_buf_inp = (n_inp_toks + n_inp_embd) * ggml_type_size(GGML_TYPE_F32); // do not consider memory compression
|
||||
const uint64_t n_buf_inp = (n_inp_toks + n_inp_embd) * ggml_type_size(GGML_TYPE_F32);
|
||||
const uint64_t n_buf_act = (n_bak_embd + n_inp_pos + n_kq_mask +
|
||||
n_inp_out_ids + n_norm + n_qcur + n_kq
|
||||
) * ggml_type_size(GGML_TYPE_F32);
|
||||
const uint64_t n_buf_out = (n_out_embd + n_output) * ggml_type_size(GGML_TYPE_F32); // do not consider memory compression
|
||||
const uint64_t n_buf_out = (n_out_embd + n_output) * ggml_type_size(GGML_TYPE_F32);
|
||||
|
||||
uint64_t n_buf_total = 0;
|
||||
if (cparams.rank == 0) {
|
||||
if (compress_memory) {
|
||||
n_buf_total = n_buf_inp / 2 + n_buf_act + n_buf_out / 2; // consider compressed memory with ratio 2:1
|
||||
if (use_gpu) {
|
||||
*gpu_buf = n_buf_act;
|
||||
if (llama_model_n_layers(model) > cparams.n_gpu_layers) {
|
||||
*cpu_buf = n_buf_inp + n_buf_act + n_buf_out;
|
||||
} else {
|
||||
n_buf_total = n_buf_inp + n_buf_act + n_buf_out;
|
||||
*cpu_buf = n_buf_inp + n_buf_out;
|
||||
}
|
||||
} else {
|
||||
n_buf_total = n_buf_act;
|
||||
*gpu_buf = 0;
|
||||
*cpu_buf = n_buf_inp + n_buf_act + n_buf_out;
|
||||
}
|
||||
return n_buf_total;
|
||||
}
|
||||
|
||||
uint64_t llama_model_kvcache_size(const struct llama_model * model, const struct llama_context_params cparams) {
|
||||
void llama_model_kvcache_size(
|
||||
uint64_t * cpu_cache,
|
||||
uint64_t * gpu_cache,
|
||||
const struct llama_model * model,
|
||||
const struct llama_context_params cparams,
|
||||
bool use_gpu) {
|
||||
const llama_hparams hparams = model->hparams;
|
||||
uint64_t ne_k = static_cast<uint64_t>(hparams.n_embd_k_gqa()) * cparams.n_ctx * ggml_type_size(cparams.type_k);
|
||||
uint64_t ne_v = static_cast<uint64_t>(hparams.n_embd_v_gqa()) * cparams.n_ctx * ggml_type_size(cparams.type_v);
|
||||
return (ne_k + ne_v) * llama_model_n_layers(model);
|
||||
if (use_gpu) {
|
||||
int n_gpu_layers = cparams.n_gpu_layers;
|
||||
*gpu_cache = (ne_k + ne_v) * n_gpu_layers;
|
||||
*cpu_cache = (ne_k + ne_v) * (llama_model_n_layers(model) - n_gpu_layers);
|
||||
} else {
|
||||
*gpu_cache = 0;
|
||||
*cpu_cache = (ne_k + ne_v) * llama_model_n_layers(model);
|
||||
}
|
||||
}
|
||||
|
||||
void llama_model_n_flops(
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue