add gpu support in llama_model_kvcache_size and llama_model_compute_buf_size

This commit is contained in:
Lizonghang 2024-11-29 21:06:32 +04:00
parent f8e9dc2713
commit 6f54a12c7d
3 changed files with 58 additions and 28 deletions

View file

@ -880,7 +880,7 @@ static float device_compute_delay(struct device_info & dev_info, int n_layers, c
total_latency += gpu_latency_per_layer * n_gpu_layers; total_latency += gpu_latency_per_layer * n_gpu_layers;
total_latency += cpu_latency_per_layer * (n_layers - n_gpu_layers); total_latency += cpu_latency_per_layer * (n_layers - n_gpu_layers);
#else #else
total_latency *= cpu_latency_per_layer * n_layers; total_latency += cpu_latency_per_layer * n_layers;
#endif #endif
total_latency += (double)n_flops.output_f32_f32 / (double)cpu.flops_f32_f32 / 1e9; total_latency += (double)n_flops.output_f32_f32 / (double)cpu.flops_f32_f32 / 1e9;
@ -900,8 +900,8 @@ static float device_compute_delay(struct device_info & dev_info, int n_layers, c
static float device_memory_access_delay(struct device_info & dev_info, int n_layers) { static float device_memory_access_delay(struct device_info & dev_info, int n_layers) {
struct model_params n_params = dev_info.model_params; struct model_params n_params = dev_info.model_params;
int64_t total_bytes = 0; int64_t total_bytes =
total_bytes += n_params.layer_f32 * 4 + n_params.layer_f32 * 4 +
n_params.layer_f16 * 2 + n_params.layer_f16 * 2 +
n_params.layer_q4k / 2 + n_params.layer_q4k / 2 +
n_params.layer_q6k * 3 / 8 + n_params.layer_q6k * 3 / 8 +
@ -929,14 +929,22 @@ static float device_disk_access_delay(struct device_info & dev_info, struct llam
int n_layers = llama_model_n_layers(model); int n_layers = llama_model_n_layers(model);
int n_gpu_layers = cparams.n_gpu_layers; int n_gpu_layers = cparams.n_gpu_layers;
uint64_t cpu_kv_size;
uint64_t gpu_kv_size;
uint64_t cpu_compute_buf;
uint64_t gpu_compute_buf;
#if defined(GGML_USE_METAL) || defined(GGML_USE_CUDA) #if defined(GGML_USE_METAL) || defined(GGML_USE_CUDA)
double cpu_kv_size_gb = 0.0f; llama_model_kvcache_size(&cpu_kv_size, &gpu_kv_size, model, cparams, true);
double cpu_compute_buf_gb = 0.0f; llama_model_compute_buf_size(&cpu_compute_buf, &gpu_compute_buf, model, cparams, true);
#else #else
double cpu_kv_size_gb = static_cast<double>(llama_model_kvcache_size(model, cparams)) / 1e9; // convert to GB llama_model_kvcache_size(&cpu_kv_size, &gpu_kv_size, model, cparams, false);
double cpu_compute_buf_gb = static_cast<double>(llama_model_compute_buf_size(model, cparams, false)) / 1e9; // convert to GB llama_model_compute_buf_size(&cpu_compute_buf, &gpu_compute_buf, model, cparams, false);
#endif #endif
double cpu_kv_size_gb = static_cast<double>(cpu_kv_size) / 1e9; // convert to GB
double cpu_compute_buf_gb = static_cast<double>(cpu_compute_buf) / 1e9; // convert to GB
int64_t cpu_total_bytes = int64_t cpu_total_bytes =
n_params.layer_f32 * 4 + n_params.layer_f32 * 4 +
n_params.layer_f16 * 2 + n_params.layer_f16 * 2 +
@ -947,6 +955,7 @@ static float device_disk_access_delay(struct device_info & dev_info, struct llam
#if defined(GGML_USE_METAL) || defined(GGML_USE_CUDA) #if defined(GGML_USE_METAL) || defined(GGML_USE_CUDA)
cpu_total_bytes *= (n_layers - n_gpu_layers); cpu_total_bytes *= (n_layers - n_gpu_layers);
#else #else
(void)n_gpu_layers;
cpu_total_bytes *= n_layers; cpu_total_bytes *= n_layers;
#endif #endif
@ -960,13 +969,7 @@ static float device_disk_access_delay(struct device_info & dev_info, struct llam
float cpu_total_gbytes = (double)cpu_total_bytes / 1e9; // convert to GB float cpu_total_gbytes = (double)cpu_total_bytes / 1e9; // convert to GB
float cpu_mem_avail = dev_info.memory.available_physical * 1024.0f * 1024.0f * 1024.0f / 1e9; // convert to GB float cpu_mem_avail = dev_info.memory.available_physical * 1024.0f * 1024.0f * 1024.0f / 1e9; // convert to GB
cpu_mem_avail -= static_cast<float>(cpu_kv_size_gb); cpu_mem_avail -= static_cast<float>(cpu_kv_size_gb);
cpu_mem_avail -= static_cast<float>(cpu_compute_buf_gb);
if (cpu_mem_avail - static_cast<float>(cpu_compute_buf_gb) < cpu_total_gbytes) {
double compressed_compute_buf_gb = static_cast<double>(llama_model_compute_buf_size(model, cparams, true)) / 1e9; // convert to GB
cpu_mem_avail -= static_cast<float>(compressed_compute_buf_gb);
} else {
cpu_mem_avail -= static_cast<float>(cpu_compute_buf_gb);
}
#ifdef __linux__ #ifdef __linux__
float disk_read_bw = dev_info.disk.read_seq_bw; // GB/s float disk_read_bw = dev_info.disk.read_seq_bw; // GB/s

View file

@ -525,10 +525,20 @@ extern "C" {
LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model); LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
// Return the size of compute buffer size, including input tensors and activations // Return the size of compute buffer size, including input tensors and activations
LLAMA_API uint64_t llama_model_compute_buf_size(const struct llama_model * model, const struct llama_context_params cparams, bool compress_memory); LLAMA_API void llama_model_compute_buf_size(
uint64_t * cpu_buf,
uint64_t * gpu_buf,
const struct llama_model * model,
const struct llama_context_params cparams,
bool use_gpu);
// Return the size of KV cache in the model // Return the size of KV cache in the model
LLAMA_API uint64_t llama_model_kvcache_size(const struct llama_model * model, const struct llama_context_params cparams); LLAMA_API void llama_model_kvcache_size(
uint64_t * cpu_cache,
uint64_t * gpu_cache,
const struct llama_model * model,
const struct llama_context_params cparams,
bool use_gpu);
// Return the total number of float operations in the model // Return the total number of float operations in the model
LLAMA_API void llama_model_n_flops( LLAMA_API void llama_model_n_flops(

View file

@ -20810,7 +20810,12 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype,
} }
} }
uint64_t llama_model_compute_buf_size(const struct llama_model * model, const struct llama_context_params cparams, bool compress_memory) { void llama_model_compute_buf_size(
uint64_t * cpu_buf,
uint64_t * gpu_buf,
const struct llama_model * model,
const struct llama_context_params cparams,
bool use_gpu) {
const llama_hparams hparams = model->hparams; const llama_hparams hparams = model->hparams;
// input tensors // input tensors
@ -20831,30 +20836,42 @@ uint64_t llama_model_compute_buf_size(const struct llama_model * model, const st
const uint64_t n_output = hparams.n_vocab * cparams.n_ubatch; const uint64_t n_output = hparams.n_vocab * cparams.n_ubatch;
// compute buffer size for input, each layer, and output // compute buffer size for input, each layer, and output
const uint64_t n_buf_inp = (n_inp_toks + n_inp_embd) * ggml_type_size(GGML_TYPE_F32); // do not consider memory compression const uint64_t n_buf_inp = (n_inp_toks + n_inp_embd) * ggml_type_size(GGML_TYPE_F32);
const uint64_t n_buf_act = (n_bak_embd + n_inp_pos + n_kq_mask + const uint64_t n_buf_act = (n_bak_embd + n_inp_pos + n_kq_mask +
n_inp_out_ids + n_norm + n_qcur + n_kq n_inp_out_ids + n_norm + n_qcur + n_kq
) * ggml_type_size(GGML_TYPE_F32); ) * ggml_type_size(GGML_TYPE_F32);
const uint64_t n_buf_out = (n_out_embd + n_output) * ggml_type_size(GGML_TYPE_F32); // do not consider memory compression const uint64_t n_buf_out = (n_out_embd + n_output) * ggml_type_size(GGML_TYPE_F32);
uint64_t n_buf_total = 0; if (use_gpu) {
if (cparams.rank == 0) { *gpu_buf = n_buf_act;
if (compress_memory) { if (llama_model_n_layers(model) > cparams.n_gpu_layers) {
n_buf_total = n_buf_inp / 2 + n_buf_act + n_buf_out / 2; // consider compressed memory with ratio 2:1 *cpu_buf = n_buf_inp + n_buf_act + n_buf_out;
} else { } else {
n_buf_total = n_buf_inp + n_buf_act + n_buf_out; *cpu_buf = n_buf_inp + n_buf_out;
} }
} else { } else {
n_buf_total = n_buf_act; *gpu_buf = 0;
*cpu_buf = n_buf_inp + n_buf_act + n_buf_out;
} }
return n_buf_total;
} }
uint64_t llama_model_kvcache_size(const struct llama_model * model, const struct llama_context_params cparams) { void llama_model_kvcache_size(
uint64_t * cpu_cache,
uint64_t * gpu_cache,
const struct llama_model * model,
const struct llama_context_params cparams,
bool use_gpu) {
const llama_hparams hparams = model->hparams; const llama_hparams hparams = model->hparams;
uint64_t ne_k = static_cast<uint64_t>(hparams.n_embd_k_gqa()) * cparams.n_ctx * ggml_type_size(cparams.type_k); uint64_t ne_k = static_cast<uint64_t>(hparams.n_embd_k_gqa()) * cparams.n_ctx * ggml_type_size(cparams.type_k);
uint64_t ne_v = static_cast<uint64_t>(hparams.n_embd_v_gqa()) * cparams.n_ctx * ggml_type_size(cparams.type_v); uint64_t ne_v = static_cast<uint64_t>(hparams.n_embd_v_gqa()) * cparams.n_ctx * ggml_type_size(cparams.type_v);
return (ne_k + ne_v) * llama_model_n_layers(model); if (use_gpu) {
int n_gpu_layers = cparams.n_gpu_layers;
*gpu_cache = (ne_k + ne_v) * n_gpu_layers;
*cpu_cache = (ne_k + ne_v) * (llama_model_n_layers(model) - n_gpu_layers);
} else {
*gpu_cache = 0;
*cpu_cache = (ne_k + ne_v) * llama_model_n_layers(model);
}
} }
void llama_model_n_flops( void llama_model_n_flops(