mirror of
https://github.com/Lizonghang/prima.cpp.git
synced 2025-09-06 06:09:04 +00:00
decrease compute buf from available memory
This commit is contained in:
parent
329d084061
commit
0f73d12247
3 changed files with 53 additions and 4 deletions
|
@ -877,9 +877,10 @@ static float device_memory_access_delay(struct device_info & dev_info, int n_lay
|
||||||
}
|
}
|
||||||
|
|
||||||
static float device_disk_access_delay(struct device_info & dev_info, struct llama_model * model, const struct llama_context_params cparams) {
|
static float device_disk_access_delay(struct device_info & dev_info, struct llama_model * model, const struct llama_context_params cparams) {
|
||||||
auto n_params = dev_info.model_params;
|
auto n_params = dev_info.model_params;
|
||||||
int n_layers = llama_model_n_layers(model);
|
int n_layers = llama_model_n_layers(model);
|
||||||
double kv_size_gb = static_cast<double>(llama_model_kvcache_size(model, cparams)) / 1e9; // convert to GB
|
double kv_size_gb = static_cast<double>(llama_model_kvcache_size(model, cparams)) / 1e9; // convert to GB
|
||||||
|
double compute_buf_gb = static_cast<double>(llama_model_compute_buf_size(model, cparams, false)) / 1e9; // convert to GB
|
||||||
|
|
||||||
int64_t total_bytes = 0;
|
int64_t total_bytes = 0;
|
||||||
total_bytes += n_params.layer_f32 * 4 +
|
total_bytes += n_params.layer_f32 * 4 +
|
||||||
|
@ -899,7 +900,14 @@ static float device_disk_access_delay(struct device_info & dev_info, struct llam
|
||||||
float total_gbytes = (double)total_bytes / 1e9; // convert to GB
|
float total_gbytes = (double)total_bytes / 1e9; // convert to GB
|
||||||
float mem_avail = dev_info.memory.available_physical * 1024.0f * 1024.0f * 1024.0f / 1e9; // convert to GB
|
float mem_avail = dev_info.memory.available_physical * 1024.0f * 1024.0f * 1024.0f / 1e9; // convert to GB
|
||||||
mem_avail -= static_cast<float>(kv_size_gb);
|
mem_avail -= static_cast<float>(kv_size_gb);
|
||||||
// todo: consider activations which also consumes the available memory
|
|
||||||
|
if (mem_avail - static_cast<float>(compute_buf_gb) < total_gbytes) {
|
||||||
|
double compressed_compute_buf_gb = static_cast<double>(llama_model_compute_buf_size(model, cparams, true)) / 1e9; // convert to GB
|
||||||
|
mem_avail -= static_cast<float>(compressed_compute_buf_gb);
|
||||||
|
} else {
|
||||||
|
mem_avail -= static_cast<float>(compute_buf_gb);
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef __linux__
|
#ifdef __linux__
|
||||||
float disk_read_bw = dev_info.disk.read_seq_bw; // GB/s
|
float disk_read_bw = dev_info.disk.read_seq_bw; // GB/s
|
||||||
#else
|
#else
|
||||||
|
|
|
@ -523,6 +523,9 @@ extern "C" {
|
||||||
// Returns the total number of parameters in the model
|
// Returns the total number of parameters in the model
|
||||||
LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
|
LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
|
||||||
|
|
||||||
|
// Return the size of compute buffer size, including input tensors and activations
|
||||||
|
LLAMA_API uint64_t llama_model_compute_buf_size(const struct llama_model * model, const struct llama_context_params cparams, bool compress_memory);
|
||||||
|
|
||||||
// Return the size of KV cache in the model
|
// Return the size of KV cache in the model
|
||||||
LLAMA_API uint64_t llama_model_kvcache_size(const struct llama_model * model, const struct llama_context_params cparams);
|
LLAMA_API uint64_t llama_model_kvcache_size(const struct llama_model * model, const struct llama_context_params cparams);
|
||||||
|
|
||||||
|
|
|
@ -20808,6 +20808,44 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
uint64_t llama_model_compute_buf_size(const struct llama_model * model, const struct llama_context_params cparams, bool compress_memory) {
|
||||||
|
const llama_hparams hparams = model->hparams;
|
||||||
|
|
||||||
|
// input tensors
|
||||||
|
const uint64_t n_inp_toks = cparams.n_ubatch;
|
||||||
|
const uint64_t n_inp_embd = hparams.n_embd * cparams.n_ubatch;
|
||||||
|
|
||||||
|
// activations (see figures/memory-allocation-map-for-activations.png for detailed allocation)
|
||||||
|
const uint64_t n_bak_embd = hparams.n_embd * cparams.n_ubatch;
|
||||||
|
const uint64_t n_inp_pos = cparams.n_ubatch;
|
||||||
|
const uint64_t n_kq_mask = cparams.n_ctx * cparams.n_ubatch;
|
||||||
|
const uint64_t n_inp_out_ids = cparams.n_ubatch;
|
||||||
|
const uint64_t n_norm = hparams.n_embd * cparams.n_ubatch;
|
||||||
|
const uint64_t n_qcur = hparams.n_embd * cparams.n_ubatch * 2;
|
||||||
|
const uint64_t n_kq = cparams.n_ctx * cparams.n_ubatch * hparams.n_head();
|
||||||
|
|
||||||
|
// outputs
|
||||||
|
const uint64_t n_out_embd = hparams.n_embd * cparams.n_ubatch;
|
||||||
|
const uint64_t n_output = hparams.n_vocab * cparams.n_ubatch;
|
||||||
|
|
||||||
|
// compute buffer size for input, each layer, and output
|
||||||
|
// const uint64_t n_buf_inp = (n_inp_toks + n_inp_embd) * ggml_type_size(GGML_TYPE_F32); // do not consider memory compression
|
||||||
|
const uint64_t n_buf_inp = (n_inp_toks + n_inp_embd) * ggml_type_size(GGML_TYPE_F32) / 2; // consider compressed memory with ratio 2:1
|
||||||
|
const uint64_t n_buf_act = (n_bak_embd + n_inp_pos + n_kq_mask +
|
||||||
|
n_inp_out_ids + n_norm + n_qcur + n_kq
|
||||||
|
) * ggml_type_size(GGML_TYPE_F32);
|
||||||
|
// const uint64_t n_buf_out = (n_out_embd + n_output) * ggml_type_size(GGML_TYPE_F32); // do not consider memory compression
|
||||||
|
const uint64_t n_buf_out = (n_out_embd + n_output) * ggml_type_size(GGML_TYPE_F32) / 2; // consider compressed memory with ratio 2:1
|
||||||
|
|
||||||
|
uint64_t n_buf_total = 0;
|
||||||
|
if (cparams.rank == 0) {
|
||||||
|
n_buf_total = n_buf_inp + n_buf_act + n_buf_out;
|
||||||
|
} else {
|
||||||
|
n_buf_total = n_buf_act;
|
||||||
|
}
|
||||||
|
return n_buf_total;
|
||||||
|
}
|
||||||
|
|
||||||
uint64_t llama_model_kvcache_size(const struct llama_model * model, const struct llama_context_params cparams) {
|
uint64_t llama_model_kvcache_size(const struct llama_model * model, const struct llama_context_params cparams) {
|
||||||
const llama_hparams hparams = model->hparams;
|
const llama_hparams hparams = model->hparams;
|
||||||
uint64_t ne_k = static_cast<uint64_t>(hparams.n_embd_k_gqa()) * cparams.n_ctx * ggml_type_size(cparams.type_k);
|
uint64_t ne_k = static_cast<uint64_t>(hparams.n_embd_k_gqa()) * cparams.n_ctx * ggml_type_size(cparams.type_k);
|
||||||
|
|
Loading…
Add table
Reference in a new issue