mirror of
https://github.com/Lizonghang/prima.cpp.git
synced 2025-09-05 20:20:39 +00:00
reduce kv cache from available memory
This commit is contained in:
parent
9858d90ce4
commit
45a1e55eec
5 changed files with 25 additions and 7 deletions
|
@ -914,7 +914,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
||||||
dev_info_set = (struct device_info *)malloc(n_world * sizeof(struct device_info));
|
dev_info_set = (struct device_info *)malloc(n_world * sizeof(struct device_info));
|
||||||
dev_info_set[0] = dev_info;
|
dev_info_set[0] = dev_info;
|
||||||
llama_gather_device_info(lctx, dev_info_set);
|
llama_gather_device_info(lctx, dev_info_set);
|
||||||
device_print_props(dev_info_set, n_world, model);
|
device_print_props(dev_info_set, n_world, model, cparams);
|
||||||
} else {
|
} else {
|
||||||
llama_send_device_info(lctx, &dev_info);
|
llama_send_device_info(lctx, &dev_info);
|
||||||
}
|
}
|
||||||
|
|
|
@ -876,8 +876,10 @@ static float device_memory_access_delay(struct device_info & dev_info, int n_lay
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
static float device_disk_access_delay(struct device_info & dev_info, int n_layers) {
|
static float device_disk_access_delay(struct device_info & dev_info, struct llama_model * model, const struct llama_context_params cparams) {
|
||||||
struct model_params n_params = dev_info.model_params;
|
auto n_params = dev_info.model_params;
|
||||||
|
int n_layers = llama_model_n_layers(model);
|
||||||
|
double kv_size_gb = static_cast<double>(llama_model_kvcache_size(model, cparams)) / 1e9; // convert to GB
|
||||||
|
|
||||||
int64_t total_bytes = 0;
|
int64_t total_bytes = 0;
|
||||||
total_bytes += n_params.layer_f32 * 4 +
|
total_bytes += n_params.layer_f32 * 4 +
|
||||||
|
@ -895,13 +897,18 @@ static float device_disk_access_delay(struct device_info & dev_info, int n_layer
|
||||||
n_params.output_q80;
|
n_params.output_q80;
|
||||||
|
|
||||||
float total_gbytes = (double)total_bytes / 1e9; // convert to GB
|
float total_gbytes = (double)total_bytes / 1e9; // convert to GB
|
||||||
float mem_avail = dev_info.memory.available_physical * 1024.0f * 1024.0f * 1024.0f / 1e9; // convert to GB
|
float mem_avail = dev_info.memory.available_physical * 1024.0f * 1024.0f * 1024.0f / 1e9; // convert to GB
|
||||||
|
mem_avail -= static_cast<float>(kv_size_gb);
|
||||||
// todo: consider activations which also consumes the available memory
|
// todo: consider activations which also consumes the available memory
|
||||||
|
#ifdef __linux__
|
||||||
|
float disk_read_bw = dev_info.disk.read_seq_bw; // GB/s
|
||||||
|
#else
|
||||||
float disk_read_bw = dev_info.disk.read_rnd_bw; // GB/s
|
float disk_read_bw = dev_info.disk.read_rnd_bw; // GB/s
|
||||||
|
#endif
|
||||||
return std::max(0.0, static_cast<double>(total_gbytes - mem_avail) / disk_read_bw * 1000); // convert to ms
|
return std::max(0.0, static_cast<double>(total_gbytes - mem_avail) / disk_read_bw * 1000); // convert to ms
|
||||||
}
|
}
|
||||||
|
|
||||||
void device_print_props(struct device_info * dev_info_set, int n, struct llama_model * model) {
|
void device_print_props(struct device_info * dev_info_set, int n, struct llama_model * model, const struct llama_context_params cparams) {
|
||||||
LOG_INF("\n-------------------------------------------------------------------------------------------\n");
|
LOG_INF("\n-------------------------------------------------------------------------------------------\n");
|
||||||
LOG_INF("| Property ");
|
LOG_INF("| Property ");
|
||||||
for (int i = 0; i < n; ++i) {
|
for (int i = 0; i < n; ++i) {
|
||||||
|
@ -1255,7 +1262,7 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
|
||||||
int n_layers = llama_model_n_layers(model);
|
int n_layers = llama_model_n_layers(model);
|
||||||
latency += device_compute_delay(dev_info_set[0], n_layers);
|
latency += device_compute_delay(dev_info_set[0], n_layers);
|
||||||
latency += device_memory_access_delay(dev_info_set[0], n_layers);
|
latency += device_memory_access_delay(dev_info_set[0], n_layers);
|
||||||
latency += device_disk_access_delay(dev_info_set[0], n_layers); // if physical memory is not enough, some tensor weights will be released from memory and reloaded by mmap later
|
latency += device_disk_access_delay(dev_info_set[0], model, cparams); // if physical memory is not enough, some tensor weights will be released from memory and reloaded by mmap later
|
||||||
|
|
||||||
LOG_INF("| Token latency (ms) ");
|
LOG_INF("| Token latency (ms) ");
|
||||||
LOG_INF("| %-10.2f ", latency);
|
LOG_INF("| %-10.2f ", latency);
|
||||||
|
|
|
@ -8,6 +8,7 @@
|
||||||
#define DISK_TEST_SEQ_BLOCK 100L * 1024 * 1024
|
#define DISK_TEST_SEQ_BLOCK 100L * 1024 * 1024
|
||||||
#define DISK_TEST_RND_BLOCK 4096
|
#define DISK_TEST_RND_BLOCK 4096
|
||||||
|
|
||||||
|
|
||||||
struct cpu_props {
|
struct cpu_props {
|
||||||
const char * name;
|
const char * name;
|
||||||
const char * description;
|
const char * description;
|
||||||
|
@ -222,7 +223,7 @@ void device_disk_rnd_bw (float * read_rnd_bw, float * write_rnd_bw, int
|
||||||
float device_memory_bw (int n_thread);
|
float device_memory_bw (int n_thread);
|
||||||
float device_cuda_memory_bw (struct llama_model * model);
|
float device_cuda_memory_bw (struct llama_model * model);
|
||||||
void device_get_props (struct llama_model * model, int device, struct ggml_backend_dev_props * props);
|
void device_get_props (struct llama_model * model, int device, struct ggml_backend_dev_props * props);
|
||||||
void device_print_props (struct device_info * dev_info_set, int n, struct llama_model * model);
|
void device_print_props (struct device_info * dev_info_set, int n, struct llama_model * model, const struct llama_context_params cparams);
|
||||||
|
|
||||||
int device_has_metal (void);
|
int device_has_metal (void);
|
||||||
int device_has_cuda (void);
|
int device_has_cuda (void);
|
||||||
|
|
|
@ -523,6 +523,9 @@ extern "C" {
|
||||||
// Returns the total number of parameters in the model
|
// Returns the total number of parameters in the model
|
||||||
LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
|
LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
|
||||||
|
|
||||||
|
// Return the size of KV cache in the model
|
||||||
|
LLAMA_API uint64_t llama_model_kvcache_size(const struct llama_model * model, const struct llama_context_params cparams);
|
||||||
|
|
||||||
// Return the total number of float operations in the model
|
// Return the total number of float operations in the model
|
||||||
LLAMA_API void llama_model_n_flops(
|
LLAMA_API void llama_model_n_flops(
|
||||||
struct llama_model * model,
|
struct llama_model * model,
|
||||||
|
|
|
@ -20808,6 +20808,13 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
uint64_t llama_model_kvcache_size(const struct llama_model * model, const struct llama_context_params cparams) {
|
||||||
|
const llama_hparams hparams = model->hparams;
|
||||||
|
uint64_t ne_k = static_cast<uint64_t>(hparams.n_embd_k_gqa()) * cparams.n_ctx * ggml_type_size(cparams.type_k);
|
||||||
|
uint64_t ne_v = static_cast<uint64_t>(hparams.n_embd_v_gqa()) * cparams.n_ctx * ggml_type_size(cparams.type_v);
|
||||||
|
return (ne_k + ne_v) * llama_model_n_layers(model);
|
||||||
|
}
|
||||||
|
|
||||||
void llama_model_n_flops(
|
void llama_model_n_flops(
|
||||||
struct llama_model * model,
|
struct llama_model * model,
|
||||||
struct llama_model_loader * ml,
|
struct llama_model_loader * ml,
|
||||||
|
|
Loading…
Add table
Reference in a new issue