mirror of
https://github.com/Lizonghang/prima.cpp.git
synced 2025-09-06 04:59:05 +00:00
fix flops count and ram/vram speed test
This commit is contained in:
parent
26c2ffb5b7
commit
df813675d0
5 changed files with 136 additions and 83 deletions
|
@ -896,7 +896,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
||||||
|
|
||||||
device_info dev_info;
|
device_info dev_info;
|
||||||
dev_info.rank = params.rank;
|
dev_info.rank = params.rank;
|
||||||
llama_profile_device(&dev_info, model, ml, params.n_predict, params.cpuparams.n_threads);
|
llama_profile_device(&dev_info, model, ml, params.n_predict, params.n_ctx, params.cpuparams.n_threads, params.flash_attn);
|
||||||
|
|
||||||
// create llama context
|
// create llama context
|
||||||
struct llama_context_params cparams = llama_context_params_from_gpt_params(params);
|
struct llama_context_params cparams = llama_context_params_from_gpt_params(params);
|
||||||
|
@ -1133,6 +1133,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
||||||
std::strcpy(cparams.next_node_ip, params.next_node_ip.c_str());
|
std::strcpy(cparams.next_node_ip, params.next_node_ip.c_str());
|
||||||
|
|
||||||
cparams.n_ctx = params.n_ctx;
|
cparams.n_ctx = params.n_ctx;
|
||||||
|
cparams.n_predict = params.n_predict;
|
||||||
cparams.n_seq_max = params.n_parallel;
|
cparams.n_seq_max = params.n_parallel;
|
||||||
cparams.n_batch = params.n_batch;
|
cparams.n_batch = params.n_batch;
|
||||||
cparams.n_ubatch = params.n_ubatch;
|
cparams.n_ubatch = params.n_ubatch;
|
||||||
|
|
|
@ -97,8 +97,9 @@ uint32_t device_cpu_cores() {
|
||||||
}
|
}
|
||||||
|
|
||||||
static float device_flops(struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t, enum profiler_backend_type btype, int n_threads) {
|
static float device_flops(struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t, enum profiler_backend_type btype, int n_threads) {
|
||||||
const int n_repeat = 1;
|
int n_repeat = 1;
|
||||||
const int n_embd = std::min(llama_n_embd(model), 4096);
|
int n_embd = std::min(llama_n_embd(model), 4096);
|
||||||
|
if (btype == PROFILER_BACKEND_TYPE_CPU) n_embd /= 8; // simulate small tensor calculation on cpu
|
||||||
std::vector<float> matrix_A(n_embd * n_embd, 1.0f);
|
std::vector<float> matrix_A(n_embd * n_embd, 1.0f);
|
||||||
std::vector<float> matrix_B(n_embd * n_embd, 1.0f / n_embd);
|
std::vector<float> matrix_B(n_embd * n_embd, 1.0f / n_embd);
|
||||||
|
|
||||||
|
@ -142,12 +143,9 @@ static float device_flops(struct llama_model * model, enum ggml_type src0t, enum
|
||||||
struct ggml_cgraph * gf = NULL;
|
struct ggml_cgraph * gf = NULL;
|
||||||
struct ggml_context * ctx_cgraph = NULL;
|
struct ggml_context * ctx_cgraph = NULL;
|
||||||
struct ggml_tensor * cur = NULL;
|
struct ggml_tensor * cur = NULL;
|
||||||
struct ggml_tensor * cur1 = NULL;
|
|
||||||
struct ggml_tensor * cur2 = NULL;
|
|
||||||
struct ggml_tensor * cur3 = NULL;
|
|
||||||
{
|
{
|
||||||
struct ggml_init_params params0 = {
|
struct ggml_init_params params0 = {
|
||||||
/*.mem_size =*/ ggml_tensor_overhead() * (5 * n_repeat + 1) + ggml_graph_overhead(),
|
/*.mem_size =*/ ggml_tensor_overhead() * (n_repeat + 2) + ggml_graph_overhead(),
|
||||||
/*.mem_buffer =*/ NULL,
|
/*.mem_buffer =*/ NULL,
|
||||||
/*.no_alloc =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph()
|
/*.no_alloc =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph()
|
||||||
};
|
};
|
||||||
|
@ -155,12 +153,8 @@ static float device_flops(struct llama_model * model, enum ggml_type src0t, enum
|
||||||
|
|
||||||
gf = ggml_new_graph(ctx_cgraph);
|
gf = ggml_new_graph(ctx_cgraph);
|
||||||
cur = ggml_mul_mat(ctx_cgraph, tensor_a, tensor_b);
|
cur = ggml_mul_mat(ctx_cgraph, tensor_a, tensor_b);
|
||||||
for (int i = 0; i < n_repeat; i++) {
|
for (int i = 0; i < n_repeat - 1; i++) {
|
||||||
cur1 = ggml_mul_mat(ctx_cgraph, tensor_a, cur);
|
cur = ggml_mul_mat(ctx_cgraph, tensor_a, cur);
|
||||||
cur2 = ggml_mul_mat(ctx_cgraph, tensor_a, cur);
|
|
||||||
cur = ggml_add(ctx_cgraph, cur1, cur2);
|
|
||||||
cur3 = ggml_mul_mat(ctx_cgraph, tensor_a, cur);
|
|
||||||
cur = ggml_add(ctx_cgraph, cur, cur3);
|
|
||||||
}
|
}
|
||||||
ggml_build_forward_expand(gf, cur);
|
ggml_build_forward_expand(gf, cur);
|
||||||
}
|
}
|
||||||
|
@ -204,15 +198,14 @@ static float device_flops(struct llama_model * model, enum ggml_type src0t, enum
|
||||||
ggml_backend_sched_alloc_graph(sched, gf);
|
ggml_backend_sched_alloc_graph(sched, gf);
|
||||||
|
|
||||||
// warm-up
|
// warm-up
|
||||||
// ggml_backend_graph_compute(backend, gf);
|
ggml_backend_graph_compute(backend, gf);
|
||||||
|
|
||||||
const int64_t t_start = ggml_time_us();
|
const int64_t t_start = ggml_time_us();
|
||||||
ggml_backend_graph_compute(backend, gf);
|
ggml_backend_graph_compute(backend, gf);
|
||||||
const int64_t t_end = ggml_time_us();
|
const int64_t t_end = ggml_time_us();
|
||||||
|
|
||||||
double elapsed_seconds = ((double)t_end - (double)t_start) / 1e6; // convert to seconds
|
double elapsed_seconds = ((double)t_end - (double)t_start) / 1e6; // convert to seconds
|
||||||
double flops = (2.0 * (double)n_embd * (double)n_embd * (double)n_embd +
|
double flops = (2.0 * (double)n_embd * (double)n_embd * (double)n_embd * n_repeat) / elapsed_seconds / 1e9; // convert to GFLOPS
|
||||||
n_repeat * 4 * 2.0 * (double)n_embd * (double)n_embd * (double)n_embd) / elapsed_seconds / 1e9; // convert to GFLOPS
|
|
||||||
|
|
||||||
ggml_free(ctx_cgraph);
|
ggml_free(ctx_cgraph);
|
||||||
ggml_gallocr_free(allocr);
|
ggml_gallocr_free(allocr);
|
||||||
|
@ -933,8 +926,8 @@ float device_memory_bw(int n_thread) {
|
||||||
return static_cast<float>(bandwidth);
|
return static_cast<float>(bandwidth);
|
||||||
}
|
}
|
||||||
|
|
||||||
static float device_read_vram_bw(struct llama_model * model, enum profiler_backend_type btype) {
|
static float device_read_vram_bw(enum profiler_backend_type btype) {
|
||||||
const int n_embd = std::min(llama_n_embd(model) * 2, 4096 * 2);
|
const int n_embd = 8192;
|
||||||
std::vector<float> matrix_A(n_embd * n_embd, 1.0f);
|
std::vector<float> matrix_A(n_embd * n_embd, 1.0f);
|
||||||
|
|
||||||
ggml_backend_t backend = NULL;
|
ggml_backend_t backend = NULL;
|
||||||
|
@ -1006,21 +999,19 @@ static float device_read_vram_bw(struct llama_model * model, enum profiler_backe
|
||||||
return bandwidth;
|
return bandwidth;
|
||||||
}
|
}
|
||||||
|
|
||||||
float device_metal_read_vram_bw(struct llama_model * model) {
|
float device_metal_read_vram_bw() {
|
||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
return device_read_vram_bw(model, PROFILER_BACKEND_TYPE_METAL);
|
return device_read_vram_bw(PROFILER_BACKEND_TYPE_METAL);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
(void)model;
|
|
||||||
return 0.0f;
|
return 0.0f;
|
||||||
}
|
}
|
||||||
|
|
||||||
float device_cuda_read_vram_bw(struct llama_model * model) {
|
float device_cuda_read_vram_bw() {
|
||||||
#ifdef GGML_USE_CUDA
|
#ifdef GGML_USE_CUDA
|
||||||
return device_read_vram_bw(model, PROFILER_BACKEND_TYPE_CUDA);
|
return device_read_vram_bw(PROFILER_BACKEND_TYPE_CUDA);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
(void)model;
|
|
||||||
return 0.0f;
|
return 0.0f;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1124,7 +1115,7 @@ static float device_compute_delay(struct device_info & dev_info, int n_layers, c
|
||||||
}
|
}
|
||||||
|
|
||||||
// estimate the memory access delay, except for the input embedding because it has been considered in n_flops.inp_embd_ms
|
// estimate the memory access delay, except for the input embedding because it has been considered in n_flops.inp_embd_ms
|
||||||
static float device_memory_access_delay(struct device_info & dev_info, const struct llama_context_params cparams, int n_layers) {
|
static float device_memory_access_delay(struct device_info & dev_info, struct llama_model * model, const struct llama_context_params cparams, int n_layers) {
|
||||||
struct model_params n_params = dev_info.model_params;
|
struct model_params n_params = dev_info.model_params;
|
||||||
int n_gpu_layers = std::min(static_cast<int>(cparams.n_gpu_layers), n_layers);
|
int n_gpu_layers = std::min(static_cast<int>(cparams.n_gpu_layers), n_layers);
|
||||||
|
|
||||||
|
@ -1144,9 +1135,14 @@ static float device_memory_access_delay(struct device_info & dev_info, const str
|
||||||
n_params.output_q6k * 6 / 8 +
|
n_params.output_q6k * 6 / 8 +
|
||||||
n_params.output_q80;
|
n_params.output_q80;
|
||||||
|
|
||||||
#if defined(GGML_USE_CUDA) || defined(GGML_USE_METAL)
|
uint64_t cpu_kv_size;
|
||||||
int64_t vram_bytes = layer_bytes * n_gpu_layers;
|
uint64_t gpu_kv_size;
|
||||||
int64_t ram_bytes = layer_bytes * (n_layers - n_gpu_layers) + output_bytes;
|
|
||||||
|
#if defined(GGML_USE_METAL) || defined(GGML_USE_CUDA)
|
||||||
|
llama_kv_size(&cpu_kv_size, &gpu_kv_size, model, cparams, true);
|
||||||
|
|
||||||
|
int64_t vram_bytes = layer_bytes * n_gpu_layers + gpu_kv_size;
|
||||||
|
int64_t ram_bytes = layer_bytes * (n_layers - n_gpu_layers) + output_bytes + cpu_kv_size;
|
||||||
|
|
||||||
#ifdef GGML_USE_CUDA
|
#ifdef GGML_USE_CUDA
|
||||||
double vram_access_delay = (double)(vram_bytes) / 1e6 / dev_info.gpu_props.cuda_read_vram_bw;
|
double vram_access_delay = (double)(vram_bytes) / 1e6 / dev_info.gpu_props.cuda_read_vram_bw;
|
||||||
|
@ -1158,8 +1154,11 @@ static float device_memory_access_delay(struct device_info & dev_info, const str
|
||||||
return static_cast<float>(vram_access_delay + ram_access_delay); // ms
|
return static_cast<float>(vram_access_delay + ram_access_delay); // ms
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
llama_kv_size(&cpu_kv_size, &gpu_kv_size, model, cparams, false);
|
||||||
|
|
||||||
(void)n_gpu_layers;
|
(void)n_gpu_layers;
|
||||||
int64_t ram_bytes = layer_bytes * n_layers + output_bytes;
|
(void)gpu_kv_size;
|
||||||
|
int64_t ram_bytes = layer_bytes * n_layers + output_bytes + cpu_kv_size;
|
||||||
double ram_access_delay = (double)(ram_bytes) / 1e6 / dev_info.memory.cpu_read_ram_bw;
|
double ram_access_delay = (double)(ram_bytes) / 1e6 / dev_info.memory.cpu_read_ram_bw;
|
||||||
return static_cast<float>(ram_access_delay); // ms
|
return static_cast<float>(ram_access_delay); // ms
|
||||||
#endif
|
#endif
|
||||||
|
@ -1191,7 +1190,9 @@ static float device_disk_access_delay(struct device_info & dev_info, struct llam
|
||||||
|
|
||||||
#if defined(GGML_USE_METAL) || defined(GGML_USE_CUDA)
|
#if defined(GGML_USE_METAL) || defined(GGML_USE_CUDA)
|
||||||
cpu_total_bytes += layer_bytes * (n_layers - n_gpu_layers);
|
cpu_total_bytes += layer_bytes * (n_layers - n_gpu_layers);
|
||||||
|
#if defined(GGML_USE_METAL)
|
||||||
int64_t gpu_total_bytes = layer_bytes * n_gpu_layers;
|
int64_t gpu_total_bytes = layer_bytes * n_gpu_layers;
|
||||||
|
#endif
|
||||||
#else
|
#else
|
||||||
(void)n_gpu_layers;
|
(void)n_gpu_layers;
|
||||||
cpu_total_bytes += layer_bytes * n_layers;
|
cpu_total_bytes += layer_bytes * n_layers;
|
||||||
|
@ -1211,10 +1212,10 @@ static float device_disk_access_delay(struct device_info & dev_info, struct llam
|
||||||
uint64_t gpu_compute_buf;
|
uint64_t gpu_compute_buf;
|
||||||
|
|
||||||
#if defined(GGML_USE_METAL) || defined(GGML_USE_CUDA)
|
#if defined(GGML_USE_METAL) || defined(GGML_USE_CUDA)
|
||||||
llama_model_kvcache_size(&cpu_kv_size, &gpu_kv_size, model, cparams, true);
|
llama_total_kv_size(&cpu_kv_size, &gpu_kv_size, model, cparams, true);
|
||||||
llama_model_compute_buf_size(&cpu_compute_buf, &gpu_compute_buf, model, cparams, true);
|
llama_model_compute_buf_size(&cpu_compute_buf, &gpu_compute_buf, model, cparams, true);
|
||||||
#else
|
#else
|
||||||
llama_model_kvcache_size(&cpu_kv_size, &gpu_kv_size, model, cparams, false);
|
llama_total_kv_size(&cpu_kv_size, &gpu_kv_size, model, cparams, false);
|
||||||
llama_model_compute_buf_size(&cpu_compute_buf, &gpu_compute_buf, model, cparams, false);
|
llama_model_compute_buf_size(&cpu_compute_buf, &gpu_compute_buf, model, cparams, false);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -1652,7 +1653,7 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
|
||||||
float latency = 0.0f;
|
float latency = 0.0f;
|
||||||
int n_layers = llama_model_n_layers (model);
|
int n_layers = llama_model_n_layers (model);
|
||||||
latency += device_compute_delay (dev_info_set[0], n_layers,cparams);
|
latency += device_compute_delay (dev_info_set[0], n_layers,cparams);
|
||||||
latency += device_memory_access_delay(dev_info_set[0], cparams, n_layers);
|
latency += device_memory_access_delay(dev_info_set[0], model, cparams, n_layers);
|
||||||
latency += device_disk_access_delay (dev_info_set[0], model, cparams); // if physical memory is not enough, some mapped data will be released and reloaded later
|
latency += device_disk_access_delay (dev_info_set[0], model, cparams); // if physical memory is not enough, some mapped data will be released and reloaded later
|
||||||
|
|
||||||
LOG_INF("| Token latency (ms) ");
|
LOG_INF("| Token latency (ms) ");
|
||||||
|
|
|
@ -241,8 +241,8 @@ uint64_t device_swap_memory (bool available);
|
||||||
void device_disk_seq_bw (float * read_seq_bw, float * write_seq_bw, int n_threads);
|
void device_disk_seq_bw (float * read_seq_bw, float * write_seq_bw, int n_threads);
|
||||||
void device_disk_rnd_bw (float * read_rnd_bw, float * write_rnd_bw, int n_threads);
|
void device_disk_rnd_bw (float * read_rnd_bw, float * write_rnd_bw, int n_threads);
|
||||||
float device_memory_bw (int n_thread);
|
float device_memory_bw (int n_thread);
|
||||||
float device_metal_read_vram_bw(struct llama_model * model);
|
float device_metal_read_vram_bw();
|
||||||
float device_cuda_read_vram_bw (struct llama_model * model);
|
float device_cuda_read_vram_bw ();
|
||||||
void device_get_props (struct llama_model * model, int device, struct ggml_backend_dev_props * props);
|
void device_get_props (struct llama_model * model, int device, struct ggml_backend_dev_props * props);
|
||||||
void device_print_props (struct device_info * dev_info_set, int n, struct llama_model * model, const struct llama_context_params cparams);
|
void device_print_props (struct device_info * dev_info_set, int n, struct llama_model * model, const struct llama_context_params cparams);
|
||||||
|
|
||||||
|
|
|
@ -325,6 +325,7 @@ extern "C" {
|
||||||
char * master_ip; // ip address of the master node
|
char * master_ip; // ip address of the master node
|
||||||
char * next_node_ip; // ip address of the next node
|
char * next_node_ip; // ip address of the next node
|
||||||
uint32_t n_ctx; // text context, 0 = from model
|
uint32_t n_ctx; // text context, 0 = from model
|
||||||
|
uint32_t n_predict; // number of tokens to predict
|
||||||
uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode
|
uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode
|
||||||
uint32_t n_ubatch; // physical maximum batch size
|
uint32_t n_ubatch; // physical maximum batch size
|
||||||
uint32_t n_seq_max; // max number of sequences (i.e. distinct states for recurrent models)
|
uint32_t n_seq_max; // max number of sequences (i.e. distinct states for recurrent models)
|
||||||
|
@ -416,7 +417,9 @@ extern "C" {
|
||||||
struct llama_model * model,
|
struct llama_model * model,
|
||||||
struct llama_model_loader * ml,
|
struct llama_model_loader * ml,
|
||||||
int n_predict,
|
int n_predict,
|
||||||
int n_threads);
|
int n_ctx,
|
||||||
|
int n_threads,
|
||||||
|
bool flash_attn);
|
||||||
|
|
||||||
LLAMA_API ggml_backend_buffer_type_t llama_dev_buffer_type(struct llama_model * model, int device);
|
LLAMA_API ggml_backend_buffer_type_t llama_dev_buffer_type(struct llama_model * model, int device);
|
||||||
|
|
||||||
|
@ -534,7 +537,14 @@ extern "C" {
|
||||||
bool use_gpu);
|
bool use_gpu);
|
||||||
|
|
||||||
// Return the size of KV cache in the model
|
// Return the size of KV cache in the model
|
||||||
LLAMA_API void llama_model_kvcache_size(
|
LLAMA_API void llama_total_kv_size(
|
||||||
|
uint64_t * cpu_cache,
|
||||||
|
uint64_t * gpu_cache,
|
||||||
|
const struct llama_model * model,
|
||||||
|
const struct llama_context_params cparams,
|
||||||
|
bool use_gpu);
|
||||||
|
|
||||||
|
LLAMA_API void llama_kv_size(
|
||||||
uint64_t * cpu_cache,
|
uint64_t * cpu_cache,
|
||||||
uint64_t * gpu_cache,
|
uint64_t * gpu_cache,
|
||||||
const struct llama_model * model,
|
const struct llama_model * model,
|
||||||
|
@ -547,9 +557,10 @@ extern "C" {
|
||||||
struct llama_model_loader * ml,
|
struct llama_model_loader * ml,
|
||||||
struct model_flops * n_flops,
|
struct model_flops * n_flops,
|
||||||
struct model_params * n_params,
|
struct model_params * n_params,
|
||||||
const int64_t n_input,
|
|
||||||
const int64_t n_history,
|
const int64_t n_history,
|
||||||
enum ggml_type * inp_embd_dtype);
|
const int64_t n_ctx,
|
||||||
|
enum ggml_type * inp_embd_dtype,
|
||||||
|
bool flash_attn);
|
||||||
|
|
||||||
// Get a llama model tensor
|
// Get a llama model tensor
|
||||||
LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
|
LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
|
||||||
|
|
116
src/llama.cpp
116
src/llama.cpp
|
@ -3570,7 +3570,14 @@ static bool is_dtype_exist(struct model_params * n_params, enum ggml_type dtype)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_profile_device(device_info * dev_info, struct llama_model * model, llama_model_loader * ml, int n_predict, int n_threads) {
|
void llama_profile_device(
|
||||||
|
device_info * dev_info,
|
||||||
|
struct llama_model * model,
|
||||||
|
llama_model_loader * ml,
|
||||||
|
int n_predict,
|
||||||
|
int n_ctx,
|
||||||
|
int n_threads,
|
||||||
|
bool flash_attn) {
|
||||||
dev_info->device_name = device_name();
|
dev_info->device_name = device_name();
|
||||||
dev_info->cpu_props.cores = device_cpu_cores();
|
dev_info->cpu_props.cores = device_cpu_cores();
|
||||||
|
|
||||||
|
@ -3584,7 +3591,7 @@ void llama_profile_device(device_info * dev_info, struct llama_model * model, ll
|
||||||
struct model_params * n_params = &dev_info->model_params;
|
struct model_params * n_params = &dev_info->model_params;
|
||||||
if (dev_info->rank == 0) {
|
if (dev_info->rank == 0) {
|
||||||
enum ggml_type inp_embd_dtype = GGML_TYPE_F32;
|
enum ggml_type inp_embd_dtype = GGML_TYPE_F32;
|
||||||
llama_model_n_flops(model, ml, n_flops, n_params, 1, n_predict, &inp_embd_dtype);
|
llama_model_n_flops(model, ml, n_flops, n_params, n_predict, n_ctx, &inp_embd_dtype, flash_attn);
|
||||||
n_flops->inp_embd_ms = device_inp_embd_delay(model, inp_embd_dtype, 1, n_threads);
|
n_flops->inp_embd_ms = device_inp_embd_delay(model, inp_embd_dtype, 1, n_threads);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3611,8 +3618,8 @@ void llama_profile_device(device_info * dev_info, struct llama_model * model, ll
|
||||||
dev_info->gpu_props.description = gpu_props.description;
|
dev_info->gpu_props.description = gpu_props.description;
|
||||||
dev_info->gpu_props.memory_free = round(gpu_props.memory_free / (double)(1 << 30) * 100) / 100;
|
dev_info->gpu_props.memory_free = round(gpu_props.memory_free / (double)(1 << 30) * 100) / 100;
|
||||||
dev_info->gpu_props.memory_total = round(gpu_props.memory_total / (double)(1 << 30) * 100) / 100;
|
dev_info->gpu_props.memory_total = round(gpu_props.memory_total / (double)(1 << 30) * 100) / 100;
|
||||||
dev_info->gpu_props.metal_read_vram_bw = device_metal_read_vram_bw(model);
|
dev_info->gpu_props.metal_read_vram_bw = device_metal_read_vram_bw();
|
||||||
dev_info->gpu_props.cuda_read_vram_bw = device_cuda_read_vram_bw(model);
|
dev_info->gpu_props.cuda_read_vram_bw = device_cuda_read_vram_bw();
|
||||||
|
|
||||||
if (is_dtype_exist(n_params, GGML_TYPE_F32)) {
|
if (is_dtype_exist(n_params, GGML_TYPE_F32)) {
|
||||||
dev_info->cpu_props.flops_f32_f32 = device_cpu_flops (model, GGML_TYPE_F32, GGML_TYPE_F32, n_threads);
|
dev_info->cpu_props.flops_f32_f32 = device_cpu_flops (model, GGML_TYPE_F32, GGML_TYPE_F32, n_threads);
|
||||||
|
@ -19669,6 +19676,7 @@ struct llama_context_params llama_context_default_params() {
|
||||||
/*.master_ip =*/ nullptr,
|
/*.master_ip =*/ nullptr,
|
||||||
/*.next_node_ip =*/ nullptr,
|
/*.next_node_ip =*/ nullptr,
|
||||||
/*.n_ctx =*/ 512,
|
/*.n_ctx =*/ 512,
|
||||||
|
/*.n_predict =*/ 512,
|
||||||
/*.n_batch =*/ 2048,
|
/*.n_batch =*/ 2048,
|
||||||
/*.n_ubatch =*/ 512,
|
/*.n_ubatch =*/ 512,
|
||||||
/*.n_seq_max =*/ 1,
|
/*.n_seq_max =*/ 1,
|
||||||
|
@ -20910,22 +20918,49 @@ void llama_model_compute_buf_size(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_model_kvcache_size(
|
void llama_total_kv_size(
|
||||||
uint64_t * cpu_cache,
|
uint64_t * cpu_cache,
|
||||||
uint64_t * gpu_cache,
|
uint64_t * gpu_cache,
|
||||||
const struct llama_model * model,
|
const struct llama_model * model,
|
||||||
const struct llama_context_params cparams,
|
const struct llama_context_params cparams,
|
||||||
bool use_gpu) {
|
bool use_gpu) {
|
||||||
const llama_hparams hparams = model->hparams;
|
const llama_hparams hparams = model->hparams;
|
||||||
uint64_t ne_k = static_cast<uint64_t>(hparams.n_embd_k_gqa()) * cparams.n_ctx * ggml_type_size(cparams.type_k);
|
uint64_t nb_k = static_cast<uint64_t>(hparams.n_embd_k_gqa()) * cparams.n_ctx * ggml_type_size(cparams.type_k);
|
||||||
uint64_t ne_v = static_cast<uint64_t>(hparams.n_embd_v_gqa()) * cparams.n_ctx * ggml_type_size(cparams.type_v);
|
uint64_t nb_v = static_cast<uint64_t>(hparams.n_embd_v_gqa()) * cparams.n_ctx * ggml_type_size(cparams.type_v);
|
||||||
if (use_gpu) {
|
if (use_gpu) {
|
||||||
int n_gpu_layers = std::min(cparams.n_gpu_layers, hparams.n_layer);
|
int n_gpu_layers = std::min(cparams.n_gpu_layers, hparams.n_layer);
|
||||||
*gpu_cache = (ne_k + ne_v) * n_gpu_layers;
|
*gpu_cache = (nb_k + nb_v) * n_gpu_layers;
|
||||||
*cpu_cache = (ne_k + ne_v) * (llama_model_n_layers(model) - n_gpu_layers);
|
*cpu_cache = (nb_k + nb_v) * (llama_model_n_layers(model) - n_gpu_layers);
|
||||||
} else {
|
} else {
|
||||||
*gpu_cache = 0;
|
*gpu_cache = 0;
|
||||||
*cpu_cache = (ne_k + ne_v) * llama_model_n_layers(model);
|
*cpu_cache = (nb_k + nb_v) * llama_model_n_layers(model);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void llama_kv_size(
|
||||||
|
uint64_t * cpu_cache,
|
||||||
|
uint64_t * gpu_cache,
|
||||||
|
const struct llama_model * model,
|
||||||
|
const struct llama_context_params cparams,
|
||||||
|
bool use_gpu) {
|
||||||
|
const llama_hparams hparams = model->hparams;
|
||||||
|
const int64_t n_layer = llama_model_n_layers(model);
|
||||||
|
const int64_t n_ctx = cparams.n_ctx;
|
||||||
|
const int64_t n_history = cparams.n_predict;
|
||||||
|
const int64_t n_pad = cparams.flash_attn ? 256u : 32u;
|
||||||
|
const int64_t n_kv = std::min(n_ctx, std::max(n_pad, GGML_PAD(n_history, n_pad)));
|
||||||
|
const int64_t n_embd_k_gqa = static_cast<int64_t>(hparams.n_embd_k_gqa());
|
||||||
|
const int64_t n_embd_v_gqa = static_cast<int64_t>(hparams.n_embd_v_gqa());
|
||||||
|
|
||||||
|
const int64_t nb_k = n_embd_k_gqa * n_kv * ggml_type_size(cparams.type_k);
|
||||||
|
const int64_t nb_v = n_embd_v_gqa * n_kv * ggml_type_size(cparams.type_v);
|
||||||
|
if (use_gpu) {
|
||||||
|
const int64_t n_gpu_layers = std::min(n_layer, static_cast<int64_t>(cparams.n_gpu_layers));
|
||||||
|
*gpu_cache = (nb_k + nb_v) * n_gpu_layers;
|
||||||
|
*cpu_cache = (nb_k + nb_v) * (n_layer - n_gpu_layers);
|
||||||
|
} else {
|
||||||
|
*gpu_cache = 0;
|
||||||
|
*cpu_cache = (nb_k + nb_v) * n_layer;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -20934,19 +20969,22 @@ void llama_model_n_flops(
|
||||||
struct llama_model_loader * ml,
|
struct llama_model_loader * ml,
|
||||||
struct model_flops * n_flops,
|
struct model_flops * n_flops,
|
||||||
struct model_params * n_params,
|
struct model_params * n_params,
|
||||||
const int64_t n_input,
|
|
||||||
const int64_t n_history,
|
const int64_t n_history,
|
||||||
enum ggml_type * inp_embd_dtype) {
|
const int64_t n_ctx,
|
||||||
|
enum ggml_type * inp_embd_dtype,
|
||||||
|
bool flash_attn) {
|
||||||
const llama_hparams hparams = model->hparams;
|
const llama_hparams hparams = model->hparams;
|
||||||
const int64_t n_layer = hparams.n_layer;
|
const int64_t n_layer = hparams.n_layer;
|
||||||
const int64_t n_vocab = hparams.n_vocab;
|
const int64_t n_vocab = hparams.n_vocab;
|
||||||
const int64_t n_embd = hparams.n_embd;
|
const int64_t n_embd = hparams.n_embd;
|
||||||
const int64_t n_head = hparams.n_head();
|
const int64_t n_head = hparams.n_head();
|
||||||
|
const int64_t n_head_kv = hparams.n_head_kv();
|
||||||
const int64_t n_ff = hparams.n_ff();
|
const int64_t n_ff = hparams.n_ff();
|
||||||
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
|
||||||
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
|
||||||
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
||||||
|
const int64_t n_embd_head_v = hparams.n_embd_head_v;
|
||||||
const int64_t n_expert = hparams.n_expert;
|
const int64_t n_expert = hparams.n_expert;
|
||||||
|
const int64_t n_pad = flash_attn ? 256u : 32u;
|
||||||
|
const int64_t n_kv = std::min(n_ctx, std::max(n_pad, GGML_PAD(n_history, n_pad)));
|
||||||
|
|
||||||
// assign all the tensors on CPU by default
|
// assign all the tensors on CPU by default
|
||||||
model->buft_input = llama_default_buffer_type_cpu(*model, true);
|
model->buft_input = llama_default_buffer_type_cpu(*model, true);
|
||||||
|
@ -21045,64 +21083,66 @@ void llama_model_n_flops(
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case 2: { // "output_norm.weight"
|
case 2: { // "output_norm.weight"
|
||||||
count_n_flops (n_flops, cur->type, PROFILER_LAYER_OUTPUT, n_input * (4 * n_embd + 1));
|
count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_OUTPUT, 4 * n_embd + 1); // rms_norm
|
||||||
|
count_n_flops (n_flops, cur->type, PROFILER_LAYER_OUTPUT, n_embd); // norm weights
|
||||||
count_n_params(n_params, cur->type, PROFILER_LAYER_OUTPUT, ggml_nelements(cur));
|
count_n_params(n_params, cur->type, PROFILER_LAYER_OUTPUT, ggml_nelements(cur));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case 3: { // "output.weight"
|
case 3: { // "output.weight"
|
||||||
count_n_flops (n_flops, cur->type, PROFILER_LAYER_OUTPUT, 2 * n_input * n_embd * n_vocab);
|
count_n_flops (n_flops, cur->type, PROFILER_LAYER_OUTPUT, 2 * n_embd * n_vocab);
|
||||||
count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_OUTPUT, 5 * n_input * n_vocab); // softmax
|
count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_OUTPUT, 5 * n_vocab); // softmax
|
||||||
count_n_params(n_params, cur->type, PROFILER_LAYER_OUTPUT, ggml_nelements(cur));
|
count_n_params(n_params, cur->type, PROFILER_LAYER_OUTPUT, ggml_nelements(cur));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case 4: // "blk.0.attn_norm.weight"
|
case 4: // "blk.0.attn_norm.weight"
|
||||||
case 12: // "blk.0.ffn_norm.weight"
|
case 12: // "blk.0.ffn_norm.weight"
|
||||||
{
|
{
|
||||||
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, n_input * (4 * n_embd + 1));
|
count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 4 * n_embd + 1); // rms norm
|
||||||
|
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, n_embd); // norm weights
|
||||||
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
|
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case 5: { // "blk.0.attn_q.weight"
|
case 5: { // "blk.0.attn_q.weight"
|
||||||
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * (n_head * n_embd_head_k));
|
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_embd * n_embd);
|
||||||
count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd_head_k); // rope
|
count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 2.5 * n_embd); // rope
|
||||||
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
|
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case 6: { // "blk.0.attn_k.weight"
|
case 6: { // "blk.0.attn_k.weight"
|
||||||
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * (n_head * n_embd_k_gqa));
|
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_embd * n_head_kv * n_embd_head_k);
|
||||||
count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd_k_gqa); // rope
|
count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 2.5 * n_embd_head_k * n_head_kv); // rope
|
||||||
count_n_flops (n_flops, GGML_TYPE_F16, PROFILER_LAYER_BACKEND, 2 * n_input * (n_input + n_history) * n_embd_head_k * n_head); // compute kq with kvcache
|
count_n_flops (n_flops, GGML_TYPE_F16, PROFILER_LAYER_BACKEND, 2 * n_embd_head_k * n_head * n_kv * n_head_kv); // compute kq with kvcache
|
||||||
count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 7 * n_input * (n_input + n_history) * n_head); // scale, mask, and softmax
|
count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 7 * n_head * n_kv); // scale, mask, and softmax
|
||||||
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
|
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case 7: { // "blk.0.attn_v.weight"
|
case 7: { // "blk.0.attn_v.weight"
|
||||||
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * (n_head * n_embd_v_gqa));
|
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_embd * n_head_kv * n_embd_head_v);
|
||||||
count_n_flops (n_flops, GGML_TYPE_F16, PROFILER_LAYER_BACKEND, n_input * (n_input + n_history) * n_embd_head_k * n_head); // compute kqv with kvcache
|
count_n_flops (n_flops, GGML_TYPE_F16, PROFILER_LAYER_BACKEND, 2 * n_embd_head_v * n_head * n_kv * n_head_kv); // compute kqv with kvcache
|
||||||
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
|
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case 8: { // "blk.0.attn_output.weight"
|
case 8: { // "blk.0.attn_output.weight"
|
||||||
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * (n_head * n_embd_head_k) * n_embd);
|
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_embd * n_embd);
|
||||||
count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_input * n_embd); // shortcut
|
count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_embd); // shortcut
|
||||||
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
|
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case 9: { // "blk.0.ffn_gate.weight"
|
case 9: { // "blk.0.ffn_gate.weight"
|
||||||
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_ff);
|
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_embd * n_ff);
|
||||||
count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 5 * n_input * n_ff); // SiLU
|
count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 8 * n_ff); // SiLU
|
||||||
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
|
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case 10: { // "blk.0.ffn_down.weight"
|
case 10: { // "blk.0.ffn_down.weight"
|
||||||
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_ff);
|
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_embd * n_ff);
|
||||||
count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_input * n_embd); // shortcut
|
count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_embd); // shortcut
|
||||||
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
|
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case 11: { // "blk.0.ffn_up.weight"
|
case 11: { // "blk.0.ffn_up.weight"
|
||||||
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_ff);
|
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_embd * n_ff);
|
||||||
count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_input * n_ff); // silu(gate(x)) * up(x)
|
count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_ff); // silu(gate(x)) * up(x)
|
||||||
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
|
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -21117,20 +21157,20 @@ void llama_model_n_flops(
|
||||||
case 17: // "blk.0.attn_output.bias"
|
case 17: // "blk.0.attn_output.bias"
|
||||||
case 19: // "blk.0.ffn_down.bias"
|
case 19: // "blk.0.ffn_down.bias"
|
||||||
{
|
{
|
||||||
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, n_input * n_embd);
|
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, n_embd);
|
||||||
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
|
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case 18: // "blk.0.ffn_gate.bias"
|
case 18: // "blk.0.ffn_gate.bias"
|
||||||
case 20: // "blk.0.ffn_up.bias"
|
case 20: // "blk.0.ffn_up.bias"
|
||||||
{
|
{
|
||||||
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, n_input * n_ff);
|
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, n_ff);
|
||||||
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
|
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
// optional: expert tensors
|
// optional: expert tensors
|
||||||
case 21: { // "blk.0.ffn_gate_inp.weight"
|
case 21: { // "blk.0.ffn_gate_inp.weight"
|
||||||
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_expert);
|
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_embd * n_expert);
|
||||||
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
|
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -21138,7 +21178,7 @@ void llama_model_n_flops(
|
||||||
case 23: // "blk.0.ffn_down_exps.weight"
|
case 23: // "blk.0.ffn_down_exps.weight"
|
||||||
case 24: // "blk.0.ffn_up_exps.weight"
|
case 24: // "blk.0.ffn_up_exps.weight"
|
||||||
{
|
{
|
||||||
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_ff * n_expert);
|
count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_embd * n_ff * n_expert);
|
||||||
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
|
count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Reference in a new issue