diff --git a/common/common.cpp b/common/common.cpp index 9c373a9a..7e09fad2 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -896,7 +896,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { device_info dev_info; dev_info.rank = params.rank; - llama_profile_device(&dev_info, model, ml, params.n_predict, params.cpuparams.n_threads); + llama_profile_device(&dev_info, model, ml, params.n_predict, params.n_ctx, params.cpuparams.n_threads, params.flash_attn); // create llama context struct llama_context_params cparams = llama_context_params_from_gpt_params(params); @@ -1133,6 +1133,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param std::strcpy(cparams.next_node_ip, params.next_node_ip.c_str()); cparams.n_ctx = params.n_ctx; + cparams.n_predict = params.n_predict; cparams.n_seq_max = params.n_parallel; cparams.n_batch = params.n_batch; cparams.n_ubatch = params.n_ubatch; diff --git a/common/profiler.cpp b/common/profiler.cpp index c75e0249..0e041ad3 100644 --- a/common/profiler.cpp +++ b/common/profiler.cpp @@ -97,8 +97,9 @@ uint32_t device_cpu_cores() { } static float device_flops(struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t, enum profiler_backend_type btype, int n_threads) { - const int n_repeat = 1; - const int n_embd = std::min(llama_n_embd(model), 4096); + int n_repeat = 1; + int n_embd = std::min(llama_n_embd(model), 4096); + if (btype == PROFILER_BACKEND_TYPE_CPU) n_embd /= 8; // simulate small tensor calculation on cpu std::vector matrix_A(n_embd * n_embd, 1.0f); std::vector matrix_B(n_embd * n_embd, 1.0f / n_embd); @@ -142,12 +143,9 @@ static float device_flops(struct llama_model * model, enum ggml_type src0t, enum struct ggml_cgraph * gf = NULL; struct ggml_context * ctx_cgraph = NULL; struct ggml_tensor * cur = NULL; - struct ggml_tensor * cur1 = NULL; - struct ggml_tensor * cur2 = NULL; - struct ggml_tensor * cur3 = NULL; { struct ggml_init_params params0 = { - /*.mem_size =*/ ggml_tensor_overhead() * (5 * n_repeat + 1) + ggml_graph_overhead(), + /*.mem_size =*/ ggml_tensor_overhead() * (n_repeat + 2) + ggml_graph_overhead(), /*.mem_buffer =*/ NULL, /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph() }; @@ -155,12 +153,8 @@ static float device_flops(struct llama_model * model, enum ggml_type src0t, enum gf = ggml_new_graph(ctx_cgraph); cur = ggml_mul_mat(ctx_cgraph, tensor_a, tensor_b); - for (int i = 0; i < n_repeat; i++) { - cur1 = ggml_mul_mat(ctx_cgraph, tensor_a, cur); - cur2 = ggml_mul_mat(ctx_cgraph, tensor_a, cur); - cur = ggml_add(ctx_cgraph, cur1, cur2); - cur3 = ggml_mul_mat(ctx_cgraph, tensor_a, cur); - cur = ggml_add(ctx_cgraph, cur, cur3); + for (int i = 0; i < n_repeat - 1; i++) { + cur = ggml_mul_mat(ctx_cgraph, tensor_a, cur); } ggml_build_forward_expand(gf, cur); } @@ -204,15 +198,14 @@ static float device_flops(struct llama_model * model, enum ggml_type src0t, enum ggml_backend_sched_alloc_graph(sched, gf); // warm-up - // ggml_backend_graph_compute(backend, gf); + ggml_backend_graph_compute(backend, gf); const int64_t t_start = ggml_time_us(); ggml_backend_graph_compute(backend, gf); const int64_t t_end = ggml_time_us(); double elapsed_seconds = ((double)t_end - (double)t_start) / 1e6; // convert to seconds - double flops = (2.0 * (double)n_embd * (double)n_embd * (double)n_embd + - n_repeat * 4 * 2.0 * (double)n_embd * (double)n_embd * (double)n_embd) / elapsed_seconds / 1e9; // convert to GFLOPS + double flops = (2.0 * (double)n_embd * (double)n_embd * (double)n_embd * n_repeat) / elapsed_seconds / 1e9; // convert to GFLOPS ggml_free(ctx_cgraph); ggml_gallocr_free(allocr); @@ -933,8 +926,8 @@ float device_memory_bw(int n_thread) { return static_cast(bandwidth); } -static float device_read_vram_bw(struct llama_model * model, enum profiler_backend_type btype) { - const int n_embd = std::min(llama_n_embd(model) * 2, 4096 * 2); +static float device_read_vram_bw(enum profiler_backend_type btype) { + const int n_embd = 8192; std::vector matrix_A(n_embd * n_embd, 1.0f); ggml_backend_t backend = NULL; @@ -1006,21 +999,19 @@ static float device_read_vram_bw(struct llama_model * model, enum profiler_backe return bandwidth; } -float device_metal_read_vram_bw(struct llama_model * model) { +float device_metal_read_vram_bw() { #ifdef GGML_USE_METAL - return device_read_vram_bw(model, PROFILER_BACKEND_TYPE_METAL); + return device_read_vram_bw(PROFILER_BACKEND_TYPE_METAL); #endif - (void)model; return 0.0f; } -float device_cuda_read_vram_bw(struct llama_model * model) { +float device_cuda_read_vram_bw() { #ifdef GGML_USE_CUDA - return device_read_vram_bw(model, PROFILER_BACKEND_TYPE_CUDA); + return device_read_vram_bw(PROFILER_BACKEND_TYPE_CUDA); #endif - (void)model; return 0.0f; } @@ -1124,7 +1115,7 @@ static float device_compute_delay(struct device_info & dev_info, int n_layers, c } // estimate the memory access delay, except for the input embedding because it has been considered in n_flops.inp_embd_ms -static float device_memory_access_delay(struct device_info & dev_info, const struct llama_context_params cparams, int n_layers) { +static float device_memory_access_delay(struct device_info & dev_info, struct llama_model * model, const struct llama_context_params cparams, int n_layers) { struct model_params n_params = dev_info.model_params; int n_gpu_layers = std::min(static_cast(cparams.n_gpu_layers), n_layers); @@ -1143,10 +1134,15 @@ static float device_memory_access_delay(struct device_info & dev_info, const str n_params.output_q5k * 5 / 8 + n_params.output_q6k * 6 / 8 + n_params.output_q80; + + uint64_t cpu_kv_size; + uint64_t gpu_kv_size; -#if defined(GGML_USE_CUDA) || defined(GGML_USE_METAL) - int64_t vram_bytes = layer_bytes * n_gpu_layers; - int64_t ram_bytes = layer_bytes * (n_layers - n_gpu_layers) + output_bytes; +#if defined(GGML_USE_METAL) || defined(GGML_USE_CUDA) + llama_kv_size(&cpu_kv_size, &gpu_kv_size, model, cparams, true); + + int64_t vram_bytes = layer_bytes * n_gpu_layers + gpu_kv_size; + int64_t ram_bytes = layer_bytes * (n_layers - n_gpu_layers) + output_bytes + cpu_kv_size; #ifdef GGML_USE_CUDA double vram_access_delay = (double)(vram_bytes) / 1e6 / dev_info.gpu_props.cuda_read_vram_bw; @@ -1158,8 +1154,11 @@ static float device_memory_access_delay(struct device_info & dev_info, const str return static_cast(vram_access_delay + ram_access_delay); // ms #else + llama_kv_size(&cpu_kv_size, &gpu_kv_size, model, cparams, false); + (void)n_gpu_layers; - int64_t ram_bytes = layer_bytes * n_layers + output_bytes; + (void)gpu_kv_size; + int64_t ram_bytes = layer_bytes * n_layers + output_bytes + cpu_kv_size; double ram_access_delay = (double)(ram_bytes) / 1e6 / dev_info.memory.cpu_read_ram_bw; return static_cast(ram_access_delay); // ms #endif @@ -1191,7 +1190,9 @@ static float device_disk_access_delay(struct device_info & dev_info, struct llam #if defined(GGML_USE_METAL) || defined(GGML_USE_CUDA) cpu_total_bytes += layer_bytes * (n_layers - n_gpu_layers); +#if defined(GGML_USE_METAL) int64_t gpu_total_bytes = layer_bytes * n_gpu_layers; +#endif #else (void)n_gpu_layers; cpu_total_bytes += layer_bytes * n_layers; @@ -1211,10 +1212,10 @@ static float device_disk_access_delay(struct device_info & dev_info, struct llam uint64_t gpu_compute_buf; #if defined(GGML_USE_METAL) || defined(GGML_USE_CUDA) - llama_model_kvcache_size(&cpu_kv_size, &gpu_kv_size, model, cparams, true); + llama_total_kv_size(&cpu_kv_size, &gpu_kv_size, model, cparams, true); llama_model_compute_buf_size(&cpu_compute_buf, &gpu_compute_buf, model, cparams, true); #else - llama_model_kvcache_size(&cpu_kv_size, &gpu_kv_size, model, cparams, false); + llama_total_kv_size(&cpu_kv_size, &gpu_kv_size, model, cparams, false); llama_model_compute_buf_size(&cpu_compute_buf, &gpu_compute_buf, model, cparams, false); #endif @@ -1651,9 +1652,9 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m // todo: calculate for each device, not only master float latency = 0.0f; int n_layers = llama_model_n_layers (model); - latency += device_compute_delay (dev_info_set[0], n_layers, cparams); - latency += device_memory_access_delay(dev_info_set[0], cparams, n_layers); - latency += device_disk_access_delay (dev_info_set[0], model, cparams); // if physical memory is not enough, some mapped data will be released and reloaded later + latency += device_compute_delay (dev_info_set[0], n_layers,cparams); + latency += device_memory_access_delay(dev_info_set[0], model, cparams, n_layers); + latency += device_disk_access_delay (dev_info_set[0], model, cparams); // if physical memory is not enough, some mapped data will be released and reloaded later LOG_INF("| Token latency (ms) "); LOG_INF("| %-10.2f ", latency); diff --git a/common/profiler.h b/common/profiler.h index 18176cc4..39d37ce8 100644 --- a/common/profiler.h +++ b/common/profiler.h @@ -241,8 +241,8 @@ uint64_t device_swap_memory (bool available); void device_disk_seq_bw (float * read_seq_bw, float * write_seq_bw, int n_threads); void device_disk_rnd_bw (float * read_rnd_bw, float * write_rnd_bw, int n_threads); float device_memory_bw (int n_thread); -float device_metal_read_vram_bw(struct llama_model * model); -float device_cuda_read_vram_bw (struct llama_model * model); +float device_metal_read_vram_bw(); +float device_cuda_read_vram_bw (); void device_get_props (struct llama_model * model, int device, struct ggml_backend_dev_props * props); void device_print_props (struct device_info * dev_info_set, int n, struct llama_model * model, const struct llama_context_params cparams); diff --git a/include/llama.h b/include/llama.h index 886f696d..b68cc269 100644 --- a/include/llama.h +++ b/include/llama.h @@ -325,6 +325,7 @@ extern "C" { char * master_ip; // ip address of the master node char * next_node_ip; // ip address of the next node uint32_t n_ctx; // text context, 0 = from model + uint32_t n_predict; // number of tokens to predict uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode uint32_t n_ubatch; // physical maximum batch size uint32_t n_seq_max; // max number of sequences (i.e. distinct states for recurrent models) @@ -412,11 +413,13 @@ extern "C" { LLAMA_API void llama_backend_init(void); LLAMA_API void llama_profile_device( - struct device_info * dev_info, - struct llama_model * model, + struct device_info * dev_info, + struct llama_model * model, struct llama_model_loader * ml, - int n_predict, - int n_threads); + int n_predict, + int n_ctx, + int n_threads, + bool flash_attn); LLAMA_API ggml_backend_buffer_type_t llama_dev_buffer_type(struct llama_model * model, int device); @@ -534,12 +537,19 @@ extern "C" { bool use_gpu); // Return the size of KV cache in the model - LLAMA_API void llama_model_kvcache_size( + LLAMA_API void llama_total_kv_size( uint64_t * cpu_cache, uint64_t * gpu_cache, const struct llama_model * model, const struct llama_context_params cparams, bool use_gpu); + + LLAMA_API void llama_kv_size( + uint64_t * cpu_cache, + uint64_t * gpu_cache, + const struct llama_model * model, + const struct llama_context_params cparams, + bool use_gpu); // Return the total number of float operations in the model LLAMA_API void llama_model_n_flops( @@ -547,9 +557,10 @@ extern "C" { struct llama_model_loader * ml, struct model_flops * n_flops, struct model_params * n_params, - const int64_t n_input, const int64_t n_history, - enum ggml_type * inp_embd_dtype); + const int64_t n_ctx, + enum ggml_type * inp_embd_dtype, + bool flash_attn); // Get a llama model tensor LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name); diff --git a/src/llama.cpp b/src/llama.cpp index f5f9889f..1696cd07 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3570,7 +3570,14 @@ static bool is_dtype_exist(struct model_params * n_params, enum ggml_type dtype) } } -void llama_profile_device(device_info * dev_info, struct llama_model * model, llama_model_loader * ml, int n_predict, int n_threads) { +void llama_profile_device( + device_info * dev_info, + struct llama_model * model, + llama_model_loader * ml, + int n_predict, + int n_ctx, + int n_threads, + bool flash_attn) { dev_info->device_name = device_name(); dev_info->cpu_props.cores = device_cpu_cores(); @@ -3584,7 +3591,7 @@ void llama_profile_device(device_info * dev_info, struct llama_model * model, ll struct model_params * n_params = &dev_info->model_params; if (dev_info->rank == 0) { enum ggml_type inp_embd_dtype = GGML_TYPE_F32; - llama_model_n_flops(model, ml, n_flops, n_params, 1, n_predict, &inp_embd_dtype); + llama_model_n_flops(model, ml, n_flops, n_params, n_predict, n_ctx, &inp_embd_dtype, flash_attn); n_flops->inp_embd_ms = device_inp_embd_delay(model, inp_embd_dtype, 1, n_threads); } @@ -3611,8 +3618,8 @@ void llama_profile_device(device_info * dev_info, struct llama_model * model, ll dev_info->gpu_props.description = gpu_props.description; dev_info->gpu_props.memory_free = round(gpu_props.memory_free / (double)(1 << 30) * 100) / 100; dev_info->gpu_props.memory_total = round(gpu_props.memory_total / (double)(1 << 30) * 100) / 100; - dev_info->gpu_props.metal_read_vram_bw = device_metal_read_vram_bw(model); - dev_info->gpu_props.cuda_read_vram_bw = device_cuda_read_vram_bw(model); + dev_info->gpu_props.metal_read_vram_bw = device_metal_read_vram_bw(); + dev_info->gpu_props.cuda_read_vram_bw = device_cuda_read_vram_bw(); if (is_dtype_exist(n_params, GGML_TYPE_F32)) { dev_info->cpu_props.flops_f32_f32 = device_cpu_flops (model, GGML_TYPE_F32, GGML_TYPE_F32, n_threads); @@ -19669,6 +19676,7 @@ struct llama_context_params llama_context_default_params() { /*.master_ip =*/ nullptr, /*.next_node_ip =*/ nullptr, /*.n_ctx =*/ 512, + /*.n_predict =*/ 512, /*.n_batch =*/ 2048, /*.n_ubatch =*/ 512, /*.n_seq_max =*/ 1, @@ -20910,22 +20918,49 @@ void llama_model_compute_buf_size( } } -void llama_model_kvcache_size( +void llama_total_kv_size( uint64_t * cpu_cache, uint64_t * gpu_cache, const struct llama_model * model, const struct llama_context_params cparams, bool use_gpu) { const llama_hparams hparams = model->hparams; - uint64_t ne_k = static_cast(hparams.n_embd_k_gqa()) * cparams.n_ctx * ggml_type_size(cparams.type_k); - uint64_t ne_v = static_cast(hparams.n_embd_v_gqa()) * cparams.n_ctx * ggml_type_size(cparams.type_v); + uint64_t nb_k = static_cast(hparams.n_embd_k_gqa()) * cparams.n_ctx * ggml_type_size(cparams.type_k); + uint64_t nb_v = static_cast(hparams.n_embd_v_gqa()) * cparams.n_ctx * ggml_type_size(cparams.type_v); if (use_gpu) { int n_gpu_layers = std::min(cparams.n_gpu_layers, hparams.n_layer); - *gpu_cache = (ne_k + ne_v) * n_gpu_layers; - *cpu_cache = (ne_k + ne_v) * (llama_model_n_layers(model) - n_gpu_layers); + *gpu_cache = (nb_k + nb_v) * n_gpu_layers; + *cpu_cache = (nb_k + nb_v) * (llama_model_n_layers(model) - n_gpu_layers); } else { *gpu_cache = 0; - *cpu_cache = (ne_k + ne_v) * llama_model_n_layers(model); + *cpu_cache = (nb_k + nb_v) * llama_model_n_layers(model); + } +} + +void llama_kv_size( + uint64_t * cpu_cache, + uint64_t * gpu_cache, + const struct llama_model * model, + const struct llama_context_params cparams, + bool use_gpu) { + const llama_hparams hparams = model->hparams; + const int64_t n_layer = llama_model_n_layers(model); + const int64_t n_ctx = cparams.n_ctx; + const int64_t n_history = cparams.n_predict; + const int64_t n_pad = cparams.flash_attn ? 256u : 32u; + const int64_t n_kv = std::min(n_ctx, std::max(n_pad, GGML_PAD(n_history, n_pad))); + const int64_t n_embd_k_gqa = static_cast(hparams.n_embd_k_gqa()); + const int64_t n_embd_v_gqa = static_cast(hparams.n_embd_v_gqa()); + + const int64_t nb_k = n_embd_k_gqa * n_kv * ggml_type_size(cparams.type_k); + const int64_t nb_v = n_embd_v_gqa * n_kv * ggml_type_size(cparams.type_v); + if (use_gpu) { + const int64_t n_gpu_layers = std::min(n_layer, static_cast(cparams.n_gpu_layers)); + *gpu_cache = (nb_k + nb_v) * n_gpu_layers; + *cpu_cache = (nb_k + nb_v) * (n_layer - n_gpu_layers); + } else { + *gpu_cache = 0; + *cpu_cache = (nb_k + nb_v) * n_layer; } } @@ -20934,20 +20969,23 @@ void llama_model_n_flops( struct llama_model_loader * ml, struct model_flops * n_flops, struct model_params * n_params, - const int64_t n_input, - const int64_t n_history, - enum ggml_type * inp_embd_dtype) { + const int64_t n_history, + const int64_t n_ctx, + enum ggml_type * inp_embd_dtype, + bool flash_attn) { const llama_hparams hparams = model->hparams; const int64_t n_layer = hparams.n_layer; const int64_t n_vocab = hparams.n_vocab; const int64_t n_embd = hparams.n_embd; const int64_t n_head = hparams.n_head(); + const int64_t n_head_kv = hparams.n_head_kv(); const int64_t n_ff = hparams.n_ff(); - const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(); - const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(); const int64_t n_embd_head_k = hparams.n_embd_head_k; + const int64_t n_embd_head_v = hparams.n_embd_head_v; const int64_t n_expert = hparams.n_expert; - + const int64_t n_pad = flash_attn ? 256u : 32u; + const int64_t n_kv = std::min(n_ctx, std::max(n_pad, GGML_PAD(n_history, n_pad))); + // assign all the tensors on CPU by default model->buft_input = llama_default_buffer_type_cpu(*model, true); model->buft_output = llama_default_buffer_type_cpu(*model, true); @@ -21045,64 +21083,66 @@ void llama_model_n_flops( break; } case 2: { // "output_norm.weight" - count_n_flops (n_flops, cur->type, PROFILER_LAYER_OUTPUT, n_input * (4 * n_embd + 1)); + count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_OUTPUT, 4 * n_embd + 1); // rms_norm + count_n_flops (n_flops, cur->type, PROFILER_LAYER_OUTPUT, n_embd); // norm weights count_n_params(n_params, cur->type, PROFILER_LAYER_OUTPUT, ggml_nelements(cur)); break; } case 3: { // "output.weight" - count_n_flops (n_flops, cur->type, PROFILER_LAYER_OUTPUT, 2 * n_input * n_embd * n_vocab); - count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_OUTPUT, 5 * n_input * n_vocab); // softmax + count_n_flops (n_flops, cur->type, PROFILER_LAYER_OUTPUT, 2 * n_embd * n_vocab); + count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_OUTPUT, 5 * n_vocab); // softmax count_n_params(n_params, cur->type, PROFILER_LAYER_OUTPUT, ggml_nelements(cur)); break; } case 4: // "blk.0.attn_norm.weight" case 12: // "blk.0.ffn_norm.weight" { - count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, n_input * (4 * n_embd + 1)); + count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 4 * n_embd + 1); // rms norm + count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, n_embd); // norm weights count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur)); break; } case 5: { // "blk.0.attn_q.weight" - count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * (n_head * n_embd_head_k)); - count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd_head_k); // rope + count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_embd * n_embd); + count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 2.5 * n_embd); // rope count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur)); break; } case 6: { // "blk.0.attn_k.weight" - count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * (n_head * n_embd_k_gqa)); - count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd_k_gqa); // rope - count_n_flops (n_flops, GGML_TYPE_F16, PROFILER_LAYER_BACKEND, 2 * n_input * (n_input + n_history) * n_embd_head_k * n_head); // compute kq with kvcache - count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 7 * n_input * (n_input + n_history) * n_head); // scale, mask, and softmax + count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_embd * n_head_kv * n_embd_head_k); + count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 2.5 * n_embd_head_k * n_head_kv); // rope + count_n_flops (n_flops, GGML_TYPE_F16, PROFILER_LAYER_BACKEND, 2 * n_embd_head_k * n_head * n_kv * n_head_kv); // compute kq with kvcache + count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 7 * n_head * n_kv); // scale, mask, and softmax count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur)); break; } case 7: { // "blk.0.attn_v.weight" - count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * (n_head * n_embd_v_gqa)); - count_n_flops (n_flops, GGML_TYPE_F16, PROFILER_LAYER_BACKEND, n_input * (n_input + n_history) * n_embd_head_k * n_head); // compute kqv with kvcache + count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_embd * n_head_kv * n_embd_head_v); + count_n_flops (n_flops, GGML_TYPE_F16, PROFILER_LAYER_BACKEND, 2 * n_embd_head_v * n_head * n_kv * n_head_kv); // compute kqv with kvcache count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur)); break; } case 8: { // "blk.0.attn_output.weight" - count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * (n_head * n_embd_head_k) * n_embd); - count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_input * n_embd); // shortcut + count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_embd * n_embd); + count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_embd); // shortcut count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur)); break; } case 9: { // "blk.0.ffn_gate.weight" - count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_ff); - count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 5 * n_input * n_ff); // SiLU + count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_embd * n_ff); + count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 8 * n_ff); // SiLU count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur)); break; } case 10: { // "blk.0.ffn_down.weight" - count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_ff); - count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_input * n_embd); // shortcut + count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_embd * n_ff); + count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_embd); // shortcut count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur)); break; } case 11: { // "blk.0.ffn_up.weight" - count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_ff); - count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_input * n_ff); // silu(gate(x)) * up(x) + count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_embd * n_ff); + count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_ff); // silu(gate(x)) * up(x) count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur)); break; } @@ -21117,20 +21157,20 @@ void llama_model_n_flops( case 17: // "blk.0.attn_output.bias" case 19: // "blk.0.ffn_down.bias" { - count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, n_input * n_embd); + count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, n_embd); count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur)); break; } case 18: // "blk.0.ffn_gate.bias" case 20: // "blk.0.ffn_up.bias" { - count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, n_input * n_ff); + count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, n_ff); count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur)); break; } // optional: expert tensors case 21: { // "blk.0.ffn_gate_inp.weight" - count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_expert); + count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_embd * n_expert); count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur)); break; } @@ -21138,7 +21178,7 @@ void llama_model_n_flops( case 23: // "blk.0.ffn_down_exps.weight" case 24: // "blk.0.ffn_up_exps.weight" { - count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_ff * n_expert); + count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_embd * n_ff * n_expert); count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur)); break; }