diff --git a/common/common.cpp b/common/common.cpp index ae7dd883..33523467 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -901,13 +901,17 @@ static bool assign_layers_to_device( float t_read_ram_cpu = 0.0f; float t_calc_cpu = ( - master.model_flops.layer_f32_f32 / (dev.cpu_props.flops_f32_f32 * 1e9 + EPS) + - master.model_flops.layer_f16_f32 / (dev.cpu_props.flops_f16_f32 * 1e9 + EPS) + - master.model_flops.layer_q4k_f32 / (dev.cpu_props.flops_q4k_f32 * 1e9 + EPS) + - master.model_flops.layer_q50_f32 / (dev.cpu_props.flops_q50_f32 * 1e9 + EPS) + - master.model_flops.layer_q5k_f32 / (dev.cpu_props.flops_q5k_f32 * 1e9 + EPS) + - master.model_flops.layer_q6k_f32 / (dev.cpu_props.flops_q6k_f32 * 1e9 + EPS) + - master.model_flops.layer_q80_f32 / (dev.cpu_props.flops_q80_f32 * 1e9 + EPS)) * 1000; // in ms + master.model_flops.layer_f32_f32 / (dev.cpu_props.flops_f32_f32 * 1e9 + EPS) + + master.model_flops.layer_f16_f32 / (dev.cpu_props.flops_f16_f32 * 1e9 + EPS) + + master.model_flops.layer_q2k_f32 / (dev.cpu_props.flops_q2k_f32 * 1e9 + EPS) + + master.model_flops.layer_q4k_f32 / (dev.cpu_props.flops_q4k_f32 * 1e9 + EPS) + + master.model_flops.layer_q5k_f32 / (dev.cpu_props.flops_q5k_f32 * 1e9 + EPS) + + master.model_flops.layer_q6k_f32 / (dev.cpu_props.flops_q6k_f32 * 1e9 + EPS) + + master.model_flops.layer_q50_f32 / (dev.cpu_props.flops_q50_f32 * 1e9 + EPS) + + master.model_flops.layer_q80_f32 / (dev.cpu_props.flops_q80_f32 * 1e9 + EPS) + + master.model_flops.layer_iq1s_f32 / (dev.cpu_props.flops_iq1s_f32 * 1e9 + EPS)+ + master.model_flops.layer_iq4nl_f32 / (dev.cpu_props.flops_iq4nl_f32 * 1e9 + EPS)) * 1000; // in ms + float t_kv_cpy_cpu = dev.memory.mem_cpy_delay; // in ms // t_read_ram_cpu = b_prime / (dev.memory.cpu_read_ram_bw * 1e9) * 1000; // in ms @@ -921,24 +925,32 @@ static bool assign_layers_to_device( if (dev.gpu_support.metal) { t_calc_gpu = ( - master.model_flops.layer_f32_f32 / (dev.gpu_props.metal_flops_f32_f32 * 1e9 + EPS) + - master.model_flops.layer_f16_f32 / (dev.gpu_props.metal_flops_f16_f32 * 1e9 + EPS) + - master.model_flops.layer_q4k_f32 / (dev.gpu_props.metal_flops_q4k_f32 * 1e9 + EPS) + - master.model_flops.layer_q50_f32 / (dev.gpu_props.metal_flops_q50_f32 * 1e9 + EPS) + - master.model_flops.layer_q5k_f32 / (dev.gpu_props.metal_flops_q5k_f32 * 1e9 + EPS) + - master.model_flops.layer_q6k_f32 / (dev.gpu_props.metal_flops_q6k_f32 * 1e9 + EPS) + - master.model_flops.layer_q80_f32 / (dev.gpu_props.metal_flops_q80_f32 * 1e9 + EPS)) * 1000; // in ms + master.model_flops.layer_f32_f32 / (dev.gpu_props.metal_flops_f32_f32 * 1e9 + EPS) + + master.model_flops.layer_f16_f32 / (dev.gpu_props.metal_flops_f16_f32 * 1e9 + EPS) + + master.model_flops.layer_q2k_f32 / (dev.gpu_props.metal_flops_q2k_f32 * 1e9 + EPS) + + master.model_flops.layer_q4k_f32 / (dev.gpu_props.metal_flops_q4k_f32 * 1e9 + EPS) + + master.model_flops.layer_q5k_f32 / (dev.gpu_props.metal_flops_q5k_f32 * 1e9 + EPS) + + master.model_flops.layer_q6k_f32 / (dev.gpu_props.metal_flops_q6k_f32 * 1e9 + EPS) + + master.model_flops.layer_q50_f32 / (dev.gpu_props.metal_flops_q50_f32 * 1e9 + EPS) + + master.model_flops.layer_q80_f32 / (dev.gpu_props.metal_flops_q80_f32 * 1e9 + EPS) + + master.model_flops.layer_iq1s_f32 / (dev.gpu_props.metal_flops_iq1s_f32 * 1e9 + EPS) + + master.model_flops.layer_iq4nl_f32 / (dev.gpu_props.metal_flops_iq4nl_f32 * 1e9 + EPS)) * 1000; // in ms + t_kv_cpy_gpu = dev.gpu_props.metal_mem_cpy_delay; // in ms // t_read_ram_gpu = b_prime / (dev.gpu_props.metal_read_vram_bw * 1e9) * 1000; // in ms } else { t_calc_gpu = ( - master.model_flops.layer_f32_f32 / (dev.gpu_props.cuda_flops_f32_f32 * 1e9 + EPS) + - master.model_flops.layer_f16_f32 / (dev.gpu_props.cuda_flops_f16_f32 * 1e9 + EPS) + - master.model_flops.layer_q4k_f32 / (dev.gpu_props.cuda_flops_q4k_f32 * 1e9 + EPS) + - master.model_flops.layer_q50_f32 / (dev.gpu_props.cuda_flops_q50_f32 * 1e9 + EPS) + - master.model_flops.layer_q5k_f32 / (dev.gpu_props.cuda_flops_q5k_f32 * 1e9 + EPS) + - master.model_flops.layer_q6k_f32 / (dev.gpu_props.cuda_flops_q6k_f32 * 1e9 + EPS) + - master.model_flops.layer_q80_f32 / (dev.gpu_props.cuda_flops_q80_f32 * 1e9 + EPS)) * 1000; // in ms + master.model_flops.layer_f32_f32 / (dev.gpu_props.cuda_flops_f32_f32 * 1e9 + EPS) + + master.model_flops.layer_f16_f32 / (dev.gpu_props.cuda_flops_f16_f32 * 1e9 + EPS) + + master.model_flops.layer_q2k_f32 / (dev.gpu_props.cuda_flops_q2k_f32 * 1e9 + EPS) + + master.model_flops.layer_q4k_f32 / (dev.gpu_props.cuda_flops_q4k_f32 * 1e9 + EPS) + + master.model_flops.layer_q5k_f32 / (dev.gpu_props.cuda_flops_q5k_f32 * 1e9 + EPS) + + master.model_flops.layer_q6k_f32 / (dev.gpu_props.cuda_flops_q6k_f32 * 1e9 + EPS) + + master.model_flops.layer_q50_f32 / (dev.gpu_props.cuda_flops_q50_f32 * 1e9 + EPS) + + master.model_flops.layer_q80_f32 / (dev.gpu_props.cuda_flops_q80_f32 * 1e9 + EPS) + + master.model_flops.layer_iq1s_f32 / (dev.gpu_props.cuda_flops_iq1s_f32 * 1e9 + EPS) + + master.model_flops.layer_iq4nl_f32 / (dev.gpu_props.cuda_flops_iq4nl_f32 * 1e9 + EPS)) * 1000; // in ms + t_kv_cpy_gpu = dev.gpu_props.cuda_mem_cpy_delay; // in ms // t_read_ram_gpu = b_prime / (dev.gpu_props.cuda_read_vram_bw * 1e9) * 1000; // in ms } @@ -1113,13 +1125,16 @@ static bool assign_layers_to_device( if (m == 0) { kappa = ( - dev.model_flops.layer_f32_f32 / (dev.cpu_props.flops_f32_f32 * 1e9 + EPS) + - dev.model_flops.layer_f16_f32 / (dev.cpu_props.flops_f16_f32 * 1e9 + EPS) + - dev.model_flops.layer_q4k_f32 / (dev.cpu_props.flops_q4k_f32 * 1e9 + EPS) + - dev.model_flops.layer_q50_f32 / (dev.cpu_props.flops_q50_f32 * 1e9 + EPS) + - dev.model_flops.layer_q5k_f32 / (dev.cpu_props.flops_q5k_f32 * 1e9 + EPS) + - dev.model_flops.layer_q6k_f32 / (dev.cpu_props.flops_q6k_f32 * 1e9 + EPS) + - dev.model_flops.layer_q80_f32 / (dev.cpu_props.flops_q80_f32 * 1e9 + EPS)) * 1000; // in ms + dev.model_flops.layer_f32_f32 / (dev.cpu_props.flops_f32_f32 * 1e9 + EPS) + + dev.model_flops.layer_f16_f32 / (dev.cpu_props.flops_f16_f32 * 1e9 + EPS) + + dev.model_flops.layer_q2k_f32 / (dev.cpu_props.flops_q2k_f32 * 1e9 + EPS) + + dev.model_flops.layer_q4k_f32 / (dev.cpu_props.flops_q4k_f32 * 1e9 + EPS) + + dev.model_flops.layer_q5k_f32 / (dev.cpu_props.flops_q5k_f32 * 1e9 + EPS) + + dev.model_flops.layer_q6k_f32 / (dev.cpu_props.flops_q6k_f32 * 1e9 + EPS) + + dev.model_flops.layer_q50_f32 / (dev.cpu_props.flops_q50_f32 * 1e9 + EPS) + + dev.model_flops.layer_q80_f32 / (dev.cpu_props.flops_q80_f32 * 1e9 + EPS) + + dev.model_flops.layer_iq1s_f32 / (dev.cpu_props.flops_iq1s_f32 * 1e9 + EPS) + + dev.model_flops.layer_iq4nl_f32 / (dev.cpu_props.flops_iq4nl_f32 * 1e9 + EPS)) * 1000; // in ms // kappa += (bi / n_vocab + bo) / (dev.memory.cpu_read_ram_bw * 1e9) * 1000; // in ms @@ -1766,33 +1781,25 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & return mparams; } -static ggml_type kv_cache_type_from_str(const std::string & s) { - if (s == "f32") { - return GGML_TYPE_F32; - } - if (s == "f16") { - return GGML_TYPE_F16; - } - if (s == "q8_0") { - return GGML_TYPE_Q8_0; - } - if (s == "q4_0") { - return GGML_TYPE_Q4_0; - } - if (s == "q4_1") { - return GGML_TYPE_Q4_1; - } - if (s == "iq4_nl") { - return GGML_TYPE_IQ4_NL; - } - if (s == "q5_0") { - return GGML_TYPE_Q5_0; - } - if (s == "q5_1") { - return GGML_TYPE_Q5_1; - } +const std::vector kv_cache_types = { + GGML_TYPE_F32, + GGML_TYPE_F16, + GGML_TYPE_BF16, // Added BF16 data type support + GGML_TYPE_Q8_0, + GGML_TYPE_Q4_0, + GGML_TYPE_Q4_1, + GGML_TYPE_IQ4_NL, + GGML_TYPE_Q5_0, + GGML_TYPE_Q5_1, +}; - throw std::runtime_error("Invalid cache type: " + s); +static ggml_type kv_cache_type_from_str(const std::string & s) { + for (const auto & type : kv_cache_types) { + if (ggml_type_name(type) == s) { + return type; + } + } + throw std::runtime_error("Unsupported cache type: " + s); } struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) { diff --git a/common/profiler.cpp b/common/profiler.cpp index 48af0950..adc9a9e7 100644 --- a/common/profiler.cpp +++ b/common/profiler.cpp @@ -364,14 +364,16 @@ float device_inp_embd_delay(struct llama_model * model, enum ggml_type src0t, in ggml_fp32_to_fp16_row(temp_f32.data(), static_cast(matrix_B), embd_size); break; } + case GGML_TYPE_Q2_K: case GGML_TYPE_Q4_K: - case GGML_TYPE_Q5_0: case GGML_TYPE_Q5_K: case GGML_TYPE_Q6_K: case GGML_TYPE_Q8_K: + case GGML_TYPE_Q5_0: case GGML_TYPE_Q8_0: - QK_K = 256; - matrix_B = malloc((embd_size / QK_K) * ggml_type_size(src0t)); + case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ4_NL: + matrix_B = malloc((embd_size / ggml_blck_size(src0t) * ggml_type_size(src0t))); // The quantization block sizes are inconsistent for different quantization methods break; default: LOG_INF("Unsupported type: %d\n", src0t); @@ -1349,31 +1351,39 @@ static float device_compute_delay(struct device_info & dev_info, int n_layers, c gpu_latency_per_layer += (double)n_flops.layer_f32_f32 / ((double)gpu.cuda_flops_f32_f32 + EPS) / 1e9; gpu_latency_per_layer += (double)n_flops.layer_f16_f32 / ((double)gpu.cuda_flops_f16_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_q2k_f32 / ((double)gpu.cuda_flops_q2k_f32 + EPS) / 1e9; gpu_latency_per_layer += (double)n_flops.layer_q4k_f32 / ((double)gpu.cuda_flops_q4k_f32 + EPS) / 1e9; - gpu_latency_per_layer += (double)n_flops.layer_q50_f32 / ((double)gpu.cuda_flops_q50_f32 + EPS) / 1e9; gpu_latency_per_layer += (double)n_flops.layer_q5k_f32 / ((double)gpu.cuda_flops_q5k_f32 + EPS) / 1e9; gpu_latency_per_layer += (double)n_flops.layer_q6k_f32 / ((double)gpu.cuda_flops_q6k_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_q50_f32 / ((double)gpu.cuda_flops_q50_f32 + EPS) / 1e9; gpu_latency_per_layer += (double)n_flops.layer_q80_f32 / ((double)gpu.cuda_flops_q80_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_iq1s_f32 / ((double)gpu.cuda_flops_iq1s_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_iq4nl_f32 / ((double)gpu.cuda_flops_iq4nl_f32 + EPS) / 1e9; #elif GGML_USE_METAL struct gpu_props gpu = dev_info.gpu_props; gpu_latency_per_layer += (double)n_flops.layer_f32_f32 / ((double)gpu.metal_flops_f32_f32 + EPS) / 1e9; gpu_latency_per_layer += (double)n_flops.layer_f16_f32 / ((double)gpu.metal_flops_f16_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_q2k_f32 / ((double)gpu.metal_flops_q2k_f32 + EPS) / 1e9; gpu_latency_per_layer += (double)n_flops.layer_q4k_f32 / ((double)gpu.metal_flops_q4k_f32 + EPS) / 1e9; - gpu_latency_per_layer += (double)n_flops.layer_q50_f32 / ((double)gpu.metal_flops_q50_f32 + EPS) / 1e9; gpu_latency_per_layer += (double)n_flops.layer_q5k_f32 / ((double)gpu.metal_flops_q5k_f32 + EPS) / 1e9; gpu_latency_per_layer += (double)n_flops.layer_q6k_f32 / ((double)gpu.metal_flops_q6k_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_q50_f32 / ((double)gpu.metal_flops_q50_f32 + EPS) / 1e9; gpu_latency_per_layer += (double)n_flops.layer_q80_f32 / ((double)gpu.metal_flops_q80_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_iq1s_f32 / ((double)gpu.metal_flops_iq1s_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_iq4nl_f32 / ((double)gpu.metal_flops_iq4nl_f32 + EPS) / 1e9; #endif cpu_latency_per_layer += (double)n_flops.layer_f32_f32 / ((double)cpu.flops_f32_f32 + EPS) / 1e9; cpu_latency_per_layer += (double)n_flops.layer_f16_f32 / ((double)cpu.flops_f16_f32 + EPS) / 1e9; + cpu_latency_per_layer += (double)n_flops.layer_q2k_f32 / ((double)cpu.flops_q2k_f32 + EPS) / 1e9; cpu_latency_per_layer += (double)n_flops.layer_q4k_f32 / ((double)cpu.flops_q4k_f32 + EPS) / 1e9; - cpu_latency_per_layer += (double)n_flops.layer_q50_f32 / ((double)cpu.flops_q50_f32 + EPS) / 1e9; cpu_latency_per_layer += (double)n_flops.layer_q5k_f32 / ((double)cpu.flops_q5k_f32 + EPS) / 1e9; cpu_latency_per_layer += (double)n_flops.layer_q6k_f32 / ((double)cpu.flops_q6k_f32 + EPS) / 1e9; + cpu_latency_per_layer += (double)n_flops.layer_q50_f32 / ((double)cpu.flops_q50_f32 + EPS) / 1e9; cpu_latency_per_layer += (double)n_flops.layer_q80_f32 / ((double)cpu.flops_q80_f32 + EPS) / 1e9; - + cpu_latency_per_layer += (double)n_flops.layer_iq1s_f32 / ((double)cpu.flops_iq1s_f32 + EPS) / 1e9; + cpu_latency_per_layer += (double)n_flops.layer_iq4nl_f32 / ((double)cpu.flops_iq4nl_f32 + EPS) / 1e9; double total_latency = 0.0f; #if defined(GGML_USE_METAL) || defined(GGML_USE_CUDA) @@ -1387,11 +1397,14 @@ static float device_compute_delay(struct device_info & dev_info, int n_layers, c total_latency += (double)n_flops.output_f32_f32 / ((double)cpu.flops_f32_f32 + EPS) / 1e9; total_latency += (double)n_flops.output_f16_f32 / ((double)cpu.flops_f16_f32 + EPS) / 1e9; + total_latency += (double)n_flops.output_q2k_f32 / ((double)cpu.flops_q2k_f32 + EPS) / 1e9; total_latency += (double)n_flops.output_q4k_f32 / ((double)cpu.flops_q4k_f32 + EPS) / 1e9; - total_latency += (double)n_flops.output_q50_f32 / ((double)cpu.flops_q50_f32 + EPS) / 1e9; total_latency += (double)n_flops.output_q5k_f32 / ((double)cpu.flops_q5k_f32 + EPS) / 1e9; total_latency += (double)n_flops.output_q6k_f32 / ((double)cpu.flops_q6k_f32 + EPS) / 1e9; + total_latency += (double)n_flops.output_q50_f32 / ((double)cpu.flops_q50_f32 + EPS) / 1e9; total_latency += (double)n_flops.output_q80_f32 / ((double)cpu.flops_q80_f32 + EPS) / 1e9; + total_latency += (double)n_flops.output_iq1s_f32 / ((double)cpu.flops_iq1s_f32 + EPS) / 1e9; + total_latency += (double)n_flops.output_iq4nl_f32 / ((double)cpu.flops_iq4nl_f32 + EPS) / 1e9; total_latency *= 1000; // convert to ms @@ -1696,18 +1709,17 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m } LOG_INF("\n"); + LOG_INF("| CPU flops (Q2K x F32, GFLOPS)"); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q2k_f32); + } + LOG_INF("| CPU flops (Q4K x F32, GFLOPS)"); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q4k_f32); } LOG_INF("\n"); - LOG_INF("| CPU flops (Q50 x F32, GFLOPS)"); - for (int i = 0; i < n; ++i) { - LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q50_f32); - } - LOG_INF("\n"); - LOG_INF("| CPU flops (Q5K x F32, GFLOPS)"); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q5k_f32); @@ -1720,12 +1732,30 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m } LOG_INF("\n"); + LOG_INF("| CPU flops (Q50 x F32, GFLOPS)"); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q50_f32); + } + LOG_INF("\n"); + LOG_INF("| CPU flops (Q80 x F32, GFLOPS)"); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q80_f32); } LOG_INF("\n"); + LOG_INF("| CPU flops (IQ1S x F32, GFLOPS)"); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_iq1s_f32); + } + LOG_INF("\n"); + + LOG_INF("| CPU flops (IQ4NL x F32, GFLOPS)"); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_iq4nl_f32); + } + LOG_INF("\n"); + LOG_INF("| Physical Mem Total (GiB) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].memory.total_physical); @@ -1882,15 +1912,15 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m } LOG_INF("\n"); - LOG_INF("| Metal flops (Q4KxF32, GFLOPS)"); + LOG_INF("| Metal flops (Q2KxF32, GFLOPS)"); for (int i = 0; i < n; ++i) { - LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q4k_f32); + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q2k_f32); } LOG_INF("\n"); - LOG_INF("| Metal flops (Q50xF32, GFLOPS)"); + LOG_INF("| Metal flops (Q4KxF32, GFLOPS)"); for (int i = 0; i < n; ++i) { - LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q50_f32); + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q4k_f32); } LOG_INF("\n"); @@ -1906,12 +1936,30 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m } LOG_INF("\n"); + LOG_INF("| Metal flops (Q50xF32, GFLOPS)"); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q50_f32); + } + LOG_INF("\n"); + LOG_INF("| Metal flops (Q80xF32, GFLOPS)"); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q80_f32); } LOG_INF("\n"); + LOG_INF("| Metal flops (IQ1SxF32, GFLOPS)"); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_iq1s_f32); + } + LOG_INF("\n"); + + LOG_INF("| Metal flops (IQ4NLxF32, GFLOPS)"); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_iq4nl_f32); + } + LOG_INF("\n"); + LOG_INF("| CUDA VRAM Read BW (GB/s) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].gpu_props.cuda_read_vram_bw); @@ -1936,15 +1984,15 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m } LOG_INF("\n"); - LOG_INF("| CUDA flops (Q4KxF32, GFLOPS) "); + LOG_INF("| CUDA flops (Q2KxF32, GFLOPS) "); for (int i = 0; i < n; ++i) { - LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q4k_f32); + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q2k_f32); } LOG_INF("\n"); - LOG_INF("| CUDA flops (Q50xF32, GFLOPS) "); + LOG_INF("| CUDA flops (Q4KxF32, GFLOPS) "); for (int i = 0; i < n; ++i) { - LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q50_f32); + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q4k_f32); } LOG_INF("\n"); @@ -1960,12 +2008,30 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m } LOG_INF("\n"); + LOG_INF("| CUDA flops (Q50xF32, GFLOPS) "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q50_f32); + } + LOG_INF("\n"); + LOG_INF("| CUDA flops (Q80xF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q80_f32); } LOG_INF("\n"); + LOG_INF("| CUDA flops (IQ1SxF32, GFLOPS) "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_iq1s_f32); + } + LOG_INF("\n"); + + LOG_INF("| CUDA flops (IQ4NLxF32, GFLOPS) "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_iq4nl_f32); + } + LOG_INF("\n"); + LOG_INF("| Model flops (output F32xF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_f32_f32); LOG_INF("\n"); @@ -1974,12 +2040,12 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_f16_f32); LOG_INF("\n"); - LOG_INF("| Model flops (output Q4KxF32) "); - LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q4k_f32); + LOG_INF("| Model flops (output Q2KxF32) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q2k_f32); LOG_INF("\n"); - LOG_INF("| Model flops (output Q50xF32) "); - LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q50_f32); + LOG_INF("| Model flops (output Q4KxF32) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q4k_f32); LOG_INF("\n"); LOG_INF("| Model flops (output Q5KxF32) "); @@ -1989,11 +2055,23 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m LOG_INF("| Model flops (output Q6KxF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q6k_f32); LOG_INF("\n"); + + LOG_INF("| Model flops (output Q50xF32) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q50_f32); + LOG_INF("\n"); LOG_INF("| Model flops (output Q80xF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q80_f32); LOG_INF("\n"); + LOG_INF("| Model flops (output IQ1SxF32) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_iq1s_f32); + LOG_INF("\n"); + + LOG_INF("| Model flops (output IQ4NLxF32) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_iq4nl_f32); + LOG_INF("\n"); + LOG_INF("| Model flops (layer F32xF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_f32_f32); LOG_INF("\n"); @@ -2002,12 +2080,12 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_f16_f32); LOG_INF("\n"); - LOG_INF("| Model flops (layer Q4KxF32) "); - LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q4k_f32); + LOG_INF("| Model flops (layer Q2KxF32) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q2k_f32); LOG_INF("\n"); - LOG_INF("| Model flops (layer Q50xF32) "); - LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q50_f32); + LOG_INF("| Model flops (layer Q4KxF32) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q4k_f32); LOG_INF("\n"); LOG_INF("| Model flops (layer Q5KxF32) "); @@ -2018,10 +2096,22 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q6k_f32); LOG_INF("\n"); + LOG_INF("| Model flops (layer Q50xF32) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q50_f32); + LOG_INF("\n"); + LOG_INF("| Model flops (layer Q80xF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q80_f32); LOG_INF("\n"); + LOG_INF("| Model flops (layer IQ1SxF32) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_iq1s_f32); + LOG_INF("\n"); + + LOG_INF("| Model flops (layer IQ4NLxF32) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_iq4nl_f32); + LOG_INF("\n"); + LOG_INF("| Model params (input F32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_f32); LOG_INF("\n"); @@ -2030,12 +2120,12 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_f16); LOG_INF("\n"); - LOG_INF("| Model params (input Q4K) "); - LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q4k); + LOG_INF("| Model params (input Q2K) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q2k); LOG_INF("\n"); - LOG_INF("| Model params (input Q50) "); - LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q50); + LOG_INF("| Model params (input Q4K) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q4k); LOG_INF("\n"); LOG_INF("| Model params (input Q5K) "); @@ -2046,10 +2136,22 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q6k); LOG_INF("\n"); + LOG_INF("| Model params (input Q50) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q50); + LOG_INF("\n"); + LOG_INF("| Model params (input Q80) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q80); LOG_INF("\n"); + LOG_INF("| Model params (input IQ1S) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_iq1s); + LOG_INF("\n"); + + LOG_INF("| Model params (input IQ4NL) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_iq4nl); + LOG_INF("\n"); + LOG_INF("| Model params (layer F32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_f32); LOG_INF("\n"); @@ -2058,12 +2160,12 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_f16); LOG_INF("\n"); - LOG_INF("| Model params (layer Q4K) "); - LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q4k); + LOG_INF("| Model params (layer Q2K) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q2k); LOG_INF("\n"); - LOG_INF("| Model params (layer Q50) "); - LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q50); + LOG_INF("| Model params (layer Q4K) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q4k); LOG_INF("\n"); LOG_INF("| Model params (layer Q5K) "); @@ -2074,10 +2176,22 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q6k); LOG_INF("\n"); + LOG_INF("| Model params (layer Q50) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q50); + LOG_INF("\n"); + LOG_INF("| Model params (layer Q80) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q80); LOG_INF("\n"); + LOG_INF("| Model params (layer IQ1S) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_iq1s); + LOG_INF("\n"); + + LOG_INF("| Model params (layer IQ4NL) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_iq4nl); + LOG_INF("\n"); + LOG_INF("| Model params (output F32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_f32); LOG_INF("\n"); @@ -2086,12 +2200,12 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_f16); LOG_INF("\n"); - LOG_INF("| Model params (output Q4K) "); - LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q4k); + LOG_INF("| Model params (output Q2K) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q2k); LOG_INF("\n"); - LOG_INF("| Model params (output Q50) "); - LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q50); + LOG_INF("| Model params (output Q4K) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q4k); LOG_INF("\n"); LOG_INF("| Model params (output Q5K) "); @@ -2102,10 +2216,22 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q6k); LOG_INF("\n"); + LOG_INF("| Model params (output Q50) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q50); + LOG_INF("\n"); + LOG_INF("| Model params (output Q80) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q80); LOG_INF("\n"); + LOG_INF("| Model params (output IQ1S) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_iq1s); + LOG_INF("\n"); + + LOG_INF("| Model params (output IQ4NL) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_iq4nl); + LOG_INF("\n"); + LOG_INF("| Model bytes (input) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_bytes.nb_input); LOG_INF("\n"); @@ -2155,17 +2281,38 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { + gpu_description_len + sizeof(struct disk_props) + sizeof(uint32_t) // cpu_props.cores - + sizeof(float) * 7 // cpu_props.flops_f32_f32, cpu_props.flops_f16_f32, cpu_props.flops_q4k_f32, cpu_props.flops_q50_f32, cpu_props.flops_q5k_f32, cpu_props.flops_q6k_f32, cpu_props.flops_q80_f32 + + sizeof(float) * 10 // - cpu_props.flops_f32_f32, cpu_props.flops_f16_f32, + // - cpu_props.flops_q2k_f32, cpu_props.flops_q4k_f32, cpu_props.flops_q5k_f32, cpu_props.flops_q6k_f32 + // - cpu_props.flops_q50_f32, cpu_props.flops_q80_f32 + // - cpu_props.flops_iq1s_f32, cpu_props.flops_iq4nl_f32 + sizeof(struct memory_info) + sizeof(struct gpu_support) - + sizeof(float) * 20; // gpu_props.memory_free, gpu_props.memory_total, gpu_props.metal_read_vram_bw, gpu_props.cuda_read_vram_bw, - // gpu_props.metal_flops_f32_f32, gpu_props.metal_flops_f16_f32, gpu_props.metal_flops_q4k_f32, gpu_props.metal_flops_q50_f32, gpu_props.metal_flops_q5k_f32, gpu_props.metal_flops_q6k_f32, gpu_props.metal_flops_q80_f32, - // gpu_props.cuda_flops_f32_f32, gpu_props.cuda_flops_f16_f32, gpu_props.cuda_flops_q4k_f32, gpu_props.cuda_flops_q50_f32, gpu_props.cuda_flops_q5k_f32, gpu_props.cuda_flops_q6k_f32, gpu_props.cuda_flops_q80_f32, - // gpu_props.metal_mem_cpy_delay, gpu_props.cuda_mem_cpy_delay + + sizeof(float) * 26; // GPU attributes + // memory: + // - memory_free, memory_total + // - metal_read_vram_bw, cuda_read_vram_bw + // Metal floating-point performance: + // - metal_flops_f32_f32, metal_flops_f16_f32 + // - metal_flops_q2k_f32, metal_flops_q4k_f32, metal_flops_q5k_f32, metal_flops_q6k_f32 + // - metal_flops_q50_f32, metal_flops_q80_f32 + // - metal_flops_iq1s_f32, metal_flops_iq4nl_f32 + // CUDA floating-point performance: + // - cuda_flops_f32_f32, cuda_flops_f16_f32 + // - cuda_flops_q2k_f32, cuda_flops_q4k_f32, cuda_flops_q5k_f32, cuda_flops_q6k_f32 + // - cuda_flops_q50_f32, cuda_flops_q80_f32 + // - cuda_flops_iq1s_f32, cuda_flops_iq4nl_f32 + // delay: + // - metal_mem_cpy_delay, cuda_mem_cpy_delay *buffer = (char *)malloc(total_size); char * ptr = *buffer; + if (*buffer == NULL) { + LOG_ERR("%s: failed to allocate %zu bytes for device info serialization\n", + __func__, total_size); + return 0; + } + // rank memcpy(ptr, &dev_info->rank, sizeof(uint32_t)); ptr += sizeof(uint32_t); @@ -2214,10 +2361,10 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { memcpy(ptr, &dev_info->cpu_props.flops_f16_f32, sizeof(float)); ptr += sizeof(float); - memcpy(ptr, &dev_info->cpu_props.flops_q4k_f32, sizeof(float)); + memcpy(ptr, &dev_info->cpu_props.flops_q2k_f32, sizeof(float)); ptr += sizeof(float); - memcpy(ptr, &dev_info->cpu_props.flops_q50_f32, sizeof(float)); + memcpy(ptr, &dev_info->cpu_props.flops_q4k_f32, sizeof(float)); ptr += sizeof(float); memcpy(ptr, &dev_info->cpu_props.flops_q5k_f32, sizeof(float)); @@ -2226,9 +2373,18 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { memcpy(ptr, &dev_info->cpu_props.flops_q6k_f32, sizeof(float)); ptr += sizeof(float); + memcpy(ptr, &dev_info->cpu_props.flops_q50_f32, sizeof(float)); + ptr += sizeof(float); + memcpy(ptr, &dev_info->cpu_props.flops_q80_f32, sizeof(float)); ptr += sizeof(float); + memcpy(ptr, &dev_info->cpu_props.flops_iq1s_f32, sizeof(float)); + ptr += sizeof(float); + + memcpy(ptr, &dev_info->cpu_props.flops_iq4nl_f32, sizeof(float)); + ptr += sizeof(float); + memcpy(ptr, &dev_info->memory, sizeof(struct memory_info)); ptr += sizeof(struct memory_info); @@ -2250,10 +2406,10 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { memcpy(ptr, &dev_info->gpu_props.metal_flops_f16_f32, sizeof(float)); ptr += sizeof(float); - memcpy(ptr, &dev_info->gpu_props.metal_flops_q4k_f32, sizeof(float)); + memcpy(ptr, &dev_info->gpu_props.metal_flops_q2k_f32, sizeof(float)); ptr += sizeof(float); - memcpy(ptr, &dev_info->gpu_props.metal_flops_q50_f32, sizeof(float)); + memcpy(ptr, &dev_info->gpu_props.metal_flops_q4k_f32, sizeof(float)); ptr += sizeof(float); memcpy(ptr, &dev_info->gpu_props.metal_flops_q5k_f32, sizeof(float)); @@ -2262,9 +2418,18 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { memcpy(ptr, &dev_info->gpu_props.metal_flops_q6k_f32, sizeof(float)); ptr += sizeof(float); + memcpy(ptr, &dev_info->gpu_props.metal_flops_q50_f32, sizeof(float)); + ptr += sizeof(float); + memcpy(ptr, &dev_info->gpu_props.metal_flops_q80_f32, sizeof(float)); ptr += sizeof(float); + memcpy(ptr, &dev_info->gpu_props.metal_flops_iq1s_f32, sizeof(float)); + ptr += sizeof(float); + + memcpy(ptr, &dev_info->gpu_props.metal_flops_iq4nl_f32, sizeof(float)); + ptr += sizeof(float); + memcpy(ptr, &dev_info->gpu_props.metal_mem_cpy_delay, sizeof(float)); ptr += sizeof(float); @@ -2277,10 +2442,10 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { memcpy(ptr, &dev_info->gpu_props.cuda_flops_f16_f32, sizeof(float)); ptr += sizeof(float); - memcpy(ptr, &dev_info->gpu_props.cuda_flops_q4k_f32, sizeof(float)); + memcpy(ptr, &dev_info->gpu_props.cuda_flops_q2k_f32, sizeof(float)); ptr += sizeof(float); - memcpy(ptr, &dev_info->gpu_props.cuda_flops_q50_f32, sizeof(float)); + memcpy(ptr, &dev_info->gpu_props.cuda_flops_q4k_f32, sizeof(float)); ptr += sizeof(float); memcpy(ptr, &dev_info->gpu_props.cuda_flops_q5k_f32, sizeof(float)); @@ -2289,9 +2454,18 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { memcpy(ptr, &dev_info->gpu_props.cuda_flops_q6k_f32, sizeof(float)); ptr += sizeof(float); + memcpy(ptr, &dev_info->gpu_props.cuda_flops_q50_f32, sizeof(float)); + ptr += sizeof(float); + memcpy(ptr, &dev_info->gpu_props.cuda_flops_q80_f32, sizeof(float)); ptr += sizeof(float); + memcpy(ptr, &dev_info->gpu_props.cuda_flops_iq1s_f32, sizeof(float)); + ptr += sizeof(float); + + memcpy(ptr, &dev_info->gpu_props.cuda_flops_iq4nl_f32, sizeof(float)); + ptr += sizeof(float); + memcpy(ptr, &dev_info->gpu_props.cuda_mem_cpy_delay, sizeof(float)); // no need to synchronize model flops and model params @@ -2366,10 +2540,10 @@ void deserialize(const char * buffer, struct device_info * dev_info) { memcpy(&dev_info->cpu_props.flops_f16_f32, ptr, sizeof(float)); ptr += sizeof(float); - memcpy(&dev_info->cpu_props.flops_q4k_f32, ptr, sizeof(float)); + memcpy(&dev_info->cpu_props.flops_q2k_f32, ptr, sizeof(float)); ptr += sizeof(float); - memcpy(&dev_info->cpu_props.flops_q50_f32, ptr, sizeof(float)); + memcpy(&dev_info->cpu_props.flops_q4k_f32, ptr, sizeof(float)); ptr += sizeof(float); memcpy(&dev_info->cpu_props.flops_q5k_f32, ptr, sizeof(float)); @@ -2378,9 +2552,18 @@ void deserialize(const char * buffer, struct device_info * dev_info) { memcpy(&dev_info->cpu_props.flops_q6k_f32, ptr, sizeof(float)); ptr += sizeof(float); + memcpy(&dev_info->cpu_props.flops_q50_f32, ptr, sizeof(float)); + ptr += sizeof(float); + memcpy(&dev_info->cpu_props.flops_q80_f32, ptr, sizeof(float)); ptr += sizeof(float); + memcpy(&dev_info->cpu_props.flops_iq1s_f32, ptr, sizeof(float)); + ptr += sizeof(float); + + memcpy(&dev_info->cpu_props.flops_iq4nl_f32, ptr, sizeof(float)); + ptr += sizeof(float); + memcpy(&dev_info->memory, ptr, sizeof(struct memory_info)); ptr += sizeof(struct memory_info); @@ -2402,10 +2585,10 @@ void deserialize(const char * buffer, struct device_info * dev_info) { memcpy(&dev_info->gpu_props.metal_flops_f16_f32, ptr, sizeof(float)); ptr += sizeof(float); - memcpy(&dev_info->gpu_props.metal_flops_q4k_f32, ptr, sizeof(float)); + memcpy(&dev_info->gpu_props.metal_flops_q2k_f32, ptr, sizeof(float)); ptr += sizeof(float); - memcpy(&dev_info->gpu_props.metal_flops_q50_f32, ptr, sizeof(float)); + memcpy(&dev_info->gpu_props.metal_flops_q4k_f32, ptr, sizeof(float)); ptr += sizeof(float); memcpy(&dev_info->gpu_props.metal_flops_q5k_f32, ptr, sizeof(float)); @@ -2414,9 +2597,18 @@ void deserialize(const char * buffer, struct device_info * dev_info) { memcpy(&dev_info->gpu_props.metal_flops_q6k_f32, ptr, sizeof(float)); ptr += sizeof(float); + memcpy(&dev_info->gpu_props.metal_flops_q50_f32, ptr, sizeof(float)); + ptr += sizeof(float); + memcpy(&dev_info->gpu_props.metal_flops_q80_f32, ptr, sizeof(float)); ptr += sizeof(float); + memcpy(&dev_info->gpu_props.metal_flops_iq1s_f32, ptr, sizeof(float)); + ptr += sizeof(float); + + memcpy(&dev_info->gpu_props.metal_flops_iq4nl_f32, ptr, sizeof(float)); + ptr += sizeof(float); + memcpy(&dev_info->gpu_props.metal_mem_cpy_delay, ptr, sizeof(float)); ptr += sizeof(float); @@ -2429,10 +2621,10 @@ void deserialize(const char * buffer, struct device_info * dev_info) { memcpy(&dev_info->gpu_props.cuda_flops_f16_f32, ptr, sizeof(float)); ptr += sizeof(float); - memcpy(&dev_info->gpu_props.cuda_flops_q4k_f32, ptr, sizeof(float)); + memcpy(&dev_info->gpu_props.cuda_flops_q2k_f32, ptr, sizeof(float)); ptr += sizeof(float); - memcpy(&dev_info->gpu_props.cuda_flops_q50_f32, ptr, sizeof(float)); + memcpy(&dev_info->gpu_props.cuda_flops_q4k_f32, ptr, sizeof(float)); ptr += sizeof(float); memcpy(&dev_info->gpu_props.cuda_flops_q5k_f32, ptr, sizeof(float)); @@ -2441,9 +2633,18 @@ void deserialize(const char * buffer, struct device_info * dev_info) { memcpy(&dev_info->gpu_props.cuda_flops_q6k_f32, ptr, sizeof(float)); ptr += sizeof(float); + memcpy(&dev_info->gpu_props.cuda_flops_q50_f32, ptr, sizeof(float)); + ptr += sizeof(float); + memcpy(&dev_info->gpu_props.cuda_flops_q80_f32, ptr, sizeof(float)); ptr += sizeof(float); + memcpy(&dev_info->gpu_props.cuda_flops_iq1s_f32, ptr, sizeof(float)); + ptr += sizeof(float); + + memcpy(&dev_info->gpu_props.cuda_flops_iq4nl_f32, ptr, sizeof(float)); + ptr += sizeof(float); + memcpy(&dev_info->gpu_props.cuda_mem_cpy_delay, ptr, sizeof(float)); // no need to synchronize model flops and model params diff --git a/common/profiler.h b/common/profiler.h index fb9a4ddb..0681a711 100644 --- a/common/profiler.h +++ b/common/profiler.h @@ -17,23 +17,30 @@ struct cpu_props { uint32_t cores; float flops_f32_f32; // in GFLOPS float flops_f16_f32; // in GFLOPS + float flops_q2k_f32; // in GFLOPS float flops_q4k_f32; // in GFLOPS - float flops_q50_f32; // in GFLOPS float flops_q5k_f32; // in GFLOPS float flops_q6k_f32; // in GFLOPS + float flops_q50_f32; // in GFLOPS float flops_q80_f32; // in GFLOPS + float flops_iq1s_f32; // in GFLOPS + float flops_iq4nl_f32; // in GFLOPS - cpu_props() : - name(""), - description(""), - cores(0), - flops_f32_f32(0.0f), - flops_f16_f32(0.0f), - flops_q4k_f32(0.0f), - flops_q50_f32(0.0f), - flops_q5k_f32(0.0f), - flops_q6k_f32(0.0f), - flops_q80_f32(0.0f) {} + cpu_props() + : name (""), + description (""), + cores (0), + flops_f32_f32 (0.0f), + flops_f16_f32 (0.0f), + flops_q2k_f32 (0.0f), + flops_q4k_f32 (0.0f), + flops_q5k_f32 (0.0f), + flops_q6k_f32 (0.0f), + flops_q50_f32 (0.0f), + flops_q80_f32 (0.0f), + flops_iq1s_f32 (0.0f), + flops_iq4nl_f32 (0.0f) + {} }; struct memory_info { @@ -82,127 +89,169 @@ struct gpu_props { float metal_read_vram_bw; // in GB/s float metal_flops_f32_f32; // in GFLOPS float metal_flops_f16_f32; // in GFLOPS + float metal_flops_q2k_f32; // in GFLOPS float metal_flops_q4k_f32; // in GFLOPS - float metal_flops_q50_f32; // in GFLOPS float metal_flops_q5k_f32; // in GFLOPS float metal_flops_q6k_f32; // in GFLOPS + float metal_flops_q50_f32; // in GFLOPS float metal_flops_q80_f32; // in GFLOPS + float metal_flops_iq1s_f32; // in GFLOPS + float metal_flops_iq4nl_f32; // in GFLOPS float metal_mem_cpy_delay; // in ms float cuda_read_vram_bw; // in GB/s float cuda_flops_f32_f32; // in GFLOPS float cuda_flops_f16_f32; // in GFLOPS + float cuda_flops_q2k_f32; // in GFLOPS float cuda_flops_q4k_f32; // in GFLOPS - float cuda_flops_q50_f32; // in GFLOPS float cuda_flops_q5k_f32; // in GFLOPS float cuda_flops_q6k_f32; // in GFLOPS + float cuda_flops_q50_f32; // in GFLOPS float cuda_flops_q80_f32; // in GFLOPS + float cuda_flops_iq1s_f32; // in GFLOPS + float cuda_flops_iq4nl_f32; // in GFLOPS float cuda_mem_cpy_delay; // in ms gpu_props() : - name(""), - description(""), - memory_free (0.0f), - memory_total (0.0f), - metal_read_vram_bw (0.0f), - metal_flops_f32_f32(0.0f), - metal_flops_f16_f32(0.0f), - metal_flops_q4k_f32(0.0f), - metal_flops_q50_f32(0.0f), - metal_flops_q5k_f32(0.0f), - metal_flops_q6k_f32(0.0f), - metal_flops_q80_f32(0.0f), - metal_mem_cpy_delay(0.0f), - cuda_read_vram_bw (0.0f), - cuda_flops_f32_f32 (0.0f), - cuda_flops_f16_f32 (0.0f), - cuda_flops_q4k_f32 (0.0f), - cuda_flops_q50_f32 (0.0f), - cuda_flops_q5k_f32 (0.0f), - cuda_flops_q6k_f32 (0.0f), - cuda_flops_q80_f32 (0.0f), - cuda_mem_cpy_delay (0.0f) {} + name (""), + description (""), + memory_free (0.0f), + memory_total (0.0f), + metal_read_vram_bw (0.0f), + metal_flops_f32_f32 (0.0f), + metal_flops_f16_f32 (0.0f), + metal_flops_q2k_f32 (0.0f), + metal_flops_q4k_f32 (0.0f), + metal_flops_q5k_f32 (0.0f), + metal_flops_q6k_f32 (0.0f), + metal_flops_q50_f32 (0.0f), + metal_flops_q80_f32 (0.0f), + metal_flops_iq1s_f32 (0.0f), + metal_flops_iq4nl_f32 (0.0f), + metal_mem_cpy_delay (0.0f), + cuda_read_vram_bw (0.0f), + cuda_flops_f32_f32 (0.0f), + cuda_flops_f16_f32 (0.0f), + cuda_flops_q2k_f32 (0.0f), + cuda_flops_q4k_f32 (0.0f), + cuda_flops_q5k_f32 (0.0f), + cuda_flops_q6k_f32 (0.0f), + cuda_flops_q50_f32 (0.0f), + cuda_flops_q80_f32 (0.0f), + cuda_flops_iq1s_f32 (0.0f), + cuda_flops_iq4nl_f32 (0.0f), + cuda_mem_cpy_delay (0.0f) {} }; struct model_flops { float inp_embd_ms; int64_t output_f32_f32; int64_t output_f16_f32; + int64_t output_q2k_f32; int64_t output_q4k_f32; - int64_t output_q50_f32; int64_t output_q5k_f32; int64_t output_q6k_f32; + int64_t output_q50_f32; int64_t output_q80_f32; + int64_t output_iq1s_f32; + int64_t output_iq4nl_f32; int64_t layer_f32_f32; int64_t layer_f16_f32; + int64_t layer_q2k_f32; int64_t layer_q4k_f32; - int64_t layer_q50_f32; int64_t layer_q5k_f32; int64_t layer_q6k_f32; + int64_t layer_q50_f32; int64_t layer_q80_f32; + int64_t layer_iq1s_f32; + int64_t layer_iq4nl_f32; model_flops() : inp_embd_ms(0.0f), output_f32_f32(0), output_f16_f32(0), + output_q2k_f32(0), output_q4k_f32(0), - output_q50_f32(0), output_q5k_f32(0), output_q6k_f32(0), + output_q50_f32(0), output_q80_f32(0), + output_iq1s_f32(0), + output_iq4nl_f32(0), layer_f32_f32 (0), layer_f16_f32 (0), + layer_q2k_f32 (0), layer_q4k_f32 (0), - layer_q50_f32 (0), layer_q5k_f32 (0), layer_q6k_f32 (0), - layer_q80_f32 (0) {} + layer_q50_f32 (0), + layer_q80_f32 (0), + layer_iq1s_f32 (0), + layer_iq4nl_f32 (0) {} }; struct model_params { int64_t input_f32; int64_t input_f16; + int64_t input_q2k; int64_t input_q4k; - int64_t input_q50; int64_t input_q5k; int64_t input_q6k; + int64_t input_q50; int64_t input_q80; + int64_t input_iq1s; + int64_t input_iq4nl; int64_t output_f32; int64_t output_f16; + int64_t output_q2k; int64_t output_q4k; - int64_t output_q50; int64_t output_q5k; int64_t output_q6k; + int64_t output_q50; int64_t output_q80; + int64_t output_iq1s; + int64_t output_iq4nl; int64_t layer_f32; int64_t layer_f16; + int64_t layer_q2k; int64_t layer_q4k; - int64_t layer_q50; int64_t layer_q5k; int64_t layer_q6k; + int64_t layer_q50; int64_t layer_q80; + int64_t layer_iq1s; + int64_t layer_iq4nl; model_params() : input_f32 (0), input_f16 (0), + input_q2k (0), input_q4k (0), - input_q50 (0), input_q5k (0), input_q6k (0), + input_q50 (0), input_q80 (0), + input_iq1s(0), + input_iq4nl(0), output_f32(0), output_f16(0), + output_q2k(0), output_q4k(0), - output_q50(0), output_q5k(0), output_q6k(0), + output_q50(0), output_q80(0), + output_iq1s(0), + output_iq4nl(0), layer_f32 (0), layer_f16 (0), + layer_q2k (0), layer_q4k (0), - layer_q50 (0), layer_q5k (0), layer_q6k (0), - layer_q80 (0) {} + layer_q50 (0), + layer_q80 (0), + layer_iq1s (0), + layer_iq4nl (0) {} }; struct model_bytes { diff --git a/src/llama.cpp b/src/llama.cpp index f782616f..51ef97c8 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3559,16 +3559,22 @@ static bool is_dtype_exist(struct model_params * n_params, enum ggml_type dtype) case GGML_TYPE_F32: case GGML_TYPE_F16: return true; + case GGML_TYPE_Q2_K: + return n_params->layer_q2k > 0 || n_params->output_q2k > 0; case GGML_TYPE_Q4_K: - return n_params->layer_q4k > 0 || n_params->output_q4k > 0; - case GGML_TYPE_Q5_0: - return n_params->layer_q50 > 0 || n_params->output_q50 > 0; + return n_params->layer_q4k > 0 || n_params->output_q4k > 0; case GGML_TYPE_Q5_K: - return n_params->layer_q5k > 0 || n_params->output_q5k > 0; + return n_params->layer_q5k > 0 || n_params->output_q5k > 0; case GGML_TYPE_Q6_K: - return n_params->layer_q6k > 0 || n_params->output_q6k > 0; + return n_params->layer_q6k > 0 || n_params->output_q6k > 0; + case GGML_TYPE_Q5_0: + return n_params->layer_q50 > 0 || n_params->output_q50 > 0; case GGML_TYPE_Q8_0: - return n_params->layer_q80 > 0 || n_params->output_q80 > 0; + return n_params->layer_q80 > 0 || n_params->output_q80 > 0; + case GGML_TYPE_IQ1_S: + return n_params->layer_iq1s > 0 || n_params->output_iq1s > 0; + case GGML_TYPE_IQ4_NL: + return n_params->layer_iq4nl > 0 || n_params->output_iq4nl > 0; default: throw std::runtime_error("Unrecognized data type\n"); } @@ -3649,18 +3655,18 @@ void llama_profile_device( dev_info->gpu_props.cuda_flops_f16_f32 = device_cuda_flops (model, GGML_TYPE_F16, GGML_TYPE_F32); } + if (is_dtype_exist(n_params, GGML_TYPE_Q2_K)) { + dev_info->cpu_props.flops_q2k_f32 = device_cpu_flops (model, GGML_TYPE_Q2_K, GGML_TYPE_F32, n_threads); + dev_info->gpu_props.metal_flops_q2k_f32 = device_metal_flops(model, GGML_TYPE_Q2_K, GGML_TYPE_F32); + dev_info->gpu_props.cuda_flops_q2k_f32 = device_cuda_flops (model, GGML_TYPE_Q2_K, GGML_TYPE_F32); + } + if (is_dtype_exist(n_params, GGML_TYPE_Q4_K)) { dev_info->cpu_props.flops_q4k_f32 = device_cpu_flops (model, GGML_TYPE_Q4_K, GGML_TYPE_F32, n_threads); dev_info->gpu_props.metal_flops_q4k_f32 = device_metal_flops(model, GGML_TYPE_Q4_K, GGML_TYPE_F32); dev_info->gpu_props.cuda_flops_q4k_f32 = device_cuda_flops (model, GGML_TYPE_Q4_K, GGML_TYPE_F32); } - if (is_dtype_exist(n_params, GGML_TYPE_Q5_0)) { - dev_info->cpu_props.flops_q50_f32 = device_cpu_flops (model, GGML_TYPE_Q5_0, GGML_TYPE_F32, n_threads); - dev_info->gpu_props.metal_flops_q50_f32 = device_metal_flops(model, GGML_TYPE_Q5_0, GGML_TYPE_F32); - dev_info->gpu_props.cuda_flops_q50_f32 = device_cuda_flops (model, GGML_TYPE_Q5_0, GGML_TYPE_F32); - } - if (is_dtype_exist(n_params, GGML_TYPE_Q5_K)) { dev_info->cpu_props.flops_q5k_f32 = device_cpu_flops (model, GGML_TYPE_Q5_K, GGML_TYPE_F32, n_threads); dev_info->gpu_props.metal_flops_q5k_f32 = device_metal_flops(model, GGML_TYPE_Q5_K, GGML_TYPE_F32); @@ -3673,11 +3679,30 @@ void llama_profile_device( dev_info->gpu_props.cuda_flops_q6k_f32 = device_cuda_flops (model, GGML_TYPE_Q6_K, GGML_TYPE_F32); } + if (is_dtype_exist(n_params, GGML_TYPE_Q5_0)) { + dev_info->cpu_props.flops_q50_f32 = device_cpu_flops (model, GGML_TYPE_Q5_0, GGML_TYPE_F32, n_threads); + dev_info->gpu_props.metal_flops_q50_f32 = device_metal_flops(model, GGML_TYPE_Q5_0, GGML_TYPE_F32); + dev_info->gpu_props.cuda_flops_q50_f32 = device_cuda_flops (model, GGML_TYPE_Q5_0, GGML_TYPE_F32); + } + + if (is_dtype_exist(n_params, GGML_TYPE_Q8_0)) { dev_info->cpu_props.flops_q80_f32 = device_cpu_flops (model, GGML_TYPE_Q8_0, GGML_TYPE_F32, n_threads); dev_info->gpu_props.metal_flops_q80_f32 = device_metal_flops(model, GGML_TYPE_Q8_0, GGML_TYPE_F32); dev_info->gpu_props.cuda_flops_q80_f32 = device_cuda_flops (model, GGML_TYPE_Q8_0, GGML_TYPE_F32); } + + if (is_dtype_exist(n_params, GGML_TYPE_IQ1_S)) { + dev_info->cpu_props.flops_iq1s_f32 = device_cpu_flops (model, GGML_TYPE_IQ1_S, GGML_TYPE_F32, n_threads); + dev_info->gpu_props.metal_flops_iq1s_f32= device_metal_flops(model, GGML_TYPE_IQ1_S, GGML_TYPE_F32); + dev_info->gpu_props.cuda_flops_iq1s_f32 = device_cuda_flops (model, GGML_TYPE_IQ1_S, GGML_TYPE_F32); + } + + if (is_dtype_exist(n_params, GGML_TYPE_IQ4_NL)) { + dev_info->cpu_props.flops_iq4nl_f32 = device_cpu_flops (model, GGML_TYPE_IQ4_NL, GGML_TYPE_F32, n_threads); + dev_info->gpu_props.metal_flops_iq4nl_f32= device_metal_flops(model, GGML_TYPE_IQ4_NL, GGML_TYPE_F32); + dev_info->gpu_props.cuda_flops_iq4nl_f32 = device_cuda_flops (model, GGML_TYPE_IQ4_NL, GGML_TYPE_F32); + } } ggml_backend_buffer_type_t llama_dev_buffer_type(struct llama_model * model, int device) { @@ -21029,49 +21054,67 @@ static void count_n_flops(struct model_flops * n_flops, enum ggml_type dtype, en case GGML_TYPE_F16: n_flops->output_f16_f32 += n; break; + case GGML_TYPE_Q2_K: + n_flops->output_q2k_f32 += n; + break; case GGML_TYPE_Q4_K: n_flops->output_q4k_f32 += n; break; - case GGML_TYPE_Q5_0: - n_flops->output_q50_f32 += n; - break; case GGML_TYPE_Q5_K: n_flops->output_q5k_f32 += n; break; case GGML_TYPE_Q6_K: n_flops->output_q6k_f32 += n; break; + case GGML_TYPE_Q5_0: + n_flops->output_q50_f32 += n; + break; case GGML_TYPE_Q8_0: n_flops->output_q80_f32 += n; break; + case GGML_TYPE_IQ1_S: + n_flops->output_iq1s_f32 += n; + break; + case GGML_TYPE_IQ4_NL: + n_flops->output_iq4nl_f32 += n; + break; default: throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_OUTPUT\n"); } break; case PROFILER_LAYER_BACKEND: - switch (dtype) { + switch (dtype) { case GGML_TYPE_F32: n_flops->layer_f32_f32 += n; break; case GGML_TYPE_F16: n_flops->layer_f16_f32 += n; break; + case GGML_TYPE_Q2_K: + n_flops->layer_q2k_f32 += n; + break; case GGML_TYPE_Q4_K: n_flops->layer_q4k_f32 += n; break; - case GGML_TYPE_Q5_0: - n_flops->layer_q50_f32 += n; - break; case GGML_TYPE_Q5_K: n_flops->layer_q5k_f32 += n; break; case GGML_TYPE_Q6_K: n_flops->layer_q6k_f32 += n; break; + case GGML_TYPE_Q5_0: + n_flops->layer_q50_f32 += n; + break; case GGML_TYPE_Q8_0: n_flops->layer_q80_f32 += n; break; + case GGML_TYPE_IQ1_S: + n_flops->layer_iq1s_f32 += n; + break; + case GGML_TYPE_IQ4_NL: + n_flops->layer_iq4nl_f32 += n; + break; default: throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_BACKEND\n"); } @@ -21093,21 +21136,30 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype, case GGML_TYPE_F16: n_params->input_f16 += n_i64t; break; + case GGML_TYPE_Q2_K: + n_params->input_q2k += n_i64t; + break; case GGML_TYPE_Q4_K: n_params->input_q4k += n_i64t; break; - case GGML_TYPE_Q5_0: - n_params->input_q50 += n_i64t; - break; case GGML_TYPE_Q5_K: n_params->input_q5k += n_i64t; break; case GGML_TYPE_Q6_K: n_params->input_q6k += n_i64t; break; + case GGML_TYPE_Q5_0: + n_params->input_q50 += n_i64t; + break; case GGML_TYPE_Q8_0: n_params->input_q80 += n_i64t; break; + case GGML_TYPE_IQ1_S: + n_params->input_iq1s += n_i64t; + break; + case GGML_TYPE_IQ4_NL: + n_params->input_iq4nl += n_i64t; + break; default: throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_OUTPUT\n"); } @@ -21116,25 +21168,34 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype, case PROFILER_LAYER_OUTPUT: switch (dtype) { case GGML_TYPE_F32: - n_params->output_f32 += n_i64t; + n_params->output_f32 += n_i64t; break; case GGML_TYPE_F16: - n_params->output_f16 += n_i64t; + n_params->output_f16 += n_i64t; + break; + case GGML_TYPE_Q2_K: + n_params->output_q2k += n_i64t; break; case GGML_TYPE_Q4_K: - n_params->output_q4k += n_i64t; - break; - case GGML_TYPE_Q5_0: - n_params->output_q50 += n_i64t; + n_params->output_q4k += n_i64t; break; case GGML_TYPE_Q5_K: - n_params->output_q5k += n_i64t; + n_params->output_q5k += n_i64t; break; case GGML_TYPE_Q6_K: - n_params->output_q6k += n_i64t; + n_params->output_q6k += n_i64t; + break; + case GGML_TYPE_Q5_0: + n_params->output_q50 += n_i64t; break; case GGML_TYPE_Q8_0: - n_params->output_q80 += n_i64t; + n_params->output_q80 += n_i64t; + break; + case GGML_TYPE_IQ1_S: + n_params->output_iq1s += n_i64t; + break; + case GGML_TYPE_IQ4_NL: + n_params->output_iq4nl += n_i64t; break; default: throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_OUTPUT\n"); @@ -21144,25 +21205,34 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype, case PROFILER_LAYER_BACKEND: switch (dtype) { case GGML_TYPE_F32: - n_params->layer_f32 += n_i64t; + n_params->layer_f32 += n_i64t; break; case GGML_TYPE_F16: - n_params->layer_f16 += n_i64t; + n_params->layer_f16 += n_i64t; + break; + case GGML_TYPE_Q2_K: + n_params->layer_q2k += n_i64t; break; case GGML_TYPE_Q4_K: - n_params->layer_q4k += n_i64t; - break; - case GGML_TYPE_Q5_0: - n_params->layer_q50 += n_i64t; + n_params->layer_q4k += n_i64t; break; case GGML_TYPE_Q5_K: - n_params->layer_q5k += n_i64t; + n_params->layer_q5k += n_i64t; break; case GGML_TYPE_Q6_K: - n_params->layer_q6k += n_i64t; + n_params->layer_q6k += n_i64t; + break; + case GGML_TYPE_Q5_0: + n_params->layer_q50 += n_i64t; break; case GGML_TYPE_Q8_0: - n_params->layer_q80 += n_i64t; + n_params->layer_q80 += n_i64t; + break; + case GGML_TYPE_IQ1_S: + n_params->layer_iq1s += n_i64t; + break; + case GGML_TYPE_IQ4_NL: + n_params->layer_iq4nl += n_i64t; break; default: throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_BACKEND\n"); @@ -21452,23 +21522,29 @@ void llama_model_n_flops( } // use average values instead of total values - n_flops->layer_f32_f32 = static_cast((double)n_flops->layer_f32_f32 / (double)n_layer); - n_flops->layer_f16_f32 = static_cast((double)n_flops->layer_f16_f32 / (double)n_layer); - n_flops->layer_q4k_f32 = static_cast((double)n_flops->layer_q4k_f32 / (double)n_layer); - n_flops->layer_q50_f32 = static_cast((double)n_flops->layer_q50_f32 / (double)n_layer); - n_flops->layer_q5k_f32 = static_cast((double)n_flops->layer_q5k_f32 / (double)n_layer); - n_flops->layer_q6k_f32 = static_cast((double)n_flops->layer_q6k_f32 / (double)n_layer); - n_flops->layer_q80_f32 = static_cast((double)n_flops->layer_q80_f32 / (double)n_layer); - - n_params->layer_f32 = static_cast((double)n_params->layer_f32 / (double)n_layer); - n_params->layer_f16 = static_cast((double)n_params->layer_f16 / (double)n_layer); - n_params->layer_q4k = static_cast((double)n_params->layer_q4k / (double)n_layer); - n_params->layer_q50 = static_cast((double)n_params->layer_q50 / (double)n_layer); - n_params->layer_q5k = static_cast((double)n_params->layer_q5k / (double)n_layer); - n_params->layer_q6k = static_cast((double)n_params->layer_q6k / (double)n_layer); - n_params->layer_q80 = static_cast((double)n_params->layer_q80 / (double)n_layer); - - n_bytes->nb_layer = static_cast((double)n_bytes->nb_layer / (double)n_layer); + n_flops->layer_f32_f32 = static_cast((double)n_flops->layer_f32_f32 / (double)n_layer); + n_flops->layer_f16_f32 = static_cast((double)n_flops->layer_f16_f32 / (double)n_layer); + n_flops->layer_q2k_f32 = static_cast((double)n_flops->layer_q2k_f32 / (double)n_layer); + n_flops->layer_q4k_f32 = static_cast((double)n_flops->layer_q4k_f32 / (double)n_layer); + n_flops->layer_q5k_f32 = static_cast((double)n_flops->layer_q5k_f32 / (double)n_layer); + n_flops->layer_q6k_f32 = static_cast((double)n_flops->layer_q6k_f32 / (double)n_layer); + n_flops->layer_q50_f32 = static_cast((double)n_flops->layer_q50_f32 / (double)n_layer); + n_flops->layer_q80_f32 = static_cast((double)n_flops->layer_q80_f32 / (double)n_layer); + n_flops->layer_iq1s_f32 = static_cast((double)n_flops->layer_iq1s_f32 / (double)n_layer); + n_flops->layer_iq4nl_f32 = static_cast((double)n_flops->layer_iq4nl_f32 / (double)n_layer); + + n_params->layer_f32 = static_cast((double)n_params->layer_f32 / (double)n_layer); + n_params->layer_f16 = static_cast((double)n_params->layer_f16 / (double)n_layer); + n_params->layer_q2k = static_cast((double)n_params->layer_q2k / (double)n_layer); + n_params->layer_q4k = static_cast((double)n_params->layer_q4k / (double)n_layer); + n_params->layer_q50 = static_cast((double)n_params->layer_q50 / (double)n_layer); + n_params->layer_q5k = static_cast((double)n_params->layer_q5k / (double)n_layer); + n_params->layer_q6k = static_cast((double)n_params->layer_q6k / (double)n_layer); + n_params->layer_q80 = static_cast((double)n_params->layer_q80 / (double)n_layer); + n_params->layer_iq1s = static_cast((double)n_params->layer_iq1s / (double)n_layer); + n_params->layer_iq4nl = static_cast((double)n_params->layer_iq4nl / (double)n_layer); + + n_bytes->nb_layer = static_cast((double)n_bytes->nb_layer / (double)n_layer); // reset ml, model, and clear contexts ml->n_created = 0;