diff --git a/common/common.cpp b/common/common.cpp index 704c7335..5c972c90 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -901,13 +901,19 @@ static bool assign_layers_to_device( float t_read_ram_cpu = 0.0f; float t_calc_cpu = ( - master.model_flops.layer_f32_f32 / (dev.cpu_props.flops_f32_f32 * 1e9 + EPS) + - master.model_flops.layer_f16_f32 / (dev.cpu_props.flops_f16_f32 * 1e9 + EPS) + - master.model_flops.layer_q4k_f32 / (dev.cpu_props.flops_q4k_f32 * 1e9 + EPS) + - master.model_flops.layer_q50_f32 / (dev.cpu_props.flops_q50_f32 * 1e9 + EPS) + - master.model_flops.layer_q5k_f32 / (dev.cpu_props.flops_q5k_f32 * 1e9 + EPS) + - master.model_flops.layer_q6k_f32 / (dev.cpu_props.flops_q6k_f32 * 1e9 + EPS) + - master.model_flops.layer_q80_f32 / (dev.cpu_props.flops_q80_f32 * 1e9 + EPS)) * 1000; // in ms + master.model_flops.layer_f32_f32 / (dev.cpu_props.flops_f32_f32 * 1e9 + EPS) + + master.model_flops.layer_f16_f32 / (dev.cpu_props.flops_f16_f32 * 1e9 + EPS) + + master.model_flops.layer_q2k_f32 / (dev.cpu_props.flops_q2k_f32 * 1e9 + EPS) + + master.model_flops.layer_q4k_f32 / (dev.cpu_props.flops_q4k_f32 * 1e9 + EPS) + + master.model_flops.layer_q5k_f32 / (dev.cpu_props.flops_q5k_f32 * 1e9 + EPS) + + master.model_flops.layer_q6k_f32 / (dev.cpu_props.flops_q6k_f32 * 1e9 + EPS) + + master.model_flops.layer_iq2xxs_f32/ (dev.cpu_props.flops_iq2xxs_f32* 1e9 + EPS) + + master.model_flops.layer_q50_f32 / (dev.cpu_props.flops_q50_f32 * 1e9 + EPS) + + master.model_flops.layer_q80_f32 / (dev.cpu_props.flops_q80_f32 * 1e9 + EPS) + + master.model_flops.layer_iq1s_f32 / (dev.cpu_props.flops_iq1s_f32 * 1e9 + EPS) + + master.model_flops.layer_iq4nl_f32 / (dev.cpu_props.flops_iq4nl_f32 * 1e9 + EPS) + + master.model_flops.layer_iq1m_f32 / (dev.cpu_props.flops_iq1m_f32 * 1e9 + EPS) ) * 1000; // in ms + float t_kv_cpy_cpu = dev.memory.mem_cpy_delay; // in ms // t_read_ram_cpu = b_prime / (dev.memory.cpu_read_ram_bw * 1e9) * 1000; // in ms @@ -921,24 +927,36 @@ static bool assign_layers_to_device( if (dev.gpu_support.metal) { t_calc_gpu = ( - master.model_flops.layer_f32_f32 / (dev.gpu_props.metal_flops_f32_f32 * 1e9 + EPS) + - master.model_flops.layer_f16_f32 / (dev.gpu_props.metal_flops_f16_f32 * 1e9 + EPS) + - master.model_flops.layer_q4k_f32 / (dev.gpu_props.metal_flops_q4k_f32 * 1e9 + EPS) + - master.model_flops.layer_q50_f32 / (dev.gpu_props.metal_flops_q50_f32 * 1e9 + EPS) + - master.model_flops.layer_q5k_f32 / (dev.gpu_props.metal_flops_q5k_f32 * 1e9 + EPS) + - master.model_flops.layer_q6k_f32 / (dev.gpu_props.metal_flops_q6k_f32 * 1e9 + EPS) + - master.model_flops.layer_q80_f32 / (dev.gpu_props.metal_flops_q80_f32 * 1e9 + EPS)) * 1000; // in ms + master.model_flops.layer_f32_f32 / (dev.gpu_props.metal_flops_f32_f32 * 1e9 + EPS) + + master.model_flops.layer_f16_f32 / (dev.gpu_props.metal_flops_f16_f32 * 1e9 + EPS) + + master.model_flops.layer_q2k_f32 / (dev.gpu_props.metal_flops_q2k_f32 * 1e9 + EPS) + + master.model_flops.layer_q4k_f32 / (dev.gpu_props.metal_flops_q4k_f32 * 1e9 + EPS) + + master.model_flops.layer_q5k_f32 / (dev.gpu_props.metal_flops_q5k_f32 * 1e9 + EPS) + + master.model_flops.layer_q6k_f32 / (dev.gpu_props.metal_flops_q6k_f32 * 1e9 + EPS) + + master.model_flops.layer_iq2xxs_f32 / (dev.gpu_props.metal_flops_iq2xxs_f32 * 1e9 + EPS) + + master.model_flops.layer_q50_f32 / (dev.gpu_props.metal_flops_q50_f32 * 1e9 + EPS) + + master.model_flops.layer_q80_f32 / (dev.gpu_props.metal_flops_q80_f32 * 1e9 + EPS) + + master.model_flops.layer_iq1s_f32 / (dev.gpu_props.metal_flops_iq1s_f32 * 1e9 + EPS) + + master.model_flops.layer_iq4nl_f32 / (dev.gpu_props.metal_flops_iq4nl_f32 * 1e9 + EPS) + + master.model_flops.layer_iq1m_f32 / (dev.gpu_props.metal_flops_iq1m_f32 * 1e9 + EPS) ) * 1000; // in ms + t_kv_cpy_gpu = dev.gpu_props.metal_mem_cpy_delay; // in ms // t_read_ram_gpu = b_prime / (dev.gpu_props.metal_read_vram_bw * 1e9) * 1000; // in ms } else { t_calc_gpu = ( - master.model_flops.layer_f32_f32 / (dev.gpu_props.cuda_flops_f32_f32 * 1e9 + EPS) + - master.model_flops.layer_f16_f32 / (dev.gpu_props.cuda_flops_f16_f32 * 1e9 + EPS) + - master.model_flops.layer_q4k_f32 / (dev.gpu_props.cuda_flops_q4k_f32 * 1e9 + EPS) + - master.model_flops.layer_q50_f32 / (dev.gpu_props.cuda_flops_q50_f32 * 1e9 + EPS) + - master.model_flops.layer_q5k_f32 / (dev.gpu_props.cuda_flops_q5k_f32 * 1e9 + EPS) + - master.model_flops.layer_q6k_f32 / (dev.gpu_props.cuda_flops_q6k_f32 * 1e9 + EPS) + - master.model_flops.layer_q80_f32 / (dev.gpu_props.cuda_flops_q80_f32 * 1e9 + EPS)) * 1000; // in ms + master.model_flops.layer_f32_f32 / (dev.gpu_props.cuda_flops_f32_f32 * 1e9 + EPS) + + master.model_flops.layer_f16_f32 / (dev.gpu_props.cuda_flops_f16_f32 * 1e9 + EPS) + + master.model_flops.layer_q2k_f32 / (dev.gpu_props.cuda_flops_q2k_f32 * 1e9 + EPS) + + master.model_flops.layer_q4k_f32 / (dev.gpu_props.cuda_flops_q4k_f32 * 1e9 + EPS) + + master.model_flops.layer_q5k_f32 / (dev.gpu_props.cuda_flops_q5k_f32 * 1e9 + EPS) + + master.model_flops.layer_q6k_f32 / (dev.gpu_props.cuda_flops_q6k_f32 * 1e9 + EPS) + + master.model_flops.layer_iq2xxs_f32 / (dev.gpu_props.cuda_flops_iq2xxs_f32 * 1e9 + EPS) + + master.model_flops.layer_q50_f32 / (dev.gpu_props.cuda_flops_q50_f32 * 1e9 + EPS) + + master.model_flops.layer_q80_f32 / (dev.gpu_props.cuda_flops_q80_f32 * 1e9 + EPS) + + master.model_flops.layer_iq1s_f32 / (dev.gpu_props.cuda_flops_iq1s_f32 * 1e9 + EPS) + + master.model_flops.layer_iq4nl_f32 / (dev.gpu_props.cuda_flops_iq4nl_f32 * 1e9 + EPS) + + master.model_flops.layer_iq1m_f32 / (dev.gpu_props.cuda_flops_iq1m_f32 * 1e9 + EPS) ) * 1000; // in ms + t_kv_cpy_gpu = dev.gpu_props.cuda_mem_cpy_delay; // in ms // t_read_ram_gpu = b_prime / (dev.gpu_props.cuda_read_vram_bw * 1e9) * 1000; // in ms } @@ -1113,14 +1131,18 @@ static bool assign_layers_to_device( if (m == 0) { kappa = ( - dev.model_flops.layer_f32_f32 / (dev.cpu_props.flops_f32_f32 * 1e9 + EPS) + - dev.model_flops.layer_f16_f32 / (dev.cpu_props.flops_f16_f32 * 1e9 + EPS) + - dev.model_flops.layer_q4k_f32 / (dev.cpu_props.flops_q4k_f32 * 1e9 + EPS) + - dev.model_flops.layer_q50_f32 / (dev.cpu_props.flops_q50_f32 * 1e9 + EPS) + - dev.model_flops.layer_q5k_f32 / (dev.cpu_props.flops_q5k_f32 * 1e9 + EPS) + - dev.model_flops.layer_q6k_f32 / (dev.cpu_props.flops_q6k_f32 * 1e9 + EPS) + - dev.model_flops.layer_q80_f32 / (dev.cpu_props.flops_q80_f32 * 1e9 + EPS)) * 1000; // in ms - + dev.model_flops.layer_f32_f32 / (dev.cpu_props.flops_f32_f32 * 1e9 + EPS) + + dev.model_flops.layer_f16_f32 / (dev.cpu_props.flops_f16_f32 * 1e9 + EPS) + + dev.model_flops.layer_q2k_f32 / (dev.cpu_props.flops_q2k_f32 * 1e9 + EPS) + + dev.model_flops.layer_q4k_f32 / (dev.cpu_props.flops_q4k_f32 * 1e9 + EPS) + + dev.model_flops.layer_q5k_f32 / (dev.cpu_props.flops_q5k_f32 * 1e9 + EPS) + + dev.model_flops.layer_q6k_f32 / (dev.cpu_props.flops_q6k_f32 * 1e9 + EPS) + + dev.model_flops.layer_iq2xxs_f32 / (dev.cpu_props.flops_iq2xxs_f32 * 1e9 + EPS) + + dev.model_flops.layer_q50_f32 / (dev.cpu_props.flops_q50_f32 * 1e9 + EPS) + + dev.model_flops.layer_q80_f32 / (dev.cpu_props.flops_q80_f32 * 1e9 + EPS) + + dev.model_flops.layer_iq1s_f32 / (dev.cpu_props.flops_iq1s_f32 * 1e9 + EPS) + + dev.model_flops.layer_iq4nl_f32 / (dev.cpu_props.flops_iq4nl_f32 * 1e9 + EPS) + + dev.model_flops.layer_iq1m_f32 / (dev.cpu_props.flops_iq1m_f32 * 1e9 + EPS) ) * 1000; // in ms // kappa += (bi / n_vocab + bo) / (dev.memory.cpu_read_ram_bw * 1e9) * 1000; // in ms kappa += (bi / n_vocab) / (disk_speed[m] * 1e9) * 1000; // in ms @@ -1766,33 +1788,25 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & return mparams; } -static ggml_type kv_cache_type_from_str(const std::string & s) { - if (s == "f32") { - return GGML_TYPE_F32; - } - if (s == "f16") { - return GGML_TYPE_F16; - } - if (s == "q8_0") { - return GGML_TYPE_Q8_0; - } - if (s == "q4_0") { - return GGML_TYPE_Q4_0; - } - if (s == "q4_1") { - return GGML_TYPE_Q4_1; - } - if (s == "iq4_nl") { - return GGML_TYPE_IQ4_NL; - } - if (s == "q5_0") { - return GGML_TYPE_Q5_0; - } - if (s == "q5_1") { - return GGML_TYPE_Q5_1; - } +const std::vector kv_cache_types = { + GGML_TYPE_F32, + GGML_TYPE_F16, + GGML_TYPE_BF16, // Added BF16 data type support + GGML_TYPE_Q8_0, + GGML_TYPE_Q4_0, + GGML_TYPE_Q4_1, + GGML_TYPE_IQ4_NL, + GGML_TYPE_Q5_0, + GGML_TYPE_Q5_1, +}; - throw std::runtime_error("Invalid cache type: " + s); +static ggml_type kv_cache_type_from_str(const std::string & s) { + for (const auto & type : kv_cache_types) { + if (ggml_type_name(type) == s) { + return type; + } + } + throw std::runtime_error("Unsupported cache type: " + s); } struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) { diff --git a/common/profiler.cpp b/common/profiler.cpp index 69a20af0..18b345a9 100644 --- a/common/profiler.cpp +++ b/common/profiler.cpp @@ -188,6 +188,9 @@ static float device_flops(struct llama_model * model, enum ggml_type src0t, enum }; struct ggml_context * ctx = ggml_init(params); + if(n_embd < ggml_blck_size(src0t)){ + n_embd = 2 * ggml_blck_size(src0t); + } struct ggml_tensor * tensor_a = ggml_new_tensor_2d(ctx, src0t, n_embd, n_embd); struct ggml_tensor * tensor_b = ggml_new_tensor_2d(ctx, src1t, n_embd, n_embd); @@ -208,10 +211,12 @@ static float device_flops(struct llama_model * model, enum ggml_type src0t, enum ctx_cgraph = ggml_init(params0); gf = ggml_new_graph(ctx_cgraph); + cur = ggml_mul_mat(ctx_cgraph, tensor_a, tensor_b); for (int i = 0; i < n_repeat - 1; i++) { cur = ggml_mul_mat(ctx_cgraph, tensor_a, cur); } + ggml_build_forward_expand(gf, cur); } @@ -364,14 +369,18 @@ float device_inp_embd_delay(struct llama_model * model, enum ggml_type src0t, in ggml_fp32_to_fp16_row(temp_f32.data(), static_cast(matrix_B), embd_size); break; } + case GGML_TYPE_Q2_K: case GGML_TYPE_Q4_K: - case GGML_TYPE_Q5_0: case GGML_TYPE_Q5_K: case GGML_TYPE_Q6_K: case GGML_TYPE_Q8_K: + case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_Q5_0: case GGML_TYPE_Q8_0: - QK_K = ggml_blck_size(src0t); - matrix_B = malloc((embd_size / QK_K) * ggml_type_size(src0t)); + case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ4_NL: + case GGML_TYPE_IQ1_M: + matrix_B = malloc((embd_size / ggml_blck_size(src0t) * ggml_type_size(src0t))); // The quantization block sizes are inconsistent for different quantization methods break; default: LOG_INF("Unsupported type: %d\n", src0t); @@ -1347,33 +1356,47 @@ static float device_compute_delay(struct device_info & dev_info, int n_layers, c #ifdef GGML_USE_CUDA struct gpu_props gpu = dev_info.gpu_props; - gpu_latency_per_layer += (double)n_flops.layer_f32_f32 / ((double)gpu.cuda_flops_f32_f32 + EPS) / 1e9; - gpu_latency_per_layer += (double)n_flops.layer_f16_f32 / ((double)gpu.cuda_flops_f16_f32 + EPS) / 1e9; - gpu_latency_per_layer += (double)n_flops.layer_q4k_f32 / ((double)gpu.cuda_flops_q4k_f32 + EPS) / 1e9; - gpu_latency_per_layer += (double)n_flops.layer_q50_f32 / ((double)gpu.cuda_flops_q50_f32 + EPS) / 1e9; - gpu_latency_per_layer += (double)n_flops.layer_q5k_f32 / ((double)gpu.cuda_flops_q5k_f32 + EPS) / 1e9; - gpu_latency_per_layer += (double)n_flops.layer_q6k_f32 / ((double)gpu.cuda_flops_q6k_f32 + EPS) / 1e9; - gpu_latency_per_layer += (double)n_flops.layer_q80_f32 / ((double)gpu.cuda_flops_q80_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_f32_f32 / ((double)gpu.cuda_flops_f32_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_f16_f32 / ((double)gpu.cuda_flops_f16_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_q2k_f32 / ((double)gpu.cuda_flops_q2k_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_q4k_f32 / ((double)gpu.cuda_flops_q4k_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_q5k_f32 / ((double)gpu.cuda_flops_q5k_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_q6k_f32 / ((double)gpu.cuda_flops_q6k_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_iq2xxs_f32 / ((double)gpu.cuda_flops_iq2xxs_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_q50_f32 / ((double)gpu.cuda_flops_q50_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_q80_f32 / ((double)gpu.cuda_flops_q80_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_iq1s_f32 / ((double)gpu.cuda_flops_iq1s_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_iq4nl_f32 / ((double)gpu.cuda_flops_iq4nl_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_iq1m_f32 / ((double)gpu.cuda_flops_iq1m_f32 + EPS) / 1e9; #elif GGML_USE_METAL struct gpu_props gpu = dev_info.gpu_props; - gpu_latency_per_layer += (double)n_flops.layer_f32_f32 / ((double)gpu.metal_flops_f32_f32 + EPS) / 1e9; - gpu_latency_per_layer += (double)n_flops.layer_f16_f32 / ((double)gpu.metal_flops_f16_f32 + EPS) / 1e9; - gpu_latency_per_layer += (double)n_flops.layer_q4k_f32 / ((double)gpu.metal_flops_q4k_f32 + EPS) / 1e9; - gpu_latency_per_layer += (double)n_flops.layer_q50_f32 / ((double)gpu.metal_flops_q50_f32 + EPS) / 1e9; - gpu_latency_per_layer += (double)n_flops.layer_q5k_f32 / ((double)gpu.metal_flops_q5k_f32 + EPS) / 1e9; - gpu_latency_per_layer += (double)n_flops.layer_q6k_f32 / ((double)gpu.metal_flops_q6k_f32 + EPS) / 1e9; - gpu_latency_per_layer += (double)n_flops.layer_q80_f32 / ((double)gpu.metal_flops_q80_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_f32_f32 / ((double)gpu.metal_flops_f32_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_f16_f32 / ((double)gpu.metal_flops_f16_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_q2k_f32 / ((double)gpu.metal_flops_q2k_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_q4k_f32 / ((double)gpu.metal_flops_q4k_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_q5k_f32 / ((double)gpu.metal_flops_q5k_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_q6k_f32 / ((double)gpu.metal_flops_q6k_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_iq2xxs_f32 / ((double)gpu.metal_flops_iq2xxs_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_q50_f32 / ((double)gpu.metal_flops_q50_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_q80_f32 / ((double)gpu.metal_flops_q80_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_iq1s_f32 / ((double)gpu.metal_flops_iq1s_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_iq4nl_f32 / ((double)gpu.metal_flops_iq4nl_f32 + EPS) / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_iq1m_f32 / ((double)gpu.metal_flops_iq1m_f32 + EPS) / 1e9; #endif - cpu_latency_per_layer += (double)n_flops.layer_f32_f32 / ((double)cpu.flops_f32_f32 + EPS) / 1e9; - cpu_latency_per_layer += (double)n_flops.layer_f16_f32 / ((double)cpu.flops_f16_f32 + EPS) / 1e9; - cpu_latency_per_layer += (double)n_flops.layer_q4k_f32 / ((double)cpu.flops_q4k_f32 + EPS) / 1e9; - cpu_latency_per_layer += (double)n_flops.layer_q50_f32 / ((double)cpu.flops_q50_f32 + EPS) / 1e9; - cpu_latency_per_layer += (double)n_flops.layer_q5k_f32 / ((double)cpu.flops_q5k_f32 + EPS) / 1e9; - cpu_latency_per_layer += (double)n_flops.layer_q6k_f32 / ((double)cpu.flops_q6k_f32 + EPS) / 1e9; - cpu_latency_per_layer += (double)n_flops.layer_q80_f32 / ((double)cpu.flops_q80_f32 + EPS) / 1e9; - + cpu_latency_per_layer += (double)n_flops.layer_f32_f32 / ((double)cpu.flops_f32_f32 + EPS) / 1e9; + cpu_latency_per_layer += (double)n_flops.layer_f16_f32 / ((double)cpu.flops_f16_f32 + EPS) / 1e9; + cpu_latency_per_layer += (double)n_flops.layer_q2k_f32 / ((double)cpu.flops_q2k_f32 + EPS) / 1e9; + cpu_latency_per_layer += (double)n_flops.layer_q4k_f32 / ((double)cpu.flops_q4k_f32 + EPS) / 1e9; + cpu_latency_per_layer += (double)n_flops.layer_q5k_f32 / ((double)cpu.flops_q5k_f32 + EPS) / 1e9; + cpu_latency_per_layer += (double)n_flops.layer_q6k_f32 / ((double)cpu.flops_q6k_f32 + EPS) / 1e9; + cpu_latency_per_layer += (double)n_flops.layer_iq2xxs_f32 / ((double)cpu.flops_iq2xxs_f32 + EPS) / 1e9; + cpu_latency_per_layer += (double)n_flops.layer_q50_f32 / ((double)cpu.flops_q50_f32 + EPS) / 1e9; + cpu_latency_per_layer += (double)n_flops.layer_q80_f32 / ((double)cpu.flops_q80_f32 + EPS) / 1e9; + cpu_latency_per_layer += (double)n_flops.layer_iq1s_f32 / ((double)cpu.flops_iq1s_f32 + EPS) / 1e9; + cpu_latency_per_layer += (double)n_flops.layer_iq4nl_f32 / ((double)cpu.flops_iq4nl_f32 + EPS) / 1e9; + cpu_latency_per_layer += (double)n_flops.layer_iq1m_f32 / ((double)cpu.flops_iq1m_f32 + EPS) / 1e9; double total_latency = 0.0f; #if defined(GGML_USE_METAL) || defined(GGML_USE_CUDA) @@ -1385,13 +1408,18 @@ static float device_compute_delay(struct device_info & dev_info, int n_layers, c total_latency += cpu_latency_per_layer * n_layers; #endif - total_latency += (double)n_flops.output_f32_f32 / ((double)cpu.flops_f32_f32 + EPS) / 1e9; - total_latency += (double)n_flops.output_f16_f32 / ((double)cpu.flops_f16_f32 + EPS) / 1e9; - total_latency += (double)n_flops.output_q4k_f32 / ((double)cpu.flops_q4k_f32 + EPS) / 1e9; - total_latency += (double)n_flops.output_q50_f32 / ((double)cpu.flops_q50_f32 + EPS) / 1e9; - total_latency += (double)n_flops.output_q5k_f32 / ((double)cpu.flops_q5k_f32 + EPS) / 1e9; - total_latency += (double)n_flops.output_q6k_f32 / ((double)cpu.flops_q6k_f32 + EPS) / 1e9; - total_latency += (double)n_flops.output_q80_f32 / ((double)cpu.flops_q80_f32 + EPS) / 1e9; + total_latency += (double)n_flops.output_f32_f32 / ((double)cpu.flops_f32_f32 + EPS) / 1e9; + total_latency += (double)n_flops.output_f16_f32 / ((double)cpu.flops_f16_f32 + EPS) / 1e9; + total_latency += (double)n_flops.output_q2k_f32 / ((double)cpu.flops_q2k_f32 + EPS) / 1e9; + total_latency += (double)n_flops.output_q4k_f32 / ((double)cpu.flops_q4k_f32 + EPS) / 1e9; + total_latency += (double)n_flops.output_q5k_f32 / ((double)cpu.flops_q5k_f32 + EPS) / 1e9; + total_latency += (double)n_flops.output_q6k_f32 / ((double)cpu.flops_q6k_f32 + EPS) / 1e9; + total_latency += (double)n_flops.output_iq2xxs_f32 / ((double)cpu.flops_iq2xxs_f32 + EPS) / 1e9; + total_latency += (double)n_flops.output_q50_f32 / ((double)cpu.flops_q50_f32 + EPS) / 1e9; + total_latency += (double)n_flops.output_q80_f32 / ((double)cpu.flops_q80_f32 + EPS) / 1e9; + total_latency += (double)n_flops.output_iq1s_f32 / ((double)cpu.flops_iq1s_f32 + EPS) / 1e9; + total_latency += (double)n_flops.output_iq4nl_f32 / ((double)cpu.flops_iq4nl_f32 + EPS) / 1e9; + total_latency += (double)n_flops.output_iq1m_f32 / ((double)cpu.flops_iq1m_f32 + EPS) / 1e9; total_latency *= 1000; // convert to ms @@ -1647,474 +1675,664 @@ static float device_mem_copy_delay(struct device_info & dev_info, struct llama_m void device_print_props(struct device_info * dev_info_set, int n, struct llama_model * model, const struct llama_context_params cparams) { LOG_INF("\n-------------------------------------------------------------------------------------------\n"); - LOG_INF("| Property "); + LOG_INF("| Property "); for (int i = 0; i < n; ++i) { LOG_INF("| Rank %-8d", i); GGML_ASSERT((int)dev_info_set[i].rank == i); } LOG_INF("\n-------------------------------------------------------------------------------------------\n"); - LOG_INF("| Device Name "); + LOG_INF("| Device Name "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.10s ", dev_info_set[i].device_name); } LOG_INF("\n"); - LOG_INF("| Device OS "); + LOG_INF("| Device OS "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.10s ", dev_info_set[i].device_os); } LOG_INF("\n"); - LOG_INF("| CPU Name "); + LOG_INF("| CPU Name "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.10s ", dev_info_set[i].cpu_props.name); } LOG_INF("\n"); - LOG_INF("| CPU Description "); + LOG_INF("| CPU Description "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.10s ", dev_info_set[i].cpu_props.description); } LOG_INF("\n"); - LOG_INF("| Number of CPU cores "); + LOG_INF("| Number of CPU cores "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10u ", dev_info_set[i].cpu_props.cores); } LOG_INF("\n"); - LOG_INF("| CPU flops (F32xF32, GFLOPS) "); + LOG_INF("| CPU flops (F32xF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_f32_f32); } LOG_INF("\n"); - LOG_INF("| CPU flops (F16xF32, GFLOPS) "); + LOG_INF("| CPU flops (F16xF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_f16_f32); } LOG_INF("\n"); - LOG_INF("| CPU flops (Q4K x F32, GFLOPS)"); + LOG_INF("| CPU flops (Q2K x F32, GFLOPS) "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q2k_f32); + } + LOG_INF("\n"); + + LOG_INF("| CPU flops (Q4K x F32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q4k_f32); } LOG_INF("\n"); - LOG_INF("| CPU flops (Q50 x F32, GFLOPS)"); - for (int i = 0; i < n; ++i) { - LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q50_f32); - } - LOG_INF("\n"); - - LOG_INF("| CPU flops (Q5K x F32, GFLOPS)"); + LOG_INF("| CPU flops (Q5K x F32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q5k_f32); } LOG_INF("\n"); - LOG_INF("| CPU flops (Q6K x F32, GFLOPS)"); + LOG_INF("| CPU flops (Q6K x F32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q6k_f32); } LOG_INF("\n"); - LOG_INF("| CPU flops (Q80 x F32, GFLOPS)"); + LOG_INF("| CPU flops (IQ2XXS x F32, GFLOPS)"); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_iq2xxs_f32); + } + LOG_INF("\n"); + + LOG_INF("| CPU flops (Q50 x F32, GFLOPS) "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q50_f32); + } + LOG_INF("\n"); + + LOG_INF("| CPU flops (Q80 x F32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q80_f32); } LOG_INF("\n"); - LOG_INF("| Physical Mem Total (GiB) "); + LOG_INF("| CPU flops (IQ1S x F32, GFLOPS) "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_iq1s_f32); + } + LOG_INF("\n"); + + LOG_INF("| CPU flops (IQ4NL x F32, GFLOPS) "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_iq4nl_f32); + } + LOG_INF("\n"); + + LOG_INF("| CPU flops (IQ1M x F32, GFLOPS) "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_iq1m_f32); + } + LOG_INF("\n"); + + LOG_INF("| Physical Mem Total (GiB) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].memory.total_physical); } LOG_INF("\n"); - LOG_INF("| Physical Mem Available (GiB) "); + LOG_INF("| Physical Mem Available (GiB) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].memory.available_physical); } LOG_INF("\n"); - LOG_INF("| Used Mem Swappable (GiB) "); + LOG_INF("| Used Mem Swappable (GiB) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].memory.used_can_swap); } LOG_INF("\n"); - LOG_INF("| Swap Mem Total (GiB) "); + LOG_INF("| Swap Mem Total (GiB) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].memory.total_swap); } LOG_INF("\n"); - LOG_INF("| Swap Mem Available (GiB) "); + LOG_INF("| Swap Mem Available (GiB) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].memory.available_swap); } LOG_INF("\n"); - LOG_INF("| CPU RAM Read BW (GB/s) "); + LOG_INF("| CPU RAM Read BW (GB/s) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].memory.cpu_read_ram_bw); } LOG_INF("\n"); - LOG_INF("| CPU KVCache Copy Time (ms/l) "); + LOG_INF("| CPU KVCache Copy Time (ms/l) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].memory.mem_cpy_delay); } LOG_INF("\n"); - LOG_INF("| Disk Read Seq Speed (GB/s) "); + LOG_INF("| Disk Read Seq Speed (GB/s) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].disk.read_seq_bw); } LOG_INF("\n"); - LOG_INF("| Disk Write Seq Speed (GB/s) "); + LOG_INF("| Disk Write Seq Speed (GB/s) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].disk.write_seq_bw); } LOG_INF("\n"); - LOG_INF("| Disk Read Rnd Speed (GB/s) "); + LOG_INF("| Disk Read Rnd Speed (GB/s) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].disk.read_rnd_bw); } LOG_INF("\n"); - LOG_INF("| Disk Write Rnd Speed (GB/s) "); + LOG_INF("| Disk Write Rnd Speed (GB/s) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].disk.write_rnd_bw); } LOG_INF("\n"); - LOG_INF("| GPU Metal "); + LOG_INF("| GPU Metal "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10d ", dev_info_set[i].gpu_support.metal); } LOG_INF("\n"); - LOG_INF("| GPU CUDA "); + LOG_INF("| GPU CUDA "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10d ", dev_info_set[i].gpu_support.cuda); } LOG_INF("\n"); - LOG_INF("| GPU Vulkan "); + LOG_INF("| GPU Vulkan "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10d ", dev_info_set[i].gpu_support.vulkan); } LOG_INF("\n"); - LOG_INF("| GPU Kompute "); + LOG_INF("| GPU Kompute "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10d ", dev_info_set[i].gpu_support.kompute); } LOG_INF("\n"); - LOG_INF("| GPU BLAS "); + LOG_INF("| GPU BLAS "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10d ", dev_info_set[i].gpu_support.gpublas); } LOG_INF("\n"); - LOG_INF("| BLAS "); + LOG_INF("| BLAS "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10d ", dev_info_set[i].gpu_support.blas); } LOG_INF("\n"); - LOG_INF("| SYCL "); + LOG_INF("| SYCL "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10d ", dev_info_set[i].gpu_support.sycl); } LOG_INF("\n"); - LOG_INF("| GPU Name "); + LOG_INF("| GPU Name "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.10s ", dev_info_set[i].gpu_props.name); } LOG_INF("\n"); - LOG_INF("| GPU Description "); + LOG_INF("| GPU Description "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.10s ", dev_info_set[i].gpu_props.description); } LOG_INF("\n"); - LOG_INF("| GPU Mem Free (GiB) "); + LOG_INF("| GPU Mem Free (GiB) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].gpu_props.memory_free); } LOG_INF("\n"); - LOG_INF("| GPU Mem Total (GiB) "); + LOG_INF("| GPU Mem Total (GiB) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].gpu_props.memory_total); } LOG_INF("\n"); - LOG_INF("| Metal VRAM Read BW (GB/s) "); + LOG_INF("| Metal VRAM Read BW (GB/s) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].gpu_props.metal_read_vram_bw); } LOG_INF("\n"); - LOG_INF("| Metal KVCache Copy Time(ms/l)"); + LOG_INF("| Metal KVCache Copy Time(ms/l) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].gpu_props.metal_mem_cpy_delay); } LOG_INF("\n"); - LOG_INF("| Metal flops (F32xF32, GFLOPS)"); + LOG_INF("| Metal flops (F32xF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_f32_f32); } LOG_INF("\n"); - LOG_INF("| Metal flops (F16xF32, GFLOPS)"); + LOG_INF("| Metal flops (F16xF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_f16_f32); } LOG_INF("\n"); - LOG_INF("| Metal flops (Q4KxF32, GFLOPS)"); + LOG_INF("| Metal flops (Q2KxF32, GFLOPS) "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q2k_f32); + } + LOG_INF("\n"); + + LOG_INF("| Metal flops (Q4KxF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q4k_f32); } LOG_INF("\n"); - LOG_INF("| Metal flops (Q50xF32, GFLOPS)"); - for (int i = 0; i < n; ++i) { - LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q50_f32); - } - LOG_INF("\n"); - - LOG_INF("| Metal flops (Q5KxF32, GFLOPS)"); + LOG_INF("| Metal flops (Q5KxF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q5k_f32); } LOG_INF("\n"); - LOG_INF("| Metal flops (Q6KxF32, GFLOPS)"); + LOG_INF("| Metal flops (Q6KxF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q6k_f32); } LOG_INF("\n"); - LOG_INF("| Metal flops (Q80xF32, GFLOPS)"); + LOG_INF("| Metal flops (IQ2XXSxF32, GFLOPS)"); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_iq2xxs_f32); + } + LOG_INF("\n"); + + LOG_INF("| Metal flops (Q50xF32, GFLOPS) "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q50_f32); + } + LOG_INF("\n"); + + LOG_INF("| Metal flops (Q80xF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q80_f32); } LOG_INF("\n"); - LOG_INF("| CUDA VRAM Read BW (GB/s) "); + LOG_INF("| Metal flops (IQ1SxF32, GFLOPS) "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_iq1s_f32); + } + LOG_INF("\n"); + + LOG_INF("| Metal flops (IQ4NLxF32, GFLOPS) "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_iq4nl_f32); + } + LOG_INF("\n"); + + LOG_INF("| Metal flops (IQ1MxF32, GFLOPS) "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_iq1m_f32); + } + LOG_INF("\n"); + + LOG_INF("| CUDA VRAM Read BW (GB/s) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].gpu_props.cuda_read_vram_bw); } LOG_INF("\n"); - LOG_INF("| CUDA KVCache Copy Time (ms/l)"); + LOG_INF("| CUDA KVCache Copy Time (ms/l) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].gpu_props.cuda_mem_cpy_delay); } LOG_INF("\n"); - LOG_INF("| CUDA flops (F32xF32, GFLOPS) "); + LOG_INF("| CUDA flops (F32xF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_f32_f32); } LOG_INF("\n"); - LOG_INF("| CUDA flops (F16xF32, GFLOPS) "); + LOG_INF("| CUDA flops (F16xF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_f16_f32); } LOG_INF("\n"); - LOG_INF("| CUDA flops (Q4KxF32, GFLOPS) "); + LOG_INF("| CUDA flops (Q2KxF32, GFLOPS) "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q2k_f32); + } + LOG_INF("\n"); + + LOG_INF("| CUDA flops (Q4KxF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q4k_f32); } LOG_INF("\n"); - LOG_INF("| CUDA flops (Q50xF32, GFLOPS) "); - for (int i = 0; i < n; ++i) { - LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q50_f32); - } - LOG_INF("\n"); - - LOG_INF("| CUDA flops (Q5KxF32, GFLOPS) "); + LOG_INF("| CUDA flops (Q5KxF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q5k_f32); } LOG_INF("\n"); - LOG_INF("| CUDA flops (Q6KxF32, GFLOPS) "); + LOG_INF("| CUDA flops (Q6KxF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q6k_f32); } LOG_INF("\n"); - LOG_INF("| CUDA flops (Q80xF32, GFLOPS) "); + LOG_INF("| CUDA flops (IQ2XXSxF32, GFLOPS) "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_iq2xxs_f32); + } + LOG_INF("\n"); + + LOG_INF("| CUDA flops (Q50xF32, GFLOPS) "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q50_f32); + } + LOG_INF("\n"); + + LOG_INF("| CUDA flops (Q80xF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q80_f32); } LOG_INF("\n"); - LOG_INF("| Model flops (output F32xF32) "); + LOG_INF("| CUDA flops (IQ1SxF32, GFLOPS) "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_iq1s_f32); + } + LOG_INF("\n"); + + LOG_INF("| CUDA flops (IQ4NLxF32, GFLOPS) "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_iq4nl_f32); + } + LOG_INF("\n"); + + LOG_INF("| CUDA flops (IQ1MxF32, GFLOPS) "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_iq1m_f32); + } + LOG_INF("\n"); + + LOG_INF("| Model flops (output F32xF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_f32_f32); LOG_INF("\n"); - LOG_INF("| Model flops (output F16xF32) "); + LOG_INF("| Model flops (output F16xF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_f16_f32); LOG_INF("\n"); - LOG_INF("| Model flops (output Q4KxF32) "); + LOG_INF("| Model flops (output Q2KxF32) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q2k_f32); + LOG_INF("\n"); + + LOG_INF("| Model flops (output Q4KxF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q4k_f32); LOG_INF("\n"); - LOG_INF("| Model flops (output Q50xF32) "); - LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q50_f32); - LOG_INF("\n"); - - LOG_INF("| Model flops (output Q5KxF32) "); + LOG_INF("| Model flops (output Q5KxF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q5k_f32); LOG_INF("\n"); - LOG_INF("| Model flops (output Q6KxF32) "); + LOG_INF("| Model flops (output Q6KxF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q6k_f32); LOG_INF("\n"); - LOG_INF("| Model flops (output Q80xF32) "); + LOG_INF("| Model flops (output IQ2XXSxF32) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_iq2xxs_f32); + LOG_INF("\n"); + + LOG_INF("| Model flops (output Q50xF32) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q50_f32); + LOG_INF("\n"); + + LOG_INF("| Model flops (output Q80xF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q80_f32); LOG_INF("\n"); - LOG_INF("| Model flops (layer F32xF32) "); + LOG_INF("| Model flops (output IQ1SxF32) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_iq1s_f32); + LOG_INF("\n"); + + LOG_INF("| Model flops (output IQ4NLxF32) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_iq4nl_f32); + LOG_INF("\n"); + + LOG_INF("| Model flops (output IQ1MxF32) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_iq1m_f32); + LOG_INF("\n"); + + LOG_INF("| Model flops (layer F32xF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_f32_f32); LOG_INF("\n"); - LOG_INF("| Model flops (layer F16xF32) "); + LOG_INF("| Model flops (layer F16xF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_f16_f32); LOG_INF("\n"); - LOG_INF("| Model flops (layer Q4KxF32) "); + LOG_INF("| Model flops (layer Q2KxF32) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q2k_f32); + LOG_INF("\n"); + + LOG_INF("| Model flops (layer Q4KxF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q4k_f32); LOG_INF("\n"); - LOG_INF("| Model flops (layer Q50xF32) "); - LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q50_f32); - LOG_INF("\n"); - - LOG_INF("| Model flops (layer Q5KxF32) "); + LOG_INF("| Model flops (layer Q5KxF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q5k_f32); LOG_INF("\n"); - LOG_INF("| Model flops (layer Q6KxF32) "); + LOG_INF("| Model flops (layer Q6KxF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q6k_f32); LOG_INF("\n"); - LOG_INF("| Model flops (layer Q80xF32) "); + LOG_INF("| Model flops (layer IQ2XXSxF32) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_iq2xxs_f32); + LOG_INF("\n"); + + LOG_INF("| Model flops (layer Q50xF32) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q50_f32); + LOG_INF("\n"); + + LOG_INF("| Model flops (layer Q80xF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q80_f32); LOG_INF("\n"); - LOG_INF("| Model params (input F32) "); + LOG_INF("| Model flops (layer IQ1SxF32) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_iq1s_f32); + LOG_INF("\n"); + + LOG_INF("| Model flops (layer IQ4NLxF32) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_iq4nl_f32); + LOG_INF("\n"); + + LOG_INF("| Model flops (layer IQ1MxF32) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_iq1m_f32); + LOG_INF("\n"); + + LOG_INF("| Model params (input F32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_f32); LOG_INF("\n"); - LOG_INF("| Model params (input F16) "); + LOG_INF("| Model params (input F16) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_f16); LOG_INF("\n"); - LOG_INF("| Model params (input Q4K) "); + LOG_INF("| Model params (input Q2K) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q2k); + LOG_INF("\n"); + + LOG_INF("| Model params (input Q4K) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q4k); LOG_INF("\n"); - LOG_INF("| Model params (input Q50) "); - LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q50); - LOG_INF("\n"); - - LOG_INF("| Model params (input Q5K) "); + LOG_INF("| Model params (input Q5K) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q5k); LOG_INF("\n"); - LOG_INF("| Model params (input Q6K) "); + LOG_INF("| Model params (input Q6K) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q6k); LOG_INF("\n"); - LOG_INF("| Model params (input Q80) "); + LOG_INF("| Model params (input IQ2XXS) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_iq2xxs); + LOG_INF("\n"); + + LOG_INF("| Model params (input Q50) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q50); + LOG_INF("\n"); + + LOG_INF("| Model params (input Q80) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q80); LOG_INF("\n"); - LOG_INF("| Model params (layer F32) "); + LOG_INF("| Model params (input IQ1S) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_iq1s); + LOG_INF("\n"); + + LOG_INF("| Model params (input IQ4NL) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_iq4nl); + LOG_INF("\n"); + + LOG_INF("| Model params (input IQ1M) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_iq1m); + LOG_INF("\n"); + + LOG_INF("| Model params (layer F32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_f32); LOG_INF("\n"); - LOG_INF("| Model params (layer F16) "); + LOG_INF("| Model params (layer F16) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_f16); LOG_INF("\n"); - LOG_INF("| Model params (layer Q4K) "); + LOG_INF("| Model params (layer Q2K) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q2k); + LOG_INF("\n"); + + LOG_INF("| Model params (layer Q4K) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q4k); LOG_INF("\n"); - LOG_INF("| Model params (layer Q50) "); - LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q50); - LOG_INF("\n"); - - LOG_INF("| Model params (layer Q5K) "); + LOG_INF("| Model params (layer Q5K) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q5k); LOG_INF("\n"); - LOG_INF("| Model params (layer Q6K) "); + LOG_INF("| Model params (layer Q6K) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q6k); LOG_INF("\n"); - LOG_INF("| Model params (layer Q80) "); + LOG_INF("| Model params (layer IQ2XXS) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_iq2xxs); + LOG_INF("\n"); + + LOG_INF("| Model params (layer Q50) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q50); + LOG_INF("\n"); + + LOG_INF("| Model params (layer Q80) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q80); LOG_INF("\n"); - LOG_INF("| Model params (output F32) "); + LOG_INF("| Model params (layer IQ1S) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_iq1s); + LOG_INF("\n"); + + LOG_INF("| Model params (layer IQ4NL) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_iq4nl); + LOG_INF("\n"); + + LOG_INF("| Model params (layer IQ1M) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_iq1m); + LOG_INF("\n"); + + LOG_INF("| Model params (output F32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_f32); LOG_INF("\n"); - LOG_INF("| Model params (output F16) "); + LOG_INF("| Model params (output F16) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_f16); LOG_INF("\n"); - LOG_INF("| Model params (output Q4K) "); + LOG_INF("| Model params (output Q2K) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q2k); + LOG_INF("\n"); + + LOG_INF("| Model params (output Q4K) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q4k); LOG_INF("\n"); - LOG_INF("| Model params (output Q50) "); - LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q50); - LOG_INF("\n"); - - LOG_INF("| Model params (output Q5K) "); + LOG_INF("| Model params (output Q5K) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q5k); LOG_INF("\n"); - LOG_INF("| Model params (output Q6K) "); + LOG_INF("| Model params (output Q6K) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q6k); LOG_INF("\n"); - LOG_INF("| Model params (output Q80) "); + LOG_INF("| Model params (output IQ2XXS) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_iq2xxs); + LOG_INF("\n"); + + LOG_INF("| Model params (output Q50) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q50); + LOG_INF("\n"); + + LOG_INF("| Model params (output Q80) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q80); LOG_INF("\n"); - LOG_INF("| Model bytes (input) "); + LOG_INF("| Model params (output IQ1S) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_iq1s); + LOG_INF("\n"); + + LOG_INF("| Model params (output IQ4NL) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_iq4nl); + LOG_INF("\n"); + + LOG_INF("| Model params (output IQ1M) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_iq1m); + LOG_INF("\n"); + + LOG_INF("| Model bytes (input) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_bytes.nb_input); LOG_INF("\n"); - LOG_INF("| Model bytes (layer) "); + LOG_INF("| Model bytes (layer) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_bytes.nb_layer); LOG_INF("\n"); - LOG_INF("| Model bytes (output) "); + LOG_INF("| Model bytes (output) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_bytes.nb_output); LOG_INF("\n"); @@ -2155,17 +2373,44 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { + gpu_description_len + sizeof(struct disk_props) + sizeof(uint32_t) // cpu_props.cores - + sizeof(float) * 7 // cpu_props.flops_f32_f32, cpu_props.flops_f16_f32, cpu_props.flops_q4k_f32, cpu_props.flops_q50_f32, cpu_props.flops_q5k_f32, cpu_props.flops_q6k_f32, cpu_props.flops_q80_f32 + + sizeof(float) * 12 // - cpu_props.flops_f32_f32, cpu_props.flops_f16_f32, + // - cpu_props.flops_q2k_f32, cpu_props.flops_q4k_f32, cpu_props.flops_q5k_f32, cpu_props.flops_q6k_f32 + // - cpu_props.flops_iq2xxs_f32 + // - cpu_props.flops_q50_f32, cpu_props.flops_q80_f32 + // - cpu_props.flops_iq1s_f32, cpu_props.flops_iq4nl_f32 + // - cpu_props.flops_iq1m_f32 + sizeof(struct memory_info) + sizeof(struct gpu_support) - + sizeof(float) * 20; // gpu_props.memory_free, gpu_props.memory_total, gpu_props.metal_read_vram_bw, gpu_props.cuda_read_vram_bw, - // gpu_props.metal_flops_f32_f32, gpu_props.metal_flops_f16_f32, gpu_props.metal_flops_q4k_f32, gpu_props.metal_flops_q50_f32, gpu_props.metal_flops_q5k_f32, gpu_props.metal_flops_q6k_f32, gpu_props.metal_flops_q80_f32, - // gpu_props.cuda_flops_f32_f32, gpu_props.cuda_flops_f16_f32, gpu_props.cuda_flops_q4k_f32, gpu_props.cuda_flops_q50_f32, gpu_props.cuda_flops_q5k_f32, gpu_props.cuda_flops_q6k_f32, gpu_props.cuda_flops_q80_f32, - // gpu_props.metal_mem_cpy_delay, gpu_props.cuda_mem_cpy_delay + + sizeof(float) * 30; // GPU attributes + // memory: + // - memory_free, memory_total + // - metal_read_vram_bw, cuda_read_vram_bw + // Metal floating-point performance: + // - metal_flops_f32_f32, metal_flops_f16_f32 + // - metal_flops_q2k_f32, metal_flops_q4k_f32, metal_flops_q5k_f32, metal_flops_q6k_f32 + // - metal_flops_iq2xxs_f32 + // - metal_flops_q50_f32, metal_flops_q80_f32 + // - metal_flops_iq1s_f32, metal_flops_iq4nl_f32 + // - metal_flops_iq1m_f32 + // CUDA floating-point performance: + // - cuda_flops_f32_f32, cuda_flops_f16_f32 + // - cuda_flops_q2k_f32, cuda_flops_q4k_f32, cuda_flops_q5k_f32, cuda_flops_q6k_f32 + // - cuda_flops_iq2xxs_f32 + // - cuda_flops_q50_f32, cuda_flops_q80_f32 + // - cuda_flops_iq1s_f32, cuda_flops_iq4nl_f32 + // - cuda_flops_iq1m_f32 + // delay: + // - metal_mem_cpy_delay, cuda_mem_cpy_delay *buffer = (char *)malloc(total_size); char * ptr = *buffer; + if (*buffer == NULL) { + LOG_ERR("%s: failed to allocate %zu bytes for device info serialization\n", + __func__, total_size); + return 0; + } + // rank memcpy(ptr, &dev_info->rank, sizeof(uint32_t)); ptr += sizeof(uint32_t); @@ -2214,10 +2459,10 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { memcpy(ptr, &dev_info->cpu_props.flops_f16_f32, sizeof(float)); ptr += sizeof(float); - memcpy(ptr, &dev_info->cpu_props.flops_q4k_f32, sizeof(float)); + memcpy(ptr, &dev_info->cpu_props.flops_q2k_f32, sizeof(float)); ptr += sizeof(float); - memcpy(ptr, &dev_info->cpu_props.flops_q50_f32, sizeof(float)); + memcpy(ptr, &dev_info->cpu_props.flops_q4k_f32, sizeof(float)); ptr += sizeof(float); memcpy(ptr, &dev_info->cpu_props.flops_q5k_f32, sizeof(float)); @@ -2226,9 +2471,24 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { memcpy(ptr, &dev_info->cpu_props.flops_q6k_f32, sizeof(float)); ptr += sizeof(float); + memcpy(ptr, &dev_info->cpu_props.flops_iq2xxs_f32, sizeof(float)); + ptr += sizeof(float); + + memcpy(ptr, &dev_info->cpu_props.flops_q50_f32, sizeof(float)); + ptr += sizeof(float); + memcpy(ptr, &dev_info->cpu_props.flops_q80_f32, sizeof(float)); ptr += sizeof(float); + memcpy(ptr, &dev_info->cpu_props.flops_iq1s_f32, sizeof(float)); + ptr += sizeof(float); + + memcpy(ptr, &dev_info->cpu_props.flops_iq4nl_f32, sizeof(float)); + ptr += sizeof(float); + + memcpy(ptr, &dev_info->cpu_props.flops_iq1m_f32, sizeof(float)); + ptr += sizeof(float); + memcpy(ptr, &dev_info->memory, sizeof(struct memory_info)); ptr += sizeof(struct memory_info); @@ -2250,10 +2510,10 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { memcpy(ptr, &dev_info->gpu_props.metal_flops_f16_f32, sizeof(float)); ptr += sizeof(float); - memcpy(ptr, &dev_info->gpu_props.metal_flops_q4k_f32, sizeof(float)); + memcpy(ptr, &dev_info->gpu_props.metal_flops_q2k_f32, sizeof(float)); ptr += sizeof(float); - memcpy(ptr, &dev_info->gpu_props.metal_flops_q50_f32, sizeof(float)); + memcpy(ptr, &dev_info->gpu_props.metal_flops_q4k_f32, sizeof(float)); ptr += sizeof(float); memcpy(ptr, &dev_info->gpu_props.metal_flops_q5k_f32, sizeof(float)); @@ -2262,9 +2522,24 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { memcpy(ptr, &dev_info->gpu_props.metal_flops_q6k_f32, sizeof(float)); ptr += sizeof(float); + memcpy(ptr, &dev_info->gpu_props.metal_flops_iq2xxs_f32, sizeof(float)); + ptr += sizeof(float); + + memcpy(ptr, &dev_info->gpu_props.metal_flops_q50_f32, sizeof(float)); + ptr += sizeof(float); + memcpy(ptr, &dev_info->gpu_props.metal_flops_q80_f32, sizeof(float)); ptr += sizeof(float); + memcpy(ptr, &dev_info->gpu_props.metal_flops_iq1s_f32, sizeof(float)); + ptr += sizeof(float); + + memcpy(ptr, &dev_info->gpu_props.metal_flops_iq4nl_f32, sizeof(float)); + ptr += sizeof(float); + + memcpy(ptr, &dev_info->gpu_props.metal_flops_iq1m_f32, sizeof(float)); + ptr += sizeof(float); + memcpy(ptr, &dev_info->gpu_props.metal_mem_cpy_delay, sizeof(float)); ptr += sizeof(float); @@ -2277,10 +2552,10 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { memcpy(ptr, &dev_info->gpu_props.cuda_flops_f16_f32, sizeof(float)); ptr += sizeof(float); - memcpy(ptr, &dev_info->gpu_props.cuda_flops_q4k_f32, sizeof(float)); + memcpy(ptr, &dev_info->gpu_props.cuda_flops_q2k_f32, sizeof(float)); ptr += sizeof(float); - memcpy(ptr, &dev_info->gpu_props.cuda_flops_q50_f32, sizeof(float)); + memcpy(ptr, &dev_info->gpu_props.cuda_flops_q4k_f32, sizeof(float)); ptr += sizeof(float); memcpy(ptr, &dev_info->gpu_props.cuda_flops_q5k_f32, sizeof(float)); @@ -2289,9 +2564,24 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { memcpy(ptr, &dev_info->gpu_props.cuda_flops_q6k_f32, sizeof(float)); ptr += sizeof(float); + memcpy(ptr, &dev_info->gpu_props.cuda_flops_iq2xxs_f32, sizeof(float)); + ptr += sizeof(float); + + memcpy(ptr, &dev_info->gpu_props.cuda_flops_q50_f32, sizeof(float)); + ptr += sizeof(float); + memcpy(ptr, &dev_info->gpu_props.cuda_flops_q80_f32, sizeof(float)); ptr += sizeof(float); + memcpy(ptr, &dev_info->gpu_props.cuda_flops_iq1s_f32, sizeof(float)); + ptr += sizeof(float); + + memcpy(ptr, &dev_info->gpu_props.cuda_flops_iq4nl_f32, sizeof(float)); + ptr += sizeof(float); + + memcpy(ptr, &dev_info->gpu_props.cuda_flops_iq1m_f32, sizeof(float)); + ptr += sizeof(float); + memcpy(ptr, &dev_info->gpu_props.cuda_mem_cpy_delay, sizeof(float)); // no need to synchronize model flops and model params @@ -2366,10 +2656,10 @@ void deserialize(const char * buffer, struct device_info * dev_info) { memcpy(&dev_info->cpu_props.flops_f16_f32, ptr, sizeof(float)); ptr += sizeof(float); - memcpy(&dev_info->cpu_props.flops_q4k_f32, ptr, sizeof(float)); + memcpy(&dev_info->cpu_props.flops_q2k_f32, ptr, sizeof(float)); ptr += sizeof(float); - memcpy(&dev_info->cpu_props.flops_q50_f32, ptr, sizeof(float)); + memcpy(&dev_info->cpu_props.flops_q4k_f32, ptr, sizeof(float)); ptr += sizeof(float); memcpy(&dev_info->cpu_props.flops_q5k_f32, ptr, sizeof(float)); @@ -2378,9 +2668,24 @@ void deserialize(const char * buffer, struct device_info * dev_info) { memcpy(&dev_info->cpu_props.flops_q6k_f32, ptr, sizeof(float)); ptr += sizeof(float); + memcpy(&dev_info->cpu_props.flops_iq2xxs_f32, ptr, sizeof(float)); + ptr += sizeof(float); + + memcpy(&dev_info->cpu_props.flops_q50_f32, ptr, sizeof(float)); + ptr += sizeof(float); + memcpy(&dev_info->cpu_props.flops_q80_f32, ptr, sizeof(float)); ptr += sizeof(float); + memcpy(&dev_info->cpu_props.flops_iq1s_f32, ptr, sizeof(float)); + ptr += sizeof(float); + + memcpy(&dev_info->cpu_props.flops_iq4nl_f32, ptr, sizeof(float)); + ptr += sizeof(float); + + memcpy(&dev_info->cpu_props.flops_iq1m_f32, ptr, sizeof(float)); + ptr += sizeof(float); + memcpy(&dev_info->memory, ptr, sizeof(struct memory_info)); ptr += sizeof(struct memory_info); @@ -2402,10 +2707,10 @@ void deserialize(const char * buffer, struct device_info * dev_info) { memcpy(&dev_info->gpu_props.metal_flops_f16_f32, ptr, sizeof(float)); ptr += sizeof(float); - memcpy(&dev_info->gpu_props.metal_flops_q4k_f32, ptr, sizeof(float)); + memcpy(&dev_info->gpu_props.metal_flops_q2k_f32, ptr, sizeof(float)); ptr += sizeof(float); - memcpy(&dev_info->gpu_props.metal_flops_q50_f32, ptr, sizeof(float)); + memcpy(&dev_info->gpu_props.metal_flops_q4k_f32, ptr, sizeof(float)); ptr += sizeof(float); memcpy(&dev_info->gpu_props.metal_flops_q5k_f32, ptr, sizeof(float)); @@ -2414,9 +2719,24 @@ void deserialize(const char * buffer, struct device_info * dev_info) { memcpy(&dev_info->gpu_props.metal_flops_q6k_f32, ptr, sizeof(float)); ptr += sizeof(float); + memcpy(&dev_info->gpu_props.metal_flops_iq2xxs_f32, ptr, sizeof(float)); + ptr += sizeof(float); + + memcpy(&dev_info->gpu_props.metal_flops_q50_f32, ptr, sizeof(float)); + ptr += sizeof(float); + memcpy(&dev_info->gpu_props.metal_flops_q80_f32, ptr, sizeof(float)); ptr += sizeof(float); + memcpy(&dev_info->gpu_props.metal_flops_iq1s_f32, ptr, sizeof(float)); + ptr += sizeof(float); + + memcpy(&dev_info->gpu_props.metal_flops_iq4nl_f32, ptr, sizeof(float)); + ptr += sizeof(float); + + memcpy(&dev_info->gpu_props.metal_flops_iq1m_f32, ptr, sizeof(float)); + ptr += sizeof(float); + memcpy(&dev_info->gpu_props.metal_mem_cpy_delay, ptr, sizeof(float)); ptr += sizeof(float); @@ -2429,10 +2749,10 @@ void deserialize(const char * buffer, struct device_info * dev_info) { memcpy(&dev_info->gpu_props.cuda_flops_f16_f32, ptr, sizeof(float)); ptr += sizeof(float); - memcpy(&dev_info->gpu_props.cuda_flops_q4k_f32, ptr, sizeof(float)); + memcpy(&dev_info->gpu_props.cuda_flops_q2k_f32, ptr, sizeof(float)); ptr += sizeof(float); - memcpy(&dev_info->gpu_props.cuda_flops_q50_f32, ptr, sizeof(float)); + memcpy(&dev_info->gpu_props.cuda_flops_q4k_f32, ptr, sizeof(float)); ptr += sizeof(float); memcpy(&dev_info->gpu_props.cuda_flops_q5k_f32, ptr, sizeof(float)); @@ -2441,9 +2761,24 @@ void deserialize(const char * buffer, struct device_info * dev_info) { memcpy(&dev_info->gpu_props.cuda_flops_q6k_f32, ptr, sizeof(float)); ptr += sizeof(float); + memcpy(&dev_info->gpu_props.cuda_flops_iq2xxs_f32, ptr, sizeof(float)); + ptr += sizeof(float); + + memcpy(&dev_info->gpu_props.cuda_flops_q50_f32, ptr, sizeof(float)); + ptr += sizeof(float); + memcpy(&dev_info->gpu_props.cuda_flops_q80_f32, ptr, sizeof(float)); ptr += sizeof(float); + memcpy(&dev_info->gpu_props.cuda_flops_iq1s_f32, ptr, sizeof(float)); + ptr += sizeof(float); + + memcpy(&dev_info->gpu_props.cuda_flops_iq4nl_f32, ptr, sizeof(float)); + ptr += sizeof(float); + + memcpy(&dev_info->gpu_props.cuda_flops_iq1m_f32, ptr, sizeof(float)); + ptr += sizeof(float); + memcpy(&dev_info->gpu_props.cuda_mem_cpy_delay, ptr, sizeof(float)); // no need to synchronize model flops and model params diff --git a/common/profiler.h b/common/profiler.h index fb9a4ddb..b8fff0d1 100644 --- a/common/profiler.h +++ b/common/profiler.h @@ -15,25 +15,36 @@ struct cpu_props { const char * name; const char * description; uint32_t cores; - float flops_f32_f32; // in GFLOPS - float flops_f16_f32; // in GFLOPS - float flops_q4k_f32; // in GFLOPS - float flops_q50_f32; // in GFLOPS - float flops_q5k_f32; // in GFLOPS - float flops_q6k_f32; // in GFLOPS - float flops_q80_f32; // in GFLOPS + float flops_f32_f32; // in GFLOPS + float flops_f16_f32; // in GFLOPS + float flops_q2k_f32; // in GFLOPS + float flops_q4k_f32; // in GFLOPS + float flops_q5k_f32; // in GFLOPS + float flops_q6k_f32; // in GFLOPS + float flops_iq2xxs_f32; // in GFLOPS + float flops_q50_f32; // in GFLOPS + float flops_q80_f32; // in GFLOPS + float flops_iq1s_f32; // in GFLOPS + float flops_iq4nl_f32; // in GFLOPS + float flops_iq1m_f32; // in GFLOPS - cpu_props() : - name(""), - description(""), - cores(0), - flops_f32_f32(0.0f), - flops_f16_f32(0.0f), - flops_q4k_f32(0.0f), - flops_q50_f32(0.0f), - flops_q5k_f32(0.0f), - flops_q6k_f32(0.0f), - flops_q80_f32(0.0f) {} + cpu_props() + : name (""), + description (""), + cores (0), + flops_f32_f32 (0.0f), + flops_f16_f32 (0.0f), + flops_q2k_f32 (0.0f), + flops_q4k_f32 (0.0f), + flops_q5k_f32 (0.0f), + flops_q6k_f32 (0.0f), + flops_iq2xxs_f32(0.0f), + flops_q50_f32 (0.0f), + flops_q80_f32 (0.0f), + flops_iq1s_f32 (0.0f), + flops_iq4nl_f32 (0.0f), + flops_iq1m_f32 (0.0f) + {} }; struct memory_info { @@ -77,132 +88,204 @@ struct gpu_support { struct gpu_props { const char * name; const char * description; - float memory_free; // in GiB - float memory_total; // in GiB - float metal_read_vram_bw; // in GB/s - float metal_flops_f32_f32; // in GFLOPS - float metal_flops_f16_f32; // in GFLOPS - float metal_flops_q4k_f32; // in GFLOPS - float metal_flops_q50_f32; // in GFLOPS - float metal_flops_q5k_f32; // in GFLOPS - float metal_flops_q6k_f32; // in GFLOPS - float metal_flops_q80_f32; // in GFLOPS - float metal_mem_cpy_delay; // in ms - float cuda_read_vram_bw; // in GB/s - float cuda_flops_f32_f32; // in GFLOPS - float cuda_flops_f16_f32; // in GFLOPS - float cuda_flops_q4k_f32; // in GFLOPS - float cuda_flops_q50_f32; // in GFLOPS - float cuda_flops_q5k_f32; // in GFLOPS - float cuda_flops_q6k_f32; // in GFLOPS - float cuda_flops_q80_f32; // in GFLOPS - float cuda_mem_cpy_delay; // in ms + float memory_free; // in GiB + float memory_total; // in GiB + float metal_read_vram_bw; // in GB/s + float metal_flops_f32_f32; // in GFLOPS + float metal_flops_f16_f32; // in GFLOPS + float metal_flops_q2k_f32; // in GFLOPS + float metal_flops_q4k_f32; // in GFLOPS + float metal_flops_q5k_f32; // in GFLOPS + float metal_flops_q6k_f32; // in GFLOPS + float metal_flops_iq2xxs_f32; // in GFLOPS + float metal_flops_q50_f32; // in GFLOPS + float metal_flops_q80_f32; // in GFLOPS + float metal_flops_iq1s_f32; // in GFLOPS + float metal_flops_iq4nl_f32; // in GFLOPS + float metal_flops_iq1m_f32; // in GFLOPS + float metal_mem_cpy_delay; // in ms + float cuda_read_vram_bw; // in GB/s + float cuda_flops_f32_f32; // in GFLOPS + float cuda_flops_f16_f32; // in GFLOPS + float cuda_flops_q2k_f32; // in GFLOPS + float cuda_flops_q4k_f32; // in GFLOPS + float cuda_flops_q5k_f32; // in GFLOPS + float cuda_flops_q6k_f32; // in GFLOPS + float cuda_flops_iq2xxs_f32; // in GFLOPS + float cuda_flops_q50_f32; // in GFLOPS + float cuda_flops_q80_f32; // in GFLOPS + float cuda_flops_iq1s_f32; // in GFLOPS + float cuda_flops_iq4nl_f32; // in GFLOPS + float cuda_flops_iq1m_f32; // in GFLOPS + float cuda_mem_cpy_delay; // in ms gpu_props() : - name(""), - description(""), - memory_free (0.0f), - memory_total (0.0f), - metal_read_vram_bw (0.0f), - metal_flops_f32_f32(0.0f), - metal_flops_f16_f32(0.0f), - metal_flops_q4k_f32(0.0f), - metal_flops_q50_f32(0.0f), - metal_flops_q5k_f32(0.0f), - metal_flops_q6k_f32(0.0f), - metal_flops_q80_f32(0.0f), - metal_mem_cpy_delay(0.0f), - cuda_read_vram_bw (0.0f), - cuda_flops_f32_f32 (0.0f), - cuda_flops_f16_f32 (0.0f), - cuda_flops_q4k_f32 (0.0f), - cuda_flops_q50_f32 (0.0f), - cuda_flops_q5k_f32 (0.0f), - cuda_flops_q6k_f32 (0.0f), - cuda_flops_q80_f32 (0.0f), - cuda_mem_cpy_delay (0.0f) {} + name (""), + description (""), + memory_free (0.0f), + memory_total (0.0f), + metal_read_vram_bw (0.0f), + metal_flops_f32_f32 (0.0f), + metal_flops_f16_f32 (0.0f), + metal_flops_q2k_f32 (0.0f), + metal_flops_q4k_f32 (0.0f), + metal_flops_q5k_f32 (0.0f), + metal_flops_q6k_f32 (0.0f), + metal_flops_iq2xxs_f32 (0.0f), + metal_flops_q50_f32 (0.0f), + metal_flops_q80_f32 (0.0f), + metal_flops_iq1s_f32 (0.0f), + metal_flops_iq4nl_f32 (0.0f), + metal_flops_iq1m_f32 (0.0f), + metal_mem_cpy_delay (0.0f), + cuda_read_vram_bw (0.0f), + cuda_flops_f32_f32 (0.0f), + cuda_flops_f16_f32 (0.0f), + cuda_flops_q2k_f32 (0.0f), + cuda_flops_q4k_f32 (0.0f), + cuda_flops_q5k_f32 (0.0f), + cuda_flops_q6k_f32 (0.0f), + cuda_flops_iq2xxs_f32 (0.0f), + cuda_flops_q50_f32 (0.0f), + cuda_flops_q80_f32 (0.0f), + cuda_flops_iq1s_f32 (0.0f), + cuda_flops_iq4nl_f32 (0.0f), + cuda_flops_iq1m_f32 (0.0f), + cuda_mem_cpy_delay (0.0f) {} }; struct model_flops { float inp_embd_ms; int64_t output_f32_f32; int64_t output_f16_f32; + int64_t output_q2k_f32; int64_t output_q4k_f32; - int64_t output_q50_f32; int64_t output_q5k_f32; int64_t output_q6k_f32; + int64_t output_iq2xxs_f32; + int64_t output_q50_f32; int64_t output_q80_f32; + int64_t output_iq1s_f32; + int64_t output_iq4nl_f32; + int64_t output_iq1m_f32; int64_t layer_f32_f32; int64_t layer_f16_f32; + int64_t layer_q2k_f32; int64_t layer_q4k_f32; - int64_t layer_q50_f32; int64_t layer_q5k_f32; int64_t layer_q6k_f32; + int64_t layer_iq2xxs_f32; + int64_t layer_q50_f32; int64_t layer_q80_f32; + int64_t layer_iq1s_f32; + int64_t layer_iq4nl_f32; + int64_t layer_iq1m_f32; model_flops() : - inp_embd_ms(0.0f), - output_f32_f32(0), - output_f16_f32(0), - output_q4k_f32(0), - output_q50_f32(0), - output_q5k_f32(0), - output_q6k_f32(0), - output_q80_f32(0), - layer_f32_f32 (0), - layer_f16_f32 (0), - layer_q4k_f32 (0), - layer_q50_f32 (0), - layer_q5k_f32 (0), - layer_q6k_f32 (0), - layer_q80_f32 (0) {} + inp_embd_ms (0.0f), + output_f32_f32 (0), + output_f16_f32 (0), + output_q2k_f32 (0), + output_q4k_f32 (0), + output_q5k_f32 (0), + output_q6k_f32 (0), + output_iq2xxs_f32 (0), + output_q50_f32 (0), + output_q80_f32 (0), + output_iq1s_f32 (0), + output_iq4nl_f32 (0), + output_iq1m_f32 (0), + layer_f32_f32 (0), + layer_f16_f32 (0), + layer_q2k_f32 (0), + layer_q4k_f32 (0), + layer_q5k_f32 (0), + layer_q6k_f32 (0), + layer_iq2xxs_f32 (0), + layer_q50_f32 (0), + layer_q80_f32 (0), + layer_iq1s_f32 (0), + layer_iq4nl_f32 (0), + layer_iq1m_f32 (0) + {} }; struct model_params { int64_t input_f32; int64_t input_f16; + int64_t input_q2k; int64_t input_q4k; - int64_t input_q50; int64_t input_q5k; int64_t input_q6k; + int64_t input_iq2xxs; + int64_t input_q50; int64_t input_q80; + int64_t input_iq1s; + int64_t input_iq4nl; + int64_t input_iq1m; int64_t output_f32; int64_t output_f16; + int64_t output_q2k; int64_t output_q4k; - int64_t output_q50; int64_t output_q5k; int64_t output_q6k; + int64_t output_iq2xxs; + int64_t output_q50; int64_t output_q80; + int64_t output_iq1s; + int64_t output_iq4nl; + int64_t output_iq1m; int64_t layer_f32; int64_t layer_f16; + int64_t layer_q2k; int64_t layer_q4k; - int64_t layer_q50; int64_t layer_q5k; int64_t layer_q6k; + int64_t layer_iq2xxs; + int64_t layer_q50; int64_t layer_q80; + int64_t layer_iq1s; + int64_t layer_iq4nl; + int64_t layer_iq1m; model_params() : - input_f32 (0), - input_f16 (0), - input_q4k (0), - input_q50 (0), - input_q5k (0), - input_q6k (0), - input_q80 (0), - output_f32(0), - output_f16(0), - output_q4k(0), - output_q50(0), - output_q5k(0), - output_q6k(0), - output_q80(0), - layer_f32 (0), - layer_f16 (0), - layer_q4k (0), - layer_q50 (0), - layer_q5k (0), - layer_q6k (0), - layer_q80 (0) {} + input_f32 (0), + input_f16 (0), + input_q2k (0), + input_q4k (0), + input_q5k (0), + input_q6k (0), + input_iq2xxs (0), + input_q50 (0), + input_q80 (0), + input_iq1s (0), + input_iq4nl (0), + input_iq1m (0), + output_f32 (0), + output_f16 (0), + output_q2k (0), + output_q4k (0), + output_q5k (0), + output_q6k (0), + output_iq2xxs (0), + output_q50 (0), + output_q80 (0), + output_iq1s (0), + output_iq4nl (0), + output_iq1m (0), + layer_f32 (0), + layer_f16 (0), + layer_q2k (0), + layer_q4k (0), + layer_q5k (0), + layer_q6k (0), + layer_iq2xxs (0), + layer_q50 (0), + layer_q80 (0), + layer_iq1s (0), + layer_iq4nl (0), + layer_iq1m (0) + {} }; struct model_bytes { diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 962dc032..4af68abc 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -385,12 +385,12 @@ extern "C" { GGML_TYPE_F64 = 28, GGML_TYPE_IQ1_M = 29, GGML_TYPE_BF16 = 30, - GGML_TYPE_Q4_0_4_4 = 31, - GGML_TYPE_Q4_0_4_8 = 32, - GGML_TYPE_Q4_0_8_8 = 33, + // GGML_TYPE_Q4_0_4_4 = 31, support has been removed from gguf files + // GGML_TYPE_Q4_0_4_8 = 32, + // GGML_TYPE_Q4_0_8_8 = 33, GGML_TYPE_TQ1_0 = 34, GGML_TYPE_TQ2_0 = 35, - GGML_TYPE_COUNT, + GGML_TYPE_COUNT = 39, }; // precision @@ -431,9 +431,6 @@ extern "C" { GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors - GGML_FTYPE_MOSTLY_Q4_0_4_4 = 25, // except 1d tensors - GGML_FTYPE_MOSTLY_Q4_0_4_8 = 26, // except 1d tensors - GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors }; // available tensor operations: diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 7aa6dce8..1c57cb95 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -15725,15 +15725,6 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte { VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_nl, data, nb); } break; - case GGML_TYPE_Q4_0_4_4: - case GGML_TYPE_Q4_0_4_8: - { - VALIDATE_ROW_DATA_DVEC_F16_IMPL(block_q4_0x4, data, nbytes / sizeof(block_q4_0x4), 4); - } break; - case GGML_TYPE_Q4_0_8_8: - { - VALIDATE_ROW_DATA_DVEC_F16_IMPL(block_q4_0x8, data, nbytes / sizeof(block_q4_0x8), 8); - } break; case GGML_TYPE_I8: case GGML_TYPE_I16: diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 73426a5d..ffae7f2e 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -1076,54 +1076,6 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_BF16, .nrows = 1, }, - [GGML_TYPE_Q4_0_4_4] = { - .type_name = "q4_0_4x4", - .blck_size = QK4_0, - .blck_size_interleave = 4, - .type_size = sizeof(block_q4_0), - .is_quantized = true, - .to_float = NULL, - .from_float = NULL, - .from_float_ref = NULL, - .vec_dot = NULL, - .vec_dot_type = GGML_TYPE_Q8_0, - .nrows = 1, - .ncols = 4, - .gemv = ggml_gemv_q4_0_4x4_q8_0, - .gemm = ggml_gemm_q4_0_4x4_q8_0, - }, - [GGML_TYPE_Q4_0_4_8] = { - .type_name = "q4_0_4x8", - .blck_size = QK4_0, - .blck_size_interleave = 8, - .type_size = sizeof(block_q4_0), - .is_quantized = true, - .to_float = NULL, - .from_float = NULL, - .from_float_ref = NULL, - .vec_dot = NULL, - .vec_dot_type = GGML_TYPE_Q8_0, - .nrows = 1, - .ncols = 4, - .gemv = ggml_gemv_q4_0_4x8_q8_0, - .gemm = ggml_gemm_q4_0_4x8_q8_0, - }, - [GGML_TYPE_Q4_0_8_8] = { - .type_name = "q4_0_8x8", - .blck_size = QK4_0, - .blck_size_interleave = 8, - .type_size = sizeof(block_q4_0), - .is_quantized = true, - .to_float = NULL, - .from_float = NULL, - .from_float_ref = NULL, - .vec_dot = NULL, - .vec_dot_type = GGML_TYPE_Q8_0, - .nrows = 1, - .ncols = 8, - .gemv = ggml_gemv_q4_0_8x8_q8_0, - .gemm = ggml_gemm_q4_0_8x8_q8_0, - }, [GGML_TYPE_TQ1_0] = { .type_name = "tq1_0", .blck_size = QK_K, @@ -3472,7 +3424,7 @@ size_t ggml_row_size(enum ggml_type type, int64_t ne) { double ggml_type_sizef(enum ggml_type type) { return ((double)(type_traits[type].type_size))/type_traits[type].blck_size; } - + const char * ggml_type_name(enum ggml_type type) { return type < GGML_TYPE_COUNT ? type_traits[type].type_name : "NONE"; } @@ -3578,9 +3530,6 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) { case GGML_FTYPE_MOSTLY_IQ4_XS: wtype = GGML_TYPE_IQ4_XS; break; case GGML_FTYPE_MOSTLY_IQ3_S: wtype = GGML_TYPE_IQ3_S; break; case GGML_FTYPE_MOSTLY_IQ2_S: wtype = GGML_TYPE_IQ2_S; break; - case GGML_FTYPE_MOSTLY_Q4_0_4_4: wtype = GGML_TYPE_Q4_0_4_4; break; - case GGML_FTYPE_MOSTLY_Q4_0_4_8: wtype = GGML_TYPE_Q4_0_4_8; break; - case GGML_FTYPE_MOSTLY_Q4_0_8_8: wtype = GGML_TYPE_Q4_0_8_8; break; case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break; case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break; } @@ -4107,7 +4056,7 @@ static struct ggml_tensor * ggml_new_tensor_impl( /*.data =*/ obj_alloc_size > 0 ? (void *)(result + 1) : data, /*.name =*/ { 0 }, /*.extra =*/ NULL, - ///*.padding =*/ { 0 }, + // /*.padding =*/ { 0 }, }; #ifdef __clang__ @@ -9517,9 +9466,6 @@ static void ggml_compute_forward_add( case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ2_S: - case GGML_TYPE_Q4_0_4_4: - case GGML_TYPE_Q4_0_4_8: - case GGML_TYPE_Q4_0_8_8: { ggml_compute_forward_add_q_f32(params, dst); } break; @@ -9897,9 +9843,6 @@ static void ggml_compute_forward_add1( case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ2_S: - case GGML_TYPE_Q4_0_4_4: - case GGML_TYPE_Q4_0_4_8: - case GGML_TYPE_Q4_0_8_8: { ggml_compute_forward_add1_q_f32(params, dst); } break; @@ -10027,9 +9970,6 @@ static void ggml_compute_forward_acc( case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ2_S: - case GGML_TYPE_Q4_0_4_4: - case GGML_TYPE_Q4_0_4_8: - case GGML_TYPE_Q4_0_8_8: default: { GGML_ABORT("fatal error"); @@ -13093,9 +13033,6 @@ static void ggml_compute_forward_out_prod( case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ2_S: - case GGML_TYPE_Q4_0_4_4: - case GGML_TYPE_Q4_0_4_8: - case GGML_TYPE_Q4_0_8_8: { ggml_compute_forward_out_prod_q_f32(params, dst); } break; @@ -13283,9 +13220,6 @@ static void ggml_compute_forward_set( case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ2_S: - case GGML_TYPE_Q4_0_4_4: - case GGML_TYPE_Q4_0_4_8: - case GGML_TYPE_Q4_0_8_8: default: { GGML_ABORT("fatal error"); @@ -13547,9 +13481,6 @@ static void ggml_compute_forward_get_rows( case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ2_S: - case GGML_TYPE_Q4_0_4_4: - case GGML_TYPE_Q4_0_4_8: - case GGML_TYPE_Q4_0_8_8: { ggml_compute_forward_get_rows_q(params, dst); } break; @@ -14139,9 +14070,6 @@ static void ggml_compute_forward_clamp( case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ2_S: case GGML_TYPE_Q8_K: - case GGML_TYPE_Q4_0_4_4: - case GGML_TYPE_Q4_0_4_8: - case GGML_TYPE_Q4_0_8_8: case GGML_TYPE_I8: case GGML_TYPE_I16: case GGML_TYPE_I32: @@ -21941,9 +21869,6 @@ size_t ggml_quantize_chunk( case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_Q4_0_4_4: result = quantize_q4_0_4x4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_Q4_0_4_8: result = quantize_q4_0_4x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_Q4_0_8_8: result = quantize_q4_0_8x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_F16: { size_t elemsize = sizeof(ggml_fp16_t); diff --git a/include/llama.h b/include/llama.h index 5c14d2a3..7d7392fe 100644 --- a/include/llama.h +++ b/include/llama.h @@ -165,18 +165,18 @@ extern "C" { LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ3_XS = 22, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, // except 1d tensors - LLAMA_FTYPE_MOSTLY_IQ1_S = 24, // except 1d tensors + LLAMA_FTYPE_MOSTLY_IQ1_S = 24, // except 1d tensors, 1 bit quantization LLAMA_FTYPE_MOSTLY_IQ4_NL = 25, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ3_S = 26, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ3_M = 27, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ2_S = 28, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors - LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors + LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors, 1 bit quantization LLAMA_FTYPE_MOSTLY_BF16 = 32, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // except 1d tensors + // LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33, // removed from gguf files, use Q4_0 and runtime repack + // LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, // removed from gguf files, use Q4_0 and runtime repack + // LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // removed from gguf files, use Q4_0 and runtime repack LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors diff --git a/src/llama.cpp b/src/llama.cpp index 88e13e5a..1aedb6a4 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3560,16 +3560,26 @@ static bool is_dtype_exist(struct model_params * n_params, enum ggml_type dtype) case GGML_TYPE_F32: case GGML_TYPE_F16: return true; + case GGML_TYPE_Q2_K: + return n_params->layer_q2k > 0 || n_params->output_q2k > 0; case GGML_TYPE_Q4_K: - return n_params->layer_q4k > 0 || n_params->output_q4k > 0; - case GGML_TYPE_Q5_0: - return n_params->layer_q50 > 0 || n_params->output_q50 > 0; + return n_params->layer_q4k > 0 || n_params->output_q4k > 0; case GGML_TYPE_Q5_K: - return n_params->layer_q5k > 0 || n_params->output_q5k > 0; + return n_params->layer_q5k > 0 || n_params->output_q5k > 0; case GGML_TYPE_Q6_K: - return n_params->layer_q6k > 0 || n_params->output_q6k > 0; + return n_params->layer_q6k > 0 || n_params->output_q6k > 0; + case GGML_TYPE_IQ2_XXS: + return n_params->layer_iq2xxs > 0 || n_params->output_iq2xxs > 0; + case GGML_TYPE_Q5_0: + return n_params->layer_q50 > 0 || n_params->output_q50 > 0; case GGML_TYPE_Q8_0: - return n_params->layer_q80 > 0 || n_params->output_q80 > 0; + return n_params->layer_q80 > 0 || n_params->output_q80 > 0; + case GGML_TYPE_IQ1_S: + return n_params->layer_iq1s > 0 || n_params->output_iq1s > 0; + case GGML_TYPE_IQ4_NL: + return n_params->layer_iq4nl > 0 || n_params->output_iq4nl > 0; + case GGML_TYPE_IQ1_M: + return n_params->layer_iq1m > 0 || n_params->output_iq1m > 0; default: throw std::runtime_error("Unrecognized data type\n"); } @@ -3650,18 +3660,18 @@ void llama_profile_device( dev_info->gpu_props.cuda_flops_f16_f32 = device_cuda_flops (model, GGML_TYPE_F16, GGML_TYPE_F32); } + if (is_dtype_exist(n_params, GGML_TYPE_Q2_K)) { + dev_info->cpu_props.flops_q2k_f32 = device_cpu_flops (model, GGML_TYPE_Q2_K, GGML_TYPE_F32, n_threads); + dev_info->gpu_props.metal_flops_q2k_f32 = device_metal_flops(model, GGML_TYPE_Q2_K, GGML_TYPE_F32); + dev_info->gpu_props.cuda_flops_q2k_f32 = device_cuda_flops (model, GGML_TYPE_Q2_K, GGML_TYPE_F32); + } + if (is_dtype_exist(n_params, GGML_TYPE_Q4_K)) { dev_info->cpu_props.flops_q4k_f32 = device_cpu_flops (model, GGML_TYPE_Q4_K, GGML_TYPE_F32, n_threads); dev_info->gpu_props.metal_flops_q4k_f32 = device_metal_flops(model, GGML_TYPE_Q4_K, GGML_TYPE_F32); dev_info->gpu_props.cuda_flops_q4k_f32 = device_cuda_flops (model, GGML_TYPE_Q4_K, GGML_TYPE_F32); } - if (is_dtype_exist(n_params, GGML_TYPE_Q5_0)) { - dev_info->cpu_props.flops_q50_f32 = device_cpu_flops (model, GGML_TYPE_Q5_0, GGML_TYPE_F32, n_threads); - dev_info->gpu_props.metal_flops_q50_f32 = device_metal_flops(model, GGML_TYPE_Q5_0, GGML_TYPE_F32); - dev_info->gpu_props.cuda_flops_q50_f32 = device_cuda_flops (model, GGML_TYPE_Q5_0, GGML_TYPE_F32); - } - if (is_dtype_exist(n_params, GGML_TYPE_Q5_K)) { dev_info->cpu_props.flops_q5k_f32 = device_cpu_flops (model, GGML_TYPE_Q5_K, GGML_TYPE_F32, n_threads); dev_info->gpu_props.metal_flops_q5k_f32 = device_metal_flops(model, GGML_TYPE_Q5_K, GGML_TYPE_F32); @@ -3674,11 +3684,42 @@ void llama_profile_device( dev_info->gpu_props.cuda_flops_q6k_f32 = device_cuda_flops (model, GGML_TYPE_Q6_K, GGML_TYPE_F32); } + if (is_dtype_exist(n_params, GGML_TYPE_IQ2_XXS)) { + dev_info->cpu_props.flops_iq2xxs_f32 = device_cpu_flops (model, GGML_TYPE_IQ2_XXS, GGML_TYPE_F32, n_threads); + dev_info->gpu_props.metal_flops_iq2xxs_f32= device_metal_flops(model, GGML_TYPE_IQ2_XXS, GGML_TYPE_F32); + dev_info->gpu_props.cuda_flops_iq2xxs_f32 = device_cuda_flops (model, GGML_TYPE_IQ2_XXS, GGML_TYPE_F32); + } + + if (is_dtype_exist(n_params, GGML_TYPE_Q5_0)) { + dev_info->cpu_props.flops_q50_f32 = device_cpu_flops (model, GGML_TYPE_Q5_0, GGML_TYPE_F32, n_threads); + dev_info->gpu_props.metal_flops_q50_f32 = device_metal_flops(model, GGML_TYPE_Q5_0, GGML_TYPE_F32); + dev_info->gpu_props.cuda_flops_q50_f32 = device_cuda_flops (model, GGML_TYPE_Q5_0, GGML_TYPE_F32); + } + + if (is_dtype_exist(n_params, GGML_TYPE_Q8_0)) { dev_info->cpu_props.flops_q80_f32 = device_cpu_flops (model, GGML_TYPE_Q8_0, GGML_TYPE_F32, n_threads); dev_info->gpu_props.metal_flops_q80_f32 = device_metal_flops(model, GGML_TYPE_Q8_0, GGML_TYPE_F32); dev_info->gpu_props.cuda_flops_q80_f32 = device_cuda_flops (model, GGML_TYPE_Q8_0, GGML_TYPE_F32); } + + if (is_dtype_exist(n_params, GGML_TYPE_IQ1_S)) { + dev_info->cpu_props.flops_iq1s_f32 = device_cpu_flops (model, GGML_TYPE_IQ1_S, GGML_TYPE_F32, n_threads); + dev_info->gpu_props.metal_flops_iq1s_f32= device_metal_flops(model, GGML_TYPE_IQ1_S, GGML_TYPE_F32); + dev_info->gpu_props.cuda_flops_iq1s_f32 = device_cuda_flops (model, GGML_TYPE_IQ1_S, GGML_TYPE_F32); + } + + if (is_dtype_exist(n_params, GGML_TYPE_IQ4_NL)) { + dev_info->cpu_props.flops_iq4nl_f32 = device_cpu_flops (model, GGML_TYPE_IQ4_NL, GGML_TYPE_F32, n_threads); + dev_info->gpu_props.metal_flops_iq4nl_f32= device_metal_flops(model, GGML_TYPE_IQ4_NL, GGML_TYPE_F32); + dev_info->gpu_props.cuda_flops_iq4nl_f32 = device_cuda_flops (model, GGML_TYPE_IQ4_NL, GGML_TYPE_F32); + } + + if (is_dtype_exist(n_params, GGML_TYPE_IQ1_M)) { + dev_info->cpu_props.flops_iq1m_f32 = device_cpu_flops (model, GGML_TYPE_IQ1_M, GGML_TYPE_F32, n_threads); + dev_info->gpu_props.metal_flops_iq1m_f32= device_metal_flops(model, GGML_TYPE_IQ1_M, GGML_TYPE_F32); + dev_info->gpu_props.cuda_flops_iq1m_f32 = device_cuda_flops (model, GGML_TYPE_IQ1_M, GGML_TYPE_F32); + } } ggml_backend_buffer_type_t llama_dev_buffer_type(struct llama_model * model, int device) { @@ -4844,9 +4885,7 @@ struct llama_model_loader { case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break; case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break; case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break; - case GGML_TYPE_Q4_0_4_4: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_4; break; - case GGML_TYPE_Q4_0_4_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_8; break; - case GGML_TYPE_Q4_0_8_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_8_8; break; + default: { LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max)); @@ -5654,9 +5693,6 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw"; - case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4"; - case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8"; - case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: return "Q4_0_8_8"; default: return "unknown, may not work"; } @@ -18997,10 +19033,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { new_type = GGML_TYPE_IQ3_S; } - else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || - new_type == GGML_TYPE_Q4_0_8_8) { - new_type = GGML_TYPE_Q4_0; - } else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) { new_type = GGML_TYPE_Q4_K; } @@ -19323,10 +19355,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break; case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break; case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break; - case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: default_type = GGML_TYPE_Q4_0_4_4; break; - case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: default_type = GGML_TYPE_Q4_0_4_8; break; - case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: default_type = GGML_TYPE_Q4_0_8_8; break; - + default: throw std::runtime_error(format("invalid output file type %d\n", ftype)); } @@ -19646,14 +19675,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s f32_data = (float *) f32_conv_buf.data(); } - int chunk_size_multiplier = 1; - if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || new_type == GGML_TYPE_Q4_0_8_8) { - if ((new_type == GGML_TYPE_Q4_0_8_8) && (tensor->ne[1] % 8 != 0)) new_type = GGML_TYPE_Q4_0; - else if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q4_0; - if (new_type == GGML_TYPE_Q4_0_8_8) chunk_size_multiplier = 8; - else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8) chunk_size_multiplier = 4; - } - LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type)); fflush(stdout); @@ -19666,8 +19687,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s const int64_t nrows = tensor->ne[1]; static const int64_t min_chunk_size = 32 * 512; - const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row)) * - chunk_size_multiplier; + const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row)); const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1]; const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size; @@ -21049,25 +21069,40 @@ static void count_n_flops(struct model_flops * n_flops, enum ggml_type dtype, en case PROFILER_LAYER_OUTPUT: switch (dtype) { case GGML_TYPE_F32: - n_flops->output_f32_f32 += n; + n_flops->output_f32_f32 += n; break; case GGML_TYPE_F16: - n_flops->output_f16_f32 += n; + n_flops->output_f16_f32 += n; + break; + case GGML_TYPE_Q2_K: + n_flops->output_q2k_f32 += n; break; case GGML_TYPE_Q4_K: - n_flops->output_q4k_f32 += n; - break; - case GGML_TYPE_Q5_0: - n_flops->output_q50_f32 += n; + n_flops->output_q4k_f32 += n; break; case GGML_TYPE_Q5_K: - n_flops->output_q5k_f32 += n; + n_flops->output_q5k_f32 += n; break; case GGML_TYPE_Q6_K: - n_flops->output_q6k_f32 += n; + n_flops->output_q6k_f32 += n; + break; + case GGML_TYPE_IQ2_XXS: + n_flops->output_iq2xxs_f32 += n; + break; + case GGML_TYPE_Q5_0: + n_flops->output_q50_f32 += n; break; case GGML_TYPE_Q8_0: - n_flops->output_q80_f32 += n; + n_flops->output_q80_f32 += n; + break; + case GGML_TYPE_IQ1_S: + n_flops->output_iq1s_f32 += n; + break; + case GGML_TYPE_IQ4_NL: + n_flops->output_iq4nl_f32 += n; + break; + case GGML_TYPE_IQ1_M: + n_flops->output_iq1m_f32 += n; break; default: throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_OUTPUT\n"); @@ -21075,27 +21110,42 @@ static void count_n_flops(struct model_flops * n_flops, enum ggml_type dtype, en break; case PROFILER_LAYER_BACKEND: - switch (dtype) { + switch (dtype) { case GGML_TYPE_F32: - n_flops->layer_f32_f32 += n; + n_flops->layer_f32_f32 += n; break; case GGML_TYPE_F16: - n_flops->layer_f16_f32 += n; + n_flops->layer_f16_f32 += n; + break; + case GGML_TYPE_Q2_K: + n_flops->layer_q2k_f32 += n; break; case GGML_TYPE_Q4_K: - n_flops->layer_q4k_f32 += n; - break; - case GGML_TYPE_Q5_0: - n_flops->layer_q50_f32 += n; + n_flops->layer_q4k_f32 += n; break; case GGML_TYPE_Q5_K: - n_flops->layer_q5k_f32 += n; + n_flops->layer_q5k_f32 += n; break; case GGML_TYPE_Q6_K: - n_flops->layer_q6k_f32 += n; + n_flops->layer_q6k_f32 += n; + break; + case GGML_TYPE_IQ2_XXS: + n_flops->layer_iq2xxs_f32 += n; + break; + case GGML_TYPE_Q5_0: + n_flops->layer_q50_f32 += n; break; case GGML_TYPE_Q8_0: - n_flops->layer_q80_f32 += n; + n_flops->layer_q80_f32 += n; + break; + case GGML_TYPE_IQ1_S: + n_flops->layer_iq1s_f32 += n; + break; + case GGML_TYPE_IQ4_NL: + n_flops->layer_iq4nl_f32 += n; + break; + case GGML_TYPE_IQ1_M: + n_flops->layer_iq1m_f32 += n; break; default: throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_BACKEND\n"); @@ -21113,25 +21163,40 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype, case PROFILER_LAYER_INPUT: switch (dtype) { case GGML_TYPE_F32: - n_params->input_f32 += n_i64t; + n_params->input_f32 += n_i64t; break; case GGML_TYPE_F16: - n_params->input_f16 += n_i64t; + n_params->input_f16 += n_i64t; + break; + case GGML_TYPE_Q2_K: + n_params->input_q2k += n_i64t; break; case GGML_TYPE_Q4_K: - n_params->input_q4k += n_i64t; - break; - case GGML_TYPE_Q5_0: - n_params->input_q50 += n_i64t; + n_params->input_q4k += n_i64t; break; case GGML_TYPE_Q5_K: - n_params->input_q5k += n_i64t; + n_params->input_q5k += n_i64t; break; case GGML_TYPE_Q6_K: - n_params->input_q6k += n_i64t; + n_params->input_q6k += n_i64t; + break; + case GGML_TYPE_IQ2_XXS: + n_params->input_iq2xxs += n_i64t; + break; + case GGML_TYPE_Q5_0: + n_params->input_q50 += n_i64t; break; case GGML_TYPE_Q8_0: - n_params->input_q80 += n_i64t; + n_params->input_q80 += n_i64t; + break; + case GGML_TYPE_IQ1_S: + n_params->input_iq1s += n_i64t; + break; + case GGML_TYPE_IQ4_NL: + n_params->input_iq4nl += n_i64t; + break; + case GGML_TYPE_IQ1_M: + n_params->input_iq1m += n_i64t; break; default: throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_OUTPUT\n"); @@ -21141,25 +21206,40 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype, case PROFILER_LAYER_OUTPUT: switch (dtype) { case GGML_TYPE_F32: - n_params->output_f32 += n_i64t; + n_params->output_f32 += n_i64t; break; case GGML_TYPE_F16: - n_params->output_f16 += n_i64t; + n_params->output_f16 += n_i64t; + break; + case GGML_TYPE_Q2_K: + n_params->output_q2k += n_i64t; break; case GGML_TYPE_Q4_K: - n_params->output_q4k += n_i64t; - break; - case GGML_TYPE_Q5_0: - n_params->output_q50 += n_i64t; + n_params->output_q4k += n_i64t; break; case GGML_TYPE_Q5_K: - n_params->output_q5k += n_i64t; + n_params->output_q5k += n_i64t; break; case GGML_TYPE_Q6_K: - n_params->output_q6k += n_i64t; + n_params->output_q6k += n_i64t; + break; + case GGML_TYPE_IQ2_XXS: + n_params->output_iq2xxs += n_i64t; + break; + case GGML_TYPE_Q5_0: + n_params->output_q50 += n_i64t; break; case GGML_TYPE_Q8_0: - n_params->output_q80 += n_i64t; + n_params->output_q80 += n_i64t; + break; + case GGML_TYPE_IQ1_S: + n_params->output_iq1s += n_i64t; + break; + case GGML_TYPE_IQ4_NL: + n_params->output_iq4nl += n_i64t; + break; + case GGML_TYPE_IQ1_M: + n_params->output_iq1m += n_i64t; break; default: throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_OUTPUT\n"); @@ -21169,25 +21249,40 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype, case PROFILER_LAYER_BACKEND: switch (dtype) { case GGML_TYPE_F32: - n_params->layer_f32 += n_i64t; + n_params->layer_f32 += n_i64t; break; case GGML_TYPE_F16: - n_params->layer_f16 += n_i64t; + n_params->layer_f16 += n_i64t; + break; + case GGML_TYPE_Q2_K: + n_params->layer_q2k += n_i64t; break; case GGML_TYPE_Q4_K: - n_params->layer_q4k += n_i64t; - break; - case GGML_TYPE_Q5_0: - n_params->layer_q50 += n_i64t; + n_params->layer_q4k += n_i64t; break; case GGML_TYPE_Q5_K: - n_params->layer_q5k += n_i64t; + n_params->layer_q5k += n_i64t; break; case GGML_TYPE_Q6_K: - n_params->layer_q6k += n_i64t; + n_params->layer_q6k += n_i64t; + break; + case GGML_TYPE_IQ2_XXS: + n_params->layer_iq2xxs += n_i64t; + break; + case GGML_TYPE_Q5_0: + n_params->layer_q50 += n_i64t; break; case GGML_TYPE_Q8_0: - n_params->layer_q80 += n_i64t; + n_params->layer_q80 += n_i64t; + break; + case GGML_TYPE_IQ1_S: + n_params->layer_iq1s += n_i64t; + break; + case GGML_TYPE_IQ4_NL: + n_params->layer_iq4nl += n_i64t; + break; + case GGML_TYPE_IQ1_M: + n_params->layer_iq1m += n_i64t; break; default: throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_BACKEND\n"); @@ -21477,23 +21572,33 @@ void llama_model_n_flops( } // use average values instead of total values - n_flops->layer_f32_f32 = static_cast((double)n_flops->layer_f32_f32 / (double)n_layer); - n_flops->layer_f16_f32 = static_cast((double)n_flops->layer_f16_f32 / (double)n_layer); - n_flops->layer_q4k_f32 = static_cast((double)n_flops->layer_q4k_f32 / (double)n_layer); - n_flops->layer_q50_f32 = static_cast((double)n_flops->layer_q50_f32 / (double)n_layer); - n_flops->layer_q5k_f32 = static_cast((double)n_flops->layer_q5k_f32 / (double)n_layer); - n_flops->layer_q6k_f32 = static_cast((double)n_flops->layer_q6k_f32 / (double)n_layer); - n_flops->layer_q80_f32 = static_cast((double)n_flops->layer_q80_f32 / (double)n_layer); - - n_params->layer_f32 = static_cast((double)n_params->layer_f32 / (double)n_layer); - n_params->layer_f16 = static_cast((double)n_params->layer_f16 / (double)n_layer); - n_params->layer_q4k = static_cast((double)n_params->layer_q4k / (double)n_layer); - n_params->layer_q50 = static_cast((double)n_params->layer_q50 / (double)n_layer); - n_params->layer_q5k = static_cast((double)n_params->layer_q5k / (double)n_layer); - n_params->layer_q6k = static_cast((double)n_params->layer_q6k / (double)n_layer); - n_params->layer_q80 = static_cast((double)n_params->layer_q80 / (double)n_layer); - - n_bytes->nb_layer = static_cast((double)n_bytes->nb_layer / (double)n_layer); + n_flops->layer_f32_f32 = static_cast((double)n_flops->layer_f32_f32 / (double)n_layer); + n_flops->layer_f16_f32 = static_cast((double)n_flops->layer_f16_f32 / (double)n_layer); + n_flops->layer_q2k_f32 = static_cast((double)n_flops->layer_q2k_f32 / (double)n_layer); + n_flops->layer_q4k_f32 = static_cast((double)n_flops->layer_q4k_f32 / (double)n_layer); + n_flops->layer_q5k_f32 = static_cast((double)n_flops->layer_q5k_f32 / (double)n_layer); + n_flops->layer_q6k_f32 = static_cast((double)n_flops->layer_q6k_f32 / (double)n_layer); + n_flops->layer_iq2xxs_f32 = static_cast((double)n_flops->layer_iq2xxs_f32 / (double)n_layer); + n_flops->layer_q50_f32 = static_cast((double)n_flops->layer_q50_f32 / (double)n_layer); + n_flops->layer_q80_f32 = static_cast((double)n_flops->layer_q80_f32 / (double)n_layer); + n_flops->layer_iq1s_f32 = static_cast((double)n_flops->layer_iq1s_f32 / (double)n_layer); + n_flops->layer_iq4nl_f32 = static_cast((double)n_flops->layer_iq4nl_f32 / (double)n_layer); + n_flops->layer_iq1m_f32 = static_cast((double)n_flops->layer_iq1m_f32 / (double)n_layer); + + n_params->layer_f32 = static_cast((double)n_params->layer_f32 / (double)n_layer); + n_params->layer_f16 = static_cast((double)n_params->layer_f16 / (double)n_layer); + n_params->layer_q2k = static_cast((double)n_params->layer_q2k / (double)n_layer); + n_params->layer_q4k = static_cast((double)n_params->layer_q4k / (double)n_layer); + n_params->layer_q5k = static_cast((double)n_params->layer_q5k / (double)n_layer); + n_params->layer_q6k = static_cast((double)n_params->layer_q6k / (double)n_layer); + n_params->layer_iq2xxs = static_cast((double)n_params->layer_iq2xxs / (double)n_layer); + n_params->layer_q50 = static_cast((double)n_params->layer_q50 / (double)n_layer); + n_params->layer_q80 = static_cast((double)n_params->layer_q80 / (double)n_layer); + n_params->layer_iq1s = static_cast((double)n_params->layer_iq1s / (double)n_layer); + n_params->layer_iq4nl = static_cast((double)n_params->layer_iq4nl / (double)n_layer); + n_params->layer_iq1m = static_cast((double)n_params->layer_iq1m / (double)n_layer); + + n_bytes->nb_layer = static_cast((double)n_bytes->nb_layer / (double)n_layer); // reset ml, model, and clear contexts ml->n_created = 0;