mirror of
https://github.com/Lizonghang/prima.cpp.git
synced 2025-09-11 04:44:35 +00:00
Added support for IQ1_M and IQ2_XXS quantization type
This commit is contained in:
parent
230c68b80c
commit
45ec52c2cb
4 changed files with 555 additions and 336 deletions
|
@ -901,16 +901,18 @@ static bool assign_layers_to_device(
|
|||
float t_read_ram_cpu = 0.0f;
|
||||
|
||||
float t_calc_cpu = (
|
||||
master.model_flops.layer_f32_f32 / (dev.cpu_props.flops_f32_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_f16_f32 / (dev.cpu_props.flops_f16_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q2k_f32 / (dev.cpu_props.flops_q2k_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q4k_f32 / (dev.cpu_props.flops_q4k_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q5k_f32 / (dev.cpu_props.flops_q5k_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q6k_f32 / (dev.cpu_props.flops_q6k_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q50_f32 / (dev.cpu_props.flops_q50_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q80_f32 / (dev.cpu_props.flops_q80_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_iq1s_f32 / (dev.cpu_props.flops_iq1s_f32 * 1e9 + EPS)+
|
||||
master.model_flops.layer_iq4nl_f32 / (dev.cpu_props.flops_iq4nl_f32 * 1e9 + EPS)) * 1000; // in ms
|
||||
master.model_flops.layer_f32_f32 / (dev.cpu_props.flops_f32_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_f16_f32 / (dev.cpu_props.flops_f16_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q2k_f32 / (dev.cpu_props.flops_q2k_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q4k_f32 / (dev.cpu_props.flops_q4k_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q5k_f32 / (dev.cpu_props.flops_q5k_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q6k_f32 / (dev.cpu_props.flops_q6k_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_iq2xxs_f32/ (dev.cpu_props.flops_iq2xxs_f32* 1e9 + EPS) +
|
||||
master.model_flops.layer_q50_f32 / (dev.cpu_props.flops_q50_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q80_f32 / (dev.cpu_props.flops_q80_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_iq1s_f32 / (dev.cpu_props.flops_iq1s_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_iq4nl_f32 / (dev.cpu_props.flops_iq4nl_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_iq1m_f32 / (dev.cpu_props.flops_iq1m_f32 * 1e9 + EPS) ) * 1000; // in ms
|
||||
|
||||
float t_kv_cpy_cpu = dev.memory.mem_cpy_delay; // in ms
|
||||
// t_read_ram_cpu = b_prime / (dev.memory.cpu_read_ram_bw * 1e9) * 1000; // in ms
|
||||
|
@ -925,31 +927,35 @@ static bool assign_layers_to_device(
|
|||
|
||||
if (dev.gpu_support.metal) {
|
||||
t_calc_gpu = (
|
||||
master.model_flops.layer_f32_f32 / (dev.gpu_props.metal_flops_f32_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_f16_f32 / (dev.gpu_props.metal_flops_f16_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q2k_f32 / (dev.gpu_props.metal_flops_q2k_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q4k_f32 / (dev.gpu_props.metal_flops_q4k_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q5k_f32 / (dev.gpu_props.metal_flops_q5k_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q6k_f32 / (dev.gpu_props.metal_flops_q6k_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q50_f32 / (dev.gpu_props.metal_flops_q50_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q80_f32 / (dev.gpu_props.metal_flops_q80_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_iq1s_f32 / (dev.gpu_props.metal_flops_iq1s_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_iq4nl_f32 / (dev.gpu_props.metal_flops_iq4nl_f32 * 1e9 + EPS)) * 1000; // in ms
|
||||
master.model_flops.layer_f32_f32 / (dev.gpu_props.metal_flops_f32_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_f16_f32 / (dev.gpu_props.metal_flops_f16_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q2k_f32 / (dev.gpu_props.metal_flops_q2k_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q4k_f32 / (dev.gpu_props.metal_flops_q4k_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q5k_f32 / (dev.gpu_props.metal_flops_q5k_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q6k_f32 / (dev.gpu_props.metal_flops_q6k_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_iq2xxs_f32 / (dev.gpu_props.metal_flops_iq2xxs_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q50_f32 / (dev.gpu_props.metal_flops_q50_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q80_f32 / (dev.gpu_props.metal_flops_q80_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_iq1s_f32 / (dev.gpu_props.metal_flops_iq1s_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_iq4nl_f32 / (dev.gpu_props.metal_flops_iq4nl_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_iq1m_f32 / (dev.gpu_props.metal_flops_iq1m_f32 * 1e9 + EPS) ) * 1000; // in ms
|
||||
|
||||
t_kv_cpy_gpu = dev.gpu_props.metal_mem_cpy_delay; // in ms
|
||||
// t_read_ram_gpu = b_prime / (dev.gpu_props.metal_read_vram_bw * 1e9) * 1000; // in ms
|
||||
} else {
|
||||
t_calc_gpu = (
|
||||
master.model_flops.layer_f32_f32 / (dev.gpu_props.cuda_flops_f32_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_f16_f32 / (dev.gpu_props.cuda_flops_f16_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q2k_f32 / (dev.gpu_props.cuda_flops_q2k_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q4k_f32 / (dev.gpu_props.cuda_flops_q4k_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q5k_f32 / (dev.gpu_props.cuda_flops_q5k_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q6k_f32 / (dev.gpu_props.cuda_flops_q6k_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q50_f32 / (dev.gpu_props.cuda_flops_q50_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q80_f32 / (dev.gpu_props.cuda_flops_q80_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_iq1s_f32 / (dev.gpu_props.cuda_flops_iq1s_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_iq4nl_f32 / (dev.gpu_props.cuda_flops_iq4nl_f32 * 1e9 + EPS)) * 1000; // in ms
|
||||
master.model_flops.layer_f32_f32 / (dev.gpu_props.cuda_flops_f32_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_f16_f32 / (dev.gpu_props.cuda_flops_f16_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q2k_f32 / (dev.gpu_props.cuda_flops_q2k_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q4k_f32 / (dev.gpu_props.cuda_flops_q4k_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q5k_f32 / (dev.gpu_props.cuda_flops_q5k_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q6k_f32 / (dev.gpu_props.cuda_flops_q6k_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_iq2xxs_f32 / (dev.gpu_props.cuda_flops_iq2xxs_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q50_f32 / (dev.gpu_props.cuda_flops_q50_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q80_f32 / (dev.gpu_props.cuda_flops_q80_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_iq1s_f32 / (dev.gpu_props.cuda_flops_iq1s_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_iq4nl_f32 / (dev.gpu_props.cuda_flops_iq4nl_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_iq1m_f32 / (dev.gpu_props.cuda_flops_iq1m_f32 * 1e9 + EPS) ) * 1000; // in ms
|
||||
|
||||
t_kv_cpy_gpu = dev.gpu_props.cuda_mem_cpy_delay; // in ms
|
||||
// t_read_ram_gpu = b_prime / (dev.gpu_props.cuda_read_vram_bw * 1e9) * 1000; // in ms
|
||||
|
@ -1125,17 +1131,18 @@ static bool assign_layers_to_device(
|
|||
|
||||
if (m == 0) {
|
||||
kappa = (
|
||||
dev.model_flops.layer_f32_f32 / (dev.cpu_props.flops_f32_f32 * 1e9 + EPS) +
|
||||
dev.model_flops.layer_f16_f32 / (dev.cpu_props.flops_f16_f32 * 1e9 + EPS) +
|
||||
dev.model_flops.layer_q2k_f32 / (dev.cpu_props.flops_q2k_f32 * 1e9 + EPS) +
|
||||
dev.model_flops.layer_q4k_f32 / (dev.cpu_props.flops_q4k_f32 * 1e9 + EPS) +
|
||||
dev.model_flops.layer_q5k_f32 / (dev.cpu_props.flops_q5k_f32 * 1e9 + EPS) +
|
||||
dev.model_flops.layer_q6k_f32 / (dev.cpu_props.flops_q6k_f32 * 1e9 + EPS) +
|
||||
dev.model_flops.layer_q50_f32 / (dev.cpu_props.flops_q50_f32 * 1e9 + EPS) +
|
||||
dev.model_flops.layer_q80_f32 / (dev.cpu_props.flops_q80_f32 * 1e9 + EPS) +
|
||||
dev.model_flops.layer_iq1s_f32 / (dev.cpu_props.flops_iq1s_f32 * 1e9 + EPS) +
|
||||
dev.model_flops.layer_iq4nl_f32 / (dev.cpu_props.flops_iq4nl_f32 * 1e9 + EPS)) * 1000; // in ms
|
||||
|
||||
dev.model_flops.layer_f32_f32 / (dev.cpu_props.flops_f32_f32 * 1e9 + EPS) +
|
||||
dev.model_flops.layer_f16_f32 / (dev.cpu_props.flops_f16_f32 * 1e9 + EPS) +
|
||||
dev.model_flops.layer_q2k_f32 / (dev.cpu_props.flops_q2k_f32 * 1e9 + EPS) +
|
||||
dev.model_flops.layer_q4k_f32 / (dev.cpu_props.flops_q4k_f32 * 1e9 + EPS) +
|
||||
dev.model_flops.layer_q5k_f32 / (dev.cpu_props.flops_q5k_f32 * 1e9 + EPS) +
|
||||
dev.model_flops.layer_q6k_f32 / (dev.cpu_props.flops_q6k_f32 * 1e9 + EPS) +
|
||||
dev.model_flops.layer_iq2xxs_f32 / (dev.cpu_props.flops_iq2xxs_f32 * 1e9 + EPS) +
|
||||
dev.model_flops.layer_q50_f32 / (dev.cpu_props.flops_q50_f32 * 1e9 + EPS) +
|
||||
dev.model_flops.layer_q80_f32 / (dev.cpu_props.flops_q80_f32 * 1e9 + EPS) +
|
||||
dev.model_flops.layer_iq1s_f32 / (dev.cpu_props.flops_iq1s_f32 * 1e9 + EPS) +
|
||||
dev.model_flops.layer_iq4nl_f32 / (dev.cpu_props.flops_iq4nl_f32 * 1e9 + EPS) +
|
||||
dev.model_flops.layer_iq1m_f32 / (dev.cpu_props.flops_iq1m_f32 * 1e9 + EPS) ) * 1000; // in ms
|
||||
// kappa += (bi / n_vocab + bo) / (dev.memory.cpu_read_ram_bw * 1e9) * 1000; // in ms
|
||||
|
||||
kappa += (bi / n_vocab) / (disk_speed[m] * 1e9) * 1000; // in ms
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue