Added support for IQ1_M and IQ2_XXS quantization type

This commit is contained in:
leeetao  2025-03-07 16:56:16 +00:00
parent 230c68b80c
commit 45ec52c2cb
4 changed files with 555 additions and 336 deletions

View file

@ -901,16 +901,18 @@ static bool assign_layers_to_device(
float t_read_ram_cpu = 0.0f; float t_read_ram_cpu = 0.0f;
float t_calc_cpu = ( float t_calc_cpu = (
master.model_flops.layer_f32_f32 / (dev.cpu_props.flops_f32_f32 * 1e9 + EPS) + master.model_flops.layer_f32_f32 / (dev.cpu_props.flops_f32_f32 * 1e9 + EPS) +
master.model_flops.layer_f16_f32 / (dev.cpu_props.flops_f16_f32 * 1e9 + EPS) + master.model_flops.layer_f16_f32 / (dev.cpu_props.flops_f16_f32 * 1e9 + EPS) +
master.model_flops.layer_q2k_f32 / (dev.cpu_props.flops_q2k_f32 * 1e9 + EPS) + master.model_flops.layer_q2k_f32 / (dev.cpu_props.flops_q2k_f32 * 1e9 + EPS) +
master.model_flops.layer_q4k_f32 / (dev.cpu_props.flops_q4k_f32 * 1e9 + EPS) + master.model_flops.layer_q4k_f32 / (dev.cpu_props.flops_q4k_f32 * 1e9 + EPS) +
master.model_flops.layer_q5k_f32 / (dev.cpu_props.flops_q5k_f32 * 1e9 + EPS) + master.model_flops.layer_q5k_f32 / (dev.cpu_props.flops_q5k_f32 * 1e9 + EPS) +
master.model_flops.layer_q6k_f32 / (dev.cpu_props.flops_q6k_f32 * 1e9 + EPS) + master.model_flops.layer_q6k_f32 / (dev.cpu_props.flops_q6k_f32 * 1e9 + EPS) +
master.model_flops.layer_q50_f32 / (dev.cpu_props.flops_q50_f32 * 1e9 + EPS) + master.model_flops.layer_iq2xxs_f32/ (dev.cpu_props.flops_iq2xxs_f32* 1e9 + EPS) +
master.model_flops.layer_q80_f32 / (dev.cpu_props.flops_q80_f32 * 1e9 + EPS) + master.model_flops.layer_q50_f32 / (dev.cpu_props.flops_q50_f32 * 1e9 + EPS) +
master.model_flops.layer_iq1s_f32 / (dev.cpu_props.flops_iq1s_f32 * 1e9 + EPS)+ master.model_flops.layer_q80_f32 / (dev.cpu_props.flops_q80_f32 * 1e9 + EPS) +
master.model_flops.layer_iq4nl_f32 / (dev.cpu_props.flops_iq4nl_f32 * 1e9 + EPS)) * 1000; // in ms master.model_flops.layer_iq1s_f32 / (dev.cpu_props.flops_iq1s_f32 * 1e9 + EPS) +
master.model_flops.layer_iq4nl_f32 / (dev.cpu_props.flops_iq4nl_f32 * 1e9 + EPS) +
master.model_flops.layer_iq1m_f32 / (dev.cpu_props.flops_iq1m_f32 * 1e9 + EPS) ) * 1000; // in ms
float t_kv_cpy_cpu = dev.memory.mem_cpy_delay; // in ms float t_kv_cpy_cpu = dev.memory.mem_cpy_delay; // in ms
// t_read_ram_cpu = b_prime / (dev.memory.cpu_read_ram_bw * 1e9) * 1000; // in ms // t_read_ram_cpu = b_prime / (dev.memory.cpu_read_ram_bw * 1e9) * 1000; // in ms
@ -925,31 +927,35 @@ static bool assign_layers_to_device(
if (dev.gpu_support.metal) { if (dev.gpu_support.metal) {
t_calc_gpu = ( t_calc_gpu = (
master.model_flops.layer_f32_f32 / (dev.gpu_props.metal_flops_f32_f32 * 1e9 + EPS) + master.model_flops.layer_f32_f32 / (dev.gpu_props.metal_flops_f32_f32 * 1e9 + EPS) +
master.model_flops.layer_f16_f32 / (dev.gpu_props.metal_flops_f16_f32 * 1e9 + EPS) + master.model_flops.layer_f16_f32 / (dev.gpu_props.metal_flops_f16_f32 * 1e9 + EPS) +
master.model_flops.layer_q2k_f32 / (dev.gpu_props.metal_flops_q2k_f32 * 1e9 + EPS) + master.model_flops.layer_q2k_f32 / (dev.gpu_props.metal_flops_q2k_f32 * 1e9 + EPS) +
master.model_flops.layer_q4k_f32 / (dev.gpu_props.metal_flops_q4k_f32 * 1e9 + EPS) + master.model_flops.layer_q4k_f32 / (dev.gpu_props.metal_flops_q4k_f32 * 1e9 + EPS) +
master.model_flops.layer_q5k_f32 / (dev.gpu_props.metal_flops_q5k_f32 * 1e9 + EPS) + master.model_flops.layer_q5k_f32 / (dev.gpu_props.metal_flops_q5k_f32 * 1e9 + EPS) +
master.model_flops.layer_q6k_f32 / (dev.gpu_props.metal_flops_q6k_f32 * 1e9 + EPS) + master.model_flops.layer_q6k_f32 / (dev.gpu_props.metal_flops_q6k_f32 * 1e9 + EPS) +
master.model_flops.layer_q50_f32 / (dev.gpu_props.metal_flops_q50_f32 * 1e9 + EPS) + master.model_flops.layer_iq2xxs_f32 / (dev.gpu_props.metal_flops_iq2xxs_f32 * 1e9 + EPS) +
master.model_flops.layer_q80_f32 / (dev.gpu_props.metal_flops_q80_f32 * 1e9 + EPS) + master.model_flops.layer_q50_f32 / (dev.gpu_props.metal_flops_q50_f32 * 1e9 + EPS) +
master.model_flops.layer_iq1s_f32 / (dev.gpu_props.metal_flops_iq1s_f32 * 1e9 + EPS) + master.model_flops.layer_q80_f32 / (dev.gpu_props.metal_flops_q80_f32 * 1e9 + EPS) +
master.model_flops.layer_iq4nl_f32 / (dev.gpu_props.metal_flops_iq4nl_f32 * 1e9 + EPS)) * 1000; // in ms master.model_flops.layer_iq1s_f32 / (dev.gpu_props.metal_flops_iq1s_f32 * 1e9 + EPS) +
master.model_flops.layer_iq4nl_f32 / (dev.gpu_props.metal_flops_iq4nl_f32 * 1e9 + EPS) +
master.model_flops.layer_iq1m_f32 / (dev.gpu_props.metal_flops_iq1m_f32 * 1e9 + EPS) ) * 1000; // in ms
t_kv_cpy_gpu = dev.gpu_props.metal_mem_cpy_delay; // in ms t_kv_cpy_gpu = dev.gpu_props.metal_mem_cpy_delay; // in ms
// t_read_ram_gpu = b_prime / (dev.gpu_props.metal_read_vram_bw * 1e9) * 1000; // in ms // t_read_ram_gpu = b_prime / (dev.gpu_props.metal_read_vram_bw * 1e9) * 1000; // in ms
} else { } else {
t_calc_gpu = ( t_calc_gpu = (
master.model_flops.layer_f32_f32 / (dev.gpu_props.cuda_flops_f32_f32 * 1e9 + EPS) + master.model_flops.layer_f32_f32 / (dev.gpu_props.cuda_flops_f32_f32 * 1e9 + EPS) +
master.model_flops.layer_f16_f32 / (dev.gpu_props.cuda_flops_f16_f32 * 1e9 + EPS) + master.model_flops.layer_f16_f32 / (dev.gpu_props.cuda_flops_f16_f32 * 1e9 + EPS) +
master.model_flops.layer_q2k_f32 / (dev.gpu_props.cuda_flops_q2k_f32 * 1e9 + EPS) + master.model_flops.layer_q2k_f32 / (dev.gpu_props.cuda_flops_q2k_f32 * 1e9 + EPS) +
master.model_flops.layer_q4k_f32 / (dev.gpu_props.cuda_flops_q4k_f32 * 1e9 + EPS) + master.model_flops.layer_q4k_f32 / (dev.gpu_props.cuda_flops_q4k_f32 * 1e9 + EPS) +
master.model_flops.layer_q5k_f32 / (dev.gpu_props.cuda_flops_q5k_f32 * 1e9 + EPS) + master.model_flops.layer_q5k_f32 / (dev.gpu_props.cuda_flops_q5k_f32 * 1e9 + EPS) +
master.model_flops.layer_q6k_f32 / (dev.gpu_props.cuda_flops_q6k_f32 * 1e9 + EPS) + master.model_flops.layer_q6k_f32 / (dev.gpu_props.cuda_flops_q6k_f32 * 1e9 + EPS) +
master.model_flops.layer_q50_f32 / (dev.gpu_props.cuda_flops_q50_f32 * 1e9 + EPS) + master.model_flops.layer_iq2xxs_f32 / (dev.gpu_props.cuda_flops_iq2xxs_f32 * 1e9 + EPS) +
master.model_flops.layer_q80_f32 / (dev.gpu_props.cuda_flops_q80_f32 * 1e9 + EPS) + master.model_flops.layer_q50_f32 / (dev.gpu_props.cuda_flops_q50_f32 * 1e9 + EPS) +
master.model_flops.layer_iq1s_f32 / (dev.gpu_props.cuda_flops_iq1s_f32 * 1e9 + EPS) + master.model_flops.layer_q80_f32 / (dev.gpu_props.cuda_flops_q80_f32 * 1e9 + EPS) +
master.model_flops.layer_iq4nl_f32 / (dev.gpu_props.cuda_flops_iq4nl_f32 * 1e9 + EPS)) * 1000; // in ms master.model_flops.layer_iq1s_f32 / (dev.gpu_props.cuda_flops_iq1s_f32 * 1e9 + EPS) +
master.model_flops.layer_iq4nl_f32 / (dev.gpu_props.cuda_flops_iq4nl_f32 * 1e9 + EPS) +
master.model_flops.layer_iq1m_f32 / (dev.gpu_props.cuda_flops_iq1m_f32 * 1e9 + EPS) ) * 1000; // in ms
t_kv_cpy_gpu = dev.gpu_props.cuda_mem_cpy_delay; // in ms t_kv_cpy_gpu = dev.gpu_props.cuda_mem_cpy_delay; // in ms
// t_read_ram_gpu = b_prime / (dev.gpu_props.cuda_read_vram_bw * 1e9) * 1000; // in ms // t_read_ram_gpu = b_prime / (dev.gpu_props.cuda_read_vram_bw * 1e9) * 1000; // in ms
@ -1125,17 +1131,18 @@ static bool assign_layers_to_device(
if (m == 0) { if (m == 0) {
kappa = ( kappa = (
dev.model_flops.layer_f32_f32 / (dev.cpu_props.flops_f32_f32 * 1e9 + EPS) + dev.model_flops.layer_f32_f32 / (dev.cpu_props.flops_f32_f32 * 1e9 + EPS) +
dev.model_flops.layer_f16_f32 / (dev.cpu_props.flops_f16_f32 * 1e9 + EPS) + dev.model_flops.layer_f16_f32 / (dev.cpu_props.flops_f16_f32 * 1e9 + EPS) +
dev.model_flops.layer_q2k_f32 / (dev.cpu_props.flops_q2k_f32 * 1e9 + EPS) + dev.model_flops.layer_q2k_f32 / (dev.cpu_props.flops_q2k_f32 * 1e9 + EPS) +
dev.model_flops.layer_q4k_f32 / (dev.cpu_props.flops_q4k_f32 * 1e9 + EPS) + dev.model_flops.layer_q4k_f32 / (dev.cpu_props.flops_q4k_f32 * 1e9 + EPS) +
dev.model_flops.layer_q5k_f32 / (dev.cpu_props.flops_q5k_f32 * 1e9 + EPS) + dev.model_flops.layer_q5k_f32 / (dev.cpu_props.flops_q5k_f32 * 1e9 + EPS) +
dev.model_flops.layer_q6k_f32 / (dev.cpu_props.flops_q6k_f32 * 1e9 + EPS) + dev.model_flops.layer_q6k_f32 / (dev.cpu_props.flops_q6k_f32 * 1e9 + EPS) +
dev.model_flops.layer_q50_f32 / (dev.cpu_props.flops_q50_f32 * 1e9 + EPS) + dev.model_flops.layer_iq2xxs_f32 / (dev.cpu_props.flops_iq2xxs_f32 * 1e9 + EPS) +
dev.model_flops.layer_q80_f32 / (dev.cpu_props.flops_q80_f32 * 1e9 + EPS) + dev.model_flops.layer_q50_f32 / (dev.cpu_props.flops_q50_f32 * 1e9 + EPS) +
dev.model_flops.layer_iq1s_f32 / (dev.cpu_props.flops_iq1s_f32 * 1e9 + EPS) + dev.model_flops.layer_q80_f32 / (dev.cpu_props.flops_q80_f32 * 1e9 + EPS) +
dev.model_flops.layer_iq4nl_f32 / (dev.cpu_props.flops_iq4nl_f32 * 1e9 + EPS)) * 1000; // in ms dev.model_flops.layer_iq1s_f32 / (dev.cpu_props.flops_iq1s_f32 * 1e9 + EPS) +
dev.model_flops.layer_iq4nl_f32 / (dev.cpu_props.flops_iq4nl_f32 * 1e9 + EPS) +
dev.model_flops.layer_iq1m_f32 / (dev.cpu_props.flops_iq1m_f32 * 1e9 + EPS) ) * 1000; // in ms
// kappa += (bi / n_vocab + bo) / (dev.memory.cpu_read_ram_bw * 1e9) * 1000; // in ms // kappa += (bi / n_vocab + bo) / (dev.memory.cpu_read_ram_bw * 1e9) * 1000; // in ms
kappa += (bi / n_vocab) / (disk_speed[m] * 1e9) * 1000; // in ms kappa += (bi / n_vocab) / (disk_speed[m] * 1e9) * 1000; // in ms

File diff suppressed because it is too large Load diff

View file

@ -15,16 +15,18 @@ struct cpu_props {
const char * name; const char * name;
const char * description; const char * description;
uint32_t cores; uint32_t cores;
float flops_f32_f32; // in GFLOPS float flops_f32_f32; // in GFLOPS
float flops_f16_f32; // in GFLOPS float flops_f16_f32; // in GFLOPS
float flops_q2k_f32; // in GFLOPS float flops_q2k_f32; // in GFLOPS
float flops_q4k_f32; // in GFLOPS float flops_q4k_f32; // in GFLOPS
float flops_q5k_f32; // in GFLOPS float flops_q5k_f32; // in GFLOPS
float flops_q6k_f32; // in GFLOPS float flops_q6k_f32; // in GFLOPS
float flops_q50_f32; // in GFLOPS float flops_iq2xxs_f32; // in GFLOPS
float flops_q80_f32; // in GFLOPS float flops_q50_f32; // in GFLOPS
float flops_iq1s_f32; // in GFLOPS float flops_q80_f32; // in GFLOPS
float flops_iq4nl_f32; // in GFLOPS float flops_iq1s_f32; // in GFLOPS
float flops_iq4nl_f32; // in GFLOPS
float flops_iq1m_f32; // in GFLOPS
cpu_props() cpu_props()
: name (""), : name (""),
@ -36,10 +38,12 @@ struct cpu_props {
flops_q4k_f32 (0.0f), flops_q4k_f32 (0.0f),
flops_q5k_f32 (0.0f), flops_q5k_f32 (0.0f),
flops_q6k_f32 (0.0f), flops_q6k_f32 (0.0f),
flops_iq2xxs_f32(0.0f),
flops_q50_f32 (0.0f), flops_q50_f32 (0.0f),
flops_q80_f32 (0.0f), flops_q80_f32 (0.0f),
flops_iq1s_f32 (0.0f), flops_iq1s_f32 (0.0f),
flops_iq4nl_f32 (0.0f) flops_iq4nl_f32 (0.0f),
flops_iq1m_f32 (0.0f)
{} {}
}; };
@ -84,32 +88,36 @@ struct gpu_support {
struct gpu_props { struct gpu_props {
const char * name; const char * name;
const char * description; const char * description;
float memory_free; // in GiB float memory_free; // in GiB
float memory_total; // in GiB float memory_total; // in GiB
float metal_read_vram_bw; // in GB/s float metal_read_vram_bw; // in GB/s
float metal_flops_f32_f32; // in GFLOPS float metal_flops_f32_f32; // in GFLOPS
float metal_flops_f16_f32; // in GFLOPS float metal_flops_f16_f32; // in GFLOPS
float metal_flops_q2k_f32; // in GFLOPS float metal_flops_q2k_f32; // in GFLOPS
float metal_flops_q4k_f32; // in GFLOPS float metal_flops_q4k_f32; // in GFLOPS
float metal_flops_q5k_f32; // in GFLOPS float metal_flops_q5k_f32; // in GFLOPS
float metal_flops_q6k_f32; // in GFLOPS float metal_flops_q6k_f32; // in GFLOPS
float metal_flops_q50_f32; // in GFLOPS float metal_flops_iq2xxs_f32; // in GFLOPS
float metal_flops_q80_f32; // in GFLOPS float metal_flops_q50_f32; // in GFLOPS
float metal_flops_iq1s_f32; // in GFLOPS float metal_flops_q80_f32; // in GFLOPS
float metal_flops_iq4nl_f32; // in GFLOPS float metal_flops_iq1s_f32; // in GFLOPS
float metal_mem_cpy_delay; // in ms float metal_flops_iq4nl_f32; // in GFLOPS
float cuda_read_vram_bw; // in GB/s float metal_flops_iq1m_f32; // in GFLOPS
float cuda_flops_f32_f32; // in GFLOPS float metal_mem_cpy_delay; // in ms
float cuda_flops_f16_f32; // in GFLOPS float cuda_read_vram_bw; // in GB/s
float cuda_flops_q2k_f32; // in GFLOPS float cuda_flops_f32_f32; // in GFLOPS
float cuda_flops_q4k_f32; // in GFLOPS float cuda_flops_f16_f32; // in GFLOPS
float cuda_flops_q5k_f32; // in GFLOPS float cuda_flops_q2k_f32; // in GFLOPS
float cuda_flops_q6k_f32; // in GFLOPS float cuda_flops_q4k_f32; // in GFLOPS
float cuda_flops_q50_f32; // in GFLOPS float cuda_flops_q5k_f32; // in GFLOPS
float cuda_flops_q80_f32; // in GFLOPS float cuda_flops_q6k_f32; // in GFLOPS
float cuda_flops_iq1s_f32; // in GFLOPS float cuda_flops_iq2xxs_f32; // in GFLOPS
float cuda_flops_iq4nl_f32; // in GFLOPS float cuda_flops_q50_f32; // in GFLOPS
float cuda_mem_cpy_delay; // in ms float cuda_flops_q80_f32; // in GFLOPS
float cuda_flops_iq1s_f32; // in GFLOPS
float cuda_flops_iq4nl_f32; // in GFLOPS
float cuda_flops_iq1m_f32; // in GFLOPS
float cuda_mem_cpy_delay; // in ms
gpu_props() : gpu_props() :
name (""), name (""),
@ -123,10 +131,12 @@ struct gpu_props {
metal_flops_q4k_f32 (0.0f), metal_flops_q4k_f32 (0.0f),
metal_flops_q5k_f32 (0.0f), metal_flops_q5k_f32 (0.0f),
metal_flops_q6k_f32 (0.0f), metal_flops_q6k_f32 (0.0f),
metal_flops_iq2xxs_f32 (0.0f),
metal_flops_q50_f32 (0.0f), metal_flops_q50_f32 (0.0f),
metal_flops_q80_f32 (0.0f), metal_flops_q80_f32 (0.0f),
metal_flops_iq1s_f32 (0.0f), metal_flops_iq1s_f32 (0.0f),
metal_flops_iq4nl_f32 (0.0f), metal_flops_iq4nl_f32 (0.0f),
metal_flops_iq1m_f32 (0.0f),
metal_mem_cpy_delay (0.0f), metal_mem_cpy_delay (0.0f),
cuda_read_vram_bw (0.0f), cuda_read_vram_bw (0.0f),
cuda_flops_f32_f32 (0.0f), cuda_flops_f32_f32 (0.0f),
@ -135,10 +145,12 @@ struct gpu_props {
cuda_flops_q4k_f32 (0.0f), cuda_flops_q4k_f32 (0.0f),
cuda_flops_q5k_f32 (0.0f), cuda_flops_q5k_f32 (0.0f),
cuda_flops_q6k_f32 (0.0f), cuda_flops_q6k_f32 (0.0f),
cuda_flops_iq2xxs_f32 (0.0f),
cuda_flops_q50_f32 (0.0f), cuda_flops_q50_f32 (0.0f),
cuda_flops_q80_f32 (0.0f), cuda_flops_q80_f32 (0.0f),
cuda_flops_iq1s_f32 (0.0f), cuda_flops_iq1s_f32 (0.0f),
cuda_flops_iq4nl_f32 (0.0f), cuda_flops_iq4nl_f32 (0.0f),
cuda_flops_iq1m_f32 (0.0f),
cuda_mem_cpy_delay (0.0f) {} cuda_mem_cpy_delay (0.0f) {}
}; };
@ -150,43 +162,52 @@ struct model_flops {
int64_t output_q4k_f32; int64_t output_q4k_f32;
int64_t output_q5k_f32; int64_t output_q5k_f32;
int64_t output_q6k_f32; int64_t output_q6k_f32;
int64_t output_iq2xxs_f32;
int64_t output_q50_f32; int64_t output_q50_f32;
int64_t output_q80_f32; int64_t output_q80_f32;
int64_t output_iq1s_f32; int64_t output_iq1s_f32;
int64_t output_iq4nl_f32; int64_t output_iq4nl_f32;
int64_t output_iq1m_f32;
int64_t layer_f32_f32; int64_t layer_f32_f32;
int64_t layer_f16_f32; int64_t layer_f16_f32;
int64_t layer_q2k_f32; int64_t layer_q2k_f32;
int64_t layer_q4k_f32; int64_t layer_q4k_f32;
int64_t layer_q5k_f32; int64_t layer_q5k_f32;
int64_t layer_q6k_f32; int64_t layer_q6k_f32;
int64_t layer_iq2xxs_f32;
int64_t layer_q50_f32; int64_t layer_q50_f32;
int64_t layer_q80_f32; int64_t layer_q80_f32;
int64_t layer_iq1s_f32; int64_t layer_iq1s_f32;
int64_t layer_iq4nl_f32; int64_t layer_iq4nl_f32;
int64_t layer_iq1m_f32;
model_flops() : model_flops() :
inp_embd_ms(0.0f), inp_embd_ms (0.0f),
output_f32_f32(0), output_f32_f32 (0),
output_f16_f32(0), output_f16_f32 (0),
output_q2k_f32(0), output_q2k_f32 (0),
output_q4k_f32(0), output_q4k_f32 (0),
output_q5k_f32(0), output_q5k_f32 (0),
output_q6k_f32(0), output_q6k_f32 (0),
output_q50_f32(0), output_iq2xxs_f32 (0),
output_q80_f32(0), output_q50_f32 (0),
output_iq1s_f32(0), output_q80_f32 (0),
output_iq4nl_f32(0), output_iq1s_f32 (0),
layer_f32_f32 (0), output_iq4nl_f32 (0),
layer_f16_f32 (0), output_iq1m_f32 (0),
layer_q2k_f32 (0), layer_f32_f32 (0),
layer_q4k_f32 (0), layer_f16_f32 (0),
layer_q5k_f32 (0), layer_q2k_f32 (0),
layer_q6k_f32 (0), layer_q4k_f32 (0),
layer_q50_f32 (0), layer_q5k_f32 (0),
layer_q80_f32 (0), layer_q6k_f32 (0),
layer_iq1s_f32 (0), layer_iq2xxs_f32 (0),
layer_iq4nl_f32 (0) {} layer_q50_f32 (0),
layer_q80_f32 (0),
layer_iq1s_f32 (0),
layer_iq4nl_f32 (0),
layer_iq1m_f32 (0)
{}
}; };
struct model_params { struct model_params {
@ -196,62 +217,75 @@ struct model_params {
int64_t input_q4k; int64_t input_q4k;
int64_t input_q5k; int64_t input_q5k;
int64_t input_q6k; int64_t input_q6k;
int64_t input_iq2xxs;
int64_t input_q50; int64_t input_q50;
int64_t input_q80; int64_t input_q80;
int64_t input_iq1s; int64_t input_iq1s;
int64_t input_iq4nl; int64_t input_iq4nl;
int64_t input_iq1m;
int64_t output_f32; int64_t output_f32;
int64_t output_f16; int64_t output_f16;
int64_t output_q2k; int64_t output_q2k;
int64_t output_q4k; int64_t output_q4k;
int64_t output_q5k; int64_t output_q5k;
int64_t output_q6k; int64_t output_q6k;
int64_t output_iq2xxs;
int64_t output_q50; int64_t output_q50;
int64_t output_q80; int64_t output_q80;
int64_t output_iq1s; int64_t output_iq1s;
int64_t output_iq4nl; int64_t output_iq4nl;
int64_t output_iq1m;
int64_t layer_f32; int64_t layer_f32;
int64_t layer_f16; int64_t layer_f16;
int64_t layer_q2k; int64_t layer_q2k;
int64_t layer_q4k; int64_t layer_q4k;
int64_t layer_q5k; int64_t layer_q5k;
int64_t layer_q6k; int64_t layer_q6k;
int64_t layer_iq2xxs;
int64_t layer_q50; int64_t layer_q50;
int64_t layer_q80; int64_t layer_q80;
int64_t layer_iq1s; int64_t layer_iq1s;
int64_t layer_iq4nl; int64_t layer_iq4nl;
int64_t layer_iq1m;
model_params() : model_params() :
input_f32 (0), input_f32 (0),
input_f16 (0), input_f16 (0),
input_q2k (0), input_q2k (0),
input_q4k (0), input_q4k (0),
input_q5k (0), input_q5k (0),
input_q6k (0), input_q6k (0),
input_q50 (0), input_iq2xxs (0),
input_q80 (0), input_q50 (0),
input_iq1s(0), input_q80 (0),
input_iq4nl(0), input_iq1s (0),
output_f32(0), input_iq4nl (0),
output_f16(0), input_iq1m (0),
output_q2k(0), output_f32 (0),
output_q4k(0), output_f16 (0),
output_q5k(0), output_q2k (0),
output_q6k(0), output_q4k (0),
output_q50(0), output_q5k (0),
output_q80(0), output_q6k (0),
output_iq1s(0), output_iq2xxs (0),
output_iq4nl(0), output_q50 (0),
layer_f32 (0), output_q80 (0),
layer_f16 (0), output_iq1s (0),
layer_q2k (0), output_iq4nl (0),
layer_q4k (0), output_iq1m (0),
layer_q5k (0), layer_f32 (0),
layer_q6k (0), layer_f16 (0),
layer_q50 (0), layer_q2k (0),
layer_q80 (0), layer_q4k (0),
layer_iq1s (0), layer_q5k (0),
layer_iq4nl (0) {} layer_q6k (0),
layer_iq2xxs (0),
layer_q50 (0),
layer_q80 (0),
layer_iq1s (0),
layer_iq4nl (0),
layer_iq1m (0)
{}
}; };
struct model_bytes { struct model_bytes {

View file

@ -3560,21 +3560,25 @@ static bool is_dtype_exist(struct model_params * n_params, enum ggml_type dtype)
case GGML_TYPE_F16: case GGML_TYPE_F16:
return true; return true;
case GGML_TYPE_Q2_K: case GGML_TYPE_Q2_K:
return n_params->layer_q2k > 0 || n_params->output_q2k > 0; return n_params->layer_q2k > 0 || n_params->output_q2k > 0;
case GGML_TYPE_Q4_K: case GGML_TYPE_Q4_K:
return n_params->layer_q4k > 0 || n_params->output_q4k > 0; return n_params->layer_q4k > 0 || n_params->output_q4k > 0;
case GGML_TYPE_Q5_K: case GGML_TYPE_Q5_K:
return n_params->layer_q5k > 0 || n_params->output_q5k > 0; return n_params->layer_q5k > 0 || n_params->output_q5k > 0;
case GGML_TYPE_Q6_K: case GGML_TYPE_Q6_K:
return n_params->layer_q6k > 0 || n_params->output_q6k > 0; return n_params->layer_q6k > 0 || n_params->output_q6k > 0;
case GGML_TYPE_IQ2_XXS:
return n_params->layer_iq2xxs > 0 || n_params->output_iq2xxs > 0;
case GGML_TYPE_Q5_0: case GGML_TYPE_Q5_0:
return n_params->layer_q50 > 0 || n_params->output_q50 > 0; return n_params->layer_q50 > 0 || n_params->output_q50 > 0;
case GGML_TYPE_Q8_0: case GGML_TYPE_Q8_0:
return n_params->layer_q80 > 0 || n_params->output_q80 > 0; return n_params->layer_q80 > 0 || n_params->output_q80 > 0;
case GGML_TYPE_IQ1_S: case GGML_TYPE_IQ1_S:
return n_params->layer_iq1s > 0 || n_params->output_iq1s > 0; return n_params->layer_iq1s > 0 || n_params->output_iq1s > 0;
case GGML_TYPE_IQ4_NL: case GGML_TYPE_IQ4_NL:
return n_params->layer_iq4nl > 0 || n_params->output_iq4nl > 0; return n_params->layer_iq4nl > 0 || n_params->output_iq4nl > 0;
case GGML_TYPE_IQ1_M:
return n_params->layer_iq1m > 0 || n_params->output_iq1m > 0;
default: default:
throw std::runtime_error("Unrecognized data type\n"); throw std::runtime_error("Unrecognized data type\n");
} }
@ -3679,6 +3683,12 @@ void llama_profile_device(
dev_info->gpu_props.cuda_flops_q6k_f32 = device_cuda_flops (model, GGML_TYPE_Q6_K, GGML_TYPE_F32); dev_info->gpu_props.cuda_flops_q6k_f32 = device_cuda_flops (model, GGML_TYPE_Q6_K, GGML_TYPE_F32);
} }
if (is_dtype_exist(n_params, GGML_TYPE_IQ2_XXS)) {
dev_info->cpu_props.flops_iq2xxs_f32 = device_cpu_flops (model, GGML_TYPE_IQ2_XXS, GGML_TYPE_F32, n_threads);
dev_info->gpu_props.metal_flops_iq2xxs_f32= device_metal_flops(model, GGML_TYPE_IQ2_XXS, GGML_TYPE_F32);
dev_info->gpu_props.cuda_flops_iq2xxs_f32 = device_cuda_flops (model, GGML_TYPE_IQ2_XXS, GGML_TYPE_F32);
}
if (is_dtype_exist(n_params, GGML_TYPE_Q5_0)) { if (is_dtype_exist(n_params, GGML_TYPE_Q5_0)) {
dev_info->cpu_props.flops_q50_f32 = device_cpu_flops (model, GGML_TYPE_Q5_0, GGML_TYPE_F32, n_threads); dev_info->cpu_props.flops_q50_f32 = device_cpu_flops (model, GGML_TYPE_Q5_0, GGML_TYPE_F32, n_threads);
dev_info->gpu_props.metal_flops_q50_f32 = device_metal_flops(model, GGML_TYPE_Q5_0, GGML_TYPE_F32); dev_info->gpu_props.metal_flops_q50_f32 = device_metal_flops(model, GGML_TYPE_Q5_0, GGML_TYPE_F32);
@ -3703,6 +3713,12 @@ void llama_profile_device(
dev_info->gpu_props.metal_flops_iq4nl_f32= device_metal_flops(model, GGML_TYPE_IQ4_NL, GGML_TYPE_F32); dev_info->gpu_props.metal_flops_iq4nl_f32= device_metal_flops(model, GGML_TYPE_IQ4_NL, GGML_TYPE_F32);
dev_info->gpu_props.cuda_flops_iq4nl_f32 = device_cuda_flops (model, GGML_TYPE_IQ4_NL, GGML_TYPE_F32); dev_info->gpu_props.cuda_flops_iq4nl_f32 = device_cuda_flops (model, GGML_TYPE_IQ4_NL, GGML_TYPE_F32);
} }
if (is_dtype_exist(n_params, GGML_TYPE_IQ1_M)) {
dev_info->cpu_props.flops_iq1m_f32 = device_cpu_flops (model, GGML_TYPE_IQ1_M, GGML_TYPE_F32, n_threads);
dev_info->gpu_props.metal_flops_iq1m_f32= device_metal_flops(model, GGML_TYPE_IQ1_M, GGML_TYPE_F32);
dev_info->gpu_props.cuda_flops_iq1m_f32 = device_cuda_flops (model, GGML_TYPE_IQ1_M, GGML_TYPE_F32);
}
} }
ggml_backend_buffer_type_t llama_dev_buffer_type(struct llama_model * model, int device) { ggml_backend_buffer_type_t llama_dev_buffer_type(struct llama_model * model, int device) {
@ -21049,34 +21065,40 @@ static void count_n_flops(struct model_flops * n_flops, enum ggml_type dtype, en
case PROFILER_LAYER_OUTPUT: case PROFILER_LAYER_OUTPUT:
switch (dtype) { switch (dtype) {
case GGML_TYPE_F32: case GGML_TYPE_F32:
n_flops->output_f32_f32 += n; n_flops->output_f32_f32 += n;
break; break;
case GGML_TYPE_F16: case GGML_TYPE_F16:
n_flops->output_f16_f32 += n; n_flops->output_f16_f32 += n;
break; break;
case GGML_TYPE_Q2_K: case GGML_TYPE_Q2_K:
n_flops->output_q2k_f32 += n; n_flops->output_q2k_f32 += n;
break; break;
case GGML_TYPE_Q4_K: case GGML_TYPE_Q4_K:
n_flops->output_q4k_f32 += n; n_flops->output_q4k_f32 += n;
break; break;
case GGML_TYPE_Q5_K: case GGML_TYPE_Q5_K:
n_flops->output_q5k_f32 += n; n_flops->output_q5k_f32 += n;
break; break;
case GGML_TYPE_Q6_K: case GGML_TYPE_Q6_K:
n_flops->output_q6k_f32 += n; n_flops->output_q6k_f32 += n;
break;
case GGML_TYPE_IQ2_XXS:
n_flops->output_iq2xxs_f32 += n;
break; break;
case GGML_TYPE_Q5_0: case GGML_TYPE_Q5_0:
n_flops->output_q50_f32 += n; n_flops->output_q50_f32 += n;
break; break;
case GGML_TYPE_Q8_0: case GGML_TYPE_Q8_0:
n_flops->output_q80_f32 += n; n_flops->output_q80_f32 += n;
break; break;
case GGML_TYPE_IQ1_S: case GGML_TYPE_IQ1_S:
n_flops->output_iq1s_f32 += n; n_flops->output_iq1s_f32 += n;
break; break;
case GGML_TYPE_IQ4_NL: case GGML_TYPE_IQ4_NL:
n_flops->output_iq4nl_f32 += n; n_flops->output_iq4nl_f32 += n;
break;
case GGML_TYPE_IQ1_M:
n_flops->output_iq1m_f32 += n;
break; break;
default: default:
throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_OUTPUT\n"); throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_OUTPUT\n");
@ -21086,34 +21108,40 @@ static void count_n_flops(struct model_flops * n_flops, enum ggml_type dtype, en
case PROFILER_LAYER_BACKEND: case PROFILER_LAYER_BACKEND:
switch (dtype) { switch (dtype) {
case GGML_TYPE_F32: case GGML_TYPE_F32:
n_flops->layer_f32_f32 += n; n_flops->layer_f32_f32 += n;
break; break;
case GGML_TYPE_F16: case GGML_TYPE_F16:
n_flops->layer_f16_f32 += n; n_flops->layer_f16_f32 += n;
break; break;
case GGML_TYPE_Q2_K: case GGML_TYPE_Q2_K:
n_flops->layer_q2k_f32 += n; n_flops->layer_q2k_f32 += n;
break; break;
case GGML_TYPE_Q4_K: case GGML_TYPE_Q4_K:
n_flops->layer_q4k_f32 += n; n_flops->layer_q4k_f32 += n;
break; break;
case GGML_TYPE_Q5_K: case GGML_TYPE_Q5_K:
n_flops->layer_q5k_f32 += n; n_flops->layer_q5k_f32 += n;
break; break;
case GGML_TYPE_Q6_K: case GGML_TYPE_Q6_K:
n_flops->layer_q6k_f32 += n; n_flops->layer_q6k_f32 += n;
break;
case GGML_TYPE_IQ2_XXS:
n_flops->layer_iq2xxs_f32 += n;
break; break;
case GGML_TYPE_Q5_0: case GGML_TYPE_Q5_0:
n_flops->layer_q50_f32 += n; n_flops->layer_q50_f32 += n;
break; break;
case GGML_TYPE_Q8_0: case GGML_TYPE_Q8_0:
n_flops->layer_q80_f32 += n; n_flops->layer_q80_f32 += n;
break; break;
case GGML_TYPE_IQ1_S: case GGML_TYPE_IQ1_S:
n_flops->layer_iq1s_f32 += n; n_flops->layer_iq1s_f32 += n;
break; break;
case GGML_TYPE_IQ4_NL: case GGML_TYPE_IQ4_NL:
n_flops->layer_iq4nl_f32 += n; n_flops->layer_iq4nl_f32 += n;
break;
case GGML_TYPE_IQ1_M:
n_flops->layer_iq1m_f32 += n;
break; break;
default: default:
throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_BACKEND\n"); throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_BACKEND\n");
@ -21131,34 +21159,40 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype,
case PROFILER_LAYER_INPUT: case PROFILER_LAYER_INPUT:
switch (dtype) { switch (dtype) {
case GGML_TYPE_F32: case GGML_TYPE_F32:
n_params->input_f32 += n_i64t; n_params->input_f32 += n_i64t;
break; break;
case GGML_TYPE_F16: case GGML_TYPE_F16:
n_params->input_f16 += n_i64t; n_params->input_f16 += n_i64t;
break; break;
case GGML_TYPE_Q2_K: case GGML_TYPE_Q2_K:
n_params->input_q2k += n_i64t; n_params->input_q2k += n_i64t;
break; break;
case GGML_TYPE_Q4_K: case GGML_TYPE_Q4_K:
n_params->input_q4k += n_i64t; n_params->input_q4k += n_i64t;
break; break;
case GGML_TYPE_Q5_K: case GGML_TYPE_Q5_K:
n_params->input_q5k += n_i64t; n_params->input_q5k += n_i64t;
break; break;
case GGML_TYPE_Q6_K: case GGML_TYPE_Q6_K:
n_params->input_q6k += n_i64t; n_params->input_q6k += n_i64t;
break;
case GGML_TYPE_IQ2_XXS:
n_params->input_iq2xxs += n_i64t;
break; break;
case GGML_TYPE_Q5_0: case GGML_TYPE_Q5_0:
n_params->input_q50 += n_i64t; n_params->input_q50 += n_i64t;
break; break;
case GGML_TYPE_Q8_0: case GGML_TYPE_Q8_0:
n_params->input_q80 += n_i64t; n_params->input_q80 += n_i64t;
break; break;
case GGML_TYPE_IQ1_S: case GGML_TYPE_IQ1_S:
n_params->input_iq1s += n_i64t; n_params->input_iq1s += n_i64t;
break; break;
case GGML_TYPE_IQ4_NL: case GGML_TYPE_IQ4_NL:
n_params->input_iq4nl += n_i64t; n_params->input_iq4nl += n_i64t;
break;
case GGML_TYPE_IQ1_M:
n_params->input_iq1m += n_i64t;
break; break;
default: default:
throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_OUTPUT\n"); throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_OUTPUT\n");
@ -21185,6 +21219,9 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype,
case GGML_TYPE_Q6_K: case GGML_TYPE_Q6_K:
n_params->output_q6k += n_i64t; n_params->output_q6k += n_i64t;
break; break;
case GGML_TYPE_IQ2_XXS:
n_params->output_iq2xxs += n_i64t;
break;
case GGML_TYPE_Q5_0: case GGML_TYPE_Q5_0:
n_params->output_q50 += n_i64t; n_params->output_q50 += n_i64t;
break; break;
@ -21197,6 +21234,9 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype,
case GGML_TYPE_IQ4_NL: case GGML_TYPE_IQ4_NL:
n_params->output_iq4nl += n_i64t; n_params->output_iq4nl += n_i64t;
break; break;
case GGML_TYPE_IQ1_M:
n_params->output_iq1m += n_i64t;
break;
default: default:
throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_OUTPUT\n"); throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_OUTPUT\n");
} }
@ -21222,6 +21262,9 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype,
case GGML_TYPE_Q6_K: case GGML_TYPE_Q6_K:
n_params->layer_q6k += n_i64t; n_params->layer_q6k += n_i64t;
break; break;
case GGML_TYPE_IQ2_XXS:
n_params->layer_iq2xxs += n_i64t;
break;
case GGML_TYPE_Q5_0: case GGML_TYPE_Q5_0:
n_params->layer_q50 += n_i64t; n_params->layer_q50 += n_i64t;
break; break;
@ -21234,6 +21277,9 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype,
case GGML_TYPE_IQ4_NL: case GGML_TYPE_IQ4_NL:
n_params->layer_iq4nl += n_i64t; n_params->layer_iq4nl += n_i64t;
break; break;
case GGML_TYPE_IQ1_M:
n_params->layer_iq1m += n_i64t;
break;
default: default:
throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_BACKEND\n"); throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_BACKEND\n");
} }
@ -21522,27 +21568,31 @@ void llama_model_n_flops(
} }
// use average values instead of total values // use average values instead of total values
n_flops->layer_f32_f32 = static_cast<int64_t>((double)n_flops->layer_f32_f32 / (double)n_layer); n_flops->layer_f32_f32 = static_cast<int64_t>((double)n_flops->layer_f32_f32 / (double)n_layer);
n_flops->layer_f16_f32 = static_cast<int64_t>((double)n_flops->layer_f16_f32 / (double)n_layer); n_flops->layer_f16_f32 = static_cast<int64_t>((double)n_flops->layer_f16_f32 / (double)n_layer);
n_flops->layer_q2k_f32 = static_cast<int64_t>((double)n_flops->layer_q2k_f32 / (double)n_layer); n_flops->layer_q2k_f32 = static_cast<int64_t>((double)n_flops->layer_q2k_f32 / (double)n_layer);
n_flops->layer_q4k_f32 = static_cast<int64_t>((double)n_flops->layer_q4k_f32 / (double)n_layer); n_flops->layer_q4k_f32 = static_cast<int64_t>((double)n_flops->layer_q4k_f32 / (double)n_layer);
n_flops->layer_q5k_f32 = static_cast<int64_t>((double)n_flops->layer_q5k_f32 / (double)n_layer); n_flops->layer_q5k_f32 = static_cast<int64_t>((double)n_flops->layer_q5k_f32 / (double)n_layer);
n_flops->layer_q6k_f32 = static_cast<int64_t>((double)n_flops->layer_q6k_f32 / (double)n_layer); n_flops->layer_q6k_f32 = static_cast<int64_t>((double)n_flops->layer_q6k_f32 / (double)n_layer);
n_flops->layer_q50_f32 = static_cast<int64_t>((double)n_flops->layer_q50_f32 / (double)n_layer); n_flops->layer_iq2xxs_f32 = static_cast<int64_t>((double)n_flops->layer_iq2xxs_f32 / (double)n_layer);
n_flops->layer_q80_f32 = static_cast<int64_t>((double)n_flops->layer_q80_f32 / (double)n_layer); n_flops->layer_q50_f32 = static_cast<int64_t>((double)n_flops->layer_q50_f32 / (double)n_layer);
n_flops->layer_iq1s_f32 = static_cast<int64_t>((double)n_flops->layer_iq1s_f32 / (double)n_layer); n_flops->layer_q80_f32 = static_cast<int64_t>((double)n_flops->layer_q80_f32 / (double)n_layer);
n_flops->layer_iq4nl_f32 = static_cast<int64_t>((double)n_flops->layer_iq4nl_f32 / (double)n_layer); n_flops->layer_iq1s_f32 = static_cast<int64_t>((double)n_flops->layer_iq1s_f32 / (double)n_layer);
n_flops->layer_iq4nl_f32 = static_cast<int64_t>((double)n_flops->layer_iq4nl_f32 / (double)n_layer);
n_flops->layer_iq1m_f32 = static_cast<int64_t>((double)n_flops->layer_iq1m_f32 / (double)n_layer);
n_params->layer_f32 = static_cast<int64_t>((double)n_params->layer_f32 / (double)n_layer); n_params->layer_f32 = static_cast<int64_t>((double)n_params->layer_f32 / (double)n_layer);
n_params->layer_f16 = static_cast<int64_t>((double)n_params->layer_f16 / (double)n_layer); n_params->layer_f16 = static_cast<int64_t>((double)n_params->layer_f16 / (double)n_layer);
n_params->layer_q2k = static_cast<int64_t>((double)n_params->layer_q2k / (double)n_layer); n_params->layer_q2k = static_cast<int64_t>((double)n_params->layer_q2k / (double)n_layer);
n_params->layer_q4k = static_cast<int64_t>((double)n_params->layer_q4k / (double)n_layer); n_params->layer_q4k = static_cast<int64_t>((double)n_params->layer_q4k / (double)n_layer);
n_params->layer_q50 = static_cast<int64_t>((double)n_params->layer_q50 / (double)n_layer);
n_params->layer_q5k = static_cast<int64_t>((double)n_params->layer_q5k / (double)n_layer); n_params->layer_q5k = static_cast<int64_t>((double)n_params->layer_q5k / (double)n_layer);
n_params->layer_q6k = static_cast<int64_t>((double)n_params->layer_q6k / (double)n_layer); n_params->layer_q6k = static_cast<int64_t>((double)n_params->layer_q6k / (double)n_layer);
n_params->layer_iq2xxs = static_cast<int64_t>((double)n_params->layer_iq2xxs / (double)n_layer);
n_params->layer_q50 = static_cast<int64_t>((double)n_params->layer_q50 / (double)n_layer);
n_params->layer_q80 = static_cast<int64_t>((double)n_params->layer_q80 / (double)n_layer); n_params->layer_q80 = static_cast<int64_t>((double)n_params->layer_q80 / (double)n_layer);
n_params->layer_iq1s = static_cast<int64_t>((double)n_params->layer_iq1s / (double)n_layer); n_params->layer_iq1s = static_cast<int64_t>((double)n_params->layer_iq1s / (double)n_layer);
n_params->layer_iq4nl = static_cast<int64_t>((double)n_params->layer_iq4nl / (double)n_layer); n_params->layer_iq4nl = static_cast<int64_t>((double)n_params->layer_iq4nl / (double)n_layer);
n_params->layer_iq1m = static_cast<int64_t>((double)n_params->layer_iq1m / (double)n_layer);
n_bytes->nb_layer = static_cast<int64_t>((double)n_bytes->nb_layer / (double)n_layer); n_bytes->nb_layer = static_cast<int64_t>((double)n_bytes->nb_layer / (double)n_layer);