mirror of
https://github.com/Lizonghang/prima.cpp.git
synced 2025-09-07 15:29:03 +00:00
Added support for Q2K, IQ1s, IQ4NL quantization types
This commit is contained in:
parent
e2cda4cfa0
commit
2f049b8428
4 changed files with 551 additions and 218 deletions
|
@ -901,13 +901,17 @@ static bool assign_layers_to_device(
|
||||||
float t_read_ram_cpu = 0.0f;
|
float t_read_ram_cpu = 0.0f;
|
||||||
|
|
||||||
float t_calc_cpu = (
|
float t_calc_cpu = (
|
||||||
master.model_flops.layer_f32_f32 / (dev.cpu_props.flops_f32_f32 * 1e9 + EPS) +
|
master.model_flops.layer_f32_f32 / (dev.cpu_props.flops_f32_f32 * 1e9 + EPS) +
|
||||||
master.model_flops.layer_f16_f32 / (dev.cpu_props.flops_f16_f32 * 1e9 + EPS) +
|
master.model_flops.layer_f16_f32 / (dev.cpu_props.flops_f16_f32 * 1e9 + EPS) +
|
||||||
master.model_flops.layer_q4k_f32 / (dev.cpu_props.flops_q4k_f32 * 1e9 + EPS) +
|
master.model_flops.layer_q2k_f32 / (dev.cpu_props.flops_q2k_f32 * 1e9 + EPS) +
|
||||||
master.model_flops.layer_q50_f32 / (dev.cpu_props.flops_q50_f32 * 1e9 + EPS) +
|
master.model_flops.layer_q4k_f32 / (dev.cpu_props.flops_q4k_f32 * 1e9 + EPS) +
|
||||||
master.model_flops.layer_q5k_f32 / (dev.cpu_props.flops_q5k_f32 * 1e9 + EPS) +
|
master.model_flops.layer_q5k_f32 / (dev.cpu_props.flops_q5k_f32 * 1e9 + EPS) +
|
||||||
master.model_flops.layer_q6k_f32 / (dev.cpu_props.flops_q6k_f32 * 1e9 + EPS) +
|
master.model_flops.layer_q6k_f32 / (dev.cpu_props.flops_q6k_f32 * 1e9 + EPS) +
|
||||||
master.model_flops.layer_q80_f32 / (dev.cpu_props.flops_q80_f32 * 1e9 + EPS)) * 1000; // in ms
|
master.model_flops.layer_q50_f32 / (dev.cpu_props.flops_q50_f32 * 1e9 + EPS) +
|
||||||
|
master.model_flops.layer_q80_f32 / (dev.cpu_props.flops_q80_f32 * 1e9 + EPS) +
|
||||||
|
master.model_flops.layer_iq1s_f32 / (dev.cpu_props.flops_iq1s_f32 * 1e9 + EPS)+
|
||||||
|
master.model_flops.layer_iq4nl_f32 / (dev.cpu_props.flops_iq4nl_f32 * 1e9 + EPS)) * 1000; // in ms
|
||||||
|
|
||||||
float t_kv_cpy_cpu = dev.memory.mem_cpy_delay; // in ms
|
float t_kv_cpy_cpu = dev.memory.mem_cpy_delay; // in ms
|
||||||
// t_read_ram_cpu = b_prime / (dev.memory.cpu_read_ram_bw * 1e9) * 1000; // in ms
|
// t_read_ram_cpu = b_prime / (dev.memory.cpu_read_ram_bw * 1e9) * 1000; // in ms
|
||||||
|
|
||||||
|
@ -921,24 +925,32 @@ static bool assign_layers_to_device(
|
||||||
|
|
||||||
if (dev.gpu_support.metal) {
|
if (dev.gpu_support.metal) {
|
||||||
t_calc_gpu = (
|
t_calc_gpu = (
|
||||||
master.model_flops.layer_f32_f32 / (dev.gpu_props.metal_flops_f32_f32 * 1e9 + EPS) +
|
master.model_flops.layer_f32_f32 / (dev.gpu_props.metal_flops_f32_f32 * 1e9 + EPS) +
|
||||||
master.model_flops.layer_f16_f32 / (dev.gpu_props.metal_flops_f16_f32 * 1e9 + EPS) +
|
master.model_flops.layer_f16_f32 / (dev.gpu_props.metal_flops_f16_f32 * 1e9 + EPS) +
|
||||||
master.model_flops.layer_q4k_f32 / (dev.gpu_props.metal_flops_q4k_f32 * 1e9 + EPS) +
|
master.model_flops.layer_q2k_f32 / (dev.gpu_props.metal_flops_q2k_f32 * 1e9 + EPS) +
|
||||||
master.model_flops.layer_q50_f32 / (dev.gpu_props.metal_flops_q50_f32 * 1e9 + EPS) +
|
master.model_flops.layer_q4k_f32 / (dev.gpu_props.metal_flops_q4k_f32 * 1e9 + EPS) +
|
||||||
master.model_flops.layer_q5k_f32 / (dev.gpu_props.metal_flops_q5k_f32 * 1e9 + EPS) +
|
master.model_flops.layer_q5k_f32 / (dev.gpu_props.metal_flops_q5k_f32 * 1e9 + EPS) +
|
||||||
master.model_flops.layer_q6k_f32 / (dev.gpu_props.metal_flops_q6k_f32 * 1e9 + EPS) +
|
master.model_flops.layer_q6k_f32 / (dev.gpu_props.metal_flops_q6k_f32 * 1e9 + EPS) +
|
||||||
master.model_flops.layer_q80_f32 / (dev.gpu_props.metal_flops_q80_f32 * 1e9 + EPS)) * 1000; // in ms
|
master.model_flops.layer_q50_f32 / (dev.gpu_props.metal_flops_q50_f32 * 1e9 + EPS) +
|
||||||
|
master.model_flops.layer_q80_f32 / (dev.gpu_props.metal_flops_q80_f32 * 1e9 + EPS) +
|
||||||
|
master.model_flops.layer_iq1s_f32 / (dev.gpu_props.metal_flops_iq1s_f32 * 1e9 + EPS) +
|
||||||
|
master.model_flops.layer_iq4nl_f32 / (dev.gpu_props.metal_flops_iq4nl_f32 * 1e9 + EPS)) * 1000; // in ms
|
||||||
|
|
||||||
t_kv_cpy_gpu = dev.gpu_props.metal_mem_cpy_delay; // in ms
|
t_kv_cpy_gpu = dev.gpu_props.metal_mem_cpy_delay; // in ms
|
||||||
// t_read_ram_gpu = b_prime / (dev.gpu_props.metal_read_vram_bw * 1e9) * 1000; // in ms
|
// t_read_ram_gpu = b_prime / (dev.gpu_props.metal_read_vram_bw * 1e9) * 1000; // in ms
|
||||||
} else {
|
} else {
|
||||||
t_calc_gpu = (
|
t_calc_gpu = (
|
||||||
master.model_flops.layer_f32_f32 / (dev.gpu_props.cuda_flops_f32_f32 * 1e9 + EPS) +
|
master.model_flops.layer_f32_f32 / (dev.gpu_props.cuda_flops_f32_f32 * 1e9 + EPS) +
|
||||||
master.model_flops.layer_f16_f32 / (dev.gpu_props.cuda_flops_f16_f32 * 1e9 + EPS) +
|
master.model_flops.layer_f16_f32 / (dev.gpu_props.cuda_flops_f16_f32 * 1e9 + EPS) +
|
||||||
master.model_flops.layer_q4k_f32 / (dev.gpu_props.cuda_flops_q4k_f32 * 1e9 + EPS) +
|
master.model_flops.layer_q2k_f32 / (dev.gpu_props.cuda_flops_q2k_f32 * 1e9 + EPS) +
|
||||||
master.model_flops.layer_q50_f32 / (dev.gpu_props.cuda_flops_q50_f32 * 1e9 + EPS) +
|
master.model_flops.layer_q4k_f32 / (dev.gpu_props.cuda_flops_q4k_f32 * 1e9 + EPS) +
|
||||||
master.model_flops.layer_q5k_f32 / (dev.gpu_props.cuda_flops_q5k_f32 * 1e9 + EPS) +
|
master.model_flops.layer_q5k_f32 / (dev.gpu_props.cuda_flops_q5k_f32 * 1e9 + EPS) +
|
||||||
master.model_flops.layer_q6k_f32 / (dev.gpu_props.cuda_flops_q6k_f32 * 1e9 + EPS) +
|
master.model_flops.layer_q6k_f32 / (dev.gpu_props.cuda_flops_q6k_f32 * 1e9 + EPS) +
|
||||||
master.model_flops.layer_q80_f32 / (dev.gpu_props.cuda_flops_q80_f32 * 1e9 + EPS)) * 1000; // in ms
|
master.model_flops.layer_q50_f32 / (dev.gpu_props.cuda_flops_q50_f32 * 1e9 + EPS) +
|
||||||
|
master.model_flops.layer_q80_f32 / (dev.gpu_props.cuda_flops_q80_f32 * 1e9 + EPS) +
|
||||||
|
master.model_flops.layer_iq1s_f32 / (dev.gpu_props.cuda_flops_iq1s_f32 * 1e9 + EPS) +
|
||||||
|
master.model_flops.layer_iq4nl_f32 / (dev.gpu_props.cuda_flops_iq4nl_f32 * 1e9 + EPS)) * 1000; // in ms
|
||||||
|
|
||||||
t_kv_cpy_gpu = dev.gpu_props.cuda_mem_cpy_delay; // in ms
|
t_kv_cpy_gpu = dev.gpu_props.cuda_mem_cpy_delay; // in ms
|
||||||
// t_read_ram_gpu = b_prime / (dev.gpu_props.cuda_read_vram_bw * 1e9) * 1000; // in ms
|
// t_read_ram_gpu = b_prime / (dev.gpu_props.cuda_read_vram_bw * 1e9) * 1000; // in ms
|
||||||
}
|
}
|
||||||
|
@ -1113,13 +1125,16 @@ static bool assign_layers_to_device(
|
||||||
|
|
||||||
if (m == 0) {
|
if (m == 0) {
|
||||||
kappa = (
|
kappa = (
|
||||||
dev.model_flops.layer_f32_f32 / (dev.cpu_props.flops_f32_f32 * 1e9 + EPS) +
|
dev.model_flops.layer_f32_f32 / (dev.cpu_props.flops_f32_f32 * 1e9 + EPS) +
|
||||||
dev.model_flops.layer_f16_f32 / (dev.cpu_props.flops_f16_f32 * 1e9 + EPS) +
|
dev.model_flops.layer_f16_f32 / (dev.cpu_props.flops_f16_f32 * 1e9 + EPS) +
|
||||||
dev.model_flops.layer_q4k_f32 / (dev.cpu_props.flops_q4k_f32 * 1e9 + EPS) +
|
dev.model_flops.layer_q2k_f32 / (dev.cpu_props.flops_q2k_f32 * 1e9 + EPS) +
|
||||||
dev.model_flops.layer_q50_f32 / (dev.cpu_props.flops_q50_f32 * 1e9 + EPS) +
|
dev.model_flops.layer_q4k_f32 / (dev.cpu_props.flops_q4k_f32 * 1e9 + EPS) +
|
||||||
dev.model_flops.layer_q5k_f32 / (dev.cpu_props.flops_q5k_f32 * 1e9 + EPS) +
|
dev.model_flops.layer_q5k_f32 / (dev.cpu_props.flops_q5k_f32 * 1e9 + EPS) +
|
||||||
dev.model_flops.layer_q6k_f32 / (dev.cpu_props.flops_q6k_f32 * 1e9 + EPS) +
|
dev.model_flops.layer_q6k_f32 / (dev.cpu_props.flops_q6k_f32 * 1e9 + EPS) +
|
||||||
dev.model_flops.layer_q80_f32 / (dev.cpu_props.flops_q80_f32 * 1e9 + EPS)) * 1000; // in ms
|
dev.model_flops.layer_q50_f32 / (dev.cpu_props.flops_q50_f32 * 1e9 + EPS) +
|
||||||
|
dev.model_flops.layer_q80_f32 / (dev.cpu_props.flops_q80_f32 * 1e9 + EPS) +
|
||||||
|
dev.model_flops.layer_iq1s_f32 / (dev.cpu_props.flops_iq1s_f32 * 1e9 + EPS) +
|
||||||
|
dev.model_flops.layer_iq4nl_f32 / (dev.cpu_props.flops_iq4nl_f32 * 1e9 + EPS)) * 1000; // in ms
|
||||||
|
|
||||||
// kappa += (bi / n_vocab + bo) / (dev.memory.cpu_read_ram_bw * 1e9) * 1000; // in ms
|
// kappa += (bi / n_vocab + bo) / (dev.memory.cpu_read_ram_bw * 1e9) * 1000; // in ms
|
||||||
|
|
||||||
|
@ -1766,33 +1781,25 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
|
||||||
return mparams;
|
return mparams;
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_type kv_cache_type_from_str(const std::string & s) {
|
const std::vector<ggml_type> kv_cache_types = {
|
||||||
if (s == "f32") {
|
GGML_TYPE_F32,
|
||||||
return GGML_TYPE_F32;
|
GGML_TYPE_F16,
|
||||||
}
|
GGML_TYPE_BF16, // Added BF16 data type support
|
||||||
if (s == "f16") {
|
GGML_TYPE_Q8_0,
|
||||||
return GGML_TYPE_F16;
|
GGML_TYPE_Q4_0,
|
||||||
}
|
GGML_TYPE_Q4_1,
|
||||||
if (s == "q8_0") {
|
GGML_TYPE_IQ4_NL,
|
||||||
return GGML_TYPE_Q8_0;
|
GGML_TYPE_Q5_0,
|
||||||
}
|
GGML_TYPE_Q5_1,
|
||||||
if (s == "q4_0") {
|
};
|
||||||
return GGML_TYPE_Q4_0;
|
|
||||||
}
|
|
||||||
if (s == "q4_1") {
|
|
||||||
return GGML_TYPE_Q4_1;
|
|
||||||
}
|
|
||||||
if (s == "iq4_nl") {
|
|
||||||
return GGML_TYPE_IQ4_NL;
|
|
||||||
}
|
|
||||||
if (s == "q5_0") {
|
|
||||||
return GGML_TYPE_Q5_0;
|
|
||||||
}
|
|
||||||
if (s == "q5_1") {
|
|
||||||
return GGML_TYPE_Q5_1;
|
|
||||||
}
|
|
||||||
|
|
||||||
throw std::runtime_error("Invalid cache type: " + s);
|
static ggml_type kv_cache_type_from_str(const std::string & s) {
|
||||||
|
for (const auto & type : kv_cache_types) {
|
||||||
|
if (ggml_type_name(type) == s) {
|
||||||
|
return type;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
throw std::runtime_error("Unsupported cache type: " + s);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
|
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
|
||||||
|
|
File diff suppressed because one or more lines are too long
|
@ -17,23 +17,30 @@ struct cpu_props {
|
||||||
uint32_t cores;
|
uint32_t cores;
|
||||||
float flops_f32_f32; // in GFLOPS
|
float flops_f32_f32; // in GFLOPS
|
||||||
float flops_f16_f32; // in GFLOPS
|
float flops_f16_f32; // in GFLOPS
|
||||||
|
float flops_q2k_f32; // in GFLOPS
|
||||||
float flops_q4k_f32; // in GFLOPS
|
float flops_q4k_f32; // in GFLOPS
|
||||||
float flops_q50_f32; // in GFLOPS
|
|
||||||
float flops_q5k_f32; // in GFLOPS
|
float flops_q5k_f32; // in GFLOPS
|
||||||
float flops_q6k_f32; // in GFLOPS
|
float flops_q6k_f32; // in GFLOPS
|
||||||
|
float flops_q50_f32; // in GFLOPS
|
||||||
float flops_q80_f32; // in GFLOPS
|
float flops_q80_f32; // in GFLOPS
|
||||||
|
float flops_iq1s_f32; // in GFLOPS
|
||||||
|
float flops_iq4nl_f32; // in GFLOPS
|
||||||
|
|
||||||
cpu_props() :
|
cpu_props()
|
||||||
name(""),
|
: name (""),
|
||||||
description(""),
|
description (""),
|
||||||
cores(0),
|
cores (0),
|
||||||
flops_f32_f32(0.0f),
|
flops_f32_f32 (0.0f),
|
||||||
flops_f16_f32(0.0f),
|
flops_f16_f32 (0.0f),
|
||||||
flops_q4k_f32(0.0f),
|
flops_q2k_f32 (0.0f),
|
||||||
flops_q50_f32(0.0f),
|
flops_q4k_f32 (0.0f),
|
||||||
flops_q5k_f32(0.0f),
|
flops_q5k_f32 (0.0f),
|
||||||
flops_q6k_f32(0.0f),
|
flops_q6k_f32 (0.0f),
|
||||||
flops_q80_f32(0.0f) {}
|
flops_q50_f32 (0.0f),
|
||||||
|
flops_q80_f32 (0.0f),
|
||||||
|
flops_iq1s_f32 (0.0f),
|
||||||
|
flops_iq4nl_f32 (0.0f)
|
||||||
|
{}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct memory_info {
|
struct memory_info {
|
||||||
|
@ -82,127 +89,169 @@ struct gpu_props {
|
||||||
float metal_read_vram_bw; // in GB/s
|
float metal_read_vram_bw; // in GB/s
|
||||||
float metal_flops_f32_f32; // in GFLOPS
|
float metal_flops_f32_f32; // in GFLOPS
|
||||||
float metal_flops_f16_f32; // in GFLOPS
|
float metal_flops_f16_f32; // in GFLOPS
|
||||||
|
float metal_flops_q2k_f32; // in GFLOPS
|
||||||
float metal_flops_q4k_f32; // in GFLOPS
|
float metal_flops_q4k_f32; // in GFLOPS
|
||||||
float metal_flops_q50_f32; // in GFLOPS
|
|
||||||
float metal_flops_q5k_f32; // in GFLOPS
|
float metal_flops_q5k_f32; // in GFLOPS
|
||||||
float metal_flops_q6k_f32; // in GFLOPS
|
float metal_flops_q6k_f32; // in GFLOPS
|
||||||
|
float metal_flops_q50_f32; // in GFLOPS
|
||||||
float metal_flops_q80_f32; // in GFLOPS
|
float metal_flops_q80_f32; // in GFLOPS
|
||||||
|
float metal_flops_iq1s_f32; // in GFLOPS
|
||||||
|
float metal_flops_iq4nl_f32; // in GFLOPS
|
||||||
float metal_mem_cpy_delay; // in ms
|
float metal_mem_cpy_delay; // in ms
|
||||||
float cuda_read_vram_bw; // in GB/s
|
float cuda_read_vram_bw; // in GB/s
|
||||||
float cuda_flops_f32_f32; // in GFLOPS
|
float cuda_flops_f32_f32; // in GFLOPS
|
||||||
float cuda_flops_f16_f32; // in GFLOPS
|
float cuda_flops_f16_f32; // in GFLOPS
|
||||||
|
float cuda_flops_q2k_f32; // in GFLOPS
|
||||||
float cuda_flops_q4k_f32; // in GFLOPS
|
float cuda_flops_q4k_f32; // in GFLOPS
|
||||||
float cuda_flops_q50_f32; // in GFLOPS
|
|
||||||
float cuda_flops_q5k_f32; // in GFLOPS
|
float cuda_flops_q5k_f32; // in GFLOPS
|
||||||
float cuda_flops_q6k_f32; // in GFLOPS
|
float cuda_flops_q6k_f32; // in GFLOPS
|
||||||
|
float cuda_flops_q50_f32; // in GFLOPS
|
||||||
float cuda_flops_q80_f32; // in GFLOPS
|
float cuda_flops_q80_f32; // in GFLOPS
|
||||||
|
float cuda_flops_iq1s_f32; // in GFLOPS
|
||||||
|
float cuda_flops_iq4nl_f32; // in GFLOPS
|
||||||
float cuda_mem_cpy_delay; // in ms
|
float cuda_mem_cpy_delay; // in ms
|
||||||
|
|
||||||
gpu_props() :
|
gpu_props() :
|
||||||
name(""),
|
name (""),
|
||||||
description(""),
|
description (""),
|
||||||
memory_free (0.0f),
|
memory_free (0.0f),
|
||||||
memory_total (0.0f),
|
memory_total (0.0f),
|
||||||
metal_read_vram_bw (0.0f),
|
metal_read_vram_bw (0.0f),
|
||||||
metal_flops_f32_f32(0.0f),
|
metal_flops_f32_f32 (0.0f),
|
||||||
metal_flops_f16_f32(0.0f),
|
metal_flops_f16_f32 (0.0f),
|
||||||
metal_flops_q4k_f32(0.0f),
|
metal_flops_q2k_f32 (0.0f),
|
||||||
metal_flops_q50_f32(0.0f),
|
metal_flops_q4k_f32 (0.0f),
|
||||||
metal_flops_q5k_f32(0.0f),
|
metal_flops_q5k_f32 (0.0f),
|
||||||
metal_flops_q6k_f32(0.0f),
|
metal_flops_q6k_f32 (0.0f),
|
||||||
metal_flops_q80_f32(0.0f),
|
metal_flops_q50_f32 (0.0f),
|
||||||
metal_mem_cpy_delay(0.0f),
|
metal_flops_q80_f32 (0.0f),
|
||||||
cuda_read_vram_bw (0.0f),
|
metal_flops_iq1s_f32 (0.0f),
|
||||||
cuda_flops_f32_f32 (0.0f),
|
metal_flops_iq4nl_f32 (0.0f),
|
||||||
cuda_flops_f16_f32 (0.0f),
|
metal_mem_cpy_delay (0.0f),
|
||||||
cuda_flops_q4k_f32 (0.0f),
|
cuda_read_vram_bw (0.0f),
|
||||||
cuda_flops_q50_f32 (0.0f),
|
cuda_flops_f32_f32 (0.0f),
|
||||||
cuda_flops_q5k_f32 (0.0f),
|
cuda_flops_f16_f32 (0.0f),
|
||||||
cuda_flops_q6k_f32 (0.0f),
|
cuda_flops_q2k_f32 (0.0f),
|
||||||
cuda_flops_q80_f32 (0.0f),
|
cuda_flops_q4k_f32 (0.0f),
|
||||||
cuda_mem_cpy_delay (0.0f) {}
|
cuda_flops_q5k_f32 (0.0f),
|
||||||
|
cuda_flops_q6k_f32 (0.0f),
|
||||||
|
cuda_flops_q50_f32 (0.0f),
|
||||||
|
cuda_flops_q80_f32 (0.0f),
|
||||||
|
cuda_flops_iq1s_f32 (0.0f),
|
||||||
|
cuda_flops_iq4nl_f32 (0.0f),
|
||||||
|
cuda_mem_cpy_delay (0.0f) {}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct model_flops {
|
struct model_flops {
|
||||||
float inp_embd_ms;
|
float inp_embd_ms;
|
||||||
int64_t output_f32_f32;
|
int64_t output_f32_f32;
|
||||||
int64_t output_f16_f32;
|
int64_t output_f16_f32;
|
||||||
|
int64_t output_q2k_f32;
|
||||||
int64_t output_q4k_f32;
|
int64_t output_q4k_f32;
|
||||||
int64_t output_q50_f32;
|
|
||||||
int64_t output_q5k_f32;
|
int64_t output_q5k_f32;
|
||||||
int64_t output_q6k_f32;
|
int64_t output_q6k_f32;
|
||||||
|
int64_t output_q50_f32;
|
||||||
int64_t output_q80_f32;
|
int64_t output_q80_f32;
|
||||||
|
int64_t output_iq1s_f32;
|
||||||
|
int64_t output_iq4nl_f32;
|
||||||
int64_t layer_f32_f32;
|
int64_t layer_f32_f32;
|
||||||
int64_t layer_f16_f32;
|
int64_t layer_f16_f32;
|
||||||
|
int64_t layer_q2k_f32;
|
||||||
int64_t layer_q4k_f32;
|
int64_t layer_q4k_f32;
|
||||||
int64_t layer_q50_f32;
|
|
||||||
int64_t layer_q5k_f32;
|
int64_t layer_q5k_f32;
|
||||||
int64_t layer_q6k_f32;
|
int64_t layer_q6k_f32;
|
||||||
|
int64_t layer_q50_f32;
|
||||||
int64_t layer_q80_f32;
|
int64_t layer_q80_f32;
|
||||||
|
int64_t layer_iq1s_f32;
|
||||||
|
int64_t layer_iq4nl_f32;
|
||||||
|
|
||||||
model_flops() :
|
model_flops() :
|
||||||
inp_embd_ms(0.0f),
|
inp_embd_ms(0.0f),
|
||||||
output_f32_f32(0),
|
output_f32_f32(0),
|
||||||
output_f16_f32(0),
|
output_f16_f32(0),
|
||||||
|
output_q2k_f32(0),
|
||||||
output_q4k_f32(0),
|
output_q4k_f32(0),
|
||||||
output_q50_f32(0),
|
|
||||||
output_q5k_f32(0),
|
output_q5k_f32(0),
|
||||||
output_q6k_f32(0),
|
output_q6k_f32(0),
|
||||||
|
output_q50_f32(0),
|
||||||
output_q80_f32(0),
|
output_q80_f32(0),
|
||||||
|
output_iq1s_f32(0),
|
||||||
|
output_iq4nl_f32(0),
|
||||||
layer_f32_f32 (0),
|
layer_f32_f32 (0),
|
||||||
layer_f16_f32 (0),
|
layer_f16_f32 (0),
|
||||||
|
layer_q2k_f32 (0),
|
||||||
layer_q4k_f32 (0),
|
layer_q4k_f32 (0),
|
||||||
layer_q50_f32 (0),
|
|
||||||
layer_q5k_f32 (0),
|
layer_q5k_f32 (0),
|
||||||
layer_q6k_f32 (0),
|
layer_q6k_f32 (0),
|
||||||
layer_q80_f32 (0) {}
|
layer_q50_f32 (0),
|
||||||
|
layer_q80_f32 (0),
|
||||||
|
layer_iq1s_f32 (0),
|
||||||
|
layer_iq4nl_f32 (0) {}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct model_params {
|
struct model_params {
|
||||||
int64_t input_f32;
|
int64_t input_f32;
|
||||||
int64_t input_f16;
|
int64_t input_f16;
|
||||||
|
int64_t input_q2k;
|
||||||
int64_t input_q4k;
|
int64_t input_q4k;
|
||||||
int64_t input_q50;
|
|
||||||
int64_t input_q5k;
|
int64_t input_q5k;
|
||||||
int64_t input_q6k;
|
int64_t input_q6k;
|
||||||
|
int64_t input_q50;
|
||||||
int64_t input_q80;
|
int64_t input_q80;
|
||||||
|
int64_t input_iq1s;
|
||||||
|
int64_t input_iq4nl;
|
||||||
int64_t output_f32;
|
int64_t output_f32;
|
||||||
int64_t output_f16;
|
int64_t output_f16;
|
||||||
|
int64_t output_q2k;
|
||||||
int64_t output_q4k;
|
int64_t output_q4k;
|
||||||
int64_t output_q50;
|
|
||||||
int64_t output_q5k;
|
int64_t output_q5k;
|
||||||
int64_t output_q6k;
|
int64_t output_q6k;
|
||||||
|
int64_t output_q50;
|
||||||
int64_t output_q80;
|
int64_t output_q80;
|
||||||
|
int64_t output_iq1s;
|
||||||
|
int64_t output_iq4nl;
|
||||||
int64_t layer_f32;
|
int64_t layer_f32;
|
||||||
int64_t layer_f16;
|
int64_t layer_f16;
|
||||||
|
int64_t layer_q2k;
|
||||||
int64_t layer_q4k;
|
int64_t layer_q4k;
|
||||||
int64_t layer_q50;
|
|
||||||
int64_t layer_q5k;
|
int64_t layer_q5k;
|
||||||
int64_t layer_q6k;
|
int64_t layer_q6k;
|
||||||
|
int64_t layer_q50;
|
||||||
int64_t layer_q80;
|
int64_t layer_q80;
|
||||||
|
int64_t layer_iq1s;
|
||||||
|
int64_t layer_iq4nl;
|
||||||
|
|
||||||
model_params() :
|
model_params() :
|
||||||
input_f32 (0),
|
input_f32 (0),
|
||||||
input_f16 (0),
|
input_f16 (0),
|
||||||
|
input_q2k (0),
|
||||||
input_q4k (0),
|
input_q4k (0),
|
||||||
input_q50 (0),
|
|
||||||
input_q5k (0),
|
input_q5k (0),
|
||||||
input_q6k (0),
|
input_q6k (0),
|
||||||
|
input_q50 (0),
|
||||||
input_q80 (0),
|
input_q80 (0),
|
||||||
|
input_iq1s(0),
|
||||||
|
input_iq4nl(0),
|
||||||
output_f32(0),
|
output_f32(0),
|
||||||
output_f16(0),
|
output_f16(0),
|
||||||
|
output_q2k(0),
|
||||||
output_q4k(0),
|
output_q4k(0),
|
||||||
output_q50(0),
|
|
||||||
output_q5k(0),
|
output_q5k(0),
|
||||||
output_q6k(0),
|
output_q6k(0),
|
||||||
|
output_q50(0),
|
||||||
output_q80(0),
|
output_q80(0),
|
||||||
|
output_iq1s(0),
|
||||||
|
output_iq4nl(0),
|
||||||
layer_f32 (0),
|
layer_f32 (0),
|
||||||
layer_f16 (0),
|
layer_f16 (0),
|
||||||
|
layer_q2k (0),
|
||||||
layer_q4k (0),
|
layer_q4k (0),
|
||||||
layer_q50 (0),
|
|
||||||
layer_q5k (0),
|
layer_q5k (0),
|
||||||
layer_q6k (0),
|
layer_q6k (0),
|
||||||
layer_q80 (0) {}
|
layer_q50 (0),
|
||||||
|
layer_q80 (0),
|
||||||
|
layer_iq1s (0),
|
||||||
|
layer_iq4nl (0) {}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct model_bytes {
|
struct model_bytes {
|
||||||
|
|
190
src/llama.cpp
190
src/llama.cpp
|
@ -3559,16 +3559,22 @@ static bool is_dtype_exist(struct model_params * n_params, enum ggml_type dtype)
|
||||||
case GGML_TYPE_F32:
|
case GGML_TYPE_F32:
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
return true;
|
return true;
|
||||||
|
case GGML_TYPE_Q2_K:
|
||||||
|
return n_params->layer_q2k > 0 || n_params->output_q2k > 0;
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
return n_params->layer_q4k > 0 || n_params->output_q4k > 0;
|
return n_params->layer_q4k > 0 || n_params->output_q4k > 0;
|
||||||
case GGML_TYPE_Q5_0:
|
|
||||||
return n_params->layer_q50 > 0 || n_params->output_q50 > 0;
|
|
||||||
case GGML_TYPE_Q5_K:
|
case GGML_TYPE_Q5_K:
|
||||||
return n_params->layer_q5k > 0 || n_params->output_q5k > 0;
|
return n_params->layer_q5k > 0 || n_params->output_q5k > 0;
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
return n_params->layer_q6k > 0 || n_params->output_q6k > 0;
|
return n_params->layer_q6k > 0 || n_params->output_q6k > 0;
|
||||||
|
case GGML_TYPE_Q5_0:
|
||||||
|
return n_params->layer_q50 > 0 || n_params->output_q50 > 0;
|
||||||
case GGML_TYPE_Q8_0:
|
case GGML_TYPE_Q8_0:
|
||||||
return n_params->layer_q80 > 0 || n_params->output_q80 > 0;
|
return n_params->layer_q80 > 0 || n_params->output_q80 > 0;
|
||||||
|
case GGML_TYPE_IQ1_S:
|
||||||
|
return n_params->layer_iq1s > 0 || n_params->output_iq1s > 0;
|
||||||
|
case GGML_TYPE_IQ4_NL:
|
||||||
|
return n_params->layer_iq4nl > 0 || n_params->output_iq4nl > 0;
|
||||||
default:
|
default:
|
||||||
throw std::runtime_error("Unrecognized data type\n");
|
throw std::runtime_error("Unrecognized data type\n");
|
||||||
}
|
}
|
||||||
|
@ -3649,18 +3655,18 @@ void llama_profile_device(
|
||||||
dev_info->gpu_props.cuda_flops_f16_f32 = device_cuda_flops (model, GGML_TYPE_F16, GGML_TYPE_F32);
|
dev_info->gpu_props.cuda_flops_f16_f32 = device_cuda_flops (model, GGML_TYPE_F16, GGML_TYPE_F32);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (is_dtype_exist(n_params, GGML_TYPE_Q2_K)) {
|
||||||
|
dev_info->cpu_props.flops_q2k_f32 = device_cpu_flops (model, GGML_TYPE_Q2_K, GGML_TYPE_F32, n_threads);
|
||||||
|
dev_info->gpu_props.metal_flops_q2k_f32 = device_metal_flops(model, GGML_TYPE_Q2_K, GGML_TYPE_F32);
|
||||||
|
dev_info->gpu_props.cuda_flops_q2k_f32 = device_cuda_flops (model, GGML_TYPE_Q2_K, GGML_TYPE_F32);
|
||||||
|
}
|
||||||
|
|
||||||
if (is_dtype_exist(n_params, GGML_TYPE_Q4_K)) {
|
if (is_dtype_exist(n_params, GGML_TYPE_Q4_K)) {
|
||||||
dev_info->cpu_props.flops_q4k_f32 = device_cpu_flops (model, GGML_TYPE_Q4_K, GGML_TYPE_F32, n_threads);
|
dev_info->cpu_props.flops_q4k_f32 = device_cpu_flops (model, GGML_TYPE_Q4_K, GGML_TYPE_F32, n_threads);
|
||||||
dev_info->gpu_props.metal_flops_q4k_f32 = device_metal_flops(model, GGML_TYPE_Q4_K, GGML_TYPE_F32);
|
dev_info->gpu_props.metal_flops_q4k_f32 = device_metal_flops(model, GGML_TYPE_Q4_K, GGML_TYPE_F32);
|
||||||
dev_info->gpu_props.cuda_flops_q4k_f32 = device_cuda_flops (model, GGML_TYPE_Q4_K, GGML_TYPE_F32);
|
dev_info->gpu_props.cuda_flops_q4k_f32 = device_cuda_flops (model, GGML_TYPE_Q4_K, GGML_TYPE_F32);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (is_dtype_exist(n_params, GGML_TYPE_Q5_0)) {
|
|
||||||
dev_info->cpu_props.flops_q50_f32 = device_cpu_flops (model, GGML_TYPE_Q5_0, GGML_TYPE_F32, n_threads);
|
|
||||||
dev_info->gpu_props.metal_flops_q50_f32 = device_metal_flops(model, GGML_TYPE_Q5_0, GGML_TYPE_F32);
|
|
||||||
dev_info->gpu_props.cuda_flops_q50_f32 = device_cuda_flops (model, GGML_TYPE_Q5_0, GGML_TYPE_F32);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (is_dtype_exist(n_params, GGML_TYPE_Q5_K)) {
|
if (is_dtype_exist(n_params, GGML_TYPE_Q5_K)) {
|
||||||
dev_info->cpu_props.flops_q5k_f32 = device_cpu_flops (model, GGML_TYPE_Q5_K, GGML_TYPE_F32, n_threads);
|
dev_info->cpu_props.flops_q5k_f32 = device_cpu_flops (model, GGML_TYPE_Q5_K, GGML_TYPE_F32, n_threads);
|
||||||
dev_info->gpu_props.metal_flops_q5k_f32 = device_metal_flops(model, GGML_TYPE_Q5_K, GGML_TYPE_F32);
|
dev_info->gpu_props.metal_flops_q5k_f32 = device_metal_flops(model, GGML_TYPE_Q5_K, GGML_TYPE_F32);
|
||||||
|
@ -3673,11 +3679,30 @@ void llama_profile_device(
|
||||||
dev_info->gpu_props.cuda_flops_q6k_f32 = device_cuda_flops (model, GGML_TYPE_Q6_K, GGML_TYPE_F32);
|
dev_info->gpu_props.cuda_flops_q6k_f32 = device_cuda_flops (model, GGML_TYPE_Q6_K, GGML_TYPE_F32);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (is_dtype_exist(n_params, GGML_TYPE_Q5_0)) {
|
||||||
|
dev_info->cpu_props.flops_q50_f32 = device_cpu_flops (model, GGML_TYPE_Q5_0, GGML_TYPE_F32, n_threads);
|
||||||
|
dev_info->gpu_props.metal_flops_q50_f32 = device_metal_flops(model, GGML_TYPE_Q5_0, GGML_TYPE_F32);
|
||||||
|
dev_info->gpu_props.cuda_flops_q50_f32 = device_cuda_flops (model, GGML_TYPE_Q5_0, GGML_TYPE_F32);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
if (is_dtype_exist(n_params, GGML_TYPE_Q8_0)) {
|
if (is_dtype_exist(n_params, GGML_TYPE_Q8_0)) {
|
||||||
dev_info->cpu_props.flops_q80_f32 = device_cpu_flops (model, GGML_TYPE_Q8_0, GGML_TYPE_F32, n_threads);
|
dev_info->cpu_props.flops_q80_f32 = device_cpu_flops (model, GGML_TYPE_Q8_0, GGML_TYPE_F32, n_threads);
|
||||||
dev_info->gpu_props.metal_flops_q80_f32 = device_metal_flops(model, GGML_TYPE_Q8_0, GGML_TYPE_F32);
|
dev_info->gpu_props.metal_flops_q80_f32 = device_metal_flops(model, GGML_TYPE_Q8_0, GGML_TYPE_F32);
|
||||||
dev_info->gpu_props.cuda_flops_q80_f32 = device_cuda_flops (model, GGML_TYPE_Q8_0, GGML_TYPE_F32);
|
dev_info->gpu_props.cuda_flops_q80_f32 = device_cuda_flops (model, GGML_TYPE_Q8_0, GGML_TYPE_F32);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (is_dtype_exist(n_params, GGML_TYPE_IQ1_S)) {
|
||||||
|
dev_info->cpu_props.flops_iq1s_f32 = device_cpu_flops (model, GGML_TYPE_IQ1_S, GGML_TYPE_F32, n_threads);
|
||||||
|
dev_info->gpu_props.metal_flops_iq1s_f32= device_metal_flops(model, GGML_TYPE_IQ1_S, GGML_TYPE_F32);
|
||||||
|
dev_info->gpu_props.cuda_flops_iq1s_f32 = device_cuda_flops (model, GGML_TYPE_IQ1_S, GGML_TYPE_F32);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (is_dtype_exist(n_params, GGML_TYPE_IQ4_NL)) {
|
||||||
|
dev_info->cpu_props.flops_iq4nl_f32 = device_cpu_flops (model, GGML_TYPE_IQ4_NL, GGML_TYPE_F32, n_threads);
|
||||||
|
dev_info->gpu_props.metal_flops_iq4nl_f32= device_metal_flops(model, GGML_TYPE_IQ4_NL, GGML_TYPE_F32);
|
||||||
|
dev_info->gpu_props.cuda_flops_iq4nl_f32 = device_cuda_flops (model, GGML_TYPE_IQ4_NL, GGML_TYPE_F32);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_buffer_type_t llama_dev_buffer_type(struct llama_model * model, int device) {
|
ggml_backend_buffer_type_t llama_dev_buffer_type(struct llama_model * model, int device) {
|
||||||
|
@ -21029,49 +21054,67 @@ static void count_n_flops(struct model_flops * n_flops, enum ggml_type dtype, en
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
n_flops->output_f16_f32 += n;
|
n_flops->output_f16_f32 += n;
|
||||||
break;
|
break;
|
||||||
|
case GGML_TYPE_Q2_K:
|
||||||
|
n_flops->output_q2k_f32 += n;
|
||||||
|
break;
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
n_flops->output_q4k_f32 += n;
|
n_flops->output_q4k_f32 += n;
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q5_0:
|
|
||||||
n_flops->output_q50_f32 += n;
|
|
||||||
break;
|
|
||||||
case GGML_TYPE_Q5_K:
|
case GGML_TYPE_Q5_K:
|
||||||
n_flops->output_q5k_f32 += n;
|
n_flops->output_q5k_f32 += n;
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
n_flops->output_q6k_f32 += n;
|
n_flops->output_q6k_f32 += n;
|
||||||
break;
|
break;
|
||||||
|
case GGML_TYPE_Q5_0:
|
||||||
|
n_flops->output_q50_f32 += n;
|
||||||
|
break;
|
||||||
case GGML_TYPE_Q8_0:
|
case GGML_TYPE_Q8_0:
|
||||||
n_flops->output_q80_f32 += n;
|
n_flops->output_q80_f32 += n;
|
||||||
break;
|
break;
|
||||||
|
case GGML_TYPE_IQ1_S:
|
||||||
|
n_flops->output_iq1s_f32 += n;
|
||||||
|
break;
|
||||||
|
case GGML_TYPE_IQ4_NL:
|
||||||
|
n_flops->output_iq4nl_f32 += n;
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_OUTPUT\n");
|
throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_OUTPUT\n");
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case PROFILER_LAYER_BACKEND:
|
case PROFILER_LAYER_BACKEND:
|
||||||
switch (dtype) {
|
switch (dtype) {
|
||||||
case GGML_TYPE_F32:
|
case GGML_TYPE_F32:
|
||||||
n_flops->layer_f32_f32 += n;
|
n_flops->layer_f32_f32 += n;
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
n_flops->layer_f16_f32 += n;
|
n_flops->layer_f16_f32 += n;
|
||||||
break;
|
break;
|
||||||
|
case GGML_TYPE_Q2_K:
|
||||||
|
n_flops->layer_q2k_f32 += n;
|
||||||
|
break;
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
n_flops->layer_q4k_f32 += n;
|
n_flops->layer_q4k_f32 += n;
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q5_0:
|
|
||||||
n_flops->layer_q50_f32 += n;
|
|
||||||
break;
|
|
||||||
case GGML_TYPE_Q5_K:
|
case GGML_TYPE_Q5_K:
|
||||||
n_flops->layer_q5k_f32 += n;
|
n_flops->layer_q5k_f32 += n;
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
n_flops->layer_q6k_f32 += n;
|
n_flops->layer_q6k_f32 += n;
|
||||||
break;
|
break;
|
||||||
|
case GGML_TYPE_Q5_0:
|
||||||
|
n_flops->layer_q50_f32 += n;
|
||||||
|
break;
|
||||||
case GGML_TYPE_Q8_0:
|
case GGML_TYPE_Q8_0:
|
||||||
n_flops->layer_q80_f32 += n;
|
n_flops->layer_q80_f32 += n;
|
||||||
break;
|
break;
|
||||||
|
case GGML_TYPE_IQ1_S:
|
||||||
|
n_flops->layer_iq1s_f32 += n;
|
||||||
|
break;
|
||||||
|
case GGML_TYPE_IQ4_NL:
|
||||||
|
n_flops->layer_iq4nl_f32 += n;
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_BACKEND\n");
|
throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_BACKEND\n");
|
||||||
}
|
}
|
||||||
|
@ -21093,21 +21136,30 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype,
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
n_params->input_f16 += n_i64t;
|
n_params->input_f16 += n_i64t;
|
||||||
break;
|
break;
|
||||||
|
case GGML_TYPE_Q2_K:
|
||||||
|
n_params->input_q2k += n_i64t;
|
||||||
|
break;
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
n_params->input_q4k += n_i64t;
|
n_params->input_q4k += n_i64t;
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q5_0:
|
|
||||||
n_params->input_q50 += n_i64t;
|
|
||||||
break;
|
|
||||||
case GGML_TYPE_Q5_K:
|
case GGML_TYPE_Q5_K:
|
||||||
n_params->input_q5k += n_i64t;
|
n_params->input_q5k += n_i64t;
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
n_params->input_q6k += n_i64t;
|
n_params->input_q6k += n_i64t;
|
||||||
break;
|
break;
|
||||||
|
case GGML_TYPE_Q5_0:
|
||||||
|
n_params->input_q50 += n_i64t;
|
||||||
|
break;
|
||||||
case GGML_TYPE_Q8_0:
|
case GGML_TYPE_Q8_0:
|
||||||
n_params->input_q80 += n_i64t;
|
n_params->input_q80 += n_i64t;
|
||||||
break;
|
break;
|
||||||
|
case GGML_TYPE_IQ1_S:
|
||||||
|
n_params->input_iq1s += n_i64t;
|
||||||
|
break;
|
||||||
|
case GGML_TYPE_IQ4_NL:
|
||||||
|
n_params->input_iq4nl += n_i64t;
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_OUTPUT\n");
|
throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_OUTPUT\n");
|
||||||
}
|
}
|
||||||
|
@ -21116,25 +21168,34 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype,
|
||||||
case PROFILER_LAYER_OUTPUT:
|
case PROFILER_LAYER_OUTPUT:
|
||||||
switch (dtype) {
|
switch (dtype) {
|
||||||
case GGML_TYPE_F32:
|
case GGML_TYPE_F32:
|
||||||
n_params->output_f32 += n_i64t;
|
n_params->output_f32 += n_i64t;
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
n_params->output_f16 += n_i64t;
|
n_params->output_f16 += n_i64t;
|
||||||
|
break;
|
||||||
|
case GGML_TYPE_Q2_K:
|
||||||
|
n_params->output_q2k += n_i64t;
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
n_params->output_q4k += n_i64t;
|
n_params->output_q4k += n_i64t;
|
||||||
break;
|
|
||||||
case GGML_TYPE_Q5_0:
|
|
||||||
n_params->output_q50 += n_i64t;
|
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q5_K:
|
case GGML_TYPE_Q5_K:
|
||||||
n_params->output_q5k += n_i64t;
|
n_params->output_q5k += n_i64t;
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
n_params->output_q6k += n_i64t;
|
n_params->output_q6k += n_i64t;
|
||||||
|
break;
|
||||||
|
case GGML_TYPE_Q5_0:
|
||||||
|
n_params->output_q50 += n_i64t;
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q8_0:
|
case GGML_TYPE_Q8_0:
|
||||||
n_params->output_q80 += n_i64t;
|
n_params->output_q80 += n_i64t;
|
||||||
|
break;
|
||||||
|
case GGML_TYPE_IQ1_S:
|
||||||
|
n_params->output_iq1s += n_i64t;
|
||||||
|
break;
|
||||||
|
case GGML_TYPE_IQ4_NL:
|
||||||
|
n_params->output_iq4nl += n_i64t;
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_OUTPUT\n");
|
throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_OUTPUT\n");
|
||||||
|
@ -21144,25 +21205,34 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype,
|
||||||
case PROFILER_LAYER_BACKEND:
|
case PROFILER_LAYER_BACKEND:
|
||||||
switch (dtype) {
|
switch (dtype) {
|
||||||
case GGML_TYPE_F32:
|
case GGML_TYPE_F32:
|
||||||
n_params->layer_f32 += n_i64t;
|
n_params->layer_f32 += n_i64t;
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
n_params->layer_f16 += n_i64t;
|
n_params->layer_f16 += n_i64t;
|
||||||
|
break;
|
||||||
|
case GGML_TYPE_Q2_K:
|
||||||
|
n_params->layer_q2k += n_i64t;
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
n_params->layer_q4k += n_i64t;
|
n_params->layer_q4k += n_i64t;
|
||||||
break;
|
|
||||||
case GGML_TYPE_Q5_0:
|
|
||||||
n_params->layer_q50 += n_i64t;
|
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q5_K:
|
case GGML_TYPE_Q5_K:
|
||||||
n_params->layer_q5k += n_i64t;
|
n_params->layer_q5k += n_i64t;
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
n_params->layer_q6k += n_i64t;
|
n_params->layer_q6k += n_i64t;
|
||||||
|
break;
|
||||||
|
case GGML_TYPE_Q5_0:
|
||||||
|
n_params->layer_q50 += n_i64t;
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q8_0:
|
case GGML_TYPE_Q8_0:
|
||||||
n_params->layer_q80 += n_i64t;
|
n_params->layer_q80 += n_i64t;
|
||||||
|
break;
|
||||||
|
case GGML_TYPE_IQ1_S:
|
||||||
|
n_params->layer_iq1s += n_i64t;
|
||||||
|
break;
|
||||||
|
case GGML_TYPE_IQ4_NL:
|
||||||
|
n_params->layer_iq4nl += n_i64t;
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_BACKEND\n");
|
throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_BACKEND\n");
|
||||||
|
@ -21452,23 +21522,29 @@ void llama_model_n_flops(
|
||||||
}
|
}
|
||||||
|
|
||||||
// use average values instead of total values
|
// use average values instead of total values
|
||||||
n_flops->layer_f32_f32 = static_cast<int64_t>((double)n_flops->layer_f32_f32 / (double)n_layer);
|
n_flops->layer_f32_f32 = static_cast<int64_t>((double)n_flops->layer_f32_f32 / (double)n_layer);
|
||||||
n_flops->layer_f16_f32 = static_cast<int64_t>((double)n_flops->layer_f16_f32 / (double)n_layer);
|
n_flops->layer_f16_f32 = static_cast<int64_t>((double)n_flops->layer_f16_f32 / (double)n_layer);
|
||||||
n_flops->layer_q4k_f32 = static_cast<int64_t>((double)n_flops->layer_q4k_f32 / (double)n_layer);
|
n_flops->layer_q2k_f32 = static_cast<int64_t>((double)n_flops->layer_q2k_f32 / (double)n_layer);
|
||||||
n_flops->layer_q50_f32 = static_cast<int64_t>((double)n_flops->layer_q50_f32 / (double)n_layer);
|
n_flops->layer_q4k_f32 = static_cast<int64_t>((double)n_flops->layer_q4k_f32 / (double)n_layer);
|
||||||
n_flops->layer_q5k_f32 = static_cast<int64_t>((double)n_flops->layer_q5k_f32 / (double)n_layer);
|
n_flops->layer_q5k_f32 = static_cast<int64_t>((double)n_flops->layer_q5k_f32 / (double)n_layer);
|
||||||
n_flops->layer_q6k_f32 = static_cast<int64_t>((double)n_flops->layer_q6k_f32 / (double)n_layer);
|
n_flops->layer_q6k_f32 = static_cast<int64_t>((double)n_flops->layer_q6k_f32 / (double)n_layer);
|
||||||
n_flops->layer_q80_f32 = static_cast<int64_t>((double)n_flops->layer_q80_f32 / (double)n_layer);
|
n_flops->layer_q50_f32 = static_cast<int64_t>((double)n_flops->layer_q50_f32 / (double)n_layer);
|
||||||
|
n_flops->layer_q80_f32 = static_cast<int64_t>((double)n_flops->layer_q80_f32 / (double)n_layer);
|
||||||
n_params->layer_f32 = static_cast<int64_t>((double)n_params->layer_f32 / (double)n_layer);
|
n_flops->layer_iq1s_f32 = static_cast<int64_t>((double)n_flops->layer_iq1s_f32 / (double)n_layer);
|
||||||
n_params->layer_f16 = static_cast<int64_t>((double)n_params->layer_f16 / (double)n_layer);
|
n_flops->layer_iq4nl_f32 = static_cast<int64_t>((double)n_flops->layer_iq4nl_f32 / (double)n_layer);
|
||||||
n_params->layer_q4k = static_cast<int64_t>((double)n_params->layer_q4k / (double)n_layer);
|
|
||||||
n_params->layer_q50 = static_cast<int64_t>((double)n_params->layer_q50 / (double)n_layer);
|
n_params->layer_f32 = static_cast<int64_t>((double)n_params->layer_f32 / (double)n_layer);
|
||||||
n_params->layer_q5k = static_cast<int64_t>((double)n_params->layer_q5k / (double)n_layer);
|
n_params->layer_f16 = static_cast<int64_t>((double)n_params->layer_f16 / (double)n_layer);
|
||||||
n_params->layer_q6k = static_cast<int64_t>((double)n_params->layer_q6k / (double)n_layer);
|
n_params->layer_q2k = static_cast<int64_t>((double)n_params->layer_q2k / (double)n_layer);
|
||||||
n_params->layer_q80 = static_cast<int64_t>((double)n_params->layer_q80 / (double)n_layer);
|
n_params->layer_q4k = static_cast<int64_t>((double)n_params->layer_q4k / (double)n_layer);
|
||||||
|
n_params->layer_q50 = static_cast<int64_t>((double)n_params->layer_q50 / (double)n_layer);
|
||||||
n_bytes->nb_layer = static_cast<int64_t>((double)n_bytes->nb_layer / (double)n_layer);
|
n_params->layer_q5k = static_cast<int64_t>((double)n_params->layer_q5k / (double)n_layer);
|
||||||
|
n_params->layer_q6k = static_cast<int64_t>((double)n_params->layer_q6k / (double)n_layer);
|
||||||
|
n_params->layer_q80 = static_cast<int64_t>((double)n_params->layer_q80 / (double)n_layer);
|
||||||
|
n_params->layer_iq1s = static_cast<int64_t>((double)n_params->layer_iq1s / (double)n_layer);
|
||||||
|
n_params->layer_iq4nl = static_cast<int64_t>((double)n_params->layer_iq4nl / (double)n_layer);
|
||||||
|
|
||||||
|
n_bytes->nb_layer = static_cast<int64_t>((double)n_bytes->nb_layer / (double)n_layer);
|
||||||
|
|
||||||
// reset ml, model, and clear contexts
|
// reset ml, model, and clear contexts
|
||||||
ml->n_created = 0;
|
ml->n_created = 0;
|
||||||
|
|
Loading…
Add table
Reference in a new issue