mirror of
https://github.com/Lizonghang/prima.cpp.git
synced 2025-09-06 06:59:02 +00:00
Added support for iq1s and iq1m quantization type
This commit is contained in:
commit
fc1e2d3fc6
8 changed files with 973 additions and 523 deletions
|
@ -903,11 +903,17 @@ static bool assign_layers_to_device(
|
|||
float t_calc_cpu = (
|
||||
master.model_flops.layer_f32_f32 / (dev.cpu_props.flops_f32_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_f16_f32 / (dev.cpu_props.flops_f16_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q2k_f32 / (dev.cpu_props.flops_q2k_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q4k_f32 / (dev.cpu_props.flops_q4k_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q50_f32 / (dev.cpu_props.flops_q50_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q5k_f32 / (dev.cpu_props.flops_q5k_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q6k_f32 / (dev.cpu_props.flops_q6k_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q80_f32 / (dev.cpu_props.flops_q80_f32 * 1e9 + EPS)) * 1000; // in ms
|
||||
master.model_flops.layer_iq2xxs_f32/ (dev.cpu_props.flops_iq2xxs_f32* 1e9 + EPS) +
|
||||
master.model_flops.layer_q50_f32 / (dev.cpu_props.flops_q50_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q80_f32 / (dev.cpu_props.flops_q80_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_iq1s_f32 / (dev.cpu_props.flops_iq1s_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_iq4nl_f32 / (dev.cpu_props.flops_iq4nl_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_iq1m_f32 / (dev.cpu_props.flops_iq1m_f32 * 1e9 + EPS) ) * 1000; // in ms
|
||||
|
||||
float t_kv_cpy_cpu = dev.memory.mem_cpy_delay; // in ms
|
||||
// t_read_ram_cpu = b_prime / (dev.memory.cpu_read_ram_bw * 1e9) * 1000; // in ms
|
||||
|
||||
|
@ -923,22 +929,34 @@ static bool assign_layers_to_device(
|
|||
t_calc_gpu = (
|
||||
master.model_flops.layer_f32_f32 / (dev.gpu_props.metal_flops_f32_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_f16_f32 / (dev.gpu_props.metal_flops_f16_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q2k_f32 / (dev.gpu_props.metal_flops_q2k_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q4k_f32 / (dev.gpu_props.metal_flops_q4k_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q50_f32 / (dev.gpu_props.metal_flops_q50_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q5k_f32 / (dev.gpu_props.metal_flops_q5k_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q6k_f32 / (dev.gpu_props.metal_flops_q6k_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q80_f32 / (dev.gpu_props.metal_flops_q80_f32 * 1e9 + EPS)) * 1000; // in ms
|
||||
master.model_flops.layer_iq2xxs_f32 / (dev.gpu_props.metal_flops_iq2xxs_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q50_f32 / (dev.gpu_props.metal_flops_q50_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q80_f32 / (dev.gpu_props.metal_flops_q80_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_iq1s_f32 / (dev.gpu_props.metal_flops_iq1s_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_iq4nl_f32 / (dev.gpu_props.metal_flops_iq4nl_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_iq1m_f32 / (dev.gpu_props.metal_flops_iq1m_f32 * 1e9 + EPS) ) * 1000; // in ms
|
||||
|
||||
t_kv_cpy_gpu = dev.gpu_props.metal_mem_cpy_delay; // in ms
|
||||
// t_read_ram_gpu = b_prime / (dev.gpu_props.metal_read_vram_bw * 1e9) * 1000; // in ms
|
||||
} else {
|
||||
t_calc_gpu = (
|
||||
master.model_flops.layer_f32_f32 / (dev.gpu_props.cuda_flops_f32_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_f16_f32 / (dev.gpu_props.cuda_flops_f16_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q2k_f32 / (dev.gpu_props.cuda_flops_q2k_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q4k_f32 / (dev.gpu_props.cuda_flops_q4k_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q50_f32 / (dev.gpu_props.cuda_flops_q50_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q5k_f32 / (dev.gpu_props.cuda_flops_q5k_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q6k_f32 / (dev.gpu_props.cuda_flops_q6k_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q80_f32 / (dev.gpu_props.cuda_flops_q80_f32 * 1e9 + EPS)) * 1000; // in ms
|
||||
master.model_flops.layer_iq2xxs_f32 / (dev.gpu_props.cuda_flops_iq2xxs_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q50_f32 / (dev.gpu_props.cuda_flops_q50_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_q80_f32 / (dev.gpu_props.cuda_flops_q80_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_iq1s_f32 / (dev.gpu_props.cuda_flops_iq1s_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_iq4nl_f32 / (dev.gpu_props.cuda_flops_iq4nl_f32 * 1e9 + EPS) +
|
||||
master.model_flops.layer_iq1m_f32 / (dev.gpu_props.cuda_flops_iq1m_f32 * 1e9 + EPS) ) * 1000; // in ms
|
||||
|
||||
t_kv_cpy_gpu = dev.gpu_props.cuda_mem_cpy_delay; // in ms
|
||||
// t_read_ram_gpu = b_prime / (dev.gpu_props.cuda_read_vram_bw * 1e9) * 1000; // in ms
|
||||
}
|
||||
|
@ -1115,12 +1133,16 @@ static bool assign_layers_to_device(
|
|||
kappa = (
|
||||
dev.model_flops.layer_f32_f32 / (dev.cpu_props.flops_f32_f32 * 1e9 + EPS) +
|
||||
dev.model_flops.layer_f16_f32 / (dev.cpu_props.flops_f16_f32 * 1e9 + EPS) +
|
||||
dev.model_flops.layer_q2k_f32 / (dev.cpu_props.flops_q2k_f32 * 1e9 + EPS) +
|
||||
dev.model_flops.layer_q4k_f32 / (dev.cpu_props.flops_q4k_f32 * 1e9 + EPS) +
|
||||
dev.model_flops.layer_q50_f32 / (dev.cpu_props.flops_q50_f32 * 1e9 + EPS) +
|
||||
dev.model_flops.layer_q5k_f32 / (dev.cpu_props.flops_q5k_f32 * 1e9 + EPS) +
|
||||
dev.model_flops.layer_q6k_f32 / (dev.cpu_props.flops_q6k_f32 * 1e9 + EPS) +
|
||||
dev.model_flops.layer_q80_f32 / (dev.cpu_props.flops_q80_f32 * 1e9 + EPS)) * 1000; // in ms
|
||||
|
||||
dev.model_flops.layer_iq2xxs_f32 / (dev.cpu_props.flops_iq2xxs_f32 * 1e9 + EPS) +
|
||||
dev.model_flops.layer_q50_f32 / (dev.cpu_props.flops_q50_f32 * 1e9 + EPS) +
|
||||
dev.model_flops.layer_q80_f32 / (dev.cpu_props.flops_q80_f32 * 1e9 + EPS) +
|
||||
dev.model_flops.layer_iq1s_f32 / (dev.cpu_props.flops_iq1s_f32 * 1e9 + EPS) +
|
||||
dev.model_flops.layer_iq4nl_f32 / (dev.cpu_props.flops_iq4nl_f32 * 1e9 + EPS) +
|
||||
dev.model_flops.layer_iq1m_f32 / (dev.cpu_props.flops_iq1m_f32 * 1e9 + EPS) ) * 1000; // in ms
|
||||
// kappa += (bi / n_vocab + bo) / (dev.memory.cpu_read_ram_bw * 1e9) * 1000; // in ms
|
||||
|
||||
kappa += (bi / n_vocab) / (disk_speed[m] * 1e9) * 1000; // in ms
|
||||
|
@ -1766,33 +1788,25 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
|
|||
return mparams;
|
||||
}
|
||||
|
||||
static ggml_type kv_cache_type_from_str(const std::string & s) {
|
||||
if (s == "f32") {
|
||||
return GGML_TYPE_F32;
|
||||
}
|
||||
if (s == "f16") {
|
||||
return GGML_TYPE_F16;
|
||||
}
|
||||
if (s == "q8_0") {
|
||||
return GGML_TYPE_Q8_0;
|
||||
}
|
||||
if (s == "q4_0") {
|
||||
return GGML_TYPE_Q4_0;
|
||||
}
|
||||
if (s == "q4_1") {
|
||||
return GGML_TYPE_Q4_1;
|
||||
}
|
||||
if (s == "iq4_nl") {
|
||||
return GGML_TYPE_IQ4_NL;
|
||||
}
|
||||
if (s == "q5_0") {
|
||||
return GGML_TYPE_Q5_0;
|
||||
}
|
||||
if (s == "q5_1") {
|
||||
return GGML_TYPE_Q5_1;
|
||||
}
|
||||
const std::vector<ggml_type> kv_cache_types = {
|
||||
GGML_TYPE_F32,
|
||||
GGML_TYPE_F16,
|
||||
GGML_TYPE_BF16, // Added BF16 data type support
|
||||
GGML_TYPE_Q8_0,
|
||||
GGML_TYPE_Q4_0,
|
||||
GGML_TYPE_Q4_1,
|
||||
GGML_TYPE_IQ4_NL,
|
||||
GGML_TYPE_Q5_0,
|
||||
GGML_TYPE_Q5_1,
|
||||
};
|
||||
|
||||
throw std::runtime_error("Invalid cache type: " + s);
|
||||
static ggml_type kv_cache_type_from_str(const std::string & s) {
|
||||
for (const auto & type : kv_cache_types) {
|
||||
if (ggml_type_name(type) == s) {
|
||||
return type;
|
||||
}
|
||||
}
|
||||
throw std::runtime_error("Unsupported cache type: " + s);
|
||||
}
|
||||
|
||||
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
|
||||
|
|
|
@ -188,6 +188,9 @@ static float device_flops(struct llama_model * model, enum ggml_type src0t, enum
|
|||
};
|
||||
struct ggml_context * ctx = ggml_init(params);
|
||||
|
||||
if(n_embd < ggml_blck_size(src0t)){
|
||||
n_embd = 2 * ggml_blck_size(src0t);
|
||||
}
|
||||
struct ggml_tensor * tensor_a = ggml_new_tensor_2d(ctx, src0t, n_embd, n_embd);
|
||||
struct ggml_tensor * tensor_b = ggml_new_tensor_2d(ctx, src1t, n_embd, n_embd);
|
||||
|
||||
|
@ -208,10 +211,12 @@ static float device_flops(struct llama_model * model, enum ggml_type src0t, enum
|
|||
ctx_cgraph = ggml_init(params0);
|
||||
|
||||
gf = ggml_new_graph(ctx_cgraph);
|
||||
|
||||
cur = ggml_mul_mat(ctx_cgraph, tensor_a, tensor_b);
|
||||
for (int i = 0; i < n_repeat - 1; i++) {
|
||||
cur = ggml_mul_mat(ctx_cgraph, tensor_a, cur);
|
||||
}
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
}
|
||||
|
||||
|
@ -364,14 +369,18 @@ float device_inp_embd_delay(struct llama_model * model, enum ggml_type src0t, in
|
|||
ggml_fp32_to_fp16_row(temp_f32.data(), static_cast<ggml_fp16_t *>(matrix_B), embd_size);
|
||||
break;
|
||||
}
|
||||
case GGML_TYPE_Q2_K:
|
||||
case GGML_TYPE_Q4_K:
|
||||
case GGML_TYPE_Q5_0:
|
||||
case GGML_TYPE_Q5_K:
|
||||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_Q8_K:
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_Q5_0:
|
||||
case GGML_TYPE_Q8_0:
|
||||
QK_K = ggml_blck_size(src0t);
|
||||
matrix_B = malloc((embd_size / QK_K) * ggml_type_size(src0t));
|
||||
case GGML_TYPE_IQ1_S:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
case GGML_TYPE_IQ1_M:
|
||||
matrix_B = malloc((embd_size / ggml_blck_size(src0t) * ggml_type_size(src0t))); // The quantization block sizes are inconsistent for different quantization methods
|
||||
break;
|
||||
default:
|
||||
LOG_INF("Unsupported type: %d\n", src0t);
|
||||
|
@ -1349,31 +1358,45 @@ static float device_compute_delay(struct device_info & dev_info, int n_layers, c
|
|||
|
||||
gpu_latency_per_layer += (double)n_flops.layer_f32_f32 / ((double)gpu.cuda_flops_f32_f32 + EPS) / 1e9;
|
||||
gpu_latency_per_layer += (double)n_flops.layer_f16_f32 / ((double)gpu.cuda_flops_f16_f32 + EPS) / 1e9;
|
||||
gpu_latency_per_layer += (double)n_flops.layer_q2k_f32 / ((double)gpu.cuda_flops_q2k_f32 + EPS) / 1e9;
|
||||
gpu_latency_per_layer += (double)n_flops.layer_q4k_f32 / ((double)gpu.cuda_flops_q4k_f32 + EPS) / 1e9;
|
||||
gpu_latency_per_layer += (double)n_flops.layer_q50_f32 / ((double)gpu.cuda_flops_q50_f32 + EPS) / 1e9;
|
||||
gpu_latency_per_layer += (double)n_flops.layer_q5k_f32 / ((double)gpu.cuda_flops_q5k_f32 + EPS) / 1e9;
|
||||
gpu_latency_per_layer += (double)n_flops.layer_q6k_f32 / ((double)gpu.cuda_flops_q6k_f32 + EPS) / 1e9;
|
||||
gpu_latency_per_layer += (double)n_flops.layer_iq2xxs_f32 / ((double)gpu.cuda_flops_iq2xxs_f32 + EPS) / 1e9;
|
||||
gpu_latency_per_layer += (double)n_flops.layer_q50_f32 / ((double)gpu.cuda_flops_q50_f32 + EPS) / 1e9;
|
||||
gpu_latency_per_layer += (double)n_flops.layer_q80_f32 / ((double)gpu.cuda_flops_q80_f32 + EPS) / 1e9;
|
||||
gpu_latency_per_layer += (double)n_flops.layer_iq1s_f32 / ((double)gpu.cuda_flops_iq1s_f32 + EPS) / 1e9;
|
||||
gpu_latency_per_layer += (double)n_flops.layer_iq4nl_f32 / ((double)gpu.cuda_flops_iq4nl_f32 + EPS) / 1e9;
|
||||
gpu_latency_per_layer += (double)n_flops.layer_iq1m_f32 / ((double)gpu.cuda_flops_iq1m_f32 + EPS) / 1e9;
|
||||
#elif GGML_USE_METAL
|
||||
struct gpu_props gpu = dev_info.gpu_props;
|
||||
|
||||
gpu_latency_per_layer += (double)n_flops.layer_f32_f32 / ((double)gpu.metal_flops_f32_f32 + EPS) / 1e9;
|
||||
gpu_latency_per_layer += (double)n_flops.layer_f16_f32 / ((double)gpu.metal_flops_f16_f32 + EPS) / 1e9;
|
||||
gpu_latency_per_layer += (double)n_flops.layer_q2k_f32 / ((double)gpu.metal_flops_q2k_f32 + EPS) / 1e9;
|
||||
gpu_latency_per_layer += (double)n_flops.layer_q4k_f32 / ((double)gpu.metal_flops_q4k_f32 + EPS) / 1e9;
|
||||
gpu_latency_per_layer += (double)n_flops.layer_q50_f32 / ((double)gpu.metal_flops_q50_f32 + EPS) / 1e9;
|
||||
gpu_latency_per_layer += (double)n_flops.layer_q5k_f32 / ((double)gpu.metal_flops_q5k_f32 + EPS) / 1e9;
|
||||
gpu_latency_per_layer += (double)n_flops.layer_q6k_f32 / ((double)gpu.metal_flops_q6k_f32 + EPS) / 1e9;
|
||||
gpu_latency_per_layer += (double)n_flops.layer_iq2xxs_f32 / ((double)gpu.metal_flops_iq2xxs_f32 + EPS) / 1e9;
|
||||
gpu_latency_per_layer += (double)n_flops.layer_q50_f32 / ((double)gpu.metal_flops_q50_f32 + EPS) / 1e9;
|
||||
gpu_latency_per_layer += (double)n_flops.layer_q80_f32 / ((double)gpu.metal_flops_q80_f32 + EPS) / 1e9;
|
||||
gpu_latency_per_layer += (double)n_flops.layer_iq1s_f32 / ((double)gpu.metal_flops_iq1s_f32 + EPS) / 1e9;
|
||||
gpu_latency_per_layer += (double)n_flops.layer_iq4nl_f32 / ((double)gpu.metal_flops_iq4nl_f32 + EPS) / 1e9;
|
||||
gpu_latency_per_layer += (double)n_flops.layer_iq1m_f32 / ((double)gpu.metal_flops_iq1m_f32 + EPS) / 1e9;
|
||||
#endif
|
||||
|
||||
cpu_latency_per_layer += (double)n_flops.layer_f32_f32 / ((double)cpu.flops_f32_f32 + EPS) / 1e9;
|
||||
cpu_latency_per_layer += (double)n_flops.layer_f16_f32 / ((double)cpu.flops_f16_f32 + EPS) / 1e9;
|
||||
cpu_latency_per_layer += (double)n_flops.layer_q2k_f32 / ((double)cpu.flops_q2k_f32 + EPS) / 1e9;
|
||||
cpu_latency_per_layer += (double)n_flops.layer_q4k_f32 / ((double)cpu.flops_q4k_f32 + EPS) / 1e9;
|
||||
cpu_latency_per_layer += (double)n_flops.layer_q50_f32 / ((double)cpu.flops_q50_f32 + EPS) / 1e9;
|
||||
cpu_latency_per_layer += (double)n_flops.layer_q5k_f32 / ((double)cpu.flops_q5k_f32 + EPS) / 1e9;
|
||||
cpu_latency_per_layer += (double)n_flops.layer_q6k_f32 / ((double)cpu.flops_q6k_f32 + EPS) / 1e9;
|
||||
cpu_latency_per_layer += (double)n_flops.layer_iq2xxs_f32 / ((double)cpu.flops_iq2xxs_f32 + EPS) / 1e9;
|
||||
cpu_latency_per_layer += (double)n_flops.layer_q50_f32 / ((double)cpu.flops_q50_f32 + EPS) / 1e9;
|
||||
cpu_latency_per_layer += (double)n_flops.layer_q80_f32 / ((double)cpu.flops_q80_f32 + EPS) / 1e9;
|
||||
|
||||
cpu_latency_per_layer += (double)n_flops.layer_iq1s_f32 / ((double)cpu.flops_iq1s_f32 + EPS) / 1e9;
|
||||
cpu_latency_per_layer += (double)n_flops.layer_iq4nl_f32 / ((double)cpu.flops_iq4nl_f32 + EPS) / 1e9;
|
||||
cpu_latency_per_layer += (double)n_flops.layer_iq1m_f32 / ((double)cpu.flops_iq1m_f32 + EPS) / 1e9;
|
||||
double total_latency = 0.0f;
|
||||
|
||||
#if defined(GGML_USE_METAL) || defined(GGML_USE_CUDA)
|
||||
|
@ -1387,11 +1410,16 @@ static float device_compute_delay(struct device_info & dev_info, int n_layers, c
|
|||
|
||||
total_latency += (double)n_flops.output_f32_f32 / ((double)cpu.flops_f32_f32 + EPS) / 1e9;
|
||||
total_latency += (double)n_flops.output_f16_f32 / ((double)cpu.flops_f16_f32 + EPS) / 1e9;
|
||||
total_latency += (double)n_flops.output_q2k_f32 / ((double)cpu.flops_q2k_f32 + EPS) / 1e9;
|
||||
total_latency += (double)n_flops.output_q4k_f32 / ((double)cpu.flops_q4k_f32 + EPS) / 1e9;
|
||||
total_latency += (double)n_flops.output_q50_f32 / ((double)cpu.flops_q50_f32 + EPS) / 1e9;
|
||||
total_latency += (double)n_flops.output_q5k_f32 / ((double)cpu.flops_q5k_f32 + EPS) / 1e9;
|
||||
total_latency += (double)n_flops.output_q6k_f32 / ((double)cpu.flops_q6k_f32 + EPS) / 1e9;
|
||||
total_latency += (double)n_flops.output_iq2xxs_f32 / ((double)cpu.flops_iq2xxs_f32 + EPS) / 1e9;
|
||||
total_latency += (double)n_flops.output_q50_f32 / ((double)cpu.flops_q50_f32 + EPS) / 1e9;
|
||||
total_latency += (double)n_flops.output_q80_f32 / ((double)cpu.flops_q80_f32 + EPS) / 1e9;
|
||||
total_latency += (double)n_flops.output_iq1s_f32 / ((double)cpu.flops_iq1s_f32 + EPS) / 1e9;
|
||||
total_latency += (double)n_flops.output_iq4nl_f32 / ((double)cpu.flops_iq4nl_f32 + EPS) / 1e9;
|
||||
total_latency += (double)n_flops.output_iq1m_f32 / ((double)cpu.flops_iq1m_f32 + EPS) / 1e9;
|
||||
|
||||
total_latency *= 1000; // convert to ms
|
||||
|
||||
|
@ -1696,36 +1724,66 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
|
|||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| CPU flops (Q4K x F32, GFLOPS)");
|
||||
LOG_INF("| CPU flops (Q2K x F32, GFLOPS) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q2k_f32);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| CPU flops (Q4K x F32, GFLOPS) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q4k_f32);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| CPU flops (Q50 x F32, GFLOPS)");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q50_f32);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| CPU flops (Q5K x F32, GFLOPS)");
|
||||
LOG_INF("| CPU flops (Q5K x F32, GFLOPS) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q5k_f32);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| CPU flops (Q6K x F32, GFLOPS)");
|
||||
LOG_INF("| CPU flops (Q6K x F32, GFLOPS) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q6k_f32);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| CPU flops (Q80 x F32, GFLOPS)");
|
||||
LOG_INF("| CPU flops (IQ2XXS x F32, GFLOPS)");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_iq2xxs_f32);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| CPU flops (Q50 x F32, GFLOPS) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q50_f32);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| CPU flops (Q80 x F32, GFLOPS) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q80_f32);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| CPU flops (IQ1S x F32, GFLOPS) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_iq1s_f32);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| CPU flops (IQ4NL x F32, GFLOPS) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_iq4nl_f32);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| CPU flops (IQ1M x F32, GFLOPS) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_iq1m_f32);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Physical Mem Total (GiB) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.2f ", dev_info_set[i].memory.total_physical);
|
||||
|
@ -1864,61 +1922,91 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
|
|||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Metal KVCache Copy Time(ms/l)");
|
||||
LOG_INF("| Metal KVCache Copy Time(ms/l) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.2f ", dev_info_set[i].gpu_props.metal_mem_cpy_delay);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Metal flops (F32xF32, GFLOPS)");
|
||||
LOG_INF("| Metal flops (F32xF32, GFLOPS) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_f32_f32);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Metal flops (F16xF32, GFLOPS)");
|
||||
LOG_INF("| Metal flops (F16xF32, GFLOPS) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_f16_f32);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Metal flops (Q4KxF32, GFLOPS)");
|
||||
LOG_INF("| Metal flops (Q2KxF32, GFLOPS) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q2k_f32);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Metal flops (Q4KxF32, GFLOPS) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q4k_f32);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Metal flops (Q50xF32, GFLOPS)");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q50_f32);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Metal flops (Q5KxF32, GFLOPS)");
|
||||
LOG_INF("| Metal flops (Q5KxF32, GFLOPS) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q5k_f32);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Metal flops (Q6KxF32, GFLOPS)");
|
||||
LOG_INF("| Metal flops (Q6KxF32, GFLOPS) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q6k_f32);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Metal flops (Q80xF32, GFLOPS)");
|
||||
LOG_INF("| Metal flops (IQ2XXSxF32, GFLOPS)");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_iq2xxs_f32);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Metal flops (Q50xF32, GFLOPS) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q50_f32);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Metal flops (Q80xF32, GFLOPS) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q80_f32);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Metal flops (IQ1SxF32, GFLOPS) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_iq1s_f32);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Metal flops (IQ4NLxF32, GFLOPS) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_iq4nl_f32);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Metal flops (IQ1MxF32, GFLOPS) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_iq1m_f32);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| CUDA VRAM Read BW (GB/s) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.2f ", dev_info_set[i].gpu_props.cuda_read_vram_bw);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| CUDA KVCache Copy Time (ms/l)");
|
||||
LOG_INF("| CUDA KVCache Copy Time (ms/l) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.2f ", dev_info_set[i].gpu_props.cuda_mem_cpy_delay);
|
||||
}
|
||||
|
@ -1936,15 +2024,15 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
|
|||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| CUDA flops (Q4KxF32, GFLOPS) ");
|
||||
LOG_INF("| CUDA flops (Q2KxF32, GFLOPS) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q4k_f32);
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q2k_f32);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| CUDA flops (Q50xF32, GFLOPS) ");
|
||||
LOG_INF("| CUDA flops (Q4KxF32, GFLOPS) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q50_f32);
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q4k_f32);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
|
@ -1960,12 +2048,42 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
|
|||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| CUDA flops (IQ2XXSxF32, GFLOPS) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_iq2xxs_f32);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| CUDA flops (Q50xF32, GFLOPS) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q50_f32);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| CUDA flops (Q80xF32, GFLOPS) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q80_f32);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| CUDA flops (IQ1SxF32, GFLOPS) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_iq1s_f32);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| CUDA flops (IQ4NLxF32, GFLOPS) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_iq4nl_f32);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| CUDA flops (IQ1MxF32, GFLOPS) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_iq1m_f32);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model flops (output F32xF32) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_f32_f32);
|
||||
LOG_INF("\n");
|
||||
|
@ -1974,12 +2092,12 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
|
|||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_f16_f32);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model flops (output Q4KxF32) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q4k_f32);
|
||||
LOG_INF("| Model flops (output Q2KxF32) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q2k_f32);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model flops (output Q50xF32) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q50_f32);
|
||||
LOG_INF("| Model flops (output Q4KxF32) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q4k_f32);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model flops (output Q5KxF32) ");
|
||||
|
@ -1990,10 +2108,30 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
|
|||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q6k_f32);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model flops (output IQ2XXSxF32) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_iq2xxs_f32);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model flops (output Q50xF32) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q50_f32);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model flops (output Q80xF32) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q80_f32);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model flops (output IQ1SxF32) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_iq1s_f32);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model flops (output IQ4NLxF32) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_iq4nl_f32);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model flops (output IQ1MxF32) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_iq1m_f32);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model flops (layer F32xF32) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_f32_f32);
|
||||
LOG_INF("\n");
|
||||
|
@ -2002,12 +2140,12 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
|
|||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_f16_f32);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model flops (layer Q4KxF32) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q4k_f32);
|
||||
LOG_INF("| Model flops (layer Q2KxF32) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q2k_f32);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model flops (layer Q50xF32) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q50_f32);
|
||||
LOG_INF("| Model flops (layer Q4KxF32) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q4k_f32);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model flops (layer Q5KxF32) ");
|
||||
|
@ -2018,10 +2156,30 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
|
|||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q6k_f32);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model flops (layer IQ2XXSxF32) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_iq2xxs_f32);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model flops (layer Q50xF32) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q50_f32);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model flops (layer Q80xF32) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q80_f32);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model flops (layer IQ1SxF32) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_iq1s_f32);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model flops (layer IQ4NLxF32) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_iq4nl_f32);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model flops (layer IQ1MxF32) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_iq1m_f32);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model params (input F32) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_f32);
|
||||
LOG_INF("\n");
|
||||
|
@ -2030,12 +2188,12 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
|
|||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_f16);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model params (input Q4K) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q4k);
|
||||
LOG_INF("| Model params (input Q2K) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q2k);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model params (input Q50) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q50);
|
||||
LOG_INF("| Model params (input Q4K) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q4k);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model params (input Q5K) ");
|
||||
|
@ -2046,10 +2204,30 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
|
|||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q6k);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model params (input IQ2XXS) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_iq2xxs);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model params (input Q50) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q50);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model params (input Q80) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q80);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model params (input IQ1S) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_iq1s);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model params (input IQ4NL) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_iq4nl);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model params (input IQ1M) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_iq1m);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model params (layer F32) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_f32);
|
||||
LOG_INF("\n");
|
||||
|
@ -2058,12 +2236,12 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
|
|||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_f16);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model params (layer Q4K) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q4k);
|
||||
LOG_INF("| Model params (layer Q2K) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q2k);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model params (layer Q50) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q50);
|
||||
LOG_INF("| Model params (layer Q4K) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q4k);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model params (layer Q5K) ");
|
||||
|
@ -2074,10 +2252,30 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
|
|||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q6k);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model params (layer IQ2XXS) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_iq2xxs);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model params (layer Q50) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q50);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model params (layer Q80) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q80);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model params (layer IQ1S) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_iq1s);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model params (layer IQ4NL) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_iq4nl);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model params (layer IQ1M) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_iq1m);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model params (output F32) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_f32);
|
||||
LOG_INF("\n");
|
||||
|
@ -2086,12 +2284,12 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
|
|||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_f16);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model params (output Q4K) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q4k);
|
||||
LOG_INF("| Model params (output Q2K) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q2k);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model params (output Q50) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q50);
|
||||
LOG_INF("| Model params (output Q4K) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q4k);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model params (output Q5K) ");
|
||||
|
@ -2102,10 +2300,30 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
|
|||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q6k);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model params (output IQ2XXS) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_iq2xxs);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model params (output Q50) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q50);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model params (output Q80) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q80);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model params (output IQ1S) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_iq1s);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model params (output IQ4NL) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_iq4nl);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model params (output IQ1M) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_iq1m);
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Model bytes (input) ");
|
||||
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_bytes.nb_input);
|
||||
LOG_INF("\n");
|
||||
|
@ -2155,17 +2373,44 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
|
|||
+ gpu_description_len
|
||||
+ sizeof(struct disk_props)
|
||||
+ sizeof(uint32_t) // cpu_props.cores
|
||||
+ sizeof(float) * 7 // cpu_props.flops_f32_f32, cpu_props.flops_f16_f32, cpu_props.flops_q4k_f32, cpu_props.flops_q50_f32, cpu_props.flops_q5k_f32, cpu_props.flops_q6k_f32, cpu_props.flops_q80_f32
|
||||
+ sizeof(float) * 12 // - cpu_props.flops_f32_f32, cpu_props.flops_f16_f32,
|
||||
// - cpu_props.flops_q2k_f32, cpu_props.flops_q4k_f32, cpu_props.flops_q5k_f32, cpu_props.flops_q6k_f32
|
||||
// - cpu_props.flops_iq2xxs_f32
|
||||
// - cpu_props.flops_q50_f32, cpu_props.flops_q80_f32
|
||||
// - cpu_props.flops_iq1s_f32, cpu_props.flops_iq4nl_f32
|
||||
// - cpu_props.flops_iq1m_f32
|
||||
+ sizeof(struct memory_info)
|
||||
+ sizeof(struct gpu_support)
|
||||
+ sizeof(float) * 20; // gpu_props.memory_free, gpu_props.memory_total, gpu_props.metal_read_vram_bw, gpu_props.cuda_read_vram_bw,
|
||||
// gpu_props.metal_flops_f32_f32, gpu_props.metal_flops_f16_f32, gpu_props.metal_flops_q4k_f32, gpu_props.metal_flops_q50_f32, gpu_props.metal_flops_q5k_f32, gpu_props.metal_flops_q6k_f32, gpu_props.metal_flops_q80_f32,
|
||||
// gpu_props.cuda_flops_f32_f32, gpu_props.cuda_flops_f16_f32, gpu_props.cuda_flops_q4k_f32, gpu_props.cuda_flops_q50_f32, gpu_props.cuda_flops_q5k_f32, gpu_props.cuda_flops_q6k_f32, gpu_props.cuda_flops_q80_f32,
|
||||
// gpu_props.metal_mem_cpy_delay, gpu_props.cuda_mem_cpy_delay
|
||||
+ sizeof(float) * 30; // GPU attributes
|
||||
// memory:
|
||||
// - memory_free, memory_total
|
||||
// - metal_read_vram_bw, cuda_read_vram_bw
|
||||
// Metal floating-point performance:
|
||||
// - metal_flops_f32_f32, metal_flops_f16_f32
|
||||
// - metal_flops_q2k_f32, metal_flops_q4k_f32, metal_flops_q5k_f32, metal_flops_q6k_f32
|
||||
// - metal_flops_iq2xxs_f32
|
||||
// - metal_flops_q50_f32, metal_flops_q80_f32
|
||||
// - metal_flops_iq1s_f32, metal_flops_iq4nl_f32
|
||||
// - metal_flops_iq1m_f32
|
||||
// CUDA floating-point performance:
|
||||
// - cuda_flops_f32_f32, cuda_flops_f16_f32
|
||||
// - cuda_flops_q2k_f32, cuda_flops_q4k_f32, cuda_flops_q5k_f32, cuda_flops_q6k_f32
|
||||
// - cuda_flops_iq2xxs_f32
|
||||
// - cuda_flops_q50_f32, cuda_flops_q80_f32
|
||||
// - cuda_flops_iq1s_f32, cuda_flops_iq4nl_f32
|
||||
// - cuda_flops_iq1m_f32
|
||||
// delay:
|
||||
// - metal_mem_cpy_delay, cuda_mem_cpy_delay
|
||||
|
||||
*buffer = (char *)malloc(total_size);
|
||||
char * ptr = *buffer;
|
||||
|
||||
if (*buffer == NULL) {
|
||||
LOG_ERR("%s: failed to allocate %zu bytes for device info serialization\n",
|
||||
__func__, total_size);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// rank
|
||||
memcpy(ptr, &dev_info->rank, sizeof(uint32_t));
|
||||
ptr += sizeof(uint32_t);
|
||||
|
@ -2214,10 +2459,10 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
|
|||
memcpy(ptr, &dev_info->cpu_props.flops_f16_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->cpu_props.flops_q4k_f32, sizeof(float));
|
||||
memcpy(ptr, &dev_info->cpu_props.flops_q2k_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->cpu_props.flops_q50_f32, sizeof(float));
|
||||
memcpy(ptr, &dev_info->cpu_props.flops_q4k_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->cpu_props.flops_q5k_f32, sizeof(float));
|
||||
|
@ -2226,9 +2471,24 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
|
|||
memcpy(ptr, &dev_info->cpu_props.flops_q6k_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->cpu_props.flops_iq2xxs_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->cpu_props.flops_q50_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->cpu_props.flops_q80_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->cpu_props.flops_iq1s_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->cpu_props.flops_iq4nl_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->cpu_props.flops_iq1m_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->memory, sizeof(struct memory_info));
|
||||
ptr += sizeof(struct memory_info);
|
||||
|
||||
|
@ -2250,10 +2510,10 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
|
|||
memcpy(ptr, &dev_info->gpu_props.metal_flops_f16_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->gpu_props.metal_flops_q4k_f32, sizeof(float));
|
||||
memcpy(ptr, &dev_info->gpu_props.metal_flops_q2k_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->gpu_props.metal_flops_q50_f32, sizeof(float));
|
||||
memcpy(ptr, &dev_info->gpu_props.metal_flops_q4k_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->gpu_props.metal_flops_q5k_f32, sizeof(float));
|
||||
|
@ -2262,9 +2522,24 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
|
|||
memcpy(ptr, &dev_info->gpu_props.metal_flops_q6k_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->gpu_props.metal_flops_iq2xxs_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->gpu_props.metal_flops_q50_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->gpu_props.metal_flops_q80_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->gpu_props.metal_flops_iq1s_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->gpu_props.metal_flops_iq4nl_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->gpu_props.metal_flops_iq1m_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->gpu_props.metal_mem_cpy_delay, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
|
@ -2277,10 +2552,10 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
|
|||
memcpy(ptr, &dev_info->gpu_props.cuda_flops_f16_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->gpu_props.cuda_flops_q4k_f32, sizeof(float));
|
||||
memcpy(ptr, &dev_info->gpu_props.cuda_flops_q2k_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->gpu_props.cuda_flops_q50_f32, sizeof(float));
|
||||
memcpy(ptr, &dev_info->gpu_props.cuda_flops_q4k_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->gpu_props.cuda_flops_q5k_f32, sizeof(float));
|
||||
|
@ -2289,9 +2564,24 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
|
|||
memcpy(ptr, &dev_info->gpu_props.cuda_flops_q6k_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->gpu_props.cuda_flops_iq2xxs_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->gpu_props.cuda_flops_q50_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->gpu_props.cuda_flops_q80_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->gpu_props.cuda_flops_iq1s_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->gpu_props.cuda_flops_iq4nl_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->gpu_props.cuda_flops_iq1m_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->gpu_props.cuda_mem_cpy_delay, sizeof(float));
|
||||
|
||||
// no need to synchronize model flops and model params
|
||||
|
@ -2366,10 +2656,10 @@ void deserialize(const char * buffer, struct device_info * dev_info) {
|
|||
memcpy(&dev_info->cpu_props.flops_f16_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->cpu_props.flops_q4k_f32, ptr, sizeof(float));
|
||||
memcpy(&dev_info->cpu_props.flops_q2k_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->cpu_props.flops_q50_f32, ptr, sizeof(float));
|
||||
memcpy(&dev_info->cpu_props.flops_q4k_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->cpu_props.flops_q5k_f32, ptr, sizeof(float));
|
||||
|
@ -2378,9 +2668,24 @@ void deserialize(const char * buffer, struct device_info * dev_info) {
|
|||
memcpy(&dev_info->cpu_props.flops_q6k_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->cpu_props.flops_iq2xxs_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->cpu_props.flops_q50_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->cpu_props.flops_q80_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->cpu_props.flops_iq1s_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->cpu_props.flops_iq4nl_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->cpu_props.flops_iq1m_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->memory, ptr, sizeof(struct memory_info));
|
||||
ptr += sizeof(struct memory_info);
|
||||
|
||||
|
@ -2402,10 +2707,10 @@ void deserialize(const char * buffer, struct device_info * dev_info) {
|
|||
memcpy(&dev_info->gpu_props.metal_flops_f16_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->gpu_props.metal_flops_q4k_f32, ptr, sizeof(float));
|
||||
memcpy(&dev_info->gpu_props.metal_flops_q2k_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->gpu_props.metal_flops_q50_f32, ptr, sizeof(float));
|
||||
memcpy(&dev_info->gpu_props.metal_flops_q4k_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->gpu_props.metal_flops_q5k_f32, ptr, sizeof(float));
|
||||
|
@ -2414,9 +2719,24 @@ void deserialize(const char * buffer, struct device_info * dev_info) {
|
|||
memcpy(&dev_info->gpu_props.metal_flops_q6k_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->gpu_props.metal_flops_iq2xxs_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->gpu_props.metal_flops_q50_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->gpu_props.metal_flops_q80_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->gpu_props.metal_flops_iq1s_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->gpu_props.metal_flops_iq4nl_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->gpu_props.metal_flops_iq1m_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->gpu_props.metal_mem_cpy_delay, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
|
@ -2429,10 +2749,10 @@ void deserialize(const char * buffer, struct device_info * dev_info) {
|
|||
memcpy(&dev_info->gpu_props.cuda_flops_f16_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->gpu_props.cuda_flops_q4k_f32, ptr, sizeof(float));
|
||||
memcpy(&dev_info->gpu_props.cuda_flops_q2k_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->gpu_props.cuda_flops_q50_f32, ptr, sizeof(float));
|
||||
memcpy(&dev_info->gpu_props.cuda_flops_q4k_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->gpu_props.cuda_flops_q5k_f32, ptr, sizeof(float));
|
||||
|
@ -2441,9 +2761,24 @@ void deserialize(const char * buffer, struct device_info * dev_info) {
|
|||
memcpy(&dev_info->gpu_props.cuda_flops_q6k_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->gpu_props.cuda_flops_iq2xxs_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->gpu_props.cuda_flops_q50_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->gpu_props.cuda_flops_q80_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->gpu_props.cuda_flops_iq1s_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->gpu_props.cuda_flops_iq4nl_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->gpu_props.cuda_flops_iq1m_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->gpu_props.cuda_mem_cpy_delay, ptr, sizeof(float));
|
||||
|
||||
// no need to synchronize model flops and model params
|
||||
|
|
|
@ -17,23 +17,34 @@ struct cpu_props {
|
|||
uint32_t cores;
|
||||
float flops_f32_f32; // in GFLOPS
|
||||
float flops_f16_f32; // in GFLOPS
|
||||
float flops_q2k_f32; // in GFLOPS
|
||||
float flops_q4k_f32; // in GFLOPS
|
||||
float flops_q50_f32; // in GFLOPS
|
||||
float flops_q5k_f32; // in GFLOPS
|
||||
float flops_q6k_f32; // in GFLOPS
|
||||
float flops_iq2xxs_f32; // in GFLOPS
|
||||
float flops_q50_f32; // in GFLOPS
|
||||
float flops_q80_f32; // in GFLOPS
|
||||
float flops_iq1s_f32; // in GFLOPS
|
||||
float flops_iq4nl_f32; // in GFLOPS
|
||||
float flops_iq1m_f32; // in GFLOPS
|
||||
|
||||
cpu_props() :
|
||||
name(""),
|
||||
description(""),
|
||||
cores(0),
|
||||
flops_f32_f32(0.0f),
|
||||
flops_f16_f32(0.0f),
|
||||
flops_q4k_f32(0.0f),
|
||||
flops_q50_f32(0.0f),
|
||||
flops_q5k_f32(0.0f),
|
||||
flops_q6k_f32(0.0f),
|
||||
flops_q80_f32(0.0f) {}
|
||||
cpu_props()
|
||||
: name (""),
|
||||
description (""),
|
||||
cores (0),
|
||||
flops_f32_f32 (0.0f),
|
||||
flops_f16_f32 (0.0f),
|
||||
flops_q2k_f32 (0.0f),
|
||||
flops_q4k_f32 (0.0f),
|
||||
flops_q5k_f32 (0.0f),
|
||||
flops_q6k_f32 (0.0f),
|
||||
flops_iq2xxs_f32(0.0f),
|
||||
flops_q50_f32 (0.0f),
|
||||
flops_q80_f32 (0.0f),
|
||||
flops_iq1s_f32 (0.0f),
|
||||
flops_iq4nl_f32 (0.0f),
|
||||
flops_iq1m_f32 (0.0f)
|
||||
{}
|
||||
};
|
||||
|
||||
struct memory_info {
|
||||
|
@ -82,44 +93,64 @@ struct gpu_props {
|
|||
float metal_read_vram_bw; // in GB/s
|
||||
float metal_flops_f32_f32; // in GFLOPS
|
||||
float metal_flops_f16_f32; // in GFLOPS
|
||||
float metal_flops_q2k_f32; // in GFLOPS
|
||||
float metal_flops_q4k_f32; // in GFLOPS
|
||||
float metal_flops_q50_f32; // in GFLOPS
|
||||
float metal_flops_q5k_f32; // in GFLOPS
|
||||
float metal_flops_q6k_f32; // in GFLOPS
|
||||
float metal_flops_iq2xxs_f32; // in GFLOPS
|
||||
float metal_flops_q50_f32; // in GFLOPS
|
||||
float metal_flops_q80_f32; // in GFLOPS
|
||||
float metal_flops_iq1s_f32; // in GFLOPS
|
||||
float metal_flops_iq4nl_f32; // in GFLOPS
|
||||
float metal_flops_iq1m_f32; // in GFLOPS
|
||||
float metal_mem_cpy_delay; // in ms
|
||||
float cuda_read_vram_bw; // in GB/s
|
||||
float cuda_flops_f32_f32; // in GFLOPS
|
||||
float cuda_flops_f16_f32; // in GFLOPS
|
||||
float cuda_flops_q2k_f32; // in GFLOPS
|
||||
float cuda_flops_q4k_f32; // in GFLOPS
|
||||
float cuda_flops_q50_f32; // in GFLOPS
|
||||
float cuda_flops_q5k_f32; // in GFLOPS
|
||||
float cuda_flops_q6k_f32; // in GFLOPS
|
||||
float cuda_flops_iq2xxs_f32; // in GFLOPS
|
||||
float cuda_flops_q50_f32; // in GFLOPS
|
||||
float cuda_flops_q80_f32; // in GFLOPS
|
||||
float cuda_flops_iq1s_f32; // in GFLOPS
|
||||
float cuda_flops_iq4nl_f32; // in GFLOPS
|
||||
float cuda_flops_iq1m_f32; // in GFLOPS
|
||||
float cuda_mem_cpy_delay; // in ms
|
||||
|
||||
gpu_props() :
|
||||
name(""),
|
||||
description(""),
|
||||
name (""),
|
||||
description (""),
|
||||
memory_free (0.0f),
|
||||
memory_total (0.0f),
|
||||
metal_read_vram_bw (0.0f),
|
||||
metal_flops_f32_f32(0.0f),
|
||||
metal_flops_f16_f32(0.0f),
|
||||
metal_flops_q4k_f32(0.0f),
|
||||
metal_flops_q50_f32(0.0f),
|
||||
metal_flops_q5k_f32(0.0f),
|
||||
metal_flops_q6k_f32(0.0f),
|
||||
metal_flops_q80_f32(0.0f),
|
||||
metal_mem_cpy_delay(0.0f),
|
||||
metal_flops_f32_f32 (0.0f),
|
||||
metal_flops_f16_f32 (0.0f),
|
||||
metal_flops_q2k_f32 (0.0f),
|
||||
metal_flops_q4k_f32 (0.0f),
|
||||
metal_flops_q5k_f32 (0.0f),
|
||||
metal_flops_q6k_f32 (0.0f),
|
||||
metal_flops_iq2xxs_f32 (0.0f),
|
||||
metal_flops_q50_f32 (0.0f),
|
||||
metal_flops_q80_f32 (0.0f),
|
||||
metal_flops_iq1s_f32 (0.0f),
|
||||
metal_flops_iq4nl_f32 (0.0f),
|
||||
metal_flops_iq1m_f32 (0.0f),
|
||||
metal_mem_cpy_delay (0.0f),
|
||||
cuda_read_vram_bw (0.0f),
|
||||
cuda_flops_f32_f32 (0.0f),
|
||||
cuda_flops_f16_f32 (0.0f),
|
||||
cuda_flops_q2k_f32 (0.0f),
|
||||
cuda_flops_q4k_f32 (0.0f),
|
||||
cuda_flops_q50_f32 (0.0f),
|
||||
cuda_flops_q5k_f32 (0.0f),
|
||||
cuda_flops_q6k_f32 (0.0f),
|
||||
cuda_flops_iq2xxs_f32 (0.0f),
|
||||
cuda_flops_q50_f32 (0.0f),
|
||||
cuda_flops_q80_f32 (0.0f),
|
||||
cuda_flops_iq1s_f32 (0.0f),
|
||||
cuda_flops_iq4nl_f32 (0.0f),
|
||||
cuda_flops_iq1m_f32 (0.0f),
|
||||
cuda_mem_cpy_delay (0.0f) {}
|
||||
};
|
||||
|
||||
|
@ -127,82 +158,134 @@ struct model_flops {
|
|||
float inp_embd_ms;
|
||||
int64_t output_f32_f32;
|
||||
int64_t output_f16_f32;
|
||||
int64_t output_q2k_f32;
|
||||
int64_t output_q4k_f32;
|
||||
int64_t output_q50_f32;
|
||||
int64_t output_q5k_f32;
|
||||
int64_t output_q6k_f32;
|
||||
int64_t output_iq2xxs_f32;
|
||||
int64_t output_q50_f32;
|
||||
int64_t output_q80_f32;
|
||||
int64_t output_iq1s_f32;
|
||||
int64_t output_iq4nl_f32;
|
||||
int64_t output_iq1m_f32;
|
||||
int64_t layer_f32_f32;
|
||||
int64_t layer_f16_f32;
|
||||
int64_t layer_q2k_f32;
|
||||
int64_t layer_q4k_f32;
|
||||
int64_t layer_q50_f32;
|
||||
int64_t layer_q5k_f32;
|
||||
int64_t layer_q6k_f32;
|
||||
int64_t layer_iq2xxs_f32;
|
||||
int64_t layer_q50_f32;
|
||||
int64_t layer_q80_f32;
|
||||
int64_t layer_iq1s_f32;
|
||||
int64_t layer_iq4nl_f32;
|
||||
int64_t layer_iq1m_f32;
|
||||
|
||||
model_flops() :
|
||||
inp_embd_ms(0.0f),
|
||||
output_f32_f32(0),
|
||||
output_f16_f32(0),
|
||||
output_q4k_f32(0),
|
||||
output_q50_f32(0),
|
||||
output_q5k_f32(0),
|
||||
output_q6k_f32(0),
|
||||
output_q80_f32(0),
|
||||
inp_embd_ms (0.0f),
|
||||
output_f32_f32 (0),
|
||||
output_f16_f32 (0),
|
||||
output_q2k_f32 (0),
|
||||
output_q4k_f32 (0),
|
||||
output_q5k_f32 (0),
|
||||
output_q6k_f32 (0),
|
||||
output_iq2xxs_f32 (0),
|
||||
output_q50_f32 (0),
|
||||
output_q80_f32 (0),
|
||||
output_iq1s_f32 (0),
|
||||
output_iq4nl_f32 (0),
|
||||
output_iq1m_f32 (0),
|
||||
layer_f32_f32 (0),
|
||||
layer_f16_f32 (0),
|
||||
layer_q2k_f32 (0),
|
||||
layer_q4k_f32 (0),
|
||||
layer_q50_f32 (0),
|
||||
layer_q5k_f32 (0),
|
||||
layer_q6k_f32 (0),
|
||||
layer_q80_f32 (0) {}
|
||||
layer_iq2xxs_f32 (0),
|
||||
layer_q50_f32 (0),
|
||||
layer_q80_f32 (0),
|
||||
layer_iq1s_f32 (0),
|
||||
layer_iq4nl_f32 (0),
|
||||
layer_iq1m_f32 (0)
|
||||
{}
|
||||
};
|
||||
|
||||
struct model_params {
|
||||
int64_t input_f32;
|
||||
int64_t input_f16;
|
||||
int64_t input_q2k;
|
||||
int64_t input_q4k;
|
||||
int64_t input_q50;
|
||||
int64_t input_q5k;
|
||||
int64_t input_q6k;
|
||||
int64_t input_iq2xxs;
|
||||
int64_t input_q50;
|
||||
int64_t input_q80;
|
||||
int64_t input_iq1s;
|
||||
int64_t input_iq4nl;
|
||||
int64_t input_iq1m;
|
||||
int64_t output_f32;
|
||||
int64_t output_f16;
|
||||
int64_t output_q2k;
|
||||
int64_t output_q4k;
|
||||
int64_t output_q50;
|
||||
int64_t output_q5k;
|
||||
int64_t output_q6k;
|
||||
int64_t output_iq2xxs;
|
||||
int64_t output_q50;
|
||||
int64_t output_q80;
|
||||
int64_t output_iq1s;
|
||||
int64_t output_iq4nl;
|
||||
int64_t output_iq1m;
|
||||
int64_t layer_f32;
|
||||
int64_t layer_f16;
|
||||
int64_t layer_q2k;
|
||||
int64_t layer_q4k;
|
||||
int64_t layer_q50;
|
||||
int64_t layer_q5k;
|
||||
int64_t layer_q6k;
|
||||
int64_t layer_iq2xxs;
|
||||
int64_t layer_q50;
|
||||
int64_t layer_q80;
|
||||
int64_t layer_iq1s;
|
||||
int64_t layer_iq4nl;
|
||||
int64_t layer_iq1m;
|
||||
|
||||
model_params() :
|
||||
input_f32 (0),
|
||||
input_f16 (0),
|
||||
input_q2k (0),
|
||||
input_q4k (0),
|
||||
input_q50 (0),
|
||||
input_q5k (0),
|
||||
input_q6k (0),
|
||||
input_iq2xxs (0),
|
||||
input_q50 (0),
|
||||
input_q80 (0),
|
||||
output_f32(0),
|
||||
output_f16(0),
|
||||
output_q4k(0),
|
||||
output_q50(0),
|
||||
output_q5k(0),
|
||||
output_q6k(0),
|
||||
output_q80(0),
|
||||
input_iq1s (0),
|
||||
input_iq4nl (0),
|
||||
input_iq1m (0),
|
||||
output_f32 (0),
|
||||
output_f16 (0),
|
||||
output_q2k (0),
|
||||
output_q4k (0),
|
||||
output_q5k (0),
|
||||
output_q6k (0),
|
||||
output_iq2xxs (0),
|
||||
output_q50 (0),
|
||||
output_q80 (0),
|
||||
output_iq1s (0),
|
||||
output_iq4nl (0),
|
||||
output_iq1m (0),
|
||||
layer_f32 (0),
|
||||
layer_f16 (0),
|
||||
layer_q2k (0),
|
||||
layer_q4k (0),
|
||||
layer_q50 (0),
|
||||
layer_q5k (0),
|
||||
layer_q6k (0),
|
||||
layer_q80 (0) {}
|
||||
layer_iq2xxs (0),
|
||||
layer_q50 (0),
|
||||
layer_q80 (0),
|
||||
layer_iq1s (0),
|
||||
layer_iq4nl (0),
|
||||
layer_iq1m (0)
|
||||
{}
|
||||
};
|
||||
|
||||
struct model_bytes {
|
||||
|
|
|
@ -385,12 +385,12 @@ extern "C" {
|
|||
GGML_TYPE_F64 = 28,
|
||||
GGML_TYPE_IQ1_M = 29,
|
||||
GGML_TYPE_BF16 = 30,
|
||||
GGML_TYPE_Q4_0_4_4 = 31,
|
||||
GGML_TYPE_Q4_0_4_8 = 32,
|
||||
GGML_TYPE_Q4_0_8_8 = 33,
|
||||
// GGML_TYPE_Q4_0_4_4 = 31, support has been removed from gguf files
|
||||
// GGML_TYPE_Q4_0_4_8 = 32,
|
||||
// GGML_TYPE_Q4_0_8_8 = 33,
|
||||
GGML_TYPE_TQ1_0 = 34,
|
||||
GGML_TYPE_TQ2_0 = 35,
|
||||
GGML_TYPE_COUNT,
|
||||
GGML_TYPE_COUNT = 39,
|
||||
};
|
||||
|
||||
// precision
|
||||
|
@ -431,9 +431,6 @@ extern "C" {
|
|||
GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_Q4_0_4_4 = 25, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_Q4_0_4_8 = 26, // except 1d tensors
|
||||
GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors
|
||||
};
|
||||
|
||||
// available tensor operations:
|
||||
|
|
|
@ -15725,15 +15725,6 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
|
|||
{
|
||||
VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_nl, data, nb);
|
||||
} break;
|
||||
case GGML_TYPE_Q4_0_4_4:
|
||||
case GGML_TYPE_Q4_0_4_8:
|
||||
{
|
||||
VALIDATE_ROW_DATA_DVEC_F16_IMPL(block_q4_0x4, data, nbytes / sizeof(block_q4_0x4), 4);
|
||||
} break;
|
||||
case GGML_TYPE_Q4_0_8_8:
|
||||
{
|
||||
VALIDATE_ROW_DATA_DVEC_F16_IMPL(block_q4_0x8, data, nbytes / sizeof(block_q4_0x8), 8);
|
||||
} break;
|
||||
|
||||
case GGML_TYPE_I8:
|
||||
case GGML_TYPE_I16:
|
||||
|
|
|
@ -1076,54 +1076,6 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|||
.vec_dot_type = GGML_TYPE_BF16,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_Q4_0_4_4] = {
|
||||
.type_name = "q4_0_4x4",
|
||||
.blck_size = QK4_0,
|
||||
.blck_size_interleave = 4,
|
||||
.type_size = sizeof(block_q4_0),
|
||||
.is_quantized = true,
|
||||
.to_float = NULL,
|
||||
.from_float = NULL,
|
||||
.from_float_ref = NULL,
|
||||
.vec_dot = NULL,
|
||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||
.nrows = 1,
|
||||
.ncols = 4,
|
||||
.gemv = ggml_gemv_q4_0_4x4_q8_0,
|
||||
.gemm = ggml_gemm_q4_0_4x4_q8_0,
|
||||
},
|
||||
[GGML_TYPE_Q4_0_4_8] = {
|
||||
.type_name = "q4_0_4x8",
|
||||
.blck_size = QK4_0,
|
||||
.blck_size_interleave = 8,
|
||||
.type_size = sizeof(block_q4_0),
|
||||
.is_quantized = true,
|
||||
.to_float = NULL,
|
||||
.from_float = NULL,
|
||||
.from_float_ref = NULL,
|
||||
.vec_dot = NULL,
|
||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||
.nrows = 1,
|
||||
.ncols = 4,
|
||||
.gemv = ggml_gemv_q4_0_4x8_q8_0,
|
||||
.gemm = ggml_gemm_q4_0_4x8_q8_0,
|
||||
},
|
||||
[GGML_TYPE_Q4_0_8_8] = {
|
||||
.type_name = "q4_0_8x8",
|
||||
.blck_size = QK4_0,
|
||||
.blck_size_interleave = 8,
|
||||
.type_size = sizeof(block_q4_0),
|
||||
.is_quantized = true,
|
||||
.to_float = NULL,
|
||||
.from_float = NULL,
|
||||
.from_float_ref = NULL,
|
||||
.vec_dot = NULL,
|
||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||
.nrows = 1,
|
||||
.ncols = 8,
|
||||
.gemv = ggml_gemv_q4_0_8x8_q8_0,
|
||||
.gemm = ggml_gemm_q4_0_8x8_q8_0,
|
||||
},
|
||||
[GGML_TYPE_TQ1_0] = {
|
||||
.type_name = "tq1_0",
|
||||
.blck_size = QK_K,
|
||||
|
@ -3578,9 +3530,6 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
|
|||
case GGML_FTYPE_MOSTLY_IQ4_XS: wtype = GGML_TYPE_IQ4_XS; break;
|
||||
case GGML_FTYPE_MOSTLY_IQ3_S: wtype = GGML_TYPE_IQ3_S; break;
|
||||
case GGML_FTYPE_MOSTLY_IQ2_S: wtype = GGML_TYPE_IQ2_S; break;
|
||||
case GGML_FTYPE_MOSTLY_Q4_0_4_4: wtype = GGML_TYPE_Q4_0_4_4; break;
|
||||
case GGML_FTYPE_MOSTLY_Q4_0_4_8: wtype = GGML_TYPE_Q4_0_4_8; break;
|
||||
case GGML_FTYPE_MOSTLY_Q4_0_8_8: wtype = GGML_TYPE_Q4_0_8_8; break;
|
||||
case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
|
||||
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
|
||||
}
|
||||
|
@ -4107,7 +4056,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|||
/*.data =*/ obj_alloc_size > 0 ? (void *)(result + 1) : data,
|
||||
/*.name =*/ { 0 },
|
||||
/*.extra =*/ NULL,
|
||||
///*.padding =*/ { 0 },
|
||||
// /*.padding =*/ { 0 },
|
||||
};
|
||||
|
||||
#ifdef __clang__
|
||||
|
@ -9517,9 +9466,6 @@ static void ggml_compute_forward_add(
|
|||
case GGML_TYPE_IQ4_XS:
|
||||
case GGML_TYPE_IQ3_S:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
case GGML_TYPE_Q4_0_4_4:
|
||||
case GGML_TYPE_Q4_0_4_8:
|
||||
case GGML_TYPE_Q4_0_8_8:
|
||||
{
|
||||
ggml_compute_forward_add_q_f32(params, dst);
|
||||
} break;
|
||||
|
@ -9897,9 +9843,6 @@ static void ggml_compute_forward_add1(
|
|||
case GGML_TYPE_IQ4_XS:
|
||||
case GGML_TYPE_IQ3_S:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
case GGML_TYPE_Q4_0_4_4:
|
||||
case GGML_TYPE_Q4_0_4_8:
|
||||
case GGML_TYPE_Q4_0_8_8:
|
||||
{
|
||||
ggml_compute_forward_add1_q_f32(params, dst);
|
||||
} break;
|
||||
|
@ -10027,9 +9970,6 @@ static void ggml_compute_forward_acc(
|
|||
case GGML_TYPE_IQ4_XS:
|
||||
case GGML_TYPE_IQ3_S:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
case GGML_TYPE_Q4_0_4_4:
|
||||
case GGML_TYPE_Q4_0_4_8:
|
||||
case GGML_TYPE_Q4_0_8_8:
|
||||
default:
|
||||
{
|
||||
GGML_ABORT("fatal error");
|
||||
|
@ -13093,9 +13033,6 @@ static void ggml_compute_forward_out_prod(
|
|||
case GGML_TYPE_IQ4_XS:
|
||||
case GGML_TYPE_IQ3_S:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
case GGML_TYPE_Q4_0_4_4:
|
||||
case GGML_TYPE_Q4_0_4_8:
|
||||
case GGML_TYPE_Q4_0_8_8:
|
||||
{
|
||||
ggml_compute_forward_out_prod_q_f32(params, dst);
|
||||
} break;
|
||||
|
@ -13283,9 +13220,6 @@ static void ggml_compute_forward_set(
|
|||
case GGML_TYPE_IQ4_XS:
|
||||
case GGML_TYPE_IQ3_S:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
case GGML_TYPE_Q4_0_4_4:
|
||||
case GGML_TYPE_Q4_0_4_8:
|
||||
case GGML_TYPE_Q4_0_8_8:
|
||||
default:
|
||||
{
|
||||
GGML_ABORT("fatal error");
|
||||
|
@ -13547,9 +13481,6 @@ static void ggml_compute_forward_get_rows(
|
|||
case GGML_TYPE_IQ4_XS:
|
||||
case GGML_TYPE_IQ3_S:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
case GGML_TYPE_Q4_0_4_4:
|
||||
case GGML_TYPE_Q4_0_4_8:
|
||||
case GGML_TYPE_Q4_0_8_8:
|
||||
{
|
||||
ggml_compute_forward_get_rows_q(params, dst);
|
||||
} break;
|
||||
|
@ -14139,9 +14070,6 @@ static void ggml_compute_forward_clamp(
|
|||
case GGML_TYPE_IQ3_S:
|
||||
case GGML_TYPE_IQ2_S:
|
||||
case GGML_TYPE_Q8_K:
|
||||
case GGML_TYPE_Q4_0_4_4:
|
||||
case GGML_TYPE_Q4_0_4_8:
|
||||
case GGML_TYPE_Q4_0_8_8:
|
||||
case GGML_TYPE_I8:
|
||||
case GGML_TYPE_I16:
|
||||
case GGML_TYPE_I32:
|
||||
|
@ -21941,9 +21869,6 @@ size_t ggml_quantize_chunk(
|
|||
case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_Q4_0_4_4: result = quantize_q4_0_4x4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_Q4_0_4_8: result = quantize_q4_0_4x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_Q4_0_8_8: result = quantize_q4_0_8x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
size_t elemsize = sizeof(ggml_fp16_t);
|
||||
|
|
|
@ -165,18 +165,18 @@ extern "C" {
|
|||
LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ3_XS = 22, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ1_S = 24, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ1_S = 24, // except 1d tensors, 1 bit quantization
|
||||
LLAMA_FTYPE_MOSTLY_IQ4_NL = 25, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ3_S = 26, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ3_M = 27, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ2_S = 28, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors, 1 bit quantization
|
||||
LLAMA_FTYPE_MOSTLY_BF16 = 32, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // except 1d tensors
|
||||
// LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33, // removed from gguf files, use Q4_0 and runtime repack
|
||||
// LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, // removed from gguf files, use Q4_0 and runtime repack
|
||||
// LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // removed from gguf files, use Q4_0 and runtime repack
|
||||
LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors
|
||||
|
||||
|
|
201
src/llama.cpp
201
src/llama.cpp
|
@ -3560,16 +3560,26 @@ static bool is_dtype_exist(struct model_params * n_params, enum ggml_type dtype)
|
|||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_F16:
|
||||
return true;
|
||||
case GGML_TYPE_Q2_K:
|
||||
return n_params->layer_q2k > 0 || n_params->output_q2k > 0;
|
||||
case GGML_TYPE_Q4_K:
|
||||
return n_params->layer_q4k > 0 || n_params->output_q4k > 0;
|
||||
case GGML_TYPE_Q5_0:
|
||||
return n_params->layer_q50 > 0 || n_params->output_q50 > 0;
|
||||
case GGML_TYPE_Q5_K:
|
||||
return n_params->layer_q5k > 0 || n_params->output_q5k > 0;
|
||||
case GGML_TYPE_Q6_K:
|
||||
return n_params->layer_q6k > 0 || n_params->output_q6k > 0;
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
return n_params->layer_iq2xxs > 0 || n_params->output_iq2xxs > 0;
|
||||
case GGML_TYPE_Q5_0:
|
||||
return n_params->layer_q50 > 0 || n_params->output_q50 > 0;
|
||||
case GGML_TYPE_Q8_0:
|
||||
return n_params->layer_q80 > 0 || n_params->output_q80 > 0;
|
||||
case GGML_TYPE_IQ1_S:
|
||||
return n_params->layer_iq1s > 0 || n_params->output_iq1s > 0;
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
return n_params->layer_iq4nl > 0 || n_params->output_iq4nl > 0;
|
||||
case GGML_TYPE_IQ1_M:
|
||||
return n_params->layer_iq1m > 0 || n_params->output_iq1m > 0;
|
||||
default:
|
||||
throw std::runtime_error("Unrecognized data type\n");
|
||||
}
|
||||
|
@ -3650,18 +3660,18 @@ void llama_profile_device(
|
|||
dev_info->gpu_props.cuda_flops_f16_f32 = device_cuda_flops (model, GGML_TYPE_F16, GGML_TYPE_F32);
|
||||
}
|
||||
|
||||
if (is_dtype_exist(n_params, GGML_TYPE_Q2_K)) {
|
||||
dev_info->cpu_props.flops_q2k_f32 = device_cpu_flops (model, GGML_TYPE_Q2_K, GGML_TYPE_F32, n_threads);
|
||||
dev_info->gpu_props.metal_flops_q2k_f32 = device_metal_flops(model, GGML_TYPE_Q2_K, GGML_TYPE_F32);
|
||||
dev_info->gpu_props.cuda_flops_q2k_f32 = device_cuda_flops (model, GGML_TYPE_Q2_K, GGML_TYPE_F32);
|
||||
}
|
||||
|
||||
if (is_dtype_exist(n_params, GGML_TYPE_Q4_K)) {
|
||||
dev_info->cpu_props.flops_q4k_f32 = device_cpu_flops (model, GGML_TYPE_Q4_K, GGML_TYPE_F32, n_threads);
|
||||
dev_info->gpu_props.metal_flops_q4k_f32 = device_metal_flops(model, GGML_TYPE_Q4_K, GGML_TYPE_F32);
|
||||
dev_info->gpu_props.cuda_flops_q4k_f32 = device_cuda_flops (model, GGML_TYPE_Q4_K, GGML_TYPE_F32);
|
||||
}
|
||||
|
||||
if (is_dtype_exist(n_params, GGML_TYPE_Q5_0)) {
|
||||
dev_info->cpu_props.flops_q50_f32 = device_cpu_flops (model, GGML_TYPE_Q5_0, GGML_TYPE_F32, n_threads);
|
||||
dev_info->gpu_props.metal_flops_q50_f32 = device_metal_flops(model, GGML_TYPE_Q5_0, GGML_TYPE_F32);
|
||||
dev_info->gpu_props.cuda_flops_q50_f32 = device_cuda_flops (model, GGML_TYPE_Q5_0, GGML_TYPE_F32);
|
||||
}
|
||||
|
||||
if (is_dtype_exist(n_params, GGML_TYPE_Q5_K)) {
|
||||
dev_info->cpu_props.flops_q5k_f32 = device_cpu_flops (model, GGML_TYPE_Q5_K, GGML_TYPE_F32, n_threads);
|
||||
dev_info->gpu_props.metal_flops_q5k_f32 = device_metal_flops(model, GGML_TYPE_Q5_K, GGML_TYPE_F32);
|
||||
|
@ -3674,11 +3684,42 @@ void llama_profile_device(
|
|||
dev_info->gpu_props.cuda_flops_q6k_f32 = device_cuda_flops (model, GGML_TYPE_Q6_K, GGML_TYPE_F32);
|
||||
}
|
||||
|
||||
if (is_dtype_exist(n_params, GGML_TYPE_IQ2_XXS)) {
|
||||
dev_info->cpu_props.flops_iq2xxs_f32 = device_cpu_flops (model, GGML_TYPE_IQ2_XXS, GGML_TYPE_F32, n_threads);
|
||||
dev_info->gpu_props.metal_flops_iq2xxs_f32= device_metal_flops(model, GGML_TYPE_IQ2_XXS, GGML_TYPE_F32);
|
||||
dev_info->gpu_props.cuda_flops_iq2xxs_f32 = device_cuda_flops (model, GGML_TYPE_IQ2_XXS, GGML_TYPE_F32);
|
||||
}
|
||||
|
||||
if (is_dtype_exist(n_params, GGML_TYPE_Q5_0)) {
|
||||
dev_info->cpu_props.flops_q50_f32 = device_cpu_flops (model, GGML_TYPE_Q5_0, GGML_TYPE_F32, n_threads);
|
||||
dev_info->gpu_props.metal_flops_q50_f32 = device_metal_flops(model, GGML_TYPE_Q5_0, GGML_TYPE_F32);
|
||||
dev_info->gpu_props.cuda_flops_q50_f32 = device_cuda_flops (model, GGML_TYPE_Q5_0, GGML_TYPE_F32);
|
||||
}
|
||||
|
||||
|
||||
if (is_dtype_exist(n_params, GGML_TYPE_Q8_0)) {
|
||||
dev_info->cpu_props.flops_q80_f32 = device_cpu_flops (model, GGML_TYPE_Q8_0, GGML_TYPE_F32, n_threads);
|
||||
dev_info->gpu_props.metal_flops_q80_f32 = device_metal_flops(model, GGML_TYPE_Q8_0, GGML_TYPE_F32);
|
||||
dev_info->gpu_props.cuda_flops_q80_f32 = device_cuda_flops (model, GGML_TYPE_Q8_0, GGML_TYPE_F32);
|
||||
}
|
||||
|
||||
if (is_dtype_exist(n_params, GGML_TYPE_IQ1_S)) {
|
||||
dev_info->cpu_props.flops_iq1s_f32 = device_cpu_flops (model, GGML_TYPE_IQ1_S, GGML_TYPE_F32, n_threads);
|
||||
dev_info->gpu_props.metal_flops_iq1s_f32= device_metal_flops(model, GGML_TYPE_IQ1_S, GGML_TYPE_F32);
|
||||
dev_info->gpu_props.cuda_flops_iq1s_f32 = device_cuda_flops (model, GGML_TYPE_IQ1_S, GGML_TYPE_F32);
|
||||
}
|
||||
|
||||
if (is_dtype_exist(n_params, GGML_TYPE_IQ4_NL)) {
|
||||
dev_info->cpu_props.flops_iq4nl_f32 = device_cpu_flops (model, GGML_TYPE_IQ4_NL, GGML_TYPE_F32, n_threads);
|
||||
dev_info->gpu_props.metal_flops_iq4nl_f32= device_metal_flops(model, GGML_TYPE_IQ4_NL, GGML_TYPE_F32);
|
||||
dev_info->gpu_props.cuda_flops_iq4nl_f32 = device_cuda_flops (model, GGML_TYPE_IQ4_NL, GGML_TYPE_F32);
|
||||
}
|
||||
|
||||
if (is_dtype_exist(n_params, GGML_TYPE_IQ1_M)) {
|
||||
dev_info->cpu_props.flops_iq1m_f32 = device_cpu_flops (model, GGML_TYPE_IQ1_M, GGML_TYPE_F32, n_threads);
|
||||
dev_info->gpu_props.metal_flops_iq1m_f32= device_metal_flops(model, GGML_TYPE_IQ1_M, GGML_TYPE_F32);
|
||||
dev_info->gpu_props.cuda_flops_iq1m_f32 = device_cuda_flops (model, GGML_TYPE_IQ1_M, GGML_TYPE_F32);
|
||||
}
|
||||
}
|
||||
|
||||
ggml_backend_buffer_type_t llama_dev_buffer_type(struct llama_model * model, int device) {
|
||||
|
@ -4844,9 +4885,7 @@ struct llama_model_loader {
|
|||
case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
|
||||
case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
|
||||
case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
|
||||
case GGML_TYPE_Q4_0_4_4: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_4; break;
|
||||
case GGML_TYPE_Q4_0_4_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_8; break;
|
||||
case GGML_TYPE_Q4_0_8_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_8_8; break;
|
||||
|
||||
default:
|
||||
{
|
||||
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
||||
|
@ -5654,9 +5693,6 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|||
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4";
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8";
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: return "Q4_0_8_8";
|
||||
|
||||
default: return "unknown, may not work";
|
||||
}
|
||||
|
@ -18997,10 +19033,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
||||
new_type = GGML_TYPE_IQ3_S;
|
||||
}
|
||||
else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 ||
|
||||
new_type == GGML_TYPE_Q4_0_8_8) {
|
||||
new_type = GGML_TYPE_Q4_0;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) {
|
||||
new_type = GGML_TYPE_Q4_K;
|
||||
}
|
||||
|
@ -19323,9 +19355,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break;
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break;
|
||||
case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: default_type = GGML_TYPE_Q4_0_4_4; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: default_type = GGML_TYPE_Q4_0_4_8; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: default_type = GGML_TYPE_Q4_0_8_8; break;
|
||||
|
||||
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
||||
}
|
||||
|
@ -19646,14 +19675,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
f32_data = (float *) f32_conv_buf.data();
|
||||
}
|
||||
|
||||
int chunk_size_multiplier = 1;
|
||||
if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || new_type == GGML_TYPE_Q4_0_8_8) {
|
||||
if ((new_type == GGML_TYPE_Q4_0_8_8) && (tensor->ne[1] % 8 != 0)) new_type = GGML_TYPE_Q4_0;
|
||||
else if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q4_0;
|
||||
if (new_type == GGML_TYPE_Q4_0_8_8) chunk_size_multiplier = 8;
|
||||
else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8) chunk_size_multiplier = 4;
|
||||
}
|
||||
|
||||
LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
|
||||
fflush(stdout);
|
||||
|
||||
|
@ -19666,8 +19687,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
const int64_t nrows = tensor->ne[1];
|
||||
|
||||
static const int64_t min_chunk_size = 32 * 512;
|
||||
const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row)) *
|
||||
chunk_size_multiplier;
|
||||
const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row));
|
||||
|
||||
const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
|
||||
const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
|
||||
|
@ -21054,21 +21074,36 @@ static void count_n_flops(struct model_flops * n_flops, enum ggml_type dtype, en
|
|||
case GGML_TYPE_F16:
|
||||
n_flops->output_f16_f32 += n;
|
||||
break;
|
||||
case GGML_TYPE_Q2_K:
|
||||
n_flops->output_q2k_f32 += n;
|
||||
break;
|
||||
case GGML_TYPE_Q4_K:
|
||||
n_flops->output_q4k_f32 += n;
|
||||
break;
|
||||
case GGML_TYPE_Q5_0:
|
||||
n_flops->output_q50_f32 += n;
|
||||
break;
|
||||
case GGML_TYPE_Q5_K:
|
||||
n_flops->output_q5k_f32 += n;
|
||||
break;
|
||||
case GGML_TYPE_Q6_K:
|
||||
n_flops->output_q6k_f32 += n;
|
||||
break;
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
n_flops->output_iq2xxs_f32 += n;
|
||||
break;
|
||||
case GGML_TYPE_Q5_0:
|
||||
n_flops->output_q50_f32 += n;
|
||||
break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
n_flops->output_q80_f32 += n;
|
||||
break;
|
||||
case GGML_TYPE_IQ1_S:
|
||||
n_flops->output_iq1s_f32 += n;
|
||||
break;
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
n_flops->output_iq4nl_f32 += n;
|
||||
break;
|
||||
case GGML_TYPE_IQ1_M:
|
||||
n_flops->output_iq1m_f32 += n;
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_OUTPUT\n");
|
||||
}
|
||||
|
@ -21082,21 +21117,36 @@ static void count_n_flops(struct model_flops * n_flops, enum ggml_type dtype, en
|
|||
case GGML_TYPE_F16:
|
||||
n_flops->layer_f16_f32 += n;
|
||||
break;
|
||||
case GGML_TYPE_Q2_K:
|
||||
n_flops->layer_q2k_f32 += n;
|
||||
break;
|
||||
case GGML_TYPE_Q4_K:
|
||||
n_flops->layer_q4k_f32 += n;
|
||||
break;
|
||||
case GGML_TYPE_Q5_0:
|
||||
n_flops->layer_q50_f32 += n;
|
||||
break;
|
||||
case GGML_TYPE_Q5_K:
|
||||
n_flops->layer_q5k_f32 += n;
|
||||
break;
|
||||
case GGML_TYPE_Q6_K:
|
||||
n_flops->layer_q6k_f32 += n;
|
||||
break;
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
n_flops->layer_iq2xxs_f32 += n;
|
||||
break;
|
||||
case GGML_TYPE_Q5_0:
|
||||
n_flops->layer_q50_f32 += n;
|
||||
break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
n_flops->layer_q80_f32 += n;
|
||||
break;
|
||||
case GGML_TYPE_IQ1_S:
|
||||
n_flops->layer_iq1s_f32 += n;
|
||||
break;
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
n_flops->layer_iq4nl_f32 += n;
|
||||
break;
|
||||
case GGML_TYPE_IQ1_M:
|
||||
n_flops->layer_iq1m_f32 += n;
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_BACKEND\n");
|
||||
}
|
||||
|
@ -21118,21 +21168,36 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype,
|
|||
case GGML_TYPE_F16:
|
||||
n_params->input_f16 += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_Q2_K:
|
||||
n_params->input_q2k += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_Q4_K:
|
||||
n_params->input_q4k += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_Q5_0:
|
||||
n_params->input_q50 += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_Q5_K:
|
||||
n_params->input_q5k += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_Q6_K:
|
||||
n_params->input_q6k += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
n_params->input_iq2xxs += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_Q5_0:
|
||||
n_params->input_q50 += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
n_params->input_q80 += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_IQ1_S:
|
||||
n_params->input_iq1s += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
n_params->input_iq4nl += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_IQ1_M:
|
||||
n_params->input_iq1m += n_i64t;
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_OUTPUT\n");
|
||||
}
|
||||
|
@ -21146,21 +21211,36 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype,
|
|||
case GGML_TYPE_F16:
|
||||
n_params->output_f16 += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_Q2_K:
|
||||
n_params->output_q2k += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_Q4_K:
|
||||
n_params->output_q4k += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_Q5_0:
|
||||
n_params->output_q50 += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_Q5_K:
|
||||
n_params->output_q5k += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_Q6_K:
|
||||
n_params->output_q6k += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
n_params->output_iq2xxs += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_Q5_0:
|
||||
n_params->output_q50 += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
n_params->output_q80 += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_IQ1_S:
|
||||
n_params->output_iq1s += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
n_params->output_iq4nl += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_IQ1_M:
|
||||
n_params->output_iq1m += n_i64t;
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_OUTPUT\n");
|
||||
}
|
||||
|
@ -21174,21 +21254,36 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype,
|
|||
case GGML_TYPE_F16:
|
||||
n_params->layer_f16 += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_Q2_K:
|
||||
n_params->layer_q2k += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_Q4_K:
|
||||
n_params->layer_q4k += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_Q5_0:
|
||||
n_params->layer_q50 += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_Q5_K:
|
||||
n_params->layer_q5k += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_Q6_K:
|
||||
n_params->layer_q6k += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
n_params->layer_iq2xxs += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_Q5_0:
|
||||
n_params->layer_q50 += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
n_params->layer_q80 += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_IQ1_S:
|
||||
n_params->layer_iq1s += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
n_params->layer_iq4nl += n_i64t;
|
||||
break;
|
||||
case GGML_TYPE_IQ1_M:
|
||||
n_params->layer_iq1m += n_i64t;
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_BACKEND\n");
|
||||
}
|
||||
|
@ -21479,19 +21574,29 @@ void llama_model_n_flops(
|
|||
// use average values instead of total values
|
||||
n_flops->layer_f32_f32 = static_cast<int64_t>((double)n_flops->layer_f32_f32 / (double)n_layer);
|
||||
n_flops->layer_f16_f32 = static_cast<int64_t>((double)n_flops->layer_f16_f32 / (double)n_layer);
|
||||
n_flops->layer_q2k_f32 = static_cast<int64_t>((double)n_flops->layer_q2k_f32 / (double)n_layer);
|
||||
n_flops->layer_q4k_f32 = static_cast<int64_t>((double)n_flops->layer_q4k_f32 / (double)n_layer);
|
||||
n_flops->layer_q50_f32 = static_cast<int64_t>((double)n_flops->layer_q50_f32 / (double)n_layer);
|
||||
n_flops->layer_q5k_f32 = static_cast<int64_t>((double)n_flops->layer_q5k_f32 / (double)n_layer);
|
||||
n_flops->layer_q6k_f32 = static_cast<int64_t>((double)n_flops->layer_q6k_f32 / (double)n_layer);
|
||||
n_flops->layer_iq2xxs_f32 = static_cast<int64_t>((double)n_flops->layer_iq2xxs_f32 / (double)n_layer);
|
||||
n_flops->layer_q50_f32 = static_cast<int64_t>((double)n_flops->layer_q50_f32 / (double)n_layer);
|
||||
n_flops->layer_q80_f32 = static_cast<int64_t>((double)n_flops->layer_q80_f32 / (double)n_layer);
|
||||
n_flops->layer_iq1s_f32 = static_cast<int64_t>((double)n_flops->layer_iq1s_f32 / (double)n_layer);
|
||||
n_flops->layer_iq4nl_f32 = static_cast<int64_t>((double)n_flops->layer_iq4nl_f32 / (double)n_layer);
|
||||
n_flops->layer_iq1m_f32 = static_cast<int64_t>((double)n_flops->layer_iq1m_f32 / (double)n_layer);
|
||||
|
||||
n_params->layer_f32 = static_cast<int64_t>((double)n_params->layer_f32 / (double)n_layer);
|
||||
n_params->layer_f16 = static_cast<int64_t>((double)n_params->layer_f16 / (double)n_layer);
|
||||
n_params->layer_q2k = static_cast<int64_t>((double)n_params->layer_q2k / (double)n_layer);
|
||||
n_params->layer_q4k = static_cast<int64_t>((double)n_params->layer_q4k / (double)n_layer);
|
||||
n_params->layer_q50 = static_cast<int64_t>((double)n_params->layer_q50 / (double)n_layer);
|
||||
n_params->layer_q5k = static_cast<int64_t>((double)n_params->layer_q5k / (double)n_layer);
|
||||
n_params->layer_q6k = static_cast<int64_t>((double)n_params->layer_q6k / (double)n_layer);
|
||||
n_params->layer_iq2xxs = static_cast<int64_t>((double)n_params->layer_iq2xxs / (double)n_layer);
|
||||
n_params->layer_q50 = static_cast<int64_t>((double)n_params->layer_q50 / (double)n_layer);
|
||||
n_params->layer_q80 = static_cast<int64_t>((double)n_params->layer_q80 / (double)n_layer);
|
||||
n_params->layer_iq1s = static_cast<int64_t>((double)n_params->layer_iq1s / (double)n_layer);
|
||||
n_params->layer_iq4nl = static_cast<int64_t>((double)n_params->layer_iq4nl / (double)n_layer);
|
||||
n_params->layer_iq1m = static_cast<int64_t>((double)n_params->layer_iq1m / (double)n_layer);
|
||||
|
||||
n_bytes->nb_layer = static_cast<int64_t>((double)n_bytes->nb_layer / (double)n_layer);
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue