mirror of
https://github.com/Lizonghang/prima.cpp.git
synced 2025-09-11 15:24:38 +00:00
add support for Q5_0
This commit is contained in:
parent
96e68679ce
commit
550fdcbc4f
4 changed files with 88 additions and 5 deletions
|
@ -904,6 +904,7 @@ static bool assign_layers_to_device(
|
||||||
master.model_flops.layer_f32_f32 / (dev.cpu_props.flops_f32_f32 * 1e9 + EPS) +
|
master.model_flops.layer_f32_f32 / (dev.cpu_props.flops_f32_f32 * 1e9 + EPS) +
|
||||||
master.model_flops.layer_f16_f32 / (dev.cpu_props.flops_f16_f32 * 1e9 + EPS) +
|
master.model_flops.layer_f16_f32 / (dev.cpu_props.flops_f16_f32 * 1e9 + EPS) +
|
||||||
master.model_flops.layer_q4k_f32 / (dev.cpu_props.flops_q4k_f32 * 1e9 + EPS) +
|
master.model_flops.layer_q4k_f32 / (dev.cpu_props.flops_q4k_f32 * 1e9 + EPS) +
|
||||||
|
master.model_flops.layer_q50_f32 / (dev.cpu_props.flops_q50_f32 * 1e9 + EPS) +
|
||||||
master.model_flops.layer_q5k_f32 / (dev.cpu_props.flops_q5k_f32 * 1e9 + EPS) +
|
master.model_flops.layer_q5k_f32 / (dev.cpu_props.flops_q5k_f32 * 1e9 + EPS) +
|
||||||
master.model_flops.layer_q6k_f32 / (dev.cpu_props.flops_q6k_f32 * 1e9 + EPS) +
|
master.model_flops.layer_q6k_f32 / (dev.cpu_props.flops_q6k_f32 * 1e9 + EPS) +
|
||||||
master.model_flops.layer_q80_f32 / (dev.cpu_props.flops_q80_f32 * 1e9 + EPS)) * 1000; // in ms
|
master.model_flops.layer_q80_f32 / (dev.cpu_props.flops_q80_f32 * 1e9 + EPS)) * 1000; // in ms
|
||||||
|
@ -923,6 +924,7 @@ static bool assign_layers_to_device(
|
||||||
master.model_flops.layer_f32_f32 / (dev.gpu_props.metal_flops_f32_f32 * 1e9 + EPS) +
|
master.model_flops.layer_f32_f32 / (dev.gpu_props.metal_flops_f32_f32 * 1e9 + EPS) +
|
||||||
master.model_flops.layer_f16_f32 / (dev.gpu_props.metal_flops_f16_f32 * 1e9 + EPS) +
|
master.model_flops.layer_f16_f32 / (dev.gpu_props.metal_flops_f16_f32 * 1e9 + EPS) +
|
||||||
master.model_flops.layer_q4k_f32 / (dev.gpu_props.metal_flops_q4k_f32 * 1e9 + EPS) +
|
master.model_flops.layer_q4k_f32 / (dev.gpu_props.metal_flops_q4k_f32 * 1e9 + EPS) +
|
||||||
|
master.model_flops.layer_q50_f32 / (dev.gpu_props.metal_flops_q50_f32 * 1e9 + EPS) +
|
||||||
master.model_flops.layer_q5k_f32 / (dev.gpu_props.metal_flops_q5k_f32 * 1e9 + EPS) +
|
master.model_flops.layer_q5k_f32 / (dev.gpu_props.metal_flops_q5k_f32 * 1e9 + EPS) +
|
||||||
master.model_flops.layer_q6k_f32 / (dev.gpu_props.metal_flops_q6k_f32 * 1e9 + EPS) +
|
master.model_flops.layer_q6k_f32 / (dev.gpu_props.metal_flops_q6k_f32 * 1e9 + EPS) +
|
||||||
master.model_flops.layer_q80_f32 / (dev.gpu_props.metal_flops_q80_f32 * 1e9 + EPS)) * 1000; // in ms
|
master.model_flops.layer_q80_f32 / (dev.gpu_props.metal_flops_q80_f32 * 1e9 + EPS)) * 1000; // in ms
|
||||||
|
@ -933,6 +935,7 @@ static bool assign_layers_to_device(
|
||||||
master.model_flops.layer_f32_f32 / (dev.gpu_props.cuda_flops_f32_f32 * 1e9 + EPS) +
|
master.model_flops.layer_f32_f32 / (dev.gpu_props.cuda_flops_f32_f32 * 1e9 + EPS) +
|
||||||
master.model_flops.layer_f16_f32 / (dev.gpu_props.cuda_flops_f16_f32 * 1e9 + EPS) +
|
master.model_flops.layer_f16_f32 / (dev.gpu_props.cuda_flops_f16_f32 * 1e9 + EPS) +
|
||||||
master.model_flops.layer_q4k_f32 / (dev.gpu_props.cuda_flops_q4k_f32 * 1e9 + EPS) +
|
master.model_flops.layer_q4k_f32 / (dev.gpu_props.cuda_flops_q4k_f32 * 1e9 + EPS) +
|
||||||
|
master.model_flops.layer_q50_f32 / (dev.gpu_props.cuda_flops_q50_f32 * 1e9 + EPS) +
|
||||||
master.model_flops.layer_q5k_f32 / (dev.gpu_props.cuda_flops_q5k_f32 * 1e9 + EPS) +
|
master.model_flops.layer_q5k_f32 / (dev.gpu_props.cuda_flops_q5k_f32 * 1e9 + EPS) +
|
||||||
master.model_flops.layer_q6k_f32 / (dev.gpu_props.cuda_flops_q6k_f32 * 1e9 + EPS) +
|
master.model_flops.layer_q6k_f32 / (dev.gpu_props.cuda_flops_q6k_f32 * 1e9 + EPS) +
|
||||||
master.model_flops.layer_q80_f32 / (dev.gpu_props.cuda_flops_q80_f32 * 1e9 + EPS)) * 1000; // in ms
|
master.model_flops.layer_q80_f32 / (dev.gpu_props.cuda_flops_q80_f32 * 1e9 + EPS)) * 1000; // in ms
|
||||||
|
@ -1113,6 +1116,7 @@ static bool assign_layers_to_device(
|
||||||
dev.model_flops.layer_f32_f32 / (dev.cpu_props.flops_f32_f32 * 1e9 + EPS) +
|
dev.model_flops.layer_f32_f32 / (dev.cpu_props.flops_f32_f32 * 1e9 + EPS) +
|
||||||
dev.model_flops.layer_f16_f32 / (dev.cpu_props.flops_f16_f32 * 1e9 + EPS) +
|
dev.model_flops.layer_f16_f32 / (dev.cpu_props.flops_f16_f32 * 1e9 + EPS) +
|
||||||
dev.model_flops.layer_q4k_f32 / (dev.cpu_props.flops_q4k_f32 * 1e9 + EPS) +
|
dev.model_flops.layer_q4k_f32 / (dev.cpu_props.flops_q4k_f32 * 1e9 + EPS) +
|
||||||
|
dev.model_flops.layer_q5k_f32 / (dev.cpu_props.flops_q50_f32 * 1e9 + EPS) +
|
||||||
dev.model_flops.layer_q5k_f32 / (dev.cpu_props.flops_q5k_f32 * 1e9 + EPS) +
|
dev.model_flops.layer_q5k_f32 / (dev.cpu_props.flops_q5k_f32 * 1e9 + EPS) +
|
||||||
dev.model_flops.layer_q6k_f32 / (dev.cpu_props.flops_q6k_f32 * 1e9 + EPS) +
|
dev.model_flops.layer_q6k_f32 / (dev.cpu_props.flops_q6k_f32 * 1e9 + EPS) +
|
||||||
dev.model_flops.layer_q80_f32 / (dev.cpu_props.flops_q80_f32 * 1e9 + EPS)) * 1000; // in ms
|
dev.model_flops.layer_q80_f32 / (dev.cpu_props.flops_q80_f32 * 1e9 + EPS)) * 1000; // in ms
|
||||||
|
|
|
@ -365,6 +365,7 @@ float device_inp_embd_delay(struct llama_model * model, enum ggml_type src0t, in
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
|
case GGML_TYPE_Q5_0:
|
||||||
case GGML_TYPE_Q5_K:
|
case GGML_TYPE_Q5_K:
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
case GGML_TYPE_Q8_K:
|
case GGML_TYPE_Q8_K:
|
||||||
|
@ -1349,6 +1350,7 @@ static float device_compute_delay(struct device_info & dev_info, int n_layers, c
|
||||||
gpu_latency_per_layer += (double)n_flops.layer_f32_f32 / ((double)gpu.cuda_flops_f32_f32 + EPS) / 1e9;
|
gpu_latency_per_layer += (double)n_flops.layer_f32_f32 / ((double)gpu.cuda_flops_f32_f32 + EPS) / 1e9;
|
||||||
gpu_latency_per_layer += (double)n_flops.layer_f16_f32 / ((double)gpu.cuda_flops_f16_f32 + EPS) / 1e9;
|
gpu_latency_per_layer += (double)n_flops.layer_f16_f32 / ((double)gpu.cuda_flops_f16_f32 + EPS) / 1e9;
|
||||||
gpu_latency_per_layer += (double)n_flops.layer_q4k_f32 / ((double)gpu.cuda_flops_q4k_f32 + EPS) / 1e9;
|
gpu_latency_per_layer += (double)n_flops.layer_q4k_f32 / ((double)gpu.cuda_flops_q4k_f32 + EPS) / 1e9;
|
||||||
|
gpu_latency_per_layer += (double)n_flops.layer_q50_f32 / ((double)gpu.cuda_flops_q50_f32 + EPS) / 1e9;
|
||||||
gpu_latency_per_layer += (double)n_flops.layer_q5k_f32 / ((double)gpu.cuda_flops_q5k_f32 + EPS) / 1e9;
|
gpu_latency_per_layer += (double)n_flops.layer_q5k_f32 / ((double)gpu.cuda_flops_q5k_f32 + EPS) / 1e9;
|
||||||
gpu_latency_per_layer += (double)n_flops.layer_q6k_f32 / ((double)gpu.cuda_flops_q6k_f32 + EPS) / 1e9;
|
gpu_latency_per_layer += (double)n_flops.layer_q6k_f32 / ((double)gpu.cuda_flops_q6k_f32 + EPS) / 1e9;
|
||||||
gpu_latency_per_layer += (double)n_flops.layer_q80_f32 / ((double)gpu.cuda_flops_q80_f32 + EPS) / 1e9;
|
gpu_latency_per_layer += (double)n_flops.layer_q80_f32 / ((double)gpu.cuda_flops_q80_f32 + EPS) / 1e9;
|
||||||
|
@ -1358,6 +1360,7 @@ static float device_compute_delay(struct device_info & dev_info, int n_layers, c
|
||||||
gpu_latency_per_layer += (double)n_flops.layer_f32_f32 / ((double)gpu.metal_flops_f32_f32 + EPS) / 1e9;
|
gpu_latency_per_layer += (double)n_flops.layer_f32_f32 / ((double)gpu.metal_flops_f32_f32 + EPS) / 1e9;
|
||||||
gpu_latency_per_layer += (double)n_flops.layer_f16_f32 / ((double)gpu.metal_flops_f16_f32 + EPS) / 1e9;
|
gpu_latency_per_layer += (double)n_flops.layer_f16_f32 / ((double)gpu.metal_flops_f16_f32 + EPS) / 1e9;
|
||||||
gpu_latency_per_layer += (double)n_flops.layer_q4k_f32 / ((double)gpu.metal_flops_q4k_f32 + EPS) / 1e9;
|
gpu_latency_per_layer += (double)n_flops.layer_q4k_f32 / ((double)gpu.metal_flops_q4k_f32 + EPS) / 1e9;
|
||||||
|
gpu_latency_per_layer += (double)n_flops.layer_q50_f32 / ((double)gpu.metal_flops_q50_f32 + EPS) / 1e9;
|
||||||
gpu_latency_per_layer += (double)n_flops.layer_q5k_f32 / ((double)gpu.metal_flops_q5k_f32 + EPS) / 1e9;
|
gpu_latency_per_layer += (double)n_flops.layer_q5k_f32 / ((double)gpu.metal_flops_q5k_f32 + EPS) / 1e9;
|
||||||
gpu_latency_per_layer += (double)n_flops.layer_q6k_f32 / ((double)gpu.metal_flops_q6k_f32 + EPS) / 1e9;
|
gpu_latency_per_layer += (double)n_flops.layer_q6k_f32 / ((double)gpu.metal_flops_q6k_f32 + EPS) / 1e9;
|
||||||
gpu_latency_per_layer += (double)n_flops.layer_q80_f32 / ((double)gpu.metal_flops_q80_f32 + EPS) / 1e9;
|
gpu_latency_per_layer += (double)n_flops.layer_q80_f32 / ((double)gpu.metal_flops_q80_f32 + EPS) / 1e9;
|
||||||
|
@ -1366,6 +1369,7 @@ static float device_compute_delay(struct device_info & dev_info, int n_layers, c
|
||||||
cpu_latency_per_layer += (double)n_flops.layer_f32_f32 / ((double)cpu.flops_f32_f32 + EPS) / 1e9;
|
cpu_latency_per_layer += (double)n_flops.layer_f32_f32 / ((double)cpu.flops_f32_f32 + EPS) / 1e9;
|
||||||
cpu_latency_per_layer += (double)n_flops.layer_f16_f32 / ((double)cpu.flops_f16_f32 + EPS) / 1e9;
|
cpu_latency_per_layer += (double)n_flops.layer_f16_f32 / ((double)cpu.flops_f16_f32 + EPS) / 1e9;
|
||||||
cpu_latency_per_layer += (double)n_flops.layer_q4k_f32 / ((double)cpu.flops_q4k_f32 + EPS) / 1e9;
|
cpu_latency_per_layer += (double)n_flops.layer_q4k_f32 / ((double)cpu.flops_q4k_f32 + EPS) / 1e9;
|
||||||
|
cpu_latency_per_layer += (double)n_flops.layer_q50_f32 / ((double)cpu.flops_q50_f32 + EPS) / 1e9;
|
||||||
cpu_latency_per_layer += (double)n_flops.layer_q5k_f32 / ((double)cpu.flops_q5k_f32 + EPS) / 1e9;
|
cpu_latency_per_layer += (double)n_flops.layer_q5k_f32 / ((double)cpu.flops_q5k_f32 + EPS) / 1e9;
|
||||||
cpu_latency_per_layer += (double)n_flops.layer_q6k_f32 / ((double)cpu.flops_q6k_f32 + EPS) / 1e9;
|
cpu_latency_per_layer += (double)n_flops.layer_q6k_f32 / ((double)cpu.flops_q6k_f32 + EPS) / 1e9;
|
||||||
cpu_latency_per_layer += (double)n_flops.layer_q80_f32 / ((double)cpu.flops_q80_f32 + EPS) / 1e9;
|
cpu_latency_per_layer += (double)n_flops.layer_q80_f32 / ((double)cpu.flops_q80_f32 + EPS) / 1e9;
|
||||||
|
@ -1384,6 +1388,7 @@ static float device_compute_delay(struct device_info & dev_info, int n_layers, c
|
||||||
total_latency += (double)n_flops.output_f32_f32 / ((double)cpu.flops_f32_f32 + EPS) / 1e9;
|
total_latency += (double)n_flops.output_f32_f32 / ((double)cpu.flops_f32_f32 + EPS) / 1e9;
|
||||||
total_latency += (double)n_flops.output_f16_f32 / ((double)cpu.flops_f16_f32 + EPS) / 1e9;
|
total_latency += (double)n_flops.output_f16_f32 / ((double)cpu.flops_f16_f32 + EPS) / 1e9;
|
||||||
total_latency += (double)n_flops.output_q4k_f32 / ((double)cpu.flops_q4k_f32 + EPS) / 1e9;
|
total_latency += (double)n_flops.output_q4k_f32 / ((double)cpu.flops_q4k_f32 + EPS) / 1e9;
|
||||||
|
total_latency += (double)n_flops.output_q50_f32 / ((double)cpu.flops_q50_f32 + EPS) / 1e9;
|
||||||
total_latency += (double)n_flops.output_q5k_f32 / ((double)cpu.flops_q5k_f32 + EPS) / 1e9;
|
total_latency += (double)n_flops.output_q5k_f32 / ((double)cpu.flops_q5k_f32 + EPS) / 1e9;
|
||||||
total_latency += (double)n_flops.output_q6k_f32 / ((double)cpu.flops_q6k_f32 + EPS) / 1e9;
|
total_latency += (double)n_flops.output_q6k_f32 / ((double)cpu.flops_q6k_f32 + EPS) / 1e9;
|
||||||
total_latency += (double)n_flops.output_q80_f32 / ((double)cpu.flops_q80_f32 + EPS) / 1e9;
|
total_latency += (double)n_flops.output_q80_f32 / ((double)cpu.flops_q80_f32 + EPS) / 1e9;
|
||||||
|
@ -1697,6 +1702,12 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
|
||||||
}
|
}
|
||||||
LOG_INF("\n");
|
LOG_INF("\n");
|
||||||
|
|
||||||
|
LOG_INF("| CPU flops (Q50 x F32, GFLOPS)");
|
||||||
|
for (int i = 0; i < n; ++i) {
|
||||||
|
LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q50_f32);
|
||||||
|
}
|
||||||
|
LOG_INF("\n");
|
||||||
|
|
||||||
LOG_INF("| CPU flops (Q5K x F32, GFLOPS)");
|
LOG_INF("| CPU flops (Q5K x F32, GFLOPS)");
|
||||||
for (int i = 0; i < n; ++i) {
|
for (int i = 0; i < n; ++i) {
|
||||||
LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q5k_f32);
|
LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q5k_f32);
|
||||||
|
@ -1877,6 +1888,12 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
|
||||||
}
|
}
|
||||||
LOG_INF("\n");
|
LOG_INF("\n");
|
||||||
|
|
||||||
|
LOG_INF("| Metal flops (Q50xF32, GFLOPS)");
|
||||||
|
for (int i = 0; i < n; ++i) {
|
||||||
|
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q50_f32);
|
||||||
|
}
|
||||||
|
LOG_INF("\n");
|
||||||
|
|
||||||
LOG_INF("| Metal flops (Q5KxF32, GFLOPS)");
|
LOG_INF("| Metal flops (Q5KxF32, GFLOPS)");
|
||||||
for (int i = 0; i < n; ++i) {
|
for (int i = 0; i < n; ++i) {
|
||||||
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q5k_f32);
|
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q5k_f32);
|
||||||
|
@ -1925,6 +1942,12 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
|
||||||
}
|
}
|
||||||
LOG_INF("\n");
|
LOG_INF("\n");
|
||||||
|
|
||||||
|
LOG_INF("| CUDA flops (Q50xF32, GFLOPS) ");
|
||||||
|
for (int i = 0; i < n; ++i) {
|
||||||
|
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q50_f32);
|
||||||
|
}
|
||||||
|
LOG_INF("\n");
|
||||||
|
|
||||||
LOG_INF("| CUDA flops (Q5KxF32, GFLOPS) ");
|
LOG_INF("| CUDA flops (Q5KxF32, GFLOPS) ");
|
||||||
for (int i = 0; i < n; ++i) {
|
for (int i = 0; i < n; ++i) {
|
||||||
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q5k_f32);
|
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q5k_f32);
|
||||||
|
@ -2112,12 +2135,12 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
|
||||||
+ gpu_description_len
|
+ gpu_description_len
|
||||||
+ sizeof(struct disk_props)
|
+ sizeof(struct disk_props)
|
||||||
+ sizeof(uint32_t) // cpu_props.cores
|
+ sizeof(uint32_t) // cpu_props.cores
|
||||||
+ sizeof(float) * 6 // cpu_props.flops_f32_f32, cpu_props.flops_f16_f32, cpu_props.flops_q4k_f32, cpu_props.flops_q5k_f32, cpu_props.flops_q6k_f32, cpu_props.flops_q80_f32
|
+ sizeof(float) * 7 // cpu_props.flops_f32_f32, cpu_props.flops_f16_f32, cpu_props.flops_q4k_f32, cpu_props.flops_q50_f32, cpu_props.flops_q5k_f32, cpu_props.flops_q6k_f32, cpu_props.flops_q80_f32
|
||||||
+ sizeof(struct memory_info)
|
+ sizeof(struct memory_info)
|
||||||
+ sizeof(struct gpu_support)
|
+ sizeof(struct gpu_support)
|
||||||
+ sizeof(float) * 18; // gpu_props.memory_free, gpu_props.memory_total, gpu_props.metal_read_vram_bw, gpu_props.cuda_read_vram_bw,
|
+ sizeof(float) * 20; // gpu_props.memory_free, gpu_props.memory_total, gpu_props.metal_read_vram_bw, gpu_props.cuda_read_vram_bw,
|
||||||
// gpu_props.metal_flops_f32_f32, gpu_props.metal_flops_f16_f32, gpu_props.metal_flops_q4k_f32, gpu_props.metal_flops_q5k_f32, gpu_props.metal_flops_q6k_f32, gpu_props.metal_flops_q80_f32,
|
// gpu_props.metal_flops_f32_f32, gpu_props.metal_flops_f16_f32, gpu_props.metal_flops_q4k_f32, gpu_props.metal_flops_q50_f32, gpu_props.metal_flops_q5k_f32, gpu_props.metal_flops_q6k_f32, gpu_props.metal_flops_q80_f32,
|
||||||
// gpu_props.cuda_flops_f32_f32, gpu_props.cuda_flops_f16_f32, gpu_props.cuda_flops_q4k_f32, gpu_props.cuda_flops_q5k_f32, gpu_props.cuda_flops_q6k_f32, gpu_props.cuda_flops_q80_f32,
|
// gpu_props.cuda_flops_f32_f32, gpu_props.cuda_flops_f16_f32, gpu_props.cuda_flops_q4k_f32, gpu_props.cuda_flops_q50_f32, gpu_props.cuda_flops_q5k_f32, gpu_props.cuda_flops_q6k_f32, gpu_props.cuda_flops_q80_f32,
|
||||||
// gpu_props.metal_mem_cpy_delay, gpu_props.cuda_mem_cpy_delay
|
// gpu_props.metal_mem_cpy_delay, gpu_props.cuda_mem_cpy_delay
|
||||||
|
|
||||||
*buffer = (char *)malloc(total_size);
|
*buffer = (char *)malloc(total_size);
|
||||||
|
@ -2174,6 +2197,9 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
|
||||||
memcpy(ptr, &dev_info->cpu_props.flops_q4k_f32, sizeof(float));
|
memcpy(ptr, &dev_info->cpu_props.flops_q4k_f32, sizeof(float));
|
||||||
ptr += sizeof(float);
|
ptr += sizeof(float);
|
||||||
|
|
||||||
|
memcpy(ptr, &dev_info->cpu_props.flops_q50_f32, sizeof(float));
|
||||||
|
ptr += sizeof(float);
|
||||||
|
|
||||||
memcpy(ptr, &dev_info->cpu_props.flops_q5k_f32, sizeof(float));
|
memcpy(ptr, &dev_info->cpu_props.flops_q5k_f32, sizeof(float));
|
||||||
ptr += sizeof(float);
|
ptr += sizeof(float);
|
||||||
|
|
||||||
|
@ -2207,6 +2233,9 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
|
||||||
memcpy(ptr, &dev_info->gpu_props.metal_flops_q4k_f32, sizeof(float));
|
memcpy(ptr, &dev_info->gpu_props.metal_flops_q4k_f32, sizeof(float));
|
||||||
ptr += sizeof(float);
|
ptr += sizeof(float);
|
||||||
|
|
||||||
|
memcpy(ptr, &dev_info->gpu_props.metal_flops_q50_f32, sizeof(float));
|
||||||
|
ptr += sizeof(float);
|
||||||
|
|
||||||
memcpy(ptr, &dev_info->gpu_props.metal_flops_q5k_f32, sizeof(float));
|
memcpy(ptr, &dev_info->gpu_props.metal_flops_q5k_f32, sizeof(float));
|
||||||
ptr += sizeof(float);
|
ptr += sizeof(float);
|
||||||
|
|
||||||
|
@ -2231,6 +2260,9 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
|
||||||
memcpy(ptr, &dev_info->gpu_props.cuda_flops_q4k_f32, sizeof(float));
|
memcpy(ptr, &dev_info->gpu_props.cuda_flops_q4k_f32, sizeof(float));
|
||||||
ptr += sizeof(float);
|
ptr += sizeof(float);
|
||||||
|
|
||||||
|
memcpy(ptr, &dev_info->gpu_props.cuda_flops_q50_f32, sizeof(float));
|
||||||
|
ptr += sizeof(float);
|
||||||
|
|
||||||
memcpy(ptr, &dev_info->gpu_props.cuda_flops_q5k_f32, sizeof(float));
|
memcpy(ptr, &dev_info->gpu_props.cuda_flops_q5k_f32, sizeof(float));
|
||||||
ptr += sizeof(float);
|
ptr += sizeof(float);
|
||||||
|
|
||||||
|
@ -2317,6 +2349,9 @@ void deserialize(const char * buffer, struct device_info * dev_info) {
|
||||||
memcpy(&dev_info->cpu_props.flops_q4k_f32, ptr, sizeof(float));
|
memcpy(&dev_info->cpu_props.flops_q4k_f32, ptr, sizeof(float));
|
||||||
ptr += sizeof(float);
|
ptr += sizeof(float);
|
||||||
|
|
||||||
|
memcpy(&dev_info->cpu_props.flops_q50_f32, ptr, sizeof(float));
|
||||||
|
ptr += sizeof(float);
|
||||||
|
|
||||||
memcpy(&dev_info->cpu_props.flops_q5k_f32, ptr, sizeof(float));
|
memcpy(&dev_info->cpu_props.flops_q5k_f32, ptr, sizeof(float));
|
||||||
ptr += sizeof(float);
|
ptr += sizeof(float);
|
||||||
|
|
||||||
|
@ -2350,6 +2385,9 @@ void deserialize(const char * buffer, struct device_info * dev_info) {
|
||||||
memcpy(&dev_info->gpu_props.metal_flops_q4k_f32, ptr, sizeof(float));
|
memcpy(&dev_info->gpu_props.metal_flops_q4k_f32, ptr, sizeof(float));
|
||||||
ptr += sizeof(float);
|
ptr += sizeof(float);
|
||||||
|
|
||||||
|
memcpy(&dev_info->gpu_props.metal_flops_q50_f32, ptr, sizeof(float));
|
||||||
|
ptr += sizeof(float);
|
||||||
|
|
||||||
memcpy(&dev_info->gpu_props.metal_flops_q5k_f32, ptr, sizeof(float));
|
memcpy(&dev_info->gpu_props.metal_flops_q5k_f32, ptr, sizeof(float));
|
||||||
ptr += sizeof(float);
|
ptr += sizeof(float);
|
||||||
|
|
||||||
|
@ -2374,6 +2412,9 @@ void deserialize(const char * buffer, struct device_info * dev_info) {
|
||||||
memcpy(&dev_info->gpu_props.cuda_flops_q4k_f32, ptr, sizeof(float));
|
memcpy(&dev_info->gpu_props.cuda_flops_q4k_f32, ptr, sizeof(float));
|
||||||
ptr += sizeof(float);
|
ptr += sizeof(float);
|
||||||
|
|
||||||
|
memcpy(&dev_info->gpu_props.cuda_flops_q50_f32, ptr, sizeof(float));
|
||||||
|
ptr += sizeof(float);
|
||||||
|
|
||||||
memcpy(&dev_info->gpu_props.cuda_flops_q5k_f32, ptr, sizeof(float));
|
memcpy(&dev_info->gpu_props.cuda_flops_q5k_f32, ptr, sizeof(float));
|
||||||
ptr += sizeof(float);
|
ptr += sizeof(float);
|
||||||
|
|
||||||
|
|
|
@ -18,6 +18,7 @@ struct cpu_props {
|
||||||
float flops_f32_f32; // in GFLOPS
|
float flops_f32_f32; // in GFLOPS
|
||||||
float flops_f16_f32; // in GFLOPS
|
float flops_f16_f32; // in GFLOPS
|
||||||
float flops_q4k_f32; // in GFLOPS
|
float flops_q4k_f32; // in GFLOPS
|
||||||
|
float flops_q50_f32; // in GFLOPS
|
||||||
float flops_q5k_f32; // in GFLOPS
|
float flops_q5k_f32; // in GFLOPS
|
||||||
float flops_q6k_f32; // in GFLOPS
|
float flops_q6k_f32; // in GFLOPS
|
||||||
float flops_q80_f32; // in GFLOPS
|
float flops_q80_f32; // in GFLOPS
|
||||||
|
@ -29,6 +30,7 @@ struct cpu_props {
|
||||||
flops_f32_f32(0.0f),
|
flops_f32_f32(0.0f),
|
||||||
flops_f16_f32(0.0f),
|
flops_f16_f32(0.0f),
|
||||||
flops_q4k_f32(0.0f),
|
flops_q4k_f32(0.0f),
|
||||||
|
flops_q50_f32(0.0f),
|
||||||
flops_q5k_f32(0.0f),
|
flops_q5k_f32(0.0f),
|
||||||
flops_q6k_f32(0.0f),
|
flops_q6k_f32(0.0f),
|
||||||
flops_q80_f32(0.0f) {}
|
flops_q80_f32(0.0f) {}
|
||||||
|
@ -81,6 +83,7 @@ struct gpu_props {
|
||||||
float metal_flops_f32_f32; // in GFLOPS
|
float metal_flops_f32_f32; // in GFLOPS
|
||||||
float metal_flops_f16_f32; // in GFLOPS
|
float metal_flops_f16_f32; // in GFLOPS
|
||||||
float metal_flops_q4k_f32; // in GFLOPS
|
float metal_flops_q4k_f32; // in GFLOPS
|
||||||
|
float metal_flops_q50_f32; // in GFLOPS
|
||||||
float metal_flops_q5k_f32; // in GFLOPS
|
float metal_flops_q5k_f32; // in GFLOPS
|
||||||
float metal_flops_q6k_f32; // in GFLOPS
|
float metal_flops_q6k_f32; // in GFLOPS
|
||||||
float metal_flops_q80_f32; // in GFLOPS
|
float metal_flops_q80_f32; // in GFLOPS
|
||||||
|
@ -89,6 +92,7 @@ struct gpu_props {
|
||||||
float cuda_flops_f32_f32; // in GFLOPS
|
float cuda_flops_f32_f32; // in GFLOPS
|
||||||
float cuda_flops_f16_f32; // in GFLOPS
|
float cuda_flops_f16_f32; // in GFLOPS
|
||||||
float cuda_flops_q4k_f32; // in GFLOPS
|
float cuda_flops_q4k_f32; // in GFLOPS
|
||||||
|
float cuda_flops_q50_f32; // in GFLOPS
|
||||||
float cuda_flops_q5k_f32; // in GFLOPS
|
float cuda_flops_q5k_f32; // in GFLOPS
|
||||||
float cuda_flops_q6k_f32; // in GFLOPS
|
float cuda_flops_q6k_f32; // in GFLOPS
|
||||||
float cuda_flops_q80_f32; // in GFLOPS
|
float cuda_flops_q80_f32; // in GFLOPS
|
||||||
|
@ -103,6 +107,7 @@ struct gpu_props {
|
||||||
metal_flops_f32_f32(0.0f),
|
metal_flops_f32_f32(0.0f),
|
||||||
metal_flops_f16_f32(0.0f),
|
metal_flops_f16_f32(0.0f),
|
||||||
metal_flops_q4k_f32(0.0f),
|
metal_flops_q4k_f32(0.0f),
|
||||||
|
metal_flops_q50_f32(0.0f),
|
||||||
metal_flops_q5k_f32(0.0f),
|
metal_flops_q5k_f32(0.0f),
|
||||||
metal_flops_q6k_f32(0.0f),
|
metal_flops_q6k_f32(0.0f),
|
||||||
metal_flops_q80_f32(0.0f),
|
metal_flops_q80_f32(0.0f),
|
||||||
|
@ -111,6 +116,7 @@ struct gpu_props {
|
||||||
cuda_flops_f32_f32 (0.0f),
|
cuda_flops_f32_f32 (0.0f),
|
||||||
cuda_flops_f16_f32 (0.0f),
|
cuda_flops_f16_f32 (0.0f),
|
||||||
cuda_flops_q4k_f32 (0.0f),
|
cuda_flops_q4k_f32 (0.0f),
|
||||||
|
cuda_flops_q50_f32 (0.0f),
|
||||||
cuda_flops_q5k_f32 (0.0f),
|
cuda_flops_q5k_f32 (0.0f),
|
||||||
cuda_flops_q6k_f32 (0.0f),
|
cuda_flops_q6k_f32 (0.0f),
|
||||||
cuda_flops_q80_f32 (0.0f),
|
cuda_flops_q80_f32 (0.0f),
|
||||||
|
@ -122,12 +128,14 @@ struct model_flops {
|
||||||
int64_t output_f32_f32;
|
int64_t output_f32_f32;
|
||||||
int64_t output_f16_f32;
|
int64_t output_f16_f32;
|
||||||
int64_t output_q4k_f32;
|
int64_t output_q4k_f32;
|
||||||
|
int64_t output_q50_f32;
|
||||||
int64_t output_q5k_f32;
|
int64_t output_q5k_f32;
|
||||||
int64_t output_q6k_f32;
|
int64_t output_q6k_f32;
|
||||||
int64_t output_q80_f32;
|
int64_t output_q80_f32;
|
||||||
int64_t layer_f32_f32;
|
int64_t layer_f32_f32;
|
||||||
int64_t layer_f16_f32;
|
int64_t layer_f16_f32;
|
||||||
int64_t layer_q4k_f32;
|
int64_t layer_q4k_f32;
|
||||||
|
int64_t layer_q50_f32;
|
||||||
int64_t layer_q5k_f32;
|
int64_t layer_q5k_f32;
|
||||||
int64_t layer_q6k_f32;
|
int64_t layer_q6k_f32;
|
||||||
int64_t layer_q80_f32;
|
int64_t layer_q80_f32;
|
||||||
|
@ -137,12 +145,14 @@ struct model_flops {
|
||||||
output_f32_f32(0),
|
output_f32_f32(0),
|
||||||
output_f16_f32(0),
|
output_f16_f32(0),
|
||||||
output_q4k_f32(0),
|
output_q4k_f32(0),
|
||||||
|
output_q50_f32(0),
|
||||||
output_q5k_f32(0),
|
output_q5k_f32(0),
|
||||||
output_q6k_f32(0),
|
output_q6k_f32(0),
|
||||||
output_q80_f32(0),
|
output_q80_f32(0),
|
||||||
layer_f32_f32 (0),
|
layer_f32_f32 (0),
|
||||||
layer_f16_f32 (0),
|
layer_f16_f32 (0),
|
||||||
layer_q4k_f32 (0),
|
layer_q4k_f32 (0),
|
||||||
|
layer_q50_f32 (0),
|
||||||
layer_q5k_f32 (0),
|
layer_q5k_f32 (0),
|
||||||
layer_q6k_f32 (0),
|
layer_q6k_f32 (0),
|
||||||
layer_q80_f32 (0) {}
|
layer_q80_f32 (0) {}
|
||||||
|
@ -152,18 +162,21 @@ struct model_params {
|
||||||
int64_t input_f32;
|
int64_t input_f32;
|
||||||
int64_t input_f16;
|
int64_t input_f16;
|
||||||
int64_t input_q4k;
|
int64_t input_q4k;
|
||||||
|
int64_t input_q50;
|
||||||
int64_t input_q5k;
|
int64_t input_q5k;
|
||||||
int64_t input_q6k;
|
int64_t input_q6k;
|
||||||
int64_t input_q80;
|
int64_t input_q80;
|
||||||
int64_t output_f32;
|
int64_t output_f32;
|
||||||
int64_t output_f16;
|
int64_t output_f16;
|
||||||
int64_t output_q4k;
|
int64_t output_q4k;
|
||||||
|
int64_t output_q50;
|
||||||
int64_t output_q5k;
|
int64_t output_q5k;
|
||||||
int64_t output_q6k;
|
int64_t output_q6k;
|
||||||
int64_t output_q80;
|
int64_t output_q80;
|
||||||
int64_t layer_f32;
|
int64_t layer_f32;
|
||||||
int64_t layer_f16;
|
int64_t layer_f16;
|
||||||
int64_t layer_q4k;
|
int64_t layer_q4k;
|
||||||
|
int64_t layer_q50;
|
||||||
int64_t layer_q5k;
|
int64_t layer_q5k;
|
||||||
int64_t layer_q6k;
|
int64_t layer_q6k;
|
||||||
int64_t layer_q80;
|
int64_t layer_q80;
|
||||||
|
@ -172,18 +185,21 @@ struct model_params {
|
||||||
input_f32 (0),
|
input_f32 (0),
|
||||||
input_f16 (0),
|
input_f16 (0),
|
||||||
input_q4k (0),
|
input_q4k (0),
|
||||||
|
input_q50 (0),
|
||||||
input_q5k (0),
|
input_q5k (0),
|
||||||
input_q6k (0),
|
input_q6k (0),
|
||||||
input_q80 (0),
|
input_q80 (0),
|
||||||
output_f32(0),
|
output_f32(0),
|
||||||
output_f16(0),
|
output_f16(0),
|
||||||
output_q4k(0),
|
output_q4k(0),
|
||||||
|
output_q50(0),
|
||||||
output_q5k(0),
|
output_q5k(0),
|
||||||
output_q6k(0),
|
output_q6k(0),
|
||||||
output_q80(0),
|
output_q80(0),
|
||||||
layer_f32 (0),
|
layer_f32 (0),
|
||||||
layer_f16 (0),
|
layer_f16 (0),
|
||||||
layer_q4k (0),
|
layer_q4k (0),
|
||||||
|
layer_q50 (0),
|
||||||
layer_q5k (0),
|
layer_q5k (0),
|
||||||
layer_q6k (0),
|
layer_q6k (0),
|
||||||
layer_q80 (0) {}
|
layer_q80 (0) {}
|
||||||
|
|
|
@ -3561,6 +3561,8 @@ static bool is_dtype_exist(struct model_params * n_params, enum ggml_type dtype)
|
||||||
return true;
|
return true;
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
return n_params->layer_q4k > 0 || n_params->output_q4k > 0;
|
return n_params->layer_q4k > 0 || n_params->output_q4k > 0;
|
||||||
|
case GGML_TYPE_Q5_0:
|
||||||
|
return n_params->layer_q50 > 0 || n_params->output_q50 > 0;
|
||||||
case GGML_TYPE_Q5_K:
|
case GGML_TYPE_Q5_K:
|
||||||
return n_params->layer_q5k > 0 || n_params->output_q5k > 0;
|
return n_params->layer_q5k > 0 || n_params->output_q5k > 0;
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
|
@ -3653,6 +3655,12 @@ void llama_profile_device(
|
||||||
dev_info->gpu_props.cuda_flops_q4k_f32 = device_cuda_flops (model, GGML_TYPE_Q4_K, GGML_TYPE_F32);
|
dev_info->gpu_props.cuda_flops_q4k_f32 = device_cuda_flops (model, GGML_TYPE_Q4_K, GGML_TYPE_F32);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (is_dtype_exist(n_params, GGML_TYPE_Q5_0)) {
|
||||||
|
dev_info->cpu_props.flops_q50_f32 = device_cpu_flops (model, GGML_TYPE_Q5_0, GGML_TYPE_F32, n_threads);
|
||||||
|
dev_info->gpu_props.metal_flops_q50_f32 = device_metal_flops(model, GGML_TYPE_Q5_0, GGML_TYPE_F32);
|
||||||
|
dev_info->gpu_props.cuda_flops_q50_f32 = device_cuda_flops (model, GGML_TYPE_Q5_0, GGML_TYPE_F32);
|
||||||
|
}
|
||||||
|
|
||||||
if (is_dtype_exist(n_params, GGML_TYPE_Q5_K)) {
|
if (is_dtype_exist(n_params, GGML_TYPE_Q5_K)) {
|
||||||
dev_info->cpu_props.flops_q5k_f32 = device_cpu_flops (model, GGML_TYPE_Q5_K, GGML_TYPE_F32, n_threads);
|
dev_info->cpu_props.flops_q5k_f32 = device_cpu_flops (model, GGML_TYPE_Q5_K, GGML_TYPE_F32, n_threads);
|
||||||
dev_info->gpu_props.metal_flops_q5k_f32 = device_metal_flops(model, GGML_TYPE_Q5_K, GGML_TYPE_F32);
|
dev_info->gpu_props.metal_flops_q5k_f32 = device_metal_flops(model, GGML_TYPE_Q5_K, GGML_TYPE_F32);
|
||||||
|
@ -21045,6 +21053,9 @@ static void count_n_flops(struct model_flops * n_flops, enum ggml_type dtype, en
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
n_flops->output_q4k_f32 += n;
|
n_flops->output_q4k_f32 += n;
|
||||||
break;
|
break;
|
||||||
|
case GGML_TYPE_Q5_0:
|
||||||
|
n_flops->output_q50_f32 += n;
|
||||||
|
break;
|
||||||
case GGML_TYPE_Q5_K:
|
case GGML_TYPE_Q5_K:
|
||||||
n_flops->output_q5k_f32 += n;
|
n_flops->output_q5k_f32 += n;
|
||||||
break;
|
break;
|
||||||
|
@ -21070,6 +21081,9 @@ static void count_n_flops(struct model_flops * n_flops, enum ggml_type dtype, en
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
n_flops->layer_q4k_f32 += n;
|
n_flops->layer_q4k_f32 += n;
|
||||||
break;
|
break;
|
||||||
|
case GGML_TYPE_Q5_0:
|
||||||
|
n_flops->layer_q50_f32 += n;
|
||||||
|
break;
|
||||||
case GGML_TYPE_Q5_K:
|
case GGML_TYPE_Q5_K:
|
||||||
n_flops->layer_q5k_f32 += n;
|
n_flops->layer_q5k_f32 += n;
|
||||||
break;
|
break;
|
||||||
|
@ -21103,6 +21117,9 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype,
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
n_params->input_q4k += n_i64t;
|
n_params->input_q4k += n_i64t;
|
||||||
break;
|
break;
|
||||||
|
case GGML_TYPE_Q5_0:
|
||||||
|
n_params->input_q50 += n_i64t;
|
||||||
|
break;
|
||||||
case GGML_TYPE_Q5_K:
|
case GGML_TYPE_Q5_K:
|
||||||
n_params->input_q5k += n_i64t;
|
n_params->input_q5k += n_i64t;
|
||||||
break;
|
break;
|
||||||
|
@ -21128,6 +21145,9 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype,
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
n_params->output_q4k += n_i64t;
|
n_params->output_q4k += n_i64t;
|
||||||
break;
|
break;
|
||||||
|
case GGML_TYPE_Q5_0:
|
||||||
|
n_params->output_q50 += n_i64t;
|
||||||
|
break;
|
||||||
case GGML_TYPE_Q5_K:
|
case GGML_TYPE_Q5_K:
|
||||||
n_params->output_q5k += n_i64t;
|
n_params->output_q5k += n_i64t;
|
||||||
break;
|
break;
|
||||||
|
@ -21153,6 +21173,9 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype,
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
n_params->layer_q4k += n_i64t;
|
n_params->layer_q4k += n_i64t;
|
||||||
break;
|
break;
|
||||||
|
case GGML_TYPE_Q5_0:
|
||||||
|
n_params->layer_q50 += n_i64t;
|
||||||
|
break;
|
||||||
case GGML_TYPE_Q5_K:
|
case GGML_TYPE_Q5_K:
|
||||||
n_params->layer_q5k += n_i64t;
|
n_params->layer_q5k += n_i64t;
|
||||||
break;
|
break;
|
||||||
|
@ -21362,7 +21385,6 @@ void llama_model_n_flops(
|
||||||
case LLM_ARCH_MINICPM:
|
case LLM_ARCH_MINICPM:
|
||||||
case LLM_ARCH_GRANITE:
|
case LLM_ARCH_GRANITE:
|
||||||
case LLM_ARCH_GRANITE_MOE:
|
case LLM_ARCH_GRANITE_MOE:
|
||||||
|
|
||||||
llm_load_llama_tensors(*ml, *model, ctx_map, 1, 0, n_layer_window, &use_mmap_buffer, false);
|
llm_load_llama_tensors(*ml, *model, ctx_map, 1, 0, n_layer_window, &use_mmap_buffer, false);
|
||||||
break;
|
break;
|
||||||
case LLM_ARCH_QWEN2:
|
case LLM_ARCH_QWEN2:
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue