add support for Q5_K and fix byte count for Q6_K

This commit is contained in:
Lizonghang 2024-12-06 07:59:45 +04:00
parent e6f4c009ab
commit f1c1d1b929
3 changed files with 190 additions and 53 deletions

View file

@ -98,7 +98,7 @@ uint32_t device_cpu_cores() {
static float device_flops(struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t, enum profiler_backend_type btype, int n_threads) {
const int n_repeat = 1;
const int n_embd = llama_n_embd(model);
const int n_embd = std::min(llama_n_embd(model), 4096);
std::vector<float> matrix_A(n_embd * n_embd, 1.0f);
std::vector<float> matrix_B(n_embd * n_embd, 1.0f / n_embd);
@ -312,8 +312,10 @@ float device_inp_embd_delay(struct llama_model * model, enum ggml_type src0t, in
break;
}
case GGML_TYPE_Q4_K:
case GGML_TYPE_Q5_K:
case GGML_TYPE_Q6_K:
case GGML_TYPE_Q8_K:
case GGML_TYPE_Q8_0:
QK_K = 256;
matrix_B = malloc((embd_size / QK_K) * ggml_type_size(src0t));
break;
@ -930,7 +932,7 @@ float device_memory_bw(int n_thread) {
}
static float device_read_vram_bw(struct llama_model * model, enum profiler_backend_type btype) {
const int n_embd = llama_n_embd(model) * 2;
const int n_embd = std::min(llama_n_embd(model) * 2, 4096 * 2);
std::vector<float> matrix_A(n_embd * n_embd, 1.0f);
ggml_backend_t backend = NULL;
@ -1073,6 +1075,7 @@ static float device_compute_delay(struct device_info & dev_info, int n_layers, c
gpu_latency_per_layer += (double)n_flops.layer_f32_f32 / (double)gpu.cuda_flops_f32_f32 / 1e9;
gpu_latency_per_layer += (double)n_flops.layer_f16_f32 / (double)gpu.cuda_flops_f16_f32 / 1e9;
gpu_latency_per_layer += (double)n_flops.layer_q4k_f32 / (double)gpu.cuda_flops_q4k_f32 / 1e9;
gpu_latency_per_layer += (double)n_flops.layer_q5k_f32 / (double)gpu.cuda_flops_q5k_f32 / 1e9;
gpu_latency_per_layer += (double)n_flops.layer_q6k_f32 / (double)gpu.cuda_flops_q6k_f32 / 1e9;
gpu_latency_per_layer += (double)n_flops.layer_q80_f32 / (double)gpu.cuda_flops_q80_f32 / 1e9;
#elif GGML_USE_METAL
@ -1081,6 +1084,7 @@ static float device_compute_delay(struct device_info & dev_info, int n_layers, c
gpu_latency_per_layer += (double)n_flops.layer_f32_f32 / (double)gpu.metal_flops_f32_f32 / 1e9;
gpu_latency_per_layer += (double)n_flops.layer_f16_f32 / (double)gpu.metal_flops_f16_f32 / 1e9;
gpu_latency_per_layer += (double)n_flops.layer_q4k_f32 / (double)gpu.metal_flops_q4k_f32 / 1e9;
gpu_latency_per_layer += (double)n_flops.layer_q5k_f32 / (double)gpu.metal_flops_q5k_f32 / 1e9;
gpu_latency_per_layer += (double)n_flops.layer_q6k_f32 / (double)gpu.metal_flops_q6k_f32 / 1e9;
gpu_latency_per_layer += (double)n_flops.layer_q80_f32 / (double)gpu.metal_flops_q80_f32 / 1e9;
#endif
@ -1088,6 +1092,7 @@ static float device_compute_delay(struct device_info & dev_info, int n_layers, c
cpu_latency_per_layer += (double)n_flops.layer_f32_f32 / (double)cpu.flops_f32_f32 / 1e9;
cpu_latency_per_layer += (double)n_flops.layer_f16_f32 / (double)cpu.flops_f16_f32 / 1e9;
cpu_latency_per_layer += (double)n_flops.layer_q4k_f32 / (double)cpu.flops_q4k_f32 / 1e9;
cpu_latency_per_layer += (double)n_flops.layer_q5k_f32 / (double)cpu.flops_q5k_f32 / 1e9;
cpu_latency_per_layer += (double)n_flops.layer_q6k_f32 / (double)cpu.flops_q6k_f32 / 1e9;
cpu_latency_per_layer += (double)n_flops.layer_q80_f32 / (double)cpu.flops_q80_f32 / 1e9;
@ -1105,6 +1110,7 @@ static float device_compute_delay(struct device_info & dev_info, int n_layers, c
total_latency += (double)n_flops.output_f32_f32 / (double)cpu.flops_f32_f32 / 1e9;
total_latency += (double)n_flops.output_f16_f32 / (double)cpu.flops_f16_f32 / 1e9;
total_latency += (double)n_flops.output_q4k_f32 / (double)cpu.flops_q4k_f32 / 1e9;
total_latency += (double)n_flops.output_q5k_f32 / (double)cpu.flops_q5k_f32 / 1e9;
total_latency += (double)n_flops.output_q6k_f32 / (double)cpu.flops_q6k_f32 / 1e9;
total_latency += (double)n_flops.output_q80_f32 / (double)cpu.flops_q80_f32 / 1e9;
@ -1123,15 +1129,17 @@ static float device_memory_access_delay(struct device_info & dev_info, const str
int64_t layer_bytes =
n_params.layer_f32 * 4 +
n_params.layer_f16 * 2 +
n_params.layer_q4k / 2 +
n_params.layer_q6k * 3 / 8 +
n_params.layer_q4k * 4 / 8 +
n_params.layer_q5k * 5 / 8 +
n_params.layer_q6k * 6 / 8 +
n_params.layer_q80;
int64_t output_bytes =
n_params.output_f32 * 4 +
n_params.output_f16 * 2 +
n_params.output_q4k / 2 +
n_params.output_q6k * 3 / 8 +
n_params.output_q4k * 4 / 8 +
n_params.output_q5k * 5 / 8 +
n_params.output_q6k * 6 / 8 +
n_params.output_q80;
#if defined(GGML_USE_CUDA) || defined(GGML_USE_METAL)
@ -1164,8 +1172,8 @@ static float device_disk_access_delay(struct device_info & dev_info, struct llam
int64_t input_bytes = (
n_params.input_f32 * 4 +
n_params.input_f16 * 2 +
n_params.input_q4k / 2 +
n_params.input_q6k * 3 / 8 +
n_params.input_q4k * 4 / 8 +
n_params.input_q6k * 6 / 8 +
n_params.input_q80) / n_vocab; // lookup table, retrieve only n_embd elements
int64_t cpu_total_bytes = input_bytes;
@ -1173,8 +1181,9 @@ static float device_disk_access_delay(struct device_info & dev_info, struct llam
int64_t layer_bytes =
n_params.layer_f32 * 4 +
n_params.layer_f16 * 2 +
n_params.layer_q4k / 2 +
n_params.layer_q6k * 3 / 8 +
n_params.layer_q4k * 4 / 8 +
n_params.layer_q5k * 5 / 8 +
n_params.layer_q6k * 6 / 8 +
n_params.layer_q80;
#if defined(GGML_USE_METAL) || defined(GGML_USE_CUDA)
@ -1188,8 +1197,9 @@ static float device_disk_access_delay(struct device_info & dev_info, struct llam
cpu_total_bytes += (
n_params.output_f32 * 4 +
n_params.output_f16 * 2 +
n_params.output_q4k / 2 +
n_params.output_q6k * 3 / 8 +
n_params.output_q4k * 4 / 8 +
n_params.output_q5k * 5 / 8 +
n_params.output_q6k * 6 / 8 +
n_params.output_q80);
uint64_t cpu_kv_size;
@ -1292,6 +1302,12 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
}
LOG_INF("\n");
LOG_INF("| CPU flops (Q5K x F32, GFLOPS)");
for (int i = 0; i < n; ++i) {
LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q5k_f32);
}
LOG_INF("\n");
LOG_INF("| CPU flops (Q6K x F32, GFLOPS)");
for (int i = 0; i < n; ++i) {
LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q6k_f32);
@ -1448,6 +1464,12 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
}
LOG_INF("\n");
LOG_INF("| Metal flops (Q5KxF32, GFLOPS)");
for (int i = 0; i < n; ++i) {
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q5k_f32);
}
LOG_INF("\n");
LOG_INF("| Metal flops (Q6KxF32, GFLOPS)");
for (int i = 0; i < n; ++i) {
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q6k_f32);
@ -1484,6 +1506,12 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
}
LOG_INF("\n");
LOG_INF("| CUDA flops (Q5KxF32, GFLOPS) ");
for (int i = 0; i < n; ++i) {
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q5k_f32);
}
LOG_INF("\n");
LOG_INF("| CUDA flops (Q6KxF32, GFLOPS) ");
for (int i = 0; i < n; ++i) {
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q6k_f32);
@ -1508,6 +1536,10 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q4k_f32);
LOG_INF("\n");
LOG_INF("| Model flops (output Q5KxF32) ");
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q5k_f32);
LOG_INF("\n");
LOG_INF("| Model flops (output Q6KxF32) ");
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q6k_f32);
LOG_INF("\n");
@ -1528,6 +1560,10 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q4k_f32);
LOG_INF("\n");
LOG_INF("| Model flops (layer Q5KxF32) ");
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q5k_f32);
LOG_INF("\n");
LOG_INF("| Model flops (layer Q6KxF32) ");
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q6k_f32);
LOG_INF("\n");
@ -1548,6 +1584,10 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q4k);
LOG_INF("\n");
LOG_INF("| Model params (input Q5K) ");
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q5k);
LOG_INF("\n");
LOG_INF("| Model params (input Q6K) ");
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q6k);
LOG_INF("\n");
@ -1568,6 +1608,10 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q4k);
LOG_INF("\n");
LOG_INF("| Model params (layer Q5K) ");
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q5k);
LOG_INF("\n");
LOG_INF("| Model params (layer Q6K) ");
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q6k);
LOG_INF("\n");
@ -1588,6 +1632,10 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q4k);
LOG_INF("\n");
LOG_INF("| Model params (output Q5K) ");
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q5k);
LOG_INF("\n");
LOG_INF("| Model params (output Q6K) ");
LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q6k);
LOG_INF("\n");
@ -1628,12 +1676,12 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
+ gpu_description_len
+ sizeof(struct disk_props)
+ sizeof(uint32_t) // cpu_props.cores
+ sizeof(float) * 5 // cpu_props.flops_f32_f32, cpu_props.flops_f16_f32, cpu_props.flops_q4k_f32, cpu_props.flops_q6k_f32, cpu_props.flops_q80_f32
+ sizeof(float) * 6 // cpu_props.flops_f32_f32, cpu_props.flops_f16_f32, cpu_props.flops_q4k_f32, cpu_props.flops_q5k_f32, cpu_props.flops_q6k_f32, cpu_props.flops_q80_f32
+ sizeof(struct memory_info)
+ sizeof(struct gpu_support)
+ sizeof(float) * 14; // gpu_props.memory_free, gpu_props.memory_total, gpu_props.metal_read_vram_bw, gpu_props.cuda_read_vram_bw,
// gpu_props.metal_flops_f32_f32, gpu_props.metal_flops_f16_f32, gpu_props.metal_flops_q4k_f32, gpu_props.metal_flops_q6k_f32, gpu_props.metal_flops_q80_f32,
// gpu_props.cuda_flops_f32_f32, gpu_props.cuda_flops_f16_f32, gpu_props.cuda_flops_q4k_f32, gpu_props.cuda_flops_q6k_f32, gpu_props.cuda_flops_q80_f32
+ sizeof(float) * 16; // gpu_props.memory_free, gpu_props.memory_total, gpu_props.metal_read_vram_bw, gpu_props.cuda_read_vram_bw,
// gpu_props.metal_flops_f32_f32, gpu_props.metal_flops_f16_f32, gpu_props.metal_flops_q4k_f32, gpu_props.metal_flops_q5k_f32, gpu_props.metal_flops_q6k_f32, gpu_props.metal_flops_q80_f32,
// gpu_props.cuda_flops_f32_f32, gpu_props.cuda_flops_f16_f32, gpu_props.cuda_flops_q4k_f32, gpu_props.cuda_flops_q5k_f32, gpu_props.cuda_flops_q6k_f32, gpu_props.cuda_flops_q80_f32
*buffer = (char *)malloc(total_size);
char * ptr = *buffer;
@ -1684,6 +1732,9 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
memcpy(ptr, &dev_info->cpu_props.flops_q4k_f32, sizeof(float));
ptr += sizeof(float);
memcpy(ptr, &dev_info->cpu_props.flops_q5k_f32, sizeof(float));
ptr += sizeof(float);
memcpy(ptr, &dev_info->cpu_props.flops_q6k_f32, sizeof(float));
ptr += sizeof(float);
@ -1714,6 +1765,9 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
memcpy(ptr, &dev_info->gpu_props.metal_flops_q4k_f32, sizeof(float));
ptr += sizeof(float);
memcpy(ptr, &dev_info->gpu_props.metal_flops_q5k_f32, sizeof(float));
ptr += sizeof(float);
memcpy(ptr, &dev_info->gpu_props.metal_flops_q6k_f32, sizeof(float));
ptr += sizeof(float);
@ -1732,6 +1786,9 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
memcpy(ptr, &dev_info->gpu_props.cuda_flops_q4k_f32, sizeof(float));
ptr += sizeof(float);
memcpy(ptr, &dev_info->gpu_props.cuda_flops_q5k_f32, sizeof(float));
ptr += sizeof(float);
memcpy(ptr, &dev_info->gpu_props.cuda_flops_q6k_f32, sizeof(float));
ptr += sizeof(float);
@ -1804,6 +1861,9 @@ void deserialize(const char * buffer, struct device_info * dev_info) {
memcpy(&dev_info->cpu_props.flops_q4k_f32, ptr, sizeof(float));
ptr += sizeof(float);
memcpy(&dev_info->cpu_props.flops_q5k_f32, ptr, sizeof(float));
ptr += sizeof(float);
memcpy(&dev_info->cpu_props.flops_q6k_f32, ptr, sizeof(float));
ptr += sizeof(float);
@ -1834,6 +1894,9 @@ void deserialize(const char * buffer, struct device_info * dev_info) {
memcpy(&dev_info->gpu_props.metal_flops_q4k_f32, ptr, sizeof(float));
ptr += sizeof(float);
memcpy(&dev_info->gpu_props.metal_flops_q5k_f32, ptr, sizeof(float));
ptr += sizeof(float);
memcpy(&dev_info->gpu_props.metal_flops_q6k_f32, ptr, sizeof(float));
ptr += sizeof(float);
@ -1852,6 +1915,9 @@ void deserialize(const char * buffer, struct device_info * dev_info) {
memcpy(&dev_info->gpu_props.cuda_flops_q4k_f32, ptr, sizeof(float));
ptr += sizeof(float);
memcpy(&dev_info->gpu_props.cuda_flops_q5k_f32, ptr, sizeof(float));
ptr += sizeof(float);
memcpy(&dev_info->gpu_props.cuda_flops_q6k_f32, ptr, sizeof(float));
ptr += sizeof(float);

View file

@ -4,6 +4,7 @@
#include "ggml.h"
#include "llama.h"
#define EPS 1e-9f
#define DISK_TEST_TOTAL_BYTE 500L * 1024 * 1024
#define DISK_TEST_SEQ_BLOCK 100L * 1024 * 1024
#define DISK_TEST_RND_BLOCK 4096
@ -17,6 +18,7 @@ struct cpu_props {
float flops_f32_f32; // in GFLOPS
float flops_f16_f32; // in GFLOPS
float flops_q4k_f32; // in GFLOPS
float flops_q5k_f32; // in GFLOPS
float flops_q6k_f32; // in GFLOPS
float flops_q80_f32; // in GFLOPS
@ -24,11 +26,12 @@ struct cpu_props {
name(""),
description(""),
cores(0),
flops_f32_f32(0.0f),
flops_f16_f32(0.0f),
flops_q4k_f32(0.0f),
flops_q6k_f32(0.0f),
flops_q80_f32(0.0f) {}
flops_f32_f32(EPS),
flops_f16_f32(EPS),
flops_q4k_f32(EPS),
flops_q5k_f32(EPS),
flops_q6k_f32(EPS),
flops_q80_f32(EPS) {}
};
struct memory_info {
@ -74,12 +77,14 @@ struct gpu_props {
float metal_flops_f32_f32; // in GFLOPS
float metal_flops_f16_f32; // in GFLOPS
float metal_flops_q4k_f32; // in GFLOPS
float metal_flops_q5k_f32; // in GFLOPS
float metal_flops_q6k_f32; // in GFLOPS
float metal_flops_q80_f32; // in GFLOPS
float cuda_read_vram_bw; // in GB/s
float cuda_flops_f32_f32; // in GFLOPS
float cuda_flops_f16_f32; // in GFLOPS
float cuda_flops_q4k_f32; // in GFLOPS
float cuda_flops_q5k_f32; // in GFLOPS
float cuda_flops_q6k_f32; // in GFLOPS
float cuda_flops_q80_f32; // in GFLOPS
@ -89,17 +94,19 @@ struct gpu_props {
memory_free (0.0f),
memory_total (0.0f),
metal_read_vram_bw (0.0f),
metal_flops_f32_f32(0.0f),
metal_flops_f16_f32(0.0f),
metal_flops_q4k_f32(0.0f),
metal_flops_q6k_f32(0.0f),
metal_flops_q80_f32(0.0f),
metal_flops_f32_f32(EPS),
metal_flops_f16_f32(EPS),
metal_flops_q4k_f32(EPS),
metal_flops_q5k_f32(EPS),
metal_flops_q6k_f32(EPS),
metal_flops_q80_f32(EPS),
cuda_read_vram_bw (0.0f),
cuda_flops_f32_f32 (0.0f),
cuda_flops_f16_f32 (0.0f),
cuda_flops_q4k_f32 (0.0f),
cuda_flops_q6k_f32 (0.0f),
cuda_flops_q80_f32 (0.0f) {}
cuda_flops_f32_f32 (EPS),
cuda_flops_f16_f32 (EPS),
cuda_flops_q4k_f32 (EPS),
cuda_flops_q5k_f32 (EPS),
cuda_flops_q6k_f32 (EPS),
cuda_flops_q80_f32 (EPS) {}
};
struct model_flops {
@ -107,11 +114,13 @@ struct model_flops {
int64_t output_f32_f32;
int64_t output_f16_f32;
int64_t output_q4k_f32;
int64_t output_q5k_f32;
int64_t output_q6k_f32;
int64_t output_q80_f32;
int64_t layer_f32_f32;
int64_t layer_f16_f32;
int64_t layer_q4k_f32;
int64_t layer_q5k_f32;
int64_t layer_q6k_f32;
int64_t layer_q80_f32;
@ -120,11 +129,13 @@ struct model_flops {
output_f32_f32(0),
output_f16_f32(0),
output_q4k_f32(0),
output_q5k_f32(0),
output_q6k_f32(0),
output_q80_f32(0),
layer_f32_f32 (0),
layer_f16_f32 (0),
layer_q4k_f32 (0),
layer_q5k_f32 (0),
layer_q6k_f32 (0),
layer_q80_f32 (0) {}
};
@ -133,16 +144,19 @@ struct model_params {
int64_t input_f32;
int64_t input_f16;
int64_t input_q4k;
int64_t input_q5k;
int64_t input_q6k;
int64_t input_q80;
int64_t output_f32;
int64_t output_f16;
int64_t output_q4k;
int64_t output_q5k;
int64_t output_q6k;
int64_t output_q80;
int64_t layer_f32;
int64_t layer_f16;
int64_t layer_q4k;
int64_t layer_q5k;
int64_t layer_q6k;
int64_t layer_q80;
@ -150,16 +164,19 @@ struct model_params {
input_f32 (0),
input_f16 (0),
input_q4k (0),
input_q5k (0),
input_q6k (0),
input_q80 (0),
output_f32(0),
output_f16(0),
output_q4k(0),
output_q5k(0),
output_q6k(0),
output_q80(0),
layer_f32 (0),
layer_f16 (0),
layer_q4k (0),
layer_q5k (0),
layer_q6k (0),
layer_q80 (0) {}
};

View file

@ -3552,15 +3552,36 @@ void llama_perf_context_sync(struct llama_context * ctx, const struct llama_mode
ctx->t_load_us = model->t_load_us;
}
static bool is_dtype_exist(struct model_params * n_params, enum ggml_type dtype) {
switch (dtype) {
case GGML_TYPE_F32:
case GGML_TYPE_F16:
return true;
case GGML_TYPE_Q4_K:
return n_params->layer_q4k > 0 || n_params->output_q4k > 0;
case GGML_TYPE_Q5_K:
return n_params->layer_q5k > 0 || n_params->output_q5k > 0;
case GGML_TYPE_Q6_K:
return n_params->layer_q6k > 0 || n_params->output_q6k > 0;
case GGML_TYPE_Q8_0:
return n_params->layer_q80 > 0 || n_params->output_q80 > 0;
default:
throw std::runtime_error("Unrecognized data type\n");
}
}
void llama_profile_device(device_info * dev_info, struct llama_model * model, llama_model_loader * ml, int n_threads) {
struct model_flops * n_flops = &dev_info->model_flops;
struct model_params * n_params = &dev_info->model_params;
if (dev_info->rank == 0) {
enum ggml_type inp_embd_dtype = GGML_TYPE_F32;
llama_model_n_flops(model, ml, n_flops, n_params, 1, 32, &inp_embd_dtype);
n_flops->inp_embd_ms = device_inp_embd_delay(model, inp_embd_dtype, 1, n_threads);
}
dev_info->device_name = device_name();
dev_info->cpu_props.cores = device_cpu_cores();
dev_info->cpu_props.flops_f32_f32 = device_cpu_flops(model, GGML_TYPE_F32, GGML_TYPE_F32, n_threads);
dev_info->cpu_props.flops_f16_f32 = device_cpu_flops(model, GGML_TYPE_F16, GGML_TYPE_F32, n_threads);
dev_info->cpu_props.flops_q4k_f32 = device_cpu_flops(model, GGML_TYPE_Q4_K, GGML_TYPE_F32, n_threads);
dev_info->cpu_props.flops_q6k_f32 = device_cpu_flops(model, GGML_TYPE_Q6_K, GGML_TYPE_F32, n_threads);
dev_info->cpu_props.flops_q80_f32 = device_cpu_flops(model, GGML_TYPE_Q8_0, GGML_TYPE_F32, n_threads);
dev_info->memory.total_physical = round(device_physical_memory(false) / (double)(1 << 30) * 100) / 100;
dev_info->memory.available_physical = round(device_physical_memory(true) / (double)(1 << 30) * 100) / 100;
dev_info->memory.total_swap = round(device_swap_memory(false) / (double)(1 << 30) * 100) / 100;
@ -3591,24 +3612,42 @@ void llama_profile_device(device_info * dev_info, struct llama_model * model, ll
dev_info->gpu_props.memory_free = round(gpu_props.memory_free / (double)(1 << 30) * 100) / 100;
dev_info->gpu_props.memory_total = round(gpu_props.memory_total / (double)(1 << 30) * 100) / 100;
dev_info->gpu_props.metal_read_vram_bw = device_metal_read_vram_bw(model);
dev_info->gpu_props.metal_flops_f32_f32 = device_metal_flops(model, GGML_TYPE_F32, GGML_TYPE_F32);
dev_info->gpu_props.metal_flops_f16_f32 = device_metal_flops(model, GGML_TYPE_F16, GGML_TYPE_F32);
dev_info->gpu_props.metal_flops_q4k_f32 = device_metal_flops(model, GGML_TYPE_Q4_K, GGML_TYPE_F32);
dev_info->gpu_props.metal_flops_q6k_f32 = device_metal_flops(model, GGML_TYPE_Q6_K, GGML_TYPE_F32);
dev_info->gpu_props.metal_flops_q80_f32 = device_metal_flops(model, GGML_TYPE_Q8_0, GGML_TYPE_F32);
dev_info->gpu_props.cuda_read_vram_bw = device_cuda_read_vram_bw(model);
dev_info->gpu_props.cuda_flops_f32_f32 = device_cuda_flops (model, GGML_TYPE_F32, GGML_TYPE_F32);
dev_info->gpu_props.cuda_flops_f16_f32 = device_cuda_flops (model, GGML_TYPE_F16, GGML_TYPE_F32);
dev_info->gpu_props.cuda_flops_q4k_f32 = device_cuda_flops (model, GGML_TYPE_Q4_K, GGML_TYPE_F32);
dev_info->gpu_props.cuda_flops_q6k_f32 = device_cuda_flops (model, GGML_TYPE_Q6_K, GGML_TYPE_F32);
dev_info->gpu_props.cuda_flops_q80_f32 = device_cuda_flops (model, GGML_TYPE_Q8_0, GGML_TYPE_F32);
if (dev_info->rank == 0) {
struct model_flops * n_flops = &dev_info->model_flops;
struct model_params * n_params = &dev_info->model_params;
enum ggml_type inp_embd_dtype = GGML_TYPE_F32;
llama_model_n_flops(model, ml, n_flops, n_params, 1, 32, &inp_embd_dtype);
n_flops->inp_embd_ms = device_inp_embd_delay(model, inp_embd_dtype, 1, n_threads);
if (is_dtype_exist(n_params, GGML_TYPE_F32)) {
dev_info->cpu_props.flops_f32_f32 = device_cpu_flops (model, GGML_TYPE_F32, GGML_TYPE_F32, n_threads);
dev_info->gpu_props.metal_flops_f32_f32 = device_metal_flops(model, GGML_TYPE_F32, GGML_TYPE_F32);
dev_info->gpu_props.cuda_flops_f32_f32 = device_cuda_flops (model, GGML_TYPE_F32, GGML_TYPE_F32);
}
if (is_dtype_exist(n_params, GGML_TYPE_F16)) {
dev_info->cpu_props.flops_f16_f32 = device_cpu_flops (model, GGML_TYPE_F16, GGML_TYPE_F32, n_threads);
dev_info->gpu_props.metal_flops_f16_f32 = device_metal_flops(model, GGML_TYPE_F16, GGML_TYPE_F32);
dev_info->gpu_props.cuda_flops_f16_f32 = device_cuda_flops (model, GGML_TYPE_F16, GGML_TYPE_F32);
}
if (is_dtype_exist(n_params, GGML_TYPE_Q4_K)) {
dev_info->cpu_props.flops_q4k_f32 = device_cpu_flops (model, GGML_TYPE_Q4_K, GGML_TYPE_F32, n_threads);
dev_info->gpu_props.metal_flops_q4k_f32 = device_metal_flops(model, GGML_TYPE_Q4_K, GGML_TYPE_F32);
dev_info->gpu_props.cuda_flops_q4k_f32 = device_cuda_flops (model, GGML_TYPE_Q4_K, GGML_TYPE_F32);
}
if (is_dtype_exist(n_params, GGML_TYPE_Q5_K)) {
dev_info->cpu_props.flops_q5k_f32 = device_cpu_flops (model, GGML_TYPE_Q5_K, GGML_TYPE_F32, n_threads);
dev_info->gpu_props.metal_flops_q5k_f32 = device_metal_flops(model, GGML_TYPE_Q5_K, GGML_TYPE_F32);
dev_info->gpu_props.cuda_flops_q5k_f32 = device_cuda_flops (model, GGML_TYPE_Q5_K, GGML_TYPE_F32);
}
if (is_dtype_exist(n_params, GGML_TYPE_Q6_K)) {
dev_info->cpu_props.flops_q6k_f32 = device_cpu_flops (model, GGML_TYPE_Q6_K, GGML_TYPE_F32, n_threads);
dev_info->gpu_props.metal_flops_q6k_f32 = device_metal_flops(model, GGML_TYPE_Q6_K, GGML_TYPE_F32);
dev_info->gpu_props.cuda_flops_q6k_f32 = device_cuda_flops (model, GGML_TYPE_Q6_K, GGML_TYPE_F32);
}
if (is_dtype_exist(n_params, GGML_TYPE_Q8_0)) {
dev_info->cpu_props.flops_q80_f32 = device_cpu_flops (model, GGML_TYPE_Q8_0, GGML_TYPE_F32, n_threads);
dev_info->gpu_props.metal_flops_q80_f32 = device_metal_flops(model, GGML_TYPE_Q8_0, GGML_TYPE_F32);
dev_info->gpu_props.cuda_flops_q80_f32 = device_cuda_flops (model, GGML_TYPE_Q8_0, GGML_TYPE_F32);
}
}
@ -20699,6 +20738,9 @@ static void count_n_flops(struct model_flops * n_flops, enum ggml_type dtype, en
case GGML_TYPE_Q4_K:
n_flops->output_q4k_f32 += n;
break;
case GGML_TYPE_Q5_K:
n_flops->output_q5k_f32 += n;
break;
case GGML_TYPE_Q6_K:
n_flops->output_q6k_f32 += n;
break;
@ -20721,6 +20763,9 @@ static void count_n_flops(struct model_flops * n_flops, enum ggml_type dtype, en
case GGML_TYPE_Q4_K:
n_flops->layer_q4k_f32 += n;
break;
case GGML_TYPE_Q5_K:
n_flops->layer_q5k_f32 += n;
break;
case GGML_TYPE_Q6_K:
n_flops->layer_q6k_f32 += n;
break;
@ -20751,6 +20796,9 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype,
case GGML_TYPE_Q4_K:
n_params->input_q4k += n_i64t;
break;
case GGML_TYPE_Q5_K:
n_params->input_q5k += n_i64t;
break;
case GGML_TYPE_Q6_K:
n_params->input_q6k += n_i64t;
break;
@ -20773,6 +20821,9 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype,
case GGML_TYPE_Q4_K:
n_params->output_q4k += n_i64t;
break;
case GGML_TYPE_Q5_K:
n_params->output_q5k += n_i64t;
break;
case GGML_TYPE_Q6_K:
n_params->output_q6k += n_i64t;
break;
@ -20795,6 +20846,9 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype,
case GGML_TYPE_Q4_K:
n_params->layer_q4k += n_i64t;
break;
case GGML_TYPE_Q5_K:
n_params->layer_q5k += n_i64t;
break;
case GGML_TYPE_Q6_K:
n_params->layer_q6k += n_i64t;
break;