diff --git a/common/profiler.cpp b/common/profiler.cpp index aa5f388b..a19c529b 100644 --- a/common/profiler.cpp +++ b/common/profiler.cpp @@ -98,7 +98,7 @@ uint32_t device_cpu_cores() { static float device_flops(struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t, enum profiler_backend_type btype, int n_threads) { const int n_repeat = 1; - const int n_embd = llama_n_embd(model); + const int n_embd = std::min(llama_n_embd(model), 4096); std::vector matrix_A(n_embd * n_embd, 1.0f); std::vector matrix_B(n_embd * n_embd, 1.0f / n_embd); @@ -312,8 +312,10 @@ float device_inp_embd_delay(struct llama_model * model, enum ggml_type src0t, in break; } case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: case GGML_TYPE_Q6_K: case GGML_TYPE_Q8_K: + case GGML_TYPE_Q8_0: QK_K = 256; matrix_B = malloc((embd_size / QK_K) * ggml_type_size(src0t)); break; @@ -930,7 +932,7 @@ float device_memory_bw(int n_thread) { } static float device_read_vram_bw(struct llama_model * model, enum profiler_backend_type btype) { - const int n_embd = llama_n_embd(model) * 2; + const int n_embd = std::min(llama_n_embd(model) * 2, 4096 * 2); std::vector matrix_A(n_embd * n_embd, 1.0f); ggml_backend_t backend = NULL; @@ -1073,6 +1075,7 @@ static float device_compute_delay(struct device_info & dev_info, int n_layers, c gpu_latency_per_layer += (double)n_flops.layer_f32_f32 / (double)gpu.cuda_flops_f32_f32 / 1e9; gpu_latency_per_layer += (double)n_flops.layer_f16_f32 / (double)gpu.cuda_flops_f16_f32 / 1e9; gpu_latency_per_layer += (double)n_flops.layer_q4k_f32 / (double)gpu.cuda_flops_q4k_f32 / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_q5k_f32 / (double)gpu.cuda_flops_q5k_f32 / 1e9; gpu_latency_per_layer += (double)n_flops.layer_q6k_f32 / (double)gpu.cuda_flops_q6k_f32 / 1e9; gpu_latency_per_layer += (double)n_flops.layer_q80_f32 / (double)gpu.cuda_flops_q80_f32 / 1e9; #elif GGML_USE_METAL @@ -1081,6 +1084,7 @@ static float device_compute_delay(struct device_info & dev_info, int n_layers, c gpu_latency_per_layer += (double)n_flops.layer_f32_f32 / (double)gpu.metal_flops_f32_f32 / 1e9; gpu_latency_per_layer += (double)n_flops.layer_f16_f32 / (double)gpu.metal_flops_f16_f32 / 1e9; gpu_latency_per_layer += (double)n_flops.layer_q4k_f32 / (double)gpu.metal_flops_q4k_f32 / 1e9; + gpu_latency_per_layer += (double)n_flops.layer_q5k_f32 / (double)gpu.metal_flops_q5k_f32 / 1e9; gpu_latency_per_layer += (double)n_flops.layer_q6k_f32 / (double)gpu.metal_flops_q6k_f32 / 1e9; gpu_latency_per_layer += (double)n_flops.layer_q80_f32 / (double)gpu.metal_flops_q80_f32 / 1e9; #endif @@ -1088,6 +1092,7 @@ static float device_compute_delay(struct device_info & dev_info, int n_layers, c cpu_latency_per_layer += (double)n_flops.layer_f32_f32 / (double)cpu.flops_f32_f32 / 1e9; cpu_latency_per_layer += (double)n_flops.layer_f16_f32 / (double)cpu.flops_f16_f32 / 1e9; cpu_latency_per_layer += (double)n_flops.layer_q4k_f32 / (double)cpu.flops_q4k_f32 / 1e9; + cpu_latency_per_layer += (double)n_flops.layer_q5k_f32 / (double)cpu.flops_q5k_f32 / 1e9; cpu_latency_per_layer += (double)n_flops.layer_q6k_f32 / (double)cpu.flops_q6k_f32 / 1e9; cpu_latency_per_layer += (double)n_flops.layer_q80_f32 / (double)cpu.flops_q80_f32 / 1e9; @@ -1105,6 +1110,7 @@ static float device_compute_delay(struct device_info & dev_info, int n_layers, c total_latency += (double)n_flops.output_f32_f32 / (double)cpu.flops_f32_f32 / 1e9; total_latency += (double)n_flops.output_f16_f32 / (double)cpu.flops_f16_f32 / 1e9; total_latency += (double)n_flops.output_q4k_f32 / (double)cpu.flops_q4k_f32 / 1e9; + total_latency += (double)n_flops.output_q5k_f32 / (double)cpu.flops_q5k_f32 / 1e9; total_latency += (double)n_flops.output_q6k_f32 / (double)cpu.flops_q6k_f32 / 1e9; total_latency += (double)n_flops.output_q80_f32 / (double)cpu.flops_q80_f32 / 1e9; @@ -1123,15 +1129,17 @@ static float device_memory_access_delay(struct device_info & dev_info, const str int64_t layer_bytes = n_params.layer_f32 * 4 + n_params.layer_f16 * 2 + - n_params.layer_q4k / 2 + - n_params.layer_q6k * 3 / 8 + + n_params.layer_q4k * 4 / 8 + + n_params.layer_q5k * 5 / 8 + + n_params.layer_q6k * 6 / 8 + n_params.layer_q80; int64_t output_bytes = n_params.output_f32 * 4 + n_params.output_f16 * 2 + - n_params.output_q4k / 2 + - n_params.output_q6k * 3 / 8 + + n_params.output_q4k * 4 / 8 + + n_params.output_q5k * 5 / 8 + + n_params.output_q6k * 6 / 8 + n_params.output_q80; #if defined(GGML_USE_CUDA) || defined(GGML_USE_METAL) @@ -1164,8 +1172,8 @@ static float device_disk_access_delay(struct device_info & dev_info, struct llam int64_t input_bytes = ( n_params.input_f32 * 4 + n_params.input_f16 * 2 + - n_params.input_q4k / 2 + - n_params.input_q6k * 3 / 8 + + n_params.input_q4k * 4 / 8 + + n_params.input_q6k * 6 / 8 + n_params.input_q80) / n_vocab; // lookup table, retrieve only n_embd elements int64_t cpu_total_bytes = input_bytes; @@ -1173,8 +1181,9 @@ static float device_disk_access_delay(struct device_info & dev_info, struct llam int64_t layer_bytes = n_params.layer_f32 * 4 + n_params.layer_f16 * 2 + - n_params.layer_q4k / 2 + - n_params.layer_q6k * 3 / 8 + + n_params.layer_q4k * 4 / 8 + + n_params.layer_q5k * 5 / 8 + + n_params.layer_q6k * 6 / 8 + n_params.layer_q80; #if defined(GGML_USE_METAL) || defined(GGML_USE_CUDA) @@ -1188,8 +1197,9 @@ static float device_disk_access_delay(struct device_info & dev_info, struct llam cpu_total_bytes += ( n_params.output_f32 * 4 + n_params.output_f16 * 2 + - n_params.output_q4k / 2 + - n_params.output_q6k * 3 / 8 + + n_params.output_q4k * 4 / 8 + + n_params.output_q5k * 5 / 8 + + n_params.output_q6k * 6 / 8 + n_params.output_q80); uint64_t cpu_kv_size; @@ -1292,6 +1302,12 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m } LOG_INF("\n"); + LOG_INF("| CPU flops (Q5K x F32, GFLOPS)"); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q5k_f32); + } + LOG_INF("\n"); + LOG_INF("| CPU flops (Q6K x F32, GFLOPS)"); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q6k_f32); @@ -1448,6 +1464,12 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m } LOG_INF("\n"); + LOG_INF("| Metal flops (Q5KxF32, GFLOPS)"); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q5k_f32); + } + LOG_INF("\n"); + LOG_INF("| Metal flops (Q6KxF32, GFLOPS)"); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q6k_f32); @@ -1484,6 +1506,12 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m } LOG_INF("\n"); + LOG_INF("| CUDA flops (Q5KxF32, GFLOPS) "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q5k_f32); + } + LOG_INF("\n"); + LOG_INF("| CUDA flops (Q6KxF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q6k_f32); @@ -1508,6 +1536,10 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q4k_f32); LOG_INF("\n"); + LOG_INF("| Model flops (output Q5KxF32) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q5k_f32); + LOG_INF("\n"); + LOG_INF("| Model flops (output Q6KxF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q6k_f32); LOG_INF("\n"); @@ -1528,6 +1560,10 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q4k_f32); LOG_INF("\n"); + LOG_INF("| Model flops (layer Q5KxF32) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q5k_f32); + LOG_INF("\n"); + LOG_INF("| Model flops (layer Q6KxF32) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q6k_f32); LOG_INF("\n"); @@ -1548,6 +1584,10 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q4k); LOG_INF("\n"); + LOG_INF("| Model params (input Q5K) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q5k); + LOG_INF("\n"); + LOG_INF("| Model params (input Q6K) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q6k); LOG_INF("\n"); @@ -1568,6 +1608,10 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q4k); LOG_INF("\n"); + LOG_INF("| Model params (layer Q5K) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q5k); + LOG_INF("\n"); + LOG_INF("| Model params (layer Q6K) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q6k); LOG_INF("\n"); @@ -1588,6 +1632,10 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q4k); LOG_INF("\n"); + LOG_INF("| Model params (output Q5K) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q5k); + LOG_INF("\n"); + LOG_INF("| Model params (output Q6K) "); LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q6k); LOG_INF("\n"); @@ -1628,12 +1676,12 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { + gpu_description_len + sizeof(struct disk_props) + sizeof(uint32_t) // cpu_props.cores - + sizeof(float) * 5 // cpu_props.flops_f32_f32, cpu_props.flops_f16_f32, cpu_props.flops_q4k_f32, cpu_props.flops_q6k_f32, cpu_props.flops_q80_f32 + + sizeof(float) * 6 // cpu_props.flops_f32_f32, cpu_props.flops_f16_f32, cpu_props.flops_q4k_f32, cpu_props.flops_q5k_f32, cpu_props.flops_q6k_f32, cpu_props.flops_q80_f32 + sizeof(struct memory_info) + sizeof(struct gpu_support) - + sizeof(float) * 14; // gpu_props.memory_free, gpu_props.memory_total, gpu_props.metal_read_vram_bw, gpu_props.cuda_read_vram_bw, - // gpu_props.metal_flops_f32_f32, gpu_props.metal_flops_f16_f32, gpu_props.metal_flops_q4k_f32, gpu_props.metal_flops_q6k_f32, gpu_props.metal_flops_q80_f32, - // gpu_props.cuda_flops_f32_f32, gpu_props.cuda_flops_f16_f32, gpu_props.cuda_flops_q4k_f32, gpu_props.cuda_flops_q6k_f32, gpu_props.cuda_flops_q80_f32 + + sizeof(float) * 16; // gpu_props.memory_free, gpu_props.memory_total, gpu_props.metal_read_vram_bw, gpu_props.cuda_read_vram_bw, + // gpu_props.metal_flops_f32_f32, gpu_props.metal_flops_f16_f32, gpu_props.metal_flops_q4k_f32, gpu_props.metal_flops_q5k_f32, gpu_props.metal_flops_q6k_f32, gpu_props.metal_flops_q80_f32, + // gpu_props.cuda_flops_f32_f32, gpu_props.cuda_flops_f16_f32, gpu_props.cuda_flops_q4k_f32, gpu_props.cuda_flops_q5k_f32, gpu_props.cuda_flops_q6k_f32, gpu_props.cuda_flops_q80_f32 *buffer = (char *)malloc(total_size); char * ptr = *buffer; @@ -1684,6 +1732,9 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { memcpy(ptr, &dev_info->cpu_props.flops_q4k_f32, sizeof(float)); ptr += sizeof(float); + memcpy(ptr, &dev_info->cpu_props.flops_q5k_f32, sizeof(float)); + ptr += sizeof(float); + memcpy(ptr, &dev_info->cpu_props.flops_q6k_f32, sizeof(float)); ptr += sizeof(float); @@ -1714,6 +1765,9 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { memcpy(ptr, &dev_info->gpu_props.metal_flops_q4k_f32, sizeof(float)); ptr += sizeof(float); + memcpy(ptr, &dev_info->gpu_props.metal_flops_q5k_f32, sizeof(float)); + ptr += sizeof(float); + memcpy(ptr, &dev_info->gpu_props.metal_flops_q6k_f32, sizeof(float)); ptr += sizeof(float); @@ -1732,6 +1786,9 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { memcpy(ptr, &dev_info->gpu_props.cuda_flops_q4k_f32, sizeof(float)); ptr += sizeof(float); + memcpy(ptr, &dev_info->gpu_props.cuda_flops_q5k_f32, sizeof(float)); + ptr += sizeof(float); + memcpy(ptr, &dev_info->gpu_props.cuda_flops_q6k_f32, sizeof(float)); ptr += sizeof(float); @@ -1804,6 +1861,9 @@ void deserialize(const char * buffer, struct device_info * dev_info) { memcpy(&dev_info->cpu_props.flops_q4k_f32, ptr, sizeof(float)); ptr += sizeof(float); + memcpy(&dev_info->cpu_props.flops_q5k_f32, ptr, sizeof(float)); + ptr += sizeof(float); + memcpy(&dev_info->cpu_props.flops_q6k_f32, ptr, sizeof(float)); ptr += sizeof(float); @@ -1834,6 +1894,9 @@ void deserialize(const char * buffer, struct device_info * dev_info) { memcpy(&dev_info->gpu_props.metal_flops_q4k_f32, ptr, sizeof(float)); ptr += sizeof(float); + memcpy(&dev_info->gpu_props.metal_flops_q5k_f32, ptr, sizeof(float)); + ptr += sizeof(float); + memcpy(&dev_info->gpu_props.metal_flops_q6k_f32, ptr, sizeof(float)); ptr += sizeof(float); @@ -1852,6 +1915,9 @@ void deserialize(const char * buffer, struct device_info * dev_info) { memcpy(&dev_info->gpu_props.cuda_flops_q4k_f32, ptr, sizeof(float)); ptr += sizeof(float); + memcpy(&dev_info->gpu_props.cuda_flops_q5k_f32, ptr, sizeof(float)); + ptr += sizeof(float); + memcpy(&dev_info->gpu_props.cuda_flops_q6k_f32, ptr, sizeof(float)); ptr += sizeof(float); diff --git a/common/profiler.h b/common/profiler.h index a0a15b49..18176cc4 100644 --- a/common/profiler.h +++ b/common/profiler.h @@ -4,6 +4,7 @@ #include "ggml.h" #include "llama.h" +#define EPS 1e-9f #define DISK_TEST_TOTAL_BYTE 500L * 1024 * 1024 #define DISK_TEST_SEQ_BLOCK 100L * 1024 * 1024 #define DISK_TEST_RND_BLOCK 4096 @@ -17,6 +18,7 @@ struct cpu_props { float flops_f32_f32; // in GFLOPS float flops_f16_f32; // in GFLOPS float flops_q4k_f32; // in GFLOPS + float flops_q5k_f32; // in GFLOPS float flops_q6k_f32; // in GFLOPS float flops_q80_f32; // in GFLOPS @@ -24,11 +26,12 @@ struct cpu_props { name(""), description(""), cores(0), - flops_f32_f32(0.0f), - flops_f16_f32(0.0f), - flops_q4k_f32(0.0f), - flops_q6k_f32(0.0f), - flops_q80_f32(0.0f) {} + flops_f32_f32(EPS), + flops_f16_f32(EPS), + flops_q4k_f32(EPS), + flops_q5k_f32(EPS), + flops_q6k_f32(EPS), + flops_q80_f32(EPS) {} }; struct memory_info { @@ -74,12 +77,14 @@ struct gpu_props { float metal_flops_f32_f32; // in GFLOPS float metal_flops_f16_f32; // in GFLOPS float metal_flops_q4k_f32; // in GFLOPS + float metal_flops_q5k_f32; // in GFLOPS float metal_flops_q6k_f32; // in GFLOPS float metal_flops_q80_f32; // in GFLOPS float cuda_read_vram_bw; // in GB/s float cuda_flops_f32_f32; // in GFLOPS float cuda_flops_f16_f32; // in GFLOPS float cuda_flops_q4k_f32; // in GFLOPS + float cuda_flops_q5k_f32; // in GFLOPS float cuda_flops_q6k_f32; // in GFLOPS float cuda_flops_q80_f32; // in GFLOPS @@ -89,17 +94,19 @@ struct gpu_props { memory_free (0.0f), memory_total (0.0f), metal_read_vram_bw (0.0f), - metal_flops_f32_f32(0.0f), - metal_flops_f16_f32(0.0f), - metal_flops_q4k_f32(0.0f), - metal_flops_q6k_f32(0.0f), - metal_flops_q80_f32(0.0f), + metal_flops_f32_f32(EPS), + metal_flops_f16_f32(EPS), + metal_flops_q4k_f32(EPS), + metal_flops_q5k_f32(EPS), + metal_flops_q6k_f32(EPS), + metal_flops_q80_f32(EPS), cuda_read_vram_bw (0.0f), - cuda_flops_f32_f32 (0.0f), - cuda_flops_f16_f32 (0.0f), - cuda_flops_q4k_f32 (0.0f), - cuda_flops_q6k_f32 (0.0f), - cuda_flops_q80_f32 (0.0f) {} + cuda_flops_f32_f32 (EPS), + cuda_flops_f16_f32 (EPS), + cuda_flops_q4k_f32 (EPS), + cuda_flops_q5k_f32 (EPS), + cuda_flops_q6k_f32 (EPS), + cuda_flops_q80_f32 (EPS) {} }; struct model_flops { @@ -107,11 +114,13 @@ struct model_flops { int64_t output_f32_f32; int64_t output_f16_f32; int64_t output_q4k_f32; + int64_t output_q5k_f32; int64_t output_q6k_f32; int64_t output_q80_f32; int64_t layer_f32_f32; int64_t layer_f16_f32; int64_t layer_q4k_f32; + int64_t layer_q5k_f32; int64_t layer_q6k_f32; int64_t layer_q80_f32; @@ -120,11 +129,13 @@ struct model_flops { output_f32_f32(0), output_f16_f32(0), output_q4k_f32(0), + output_q5k_f32(0), output_q6k_f32(0), output_q80_f32(0), layer_f32_f32 (0), layer_f16_f32 (0), layer_q4k_f32 (0), + layer_q5k_f32 (0), layer_q6k_f32 (0), layer_q80_f32 (0) {} }; @@ -133,16 +144,19 @@ struct model_params { int64_t input_f32; int64_t input_f16; int64_t input_q4k; + int64_t input_q5k; int64_t input_q6k; int64_t input_q80; int64_t output_f32; int64_t output_f16; int64_t output_q4k; + int64_t output_q5k; int64_t output_q6k; int64_t output_q80; int64_t layer_f32; int64_t layer_f16; int64_t layer_q4k; + int64_t layer_q5k; int64_t layer_q6k; int64_t layer_q80; @@ -150,16 +164,19 @@ struct model_params { input_f32 (0), input_f16 (0), input_q4k (0), + input_q5k (0), input_q6k (0), input_q80 (0), output_f32(0), output_f16(0), output_q4k(0), + output_q5k(0), output_q6k(0), output_q80(0), layer_f32 (0), layer_f16 (0), layer_q4k (0), + layer_q5k (0), layer_q6k (0), layer_q80 (0) {} }; diff --git a/src/llama.cpp b/src/llama.cpp index 363364de..29d62c4a 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3552,15 +3552,36 @@ void llama_perf_context_sync(struct llama_context * ctx, const struct llama_mode ctx->t_load_us = model->t_load_us; } +static bool is_dtype_exist(struct model_params * n_params, enum ggml_type dtype) { + switch (dtype) { + case GGML_TYPE_F32: + case GGML_TYPE_F16: + return true; + case GGML_TYPE_Q4_K: + return n_params->layer_q4k > 0 || n_params->output_q4k > 0; + case GGML_TYPE_Q5_K: + return n_params->layer_q5k > 0 || n_params->output_q5k > 0; + case GGML_TYPE_Q6_K: + return n_params->layer_q6k > 0 || n_params->output_q6k > 0; + case GGML_TYPE_Q8_0: + return n_params->layer_q80 > 0 || n_params->output_q80 > 0; + default: + throw std::runtime_error("Unrecognized data type\n"); + } +} + void llama_profile_device(device_info * dev_info, struct llama_model * model, llama_model_loader * ml, int n_threads) { + struct model_flops * n_flops = &dev_info->model_flops; + struct model_params * n_params = &dev_info->model_params; + + if (dev_info->rank == 0) { + enum ggml_type inp_embd_dtype = GGML_TYPE_F32; + llama_model_n_flops(model, ml, n_flops, n_params, 1, 32, &inp_embd_dtype); + n_flops->inp_embd_ms = device_inp_embd_delay(model, inp_embd_dtype, 1, n_threads); + } + dev_info->device_name = device_name(); dev_info->cpu_props.cores = device_cpu_cores(); - dev_info->cpu_props.flops_f32_f32 = device_cpu_flops(model, GGML_TYPE_F32, GGML_TYPE_F32, n_threads); - dev_info->cpu_props.flops_f16_f32 = device_cpu_flops(model, GGML_TYPE_F16, GGML_TYPE_F32, n_threads); - dev_info->cpu_props.flops_q4k_f32 = device_cpu_flops(model, GGML_TYPE_Q4_K, GGML_TYPE_F32, n_threads); - dev_info->cpu_props.flops_q6k_f32 = device_cpu_flops(model, GGML_TYPE_Q6_K, GGML_TYPE_F32, n_threads); - dev_info->cpu_props.flops_q80_f32 = device_cpu_flops(model, GGML_TYPE_Q8_0, GGML_TYPE_F32, n_threads); - dev_info->memory.total_physical = round(device_physical_memory(false) / (double)(1 << 30) * 100) / 100; dev_info->memory.available_physical = round(device_physical_memory(true) / (double)(1 << 30) * 100) / 100; dev_info->memory.total_swap = round(device_swap_memory(false) / (double)(1 << 30) * 100) / 100; @@ -3591,24 +3612,42 @@ void llama_profile_device(device_info * dev_info, struct llama_model * model, ll dev_info->gpu_props.memory_free = round(gpu_props.memory_free / (double)(1 << 30) * 100) / 100; dev_info->gpu_props.memory_total = round(gpu_props.memory_total / (double)(1 << 30) * 100) / 100; dev_info->gpu_props.metal_read_vram_bw = device_metal_read_vram_bw(model); - dev_info->gpu_props.metal_flops_f32_f32 = device_metal_flops(model, GGML_TYPE_F32, GGML_TYPE_F32); - dev_info->gpu_props.metal_flops_f16_f32 = device_metal_flops(model, GGML_TYPE_F16, GGML_TYPE_F32); - dev_info->gpu_props.metal_flops_q4k_f32 = device_metal_flops(model, GGML_TYPE_Q4_K, GGML_TYPE_F32); - dev_info->gpu_props.metal_flops_q6k_f32 = device_metal_flops(model, GGML_TYPE_Q6_K, GGML_TYPE_F32); - dev_info->gpu_props.metal_flops_q80_f32 = device_metal_flops(model, GGML_TYPE_Q8_0, GGML_TYPE_F32); dev_info->gpu_props.cuda_read_vram_bw = device_cuda_read_vram_bw(model); - dev_info->gpu_props.cuda_flops_f32_f32 = device_cuda_flops (model, GGML_TYPE_F32, GGML_TYPE_F32); - dev_info->gpu_props.cuda_flops_f16_f32 = device_cuda_flops (model, GGML_TYPE_F16, GGML_TYPE_F32); - dev_info->gpu_props.cuda_flops_q4k_f32 = device_cuda_flops (model, GGML_TYPE_Q4_K, GGML_TYPE_F32); - dev_info->gpu_props.cuda_flops_q6k_f32 = device_cuda_flops (model, GGML_TYPE_Q6_K, GGML_TYPE_F32); - dev_info->gpu_props.cuda_flops_q80_f32 = device_cuda_flops (model, GGML_TYPE_Q8_0, GGML_TYPE_F32); - if (dev_info->rank == 0) { - struct model_flops * n_flops = &dev_info->model_flops; - struct model_params * n_params = &dev_info->model_params; - enum ggml_type inp_embd_dtype = GGML_TYPE_F32; - llama_model_n_flops(model, ml, n_flops, n_params, 1, 32, &inp_embd_dtype); - n_flops->inp_embd_ms = device_inp_embd_delay(model, inp_embd_dtype, 1, n_threads); + if (is_dtype_exist(n_params, GGML_TYPE_F32)) { + dev_info->cpu_props.flops_f32_f32 = device_cpu_flops (model, GGML_TYPE_F32, GGML_TYPE_F32, n_threads); + dev_info->gpu_props.metal_flops_f32_f32 = device_metal_flops(model, GGML_TYPE_F32, GGML_TYPE_F32); + dev_info->gpu_props.cuda_flops_f32_f32 = device_cuda_flops (model, GGML_TYPE_F32, GGML_TYPE_F32); + } + + if (is_dtype_exist(n_params, GGML_TYPE_F16)) { + dev_info->cpu_props.flops_f16_f32 = device_cpu_flops (model, GGML_TYPE_F16, GGML_TYPE_F32, n_threads); + dev_info->gpu_props.metal_flops_f16_f32 = device_metal_flops(model, GGML_TYPE_F16, GGML_TYPE_F32); + dev_info->gpu_props.cuda_flops_f16_f32 = device_cuda_flops (model, GGML_TYPE_F16, GGML_TYPE_F32); + } + + if (is_dtype_exist(n_params, GGML_TYPE_Q4_K)) { + dev_info->cpu_props.flops_q4k_f32 = device_cpu_flops (model, GGML_TYPE_Q4_K, GGML_TYPE_F32, n_threads); + dev_info->gpu_props.metal_flops_q4k_f32 = device_metal_flops(model, GGML_TYPE_Q4_K, GGML_TYPE_F32); + dev_info->gpu_props.cuda_flops_q4k_f32 = device_cuda_flops (model, GGML_TYPE_Q4_K, GGML_TYPE_F32); + } + + if (is_dtype_exist(n_params, GGML_TYPE_Q5_K)) { + dev_info->cpu_props.flops_q5k_f32 = device_cpu_flops (model, GGML_TYPE_Q5_K, GGML_TYPE_F32, n_threads); + dev_info->gpu_props.metal_flops_q5k_f32 = device_metal_flops(model, GGML_TYPE_Q5_K, GGML_TYPE_F32); + dev_info->gpu_props.cuda_flops_q5k_f32 = device_cuda_flops (model, GGML_TYPE_Q5_K, GGML_TYPE_F32); + } + + if (is_dtype_exist(n_params, GGML_TYPE_Q6_K)) { + dev_info->cpu_props.flops_q6k_f32 = device_cpu_flops (model, GGML_TYPE_Q6_K, GGML_TYPE_F32, n_threads); + dev_info->gpu_props.metal_flops_q6k_f32 = device_metal_flops(model, GGML_TYPE_Q6_K, GGML_TYPE_F32); + dev_info->gpu_props.cuda_flops_q6k_f32 = device_cuda_flops (model, GGML_TYPE_Q6_K, GGML_TYPE_F32); + } + + if (is_dtype_exist(n_params, GGML_TYPE_Q8_0)) { + dev_info->cpu_props.flops_q80_f32 = device_cpu_flops (model, GGML_TYPE_Q8_0, GGML_TYPE_F32, n_threads); + dev_info->gpu_props.metal_flops_q80_f32 = device_metal_flops(model, GGML_TYPE_Q8_0, GGML_TYPE_F32); + dev_info->gpu_props.cuda_flops_q80_f32 = device_cuda_flops (model, GGML_TYPE_Q8_0, GGML_TYPE_F32); } } @@ -20699,6 +20738,9 @@ static void count_n_flops(struct model_flops * n_flops, enum ggml_type dtype, en case GGML_TYPE_Q4_K: n_flops->output_q4k_f32 += n; break; + case GGML_TYPE_Q5_K: + n_flops->output_q5k_f32 += n; + break; case GGML_TYPE_Q6_K: n_flops->output_q6k_f32 += n; break; @@ -20721,6 +20763,9 @@ static void count_n_flops(struct model_flops * n_flops, enum ggml_type dtype, en case GGML_TYPE_Q4_K: n_flops->layer_q4k_f32 += n; break; + case GGML_TYPE_Q5_K: + n_flops->layer_q5k_f32 += n; + break; case GGML_TYPE_Q6_K: n_flops->layer_q6k_f32 += n; break; @@ -20751,6 +20796,9 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype, case GGML_TYPE_Q4_K: n_params->input_q4k += n_i64t; break; + case GGML_TYPE_Q5_K: + n_params->input_q5k += n_i64t; + break; case GGML_TYPE_Q6_K: n_params->input_q6k += n_i64t; break; @@ -20773,6 +20821,9 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype, case GGML_TYPE_Q4_K: n_params->output_q4k += n_i64t; break; + case GGML_TYPE_Q5_K: + n_params->output_q5k += n_i64t; + break; case GGML_TYPE_Q6_K: n_params->output_q6k += n_i64t; break; @@ -20795,6 +20846,9 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype, case GGML_TYPE_Q4_K: n_params->layer_q4k += n_i64t; break; + case GGML_TYPE_Q5_K: + n_params->layer_q5k += n_i64t; + break; case GGML_TYPE_Q6_K: n_params->layer_q6k += n_i64t; break;