prima.cpp/common/profiler.h
2024-11-28 20:21:21 +04:00

239 lines
6.8 KiB
C

#ifndef PROFILER_H
#define PROFILER_H
#include "ggml.h"
#include "llama.h"
#define DISK_TEST_TOTAL_BYTE 500L * 1024 * 1024
#define DISK_TEST_SEQ_BLOCK 100L * 1024 * 1024
#define DISK_TEST_RND_BLOCK 4096
struct cpu_props {
const char * name;
const char * description;
uint32_t cores;
float flops_f32_f32; // in GFLOPS
float flops_f16_f32; // in GFLOPS
float flops_q4k_f32; // in GFLOPS
float flops_q6k_f32; // in GFLOPS
float flops_q80_f32; // in GFLOPS
cpu_props() :
name(""),
description(""),
cores(0),
flops_f32_f32(0.0f),
flops_f16_f32(0.0f),
flops_q4k_f32(0.0f),
flops_q6k_f32(0.0f),
flops_q80_f32(0.0f) {}
};
struct memory_info {
float total_physical; // in GiB
float available_physical; // in GiB
float total_swap; // in GiB
float available_swap; // in GiB
float read_bandwidth; // in GB/s
memory_info() :
total_physical (0.0f),
available_physical(0.0f),
total_swap (0.0f),
available_swap (0.0f),
read_bandwidth (0.0f) {}
};
struct gpu_support {
bool metal;
bool cuda;
bool vulkan;
bool kompute;
bool gpublas;
bool blas;
bool sycl;
gpu_support() :
metal (false),
cuda (false),
vulkan (false),
kompute(false),
gpublas(false),
blas (false),
sycl (false) {}
};
struct gpu_props {
const char * name;
const char * description;
float memory_free; // in GiB
float memory_total; // in GiB
float read_bandwidth; // in GB/s
float metal_flops_f32_f32; // in GFLOPS
float metal_flops_f16_f32; // in GFLOPS
float metal_flops_q4k_f32; // in GFLOPS
float metal_flops_q6k_f32; // in GFLOPS
float metal_flops_q80_f32; // in GFLOPS
float cuda_flops_f32_f32; // in GFLOPS
float cuda_flops_f16_f32; // in GFLOPS
float cuda_flops_q4k_f32; // in GFLOPS
float cuda_flops_q6k_f32; // in GFLOPS
float cuda_flops_q80_f32; // in GFLOPS
gpu_props() :
name(""),
description(""),
memory_free (0.0f),
memory_total (0.0f),
read_bandwidth (0.0f),
metal_flops_f32_f32(0.0f),
metal_flops_f16_f32(0.0f),
metal_flops_q4k_f32(0.0f),
metal_flops_q6k_f32(0.0f),
metal_flops_q80_f32(0.0f),
cuda_flops_f32_f32 (0.0f),
cuda_flops_f16_f32 (0.0f),
cuda_flops_q4k_f32 (0.0f),
cuda_flops_q6k_f32 (0.0f),
cuda_flops_q80_f32 (0.0f) {}
};
struct model_flops {
float inp_embd_ms;
int64_t output_f32_f32;
int64_t output_f16_f32;
int64_t output_q4k_f32;
int64_t output_q6k_f32;
int64_t output_q80_f32;
int64_t layer_f32_f32;
int64_t layer_f16_f32;
int64_t layer_q4k_f32;
int64_t layer_q6k_f32;
int64_t layer_q80_f32;
model_flops() :
inp_embd_ms(0.0f),
output_f32_f32(0),
output_f16_f32(0),
output_q4k_f32(0),
output_q6k_f32(0),
output_q80_f32(0),
layer_f32_f32 (0),
layer_f16_f32 (0),
layer_q4k_f32 (0),
layer_q6k_f32 (0),
layer_q80_f32 (0) {}
};
struct model_params {
int64_t input_f32;
int64_t input_f16;
int64_t input_q4k;
int64_t input_q6k;
int64_t input_q80;
int64_t output_f32;
int64_t output_f16;
int64_t output_q4k;
int64_t output_q6k;
int64_t output_q80;
int64_t layer_f32;
int64_t layer_f16;
int64_t layer_q4k;
int64_t layer_q6k;
int64_t layer_q80;
model_params() :
input_f32 (0),
input_f16 (0),
input_q4k (0),
input_q6k (0),
input_q80 (0),
output_f32(0),
output_f16(0),
output_q4k(0),
output_q6k(0),
output_q80(0),
layer_f32 (0),
layer_f16 (0),
layer_q4k (0),
layer_q6k (0),
layer_q80 (0) {}
};
struct disk_props {
float read_seq_bw; // in GB/s
float read_rnd_bw; // in GB/s
float write_seq_bw; // in GB/s
float write_rnd_bw; // in GB/s
disk_props() :
read_seq_bw (0.0f),
read_rnd_bw (0.0f),
write_seq_bw(0.0f),
write_rnd_bw(0.0f) {}
};
struct device_info {
uint32_t rank;
const char * device_name;
struct disk_props disk;
struct cpu_props cpu_props;
struct memory_info memory;
struct gpu_support gpu_support;
struct gpu_props gpu_props;
struct model_flops model_flops;
struct model_params model_params;
device_info() :
rank(0),
device_name(""),
disk(),
cpu_props(),
memory(),
gpu_support(),
gpu_props(),
model_flops(),
model_params() {}
};
enum profiler_backend_type {
PROFILER_BACKEND_TYPE_CPU = 0,
PROFILER_BACKEND_TYPE_METAL = 1,
PROFILER_BACKEND_TYPE_CUDA = 2,
};
enum profiler_layer_type {
PROFILER_LAYER_INPUT = 0,
PROFILER_LAYER_OUTPUT = 1,
PROFILER_LAYER_BACKEND = 2,
};
const char * device_name(void);
uint32_t device_cpu_cores (void);
float device_cpu_flops (struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t, int n_threads);
float device_metal_flops (struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t);
float device_cuda_flops (struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t);
float device_inp_embd_delay (struct llama_model * model, enum ggml_type src0t, int n_tokens, int n_threads);
uint64_t device_physical_memory (bool available);
uint64_t device_swap_memory (bool available);
void device_disk_seq_bw (float * read_seq_bw, float * write_seq_bw, int n_threads);
void device_disk_rnd_bw (float * read_rnd_bw, float * write_rnd_bw, int n_threads);
float device_memory_bw (int n_thread);
float device_cuda_memory_bw (struct llama_model * model);
void device_get_props (struct llama_model * model, int device, struct ggml_backend_dev_props * props);
void device_print_props (struct device_info * dev_info_set, int n, struct llama_model * model, const struct llama_context_params cparams);
int device_has_metal (void);
int device_has_cuda (void);
int device_has_vulkan (void);
int device_has_kompute(void);
int device_has_gpublas(void);
int device_has_blas (void);
int device_has_sycl (void);
size_t serialize (const struct device_info * dev_info, char ** buffer);
void deserialize(const char * buffer, struct device_info * dev_info);
#endif // PROFILER_H