prima.cpp/common/profiler.h

#ifndef PROFILER_H
#define PROFILER_H

#include "ggml.h"
#include "llama.h"

#define EPS                  1e-9f
#define DISK_TEST_TOTAL_BYTE 500L * 1024 * 1024
#define DISK_TEST_SEQ_BLOCK  100L * 1024 * 1024
#define DISK_TEST_RND_BLOCK  4096
#define MEM_TEST_BLOCK_SIZE  64 * 1024


struct cpu_props {
    const char * name;
    const char * description;
    uint32_t     cores;
    float        flops_f32_f32;     // in GFLOPS
    float        flops_f16_f32;     // in GFLOPS
    float        flops_q2k_f32;     // in GFLOPS
    float        flops_q4k_f32;     // in GFLOPS
    float        flops_q5k_f32;     // in GFLOPS
    float        flops_q6k_f32;     // in GFLOPS
    float        flops_iq2xxs_f32;  // in GFLOPS
    float        flops_q50_f32;     // in GFLOPS
    float        flops_q80_f32;     // in GFLOPS
    float        flops_iq1s_f32;    // in GFLOPS
    float        flops_iq4nl_f32;   // in GFLOPS
    float        flops_iq1m_f32;    // in GFLOPS

    cpu_props()
        : name            (""),
          description     (""),
          cores           (0),
          flops_f32_f32   (0.0f),
          flops_f16_f32   (0.0f),
          flops_q2k_f32   (0.0f),
          flops_q4k_f32   (0.0f),
          flops_q5k_f32   (0.0f),
          flops_q6k_f32   (0.0f),
          flops_iq2xxs_f32(0.0f),
          flops_q50_f32   (0.0f),
          flops_q80_f32   (0.0f),
          flops_iq1s_f32  (0.0f),
          flops_iq4nl_f32 (0.0f),
          flops_iq1m_f32  (0.0f)
    {}
};

struct memory_info {
    float        total_physical;     // in GiB
    float        available_physical; // in GiB
    float        used_can_swap;      // in GiB
    float        total_swap;         // in GiB
    float        available_swap;     // in GiB
    float        cpu_read_ram_bw;    // in GB/s
    float        mem_cpy_delay;      // in ms

    memory_info() :
        total_physical    (0.0f),
        available_physical(0.0f),
        used_can_swap     (0.0f),
        total_swap        (0.0f),
        available_swap    (0.0f),
        cpu_read_ram_bw   (0.0f),
        mem_cpy_delay     (0.0f) {}
};

struct gpu_support {
    bool         metal;
    bool         cuda;
    bool         vulkan;
    bool         kompute;
    bool         gpublas;
    bool         blas;
    bool         sycl;

    gpu_support() :
        metal  (false),
        cuda   (false),
        vulkan (false),
        kompute(false),
        gpublas(false),
        blas   (false),
        sycl   (false) {}
};

struct gpu_props {
    const char * name;
    const char * description;
    float        memory_free;               // in GiB
    float        memory_total;              // in GiB
    float        metal_read_vram_bw;        // in GB/s
    float        metal_flops_f32_f32;       // in GFLOPS
    float        metal_flops_f16_f32;       // in GFLOPS
    float        metal_flops_q2k_f32;       // in GFLOPS
    float        metal_flops_q4k_f32;     // in GFLOPS
    float        metal_flops_q5k_f32;     // in GFLOPS
    float        metal_flops_q6k_f32;     // in GFLOPS
    float        metal_flops_iq2xxs_f32;  // in GFLOPS
    float        metal_flops_q50_f32;     // in GFLOPS
    float        metal_flops_q80_f32;     // in GFLOPS
    float        metal_flops_iq1s_f32;    // in GFLOPS
    float        metal_flops_iq4nl_f32;   // in GFLOPS
    float        metal_flops_iq1m_f32;    // in GFLOPS
    float        metal_mem_cpy_delay;     // in ms
    float        cuda_read_vram_bw;       // in GB/s
    float        cuda_flops_f32_f32;      // in GFLOPS
    float        cuda_flops_f16_f32;      // in GFLOPS
    float        cuda_flops_q2k_f32;      // in GFLOPS
    float        cuda_flops_q4k_f32;      // in GFLOPS
    float        cuda_flops_q5k_f32;      // in GFLOPS
    float        cuda_flops_q6k_f32;      // in GFLOPS
    float        cuda_flops_iq2xxs_f32;   // in GFLOPS
    float        cuda_flops_q50_f32;      // in GFLOPS
    float        cuda_flops_q80_f32;      // in GFLOPS
    float        cuda_flops_iq1s_f32;     // in GFLOPS
    float        cuda_flops_iq4nl_f32;    // in GFLOPS
    float        cuda_flops_iq1m_f32;     // in GFLOPS
    float        cuda_mem_cpy_delay;      // in ms

    gpu_props() :
        name                    (""),
        description             (""),
        memory_free             (0.0f),
        memory_total            (0.0f),
        metal_read_vram_bw      (0.0f),
        metal_flops_f32_f32     (0.0f),
        metal_flops_f16_f32     (0.0f),
        metal_flops_q2k_f32     (0.0f),
        metal_flops_q4k_f32     (0.0f),
        metal_flops_q5k_f32     (0.0f),
        metal_flops_q6k_f32     (0.0f),
        metal_flops_iq2xxs_f32  (0.0f),
        metal_flops_q50_f32     (0.0f),
        metal_flops_q80_f32     (0.0f),
        metal_flops_iq1s_f32    (0.0f),
        metal_flops_iq4nl_f32   (0.0f),
        metal_flops_iq1m_f32    (0.0f),
        metal_mem_cpy_delay     (0.0f),
        cuda_read_vram_bw       (0.0f),
        cuda_flops_f32_f32      (0.0f),
        cuda_flops_f16_f32      (0.0f),
        cuda_flops_q2k_f32      (0.0f),
        cuda_flops_q4k_f32      (0.0f),
        cuda_flops_q5k_f32      (0.0f),
        cuda_flops_q6k_f32      (0.0f),
        cuda_flops_iq2xxs_f32   (0.0f),
        cuda_flops_q50_f32      (0.0f),
        cuda_flops_q80_f32      (0.0f),
        cuda_flops_iq1s_f32     (0.0f),
        cuda_flops_iq4nl_f32    (0.0f),
        cuda_flops_iq1m_f32     (0.0f),
        cuda_mem_cpy_delay      (0.0f) {}
};

struct model_flops {
    float   inp_embd_ms;
    int64_t output_f32_f32;
    int64_t output_f16_f32;
    int64_t output_q2k_f32;
    int64_t output_q4k_f32;
    int64_t output_q5k_f32;
    int64_t output_q6k_f32;
    int64_t output_iq2xxs_f32;
    int64_t output_q50_f32;
    int64_t output_q80_f32;
    int64_t output_iq1s_f32;
    int64_t output_iq4nl_f32;
    int64_t output_iq1m_f32;
    int64_t layer_f32_f32;
    int64_t layer_f16_f32;
    int64_t layer_q2k_f32;
    int64_t layer_q4k_f32;
    int64_t layer_q5k_f32;
    int64_t layer_q6k_f32;
    int64_t layer_iq2xxs_f32;
    int64_t layer_q50_f32;
    int64_t layer_q80_f32;
    int64_t layer_iq1s_f32;
    int64_t layer_iq4nl_f32;
    int64_t layer_iq1m_f32;

    model_flops() :
        inp_embd_ms        (0.0f),
        output_f32_f32     (0),
        output_f16_f32     (0),
        output_q2k_f32     (0),
        output_q4k_f32     (0),
        output_q5k_f32     (0),
        output_q6k_f32     (0),
        output_iq2xxs_f32  (0),
        output_q50_f32     (0),
        output_q80_f32     (0),
        output_iq1s_f32    (0),
        output_iq4nl_f32   (0),
        output_iq1m_f32    (0),
        layer_f32_f32      (0),
        layer_f16_f32      (0),
        layer_q2k_f32      (0),
        layer_q4k_f32      (0),
        layer_q5k_f32      (0),
        layer_q6k_f32      (0),
        layer_iq2xxs_f32   (0),
        layer_q50_f32      (0),
        layer_q80_f32      (0),
        layer_iq1s_f32     (0),
        layer_iq4nl_f32    (0),
        layer_iq1m_f32     (0)
        {}
};

struct model_params {
    int64_t input_f32;
    int64_t input_f16;
    int64_t input_q2k;
    int64_t input_q4k;
    int64_t input_q5k;
    int64_t input_q6k;
    int64_t input_iq2xxs;
    int64_t input_q50;
    int64_t input_q80;
    int64_t input_iq1s;
    int64_t input_iq4nl;
    int64_t input_iq1m;
    int64_t output_f32;
    int64_t output_f16;
    int64_t output_q2k;
    int64_t output_q4k;
    int64_t output_q5k;
    int64_t output_q6k;
    int64_t output_iq2xxs;
    int64_t output_q50;
    int64_t output_q80;
    int64_t output_iq1s;
    int64_t output_iq4nl;
    int64_t output_iq1m;
    int64_t layer_f32;
    int64_t layer_f16;
    int64_t layer_q2k;
    int64_t layer_q4k;
    int64_t layer_q5k;
    int64_t layer_q6k;
    int64_t layer_iq2xxs;
    int64_t layer_q50;
    int64_t layer_q80;
    int64_t layer_iq1s;
    int64_t layer_iq4nl;
    int64_t layer_iq1m;

    model_params() :
        input_f32       (0),
        input_f16       (0),
        input_q2k       (0),
        input_q4k       (0),
        input_q5k       (0),
        input_q6k       (0),
        input_iq2xxs    (0),
        input_q50       (0),
        input_q80       (0),
        input_iq1s      (0),
        input_iq4nl     (0),
        input_iq1m      (0),
        output_f32      (0),
        output_f16      (0),
        output_q2k      (0),
        output_q4k      (0),
        output_q5k      (0),
        output_q6k      (0),
        output_iq2xxs   (0),
        output_q50      (0),
        output_q80      (0),
        output_iq1s     (0),
        output_iq4nl    (0),
        output_iq1m     (0),
        layer_f32       (0),
        layer_f16       (0),
        layer_q2k       (0),
        layer_q4k       (0),
        layer_q5k       (0),
        layer_q6k       (0),
        layer_iq2xxs    (0),
        layer_q50       (0),
        layer_q80       (0),
        layer_iq1s      (0),
        layer_iq4nl     (0),
        layer_iq1m      (0)
        {}
};

struct model_bytes {
    int64_t nb_input;
    int64_t nb_layer;
    int64_t nb_output;

    // used to estimate the compute buffer size
    int64_t nb_output_w;
    int64_t nb_output_norm_w;
    int64_t nb_attn_norm_w;
    int64_t nb_attn_q_w;
    int64_t nb_ffn_gate_w;
    int64_t nb_ffn_down_w;

    model_bytes() :
        nb_input        (0),
        nb_layer        (0),
        nb_output       (0),
        nb_output_w     (0),
        nb_output_norm_w(0),
        nb_attn_norm_w  (0),
        nb_attn_q_w     (0),
        nb_ffn_gate_w   (0),
        nb_ffn_down_w   (0) {}
};

struct disk_props {
    float read_seq_bw;  // in GB/s
    float read_rnd_bw;  // in GB/s
    float write_seq_bw; // in GB/s
    float write_rnd_bw; // in GB/s

    disk_props() :
        read_seq_bw (0.0f),
        read_rnd_bw (0.0f),
        write_seq_bw(0.0f),
        write_rnd_bw(0.0f) {}
};

struct startup_args{
    bool     should_profile;
    uint32_t n_ctx;
};

struct device_info {
    uint32_t            rank;
    const char *        device_name;
    const char *        device_os;
    const char *        next_ip;
    struct disk_props   disk;
    struct cpu_props    cpu_props;
    struct memory_info  memory;
    struct gpu_support  gpu_support;
    struct gpu_props    gpu_props;
    struct model_flops  model_flops;
    struct model_params model_params;
    struct model_bytes  model_bytes;

    device_info() :
        rank(0),
        device_name(""),
        device_os(""),
        next_ip(""),
        disk(),
        cpu_props(),
        memory(),
        gpu_support(),
        gpu_props(),
        model_flops(),
        model_params(),
        model_bytes() {}
};

struct TopoRebuildHelperInfo{
    struct device_info dev_info;
    char               is_forwarder;

    TopoRebuildHelperInfo():
        dev_info(),
        is_forwarder(0){}

    void   deserialize(const char * buffer);
    size_t serialize(char ** buffer) const;
};

enum profiler_backend_type {
    PROFILER_BACKEND_TYPE_CPU   = 0,
    PROFILER_BACKEND_TYPE_METAL = 1,
    PROFILER_BACKEND_TYPE_CUDA  = 2,
};

enum profiler_layer_type {
    PROFILER_LAYER_INPUT   = 0,
    PROFILER_LAYER_OUTPUT  = 1,
    PROFILER_LAYER_BACKEND = 2,
};

const char * device_name(void);
const char * device_os(void);

uint32_t device_cpu_cores         (void);
float    device_cpu_flops         (struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t, int n_threads);
float    device_metal_flops       (struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t);
float    device_cuda_flops        (struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t);
float    device_inp_embd_delay    (struct llama_model * model, enum ggml_type src0t, int n_tokens, int n_threads);
uint64_t device_physical_memory   (bool available);
uint64_t device_swap_memory       (bool available);
uint64_t device_swappable_memory  ();
void     device_disk_seq_bw       (float * read_seq_bw, float * write_seq_bw, int n_threads);
void     device_disk_rnd_bw       (float * read_rnd_bw, float * write_rnd_bw, int n_threads);
float    device_memory_bw         (int n_thread);
float    device_cpu_mem_copy      (struct llama_model * model, int n_threads);
float    device_metal_mem_copy    (struct llama_model * model);
float    device_cuda_mem_copy     (struct llama_model * model);
float    device_metal_read_vram_bw();
float    device_cuda_read_vram_bw ();
void     device_get_props         (struct llama_model * model, int device, struct ggml_backend_dev_props * props);
void     device_print_props       (struct device_info * dev_info_set, int n, struct llama_model * model, const struct llama_context_params cparams);

int      device_has_metal  (void);
int      device_has_cuda   (void);
int      device_has_vulkan (void);
int      device_has_kompute(void);
int      device_has_gpublas(void);
int      device_has_blas   (void);
int      device_has_sycl   (void);

size_t   serialize  (const struct device_info * dev_info, char ** buffer);
size_t   deserialize(const char * buffer, struct device_info * dev_info);

#endif // PROFILER_H