#ifndef PROFILER_H
#define PROFILER_H

#include "ggml.h"
#include "llama.h"

#define EPS                  1e-9f
#define DISK_TEST_TOTAL_BYTE 500L * 1024 * 1024
#define DISK_TEST_SEQ_BLOCK  100L * 1024 * 1024
#define DISK_TEST_RND_BLOCK  4096
#define MEM_TEST_BLOCK_SIZE  64 * 1024


struct cpu_props {
    const char * name;
    const char * description;
    uint32_t     cores;
    float        flops_f32_f32; // in GFLOPS
    float        flops_f16_f32; // in GFLOPS
    float        flops_q4k_f32; // in GFLOPS
    float        flops_q50_f32; // in GFLOPS
    float        flops_q5k_f32; // in GFLOPS
    float        flops_q6k_f32; // in GFLOPS
    float        flops_q80_f32; // in GFLOPS

    cpu_props() : 
        name(""), 
        description(""), 
        cores(0), 
        flops_f32_f32(0.0f), 
        flops_f16_f32(0.0f), 
        flops_q4k_f32(0.0f),
        flops_q50_f32(0.0f),
        flops_q5k_f32(0.0f),
        flops_q6k_f32(0.0f),
        flops_q80_f32(0.0f) {}
};

struct memory_info {
    float        total_physical;     // in GiB
    float        available_physical; // in GiB
    float        used_can_swap;      // in GiB
    float        total_swap;         // in GiB
    float        available_swap;     // in GiB
    float        cpu_read_ram_bw;    // in GB/s
    float        mem_cpy_delay;      // in ms

    memory_info() : 
        total_physical    (0.0f), 
        available_physical(0.0f), 
        used_can_swap     (0.0f),
        total_swap        (0.0f), 
        available_swap    (0.0f), 
        cpu_read_ram_bw   (0.0f),
        mem_cpy_delay     (0.0f) {}
};

struct gpu_support {
    bool         metal;
    bool         cuda;
    bool         vulkan;
    bool         kompute;
    bool         gpublas;
    bool         blas;
    bool         sycl;

    gpu_support() : 
        metal  (false), 
        cuda   (false), 
        vulkan (false), 
        kompute(false), 
        gpublas(false), 
        blas   (false), 
        sycl   (false) {}
};

struct gpu_props {
    const char * name;
    const char * description;
    float        memory_free;         // in GiB
    float        memory_total;        // in GiB
    float        metal_read_vram_bw;  // in GB/s
    float        metal_flops_f32_f32; // in GFLOPS
    float        metal_flops_f16_f32; // in GFLOPS
    float        metal_flops_q4k_f32; // in GFLOPS
    float        metal_flops_q50_f32; // in GFLOPS
    float        metal_flops_q5k_f32; // in GFLOPS
    float        metal_flops_q6k_f32; // in GFLOPS
    float        metal_flops_q80_f32; // in GFLOPS
    float        metal_mem_cpy_delay; // in ms
    float        cuda_read_vram_bw;   // in GB/s
    float        cuda_flops_f32_f32;  // in GFLOPS
    float        cuda_flops_f16_f32;  // in GFLOPS
    float        cuda_flops_q4k_f32;  // in GFLOPS
    float        cuda_flops_q50_f32;  // in GFLOPS
    float        cuda_flops_q5k_f32;  // in GFLOPS
    float        cuda_flops_q6k_f32;  // in GFLOPS
    float        cuda_flops_q80_f32;  // in GFLOPS
    float        cuda_mem_cpy_delay;  // in ms

    gpu_props() : 
        name(""), 
        description(""), 
        memory_free        (0.0f), 
        memory_total       (0.0f), 
        metal_read_vram_bw (0.0f),
        metal_flops_f32_f32(0.0f), 
        metal_flops_f16_f32(0.0f),
        metal_flops_q4k_f32(0.0f),
        metal_flops_q50_f32(0.0f),
        metal_flops_q5k_f32(0.0f),
        metal_flops_q6k_f32(0.0f),
        metal_flops_q80_f32(0.0f),
        metal_mem_cpy_delay(0.0f),
        cuda_read_vram_bw  (0.0f),
        cuda_flops_f32_f32 (0.0f), 
        cuda_flops_f16_f32 (0.0f), 
        cuda_flops_q4k_f32 (0.0f),
        cuda_flops_q50_f32 (0.0f),
        cuda_flops_q5k_f32 (0.0f),
        cuda_flops_q6k_f32 (0.0f),
        cuda_flops_q80_f32 (0.0f),
        cuda_mem_cpy_delay (0.0f) {}
};

struct model_flops {
    float   inp_embd_ms;
    int64_t output_f32_f32;
    int64_t output_f16_f32;
    int64_t output_q4k_f32;
    int64_t output_q50_f32;
    int64_t output_q5k_f32;
    int64_t output_q6k_f32;
    int64_t output_q80_f32;
    int64_t layer_f32_f32;
    int64_t layer_f16_f32;
    int64_t layer_q4k_f32;
    int64_t layer_q50_f32;
    int64_t layer_q5k_f32;
    int64_t layer_q6k_f32;
    int64_t layer_q80_f32;

    model_flops() : 
        inp_embd_ms(0.0f),
        output_f32_f32(0), 
        output_f16_f32(0),
        output_q4k_f32(0),
        output_q50_f32(0),
        output_q5k_f32(0),
        output_q6k_f32(0), 
        output_q80_f32(0),
        layer_f32_f32 (0),
        layer_f16_f32 (0),
        layer_q4k_f32 (0),
        layer_q50_f32 (0),
        layer_q5k_f32 (0),
        layer_q6k_f32 (0),
        layer_q80_f32 (0) {}
};

struct model_params {
    int64_t input_f32;
    int64_t input_f16;
    int64_t input_q4k;
    int64_t input_q50;
    int64_t input_q5k;
    int64_t input_q6k;
    int64_t input_q80;
    int64_t output_f32;
    int64_t output_f16;
    int64_t output_q4k;
    int64_t output_q50;
    int64_t output_q5k;
    int64_t output_q6k;
    int64_t output_q80;
    int64_t layer_f32;
    int64_t layer_f16;
    int64_t layer_q4k;
    int64_t layer_q50;
    int64_t layer_q5k;
    int64_t layer_q6k;
    int64_t layer_q80;

    model_params() :
        input_f32 (0),
        input_f16 (0),
        input_q4k (0),
        input_q50 (0),
        input_q5k (0),
        input_q6k (0),
        input_q80 (0),
        output_f32(0),
        output_f16(0),
        output_q4k(0),
        output_q50(0),
        output_q5k(0),
        output_q6k(0),
        output_q80(0),
        layer_f32 (0),
        layer_f16 (0),
        layer_q4k (0),
        layer_q50 (0),
        layer_q5k (0),
        layer_q6k (0),
        layer_q80 (0) {}
};

struct model_bytes {
    int64_t nb_input;
    int64_t nb_layer;
    int64_t nb_output;

    model_bytes() :
        nb_input (0),
        nb_layer (0),
        nb_output(0) {}
};

struct disk_props {
    float read_seq_bw;  // in GB/s
    float read_rnd_bw;  // in GB/s
    float write_seq_bw; // in GB/s
    float write_rnd_bw; // in GB/s

    disk_props() :
        read_seq_bw (0.0f),
        read_rnd_bw (0.0f),
        write_seq_bw(0.0f),
        write_rnd_bw(0.0f) {}
};

struct device_info {
    uint32_t            rank;
    const char *        device_name;
    const char *        device_os;
    struct disk_props   disk;
    struct cpu_props    cpu_props;
    struct memory_info  memory;
    struct gpu_support  gpu_support;
    struct gpu_props    gpu_props;
    struct model_flops  model_flops;
    struct model_params model_params;
    struct model_bytes  model_bytes;

    device_info() : 
        rank(0), 
        device_name(""), 
        device_os(""),
        disk(),
        cpu_props(), 
        memory(), 
        gpu_support(), 
        gpu_props(), 
        model_flops(),
        model_params(),
        model_bytes() {}
};

enum profiler_backend_type {
    PROFILER_BACKEND_TYPE_CPU   = 0,
    PROFILER_BACKEND_TYPE_METAL = 1,
    PROFILER_BACKEND_TYPE_CUDA  = 2,
};

enum profiler_layer_type {
    PROFILER_LAYER_INPUT   = 0,
    PROFILER_LAYER_OUTPUT  = 1,
    PROFILER_LAYER_BACKEND = 2,
};

const char * device_name(void); 
const char * device_os(void);

uint32_t device_cpu_cores         (void);
float    device_cpu_flops         (struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t, int n_threads);
float    device_metal_flops       (struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t);
float    device_cuda_flops        (struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t);
float    device_inp_embd_delay    (struct llama_model * model, enum ggml_type src0t, int n_tokens, int n_threads);
uint64_t device_physical_memory   (bool available);
uint64_t device_swap_memory       (bool available);
uint64_t device_swappable_memory  ();
void     device_disk_seq_bw       (float * read_seq_bw, float * write_seq_bw, int n_threads);
void     device_disk_rnd_bw       (float * read_rnd_bw, float * write_rnd_bw, int n_threads);
float    device_memory_bw         (int n_thread);
float    device_cpu_mem_copy      (struct llama_model * model, int n_threads);
float    device_metal_mem_copy    (struct llama_model * model);
float    device_cuda_mem_copy     (struct llama_model * model);
float    device_metal_read_vram_bw();
float    device_cuda_read_vram_bw ();
void     device_get_props         (struct llama_model * model, int device, struct ggml_backend_dev_props * props); 
void     device_print_props       (struct device_info * dev_info_set, int n, struct llama_model * model, const struct llama_context_params cparams);

int      device_has_metal  (void);
int      device_has_cuda   (void);
int      device_has_vulkan (void);
int      device_has_kompute(void);
int      device_has_gpublas(void);
int      device_has_blas   (void);
int      device_has_sycl   (void);

size_t   serialize  (const struct device_info * dev_info, char ** buffer);
void     deserialize(const char * buffer, struct device_info * dev_info);

#endif // PROFILER_H