mirror of
https://github.com/Lizonghang/prima.cpp.git
synced 2025-09-10 18:54:40 +00:00
add f32, f16, q4k_f32, q6k_f32 flops test and fix duplicate inp_embd in subgraphs
This commit is contained in:
parent
7ee1423006
commit
a5ba34169a
3 changed files with 184 additions and 70 deletions
|
@ -82,7 +82,7 @@ uint32_t device_cpu_cores() {
|
||||||
return core_count;
|
return core_count;
|
||||||
}
|
}
|
||||||
|
|
||||||
static float device_flops(struct llama_model * model, enum ggml_type dtype, profiler_backend_type btype, int n_threads) {
|
static float device_flops(struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t, profiler_backend_type btype, int n_threads) {
|
||||||
const int n_embd = llama_n_embd(model);
|
const int n_embd = llama_n_embd(model);
|
||||||
std::vector<float> matrix_A(n_embd * n_embd, 1.0f);
|
std::vector<float> matrix_A(n_embd * n_embd, 1.0f);
|
||||||
std::vector<float> matrix_B(n_embd * n_embd, 1.0f / n_embd);
|
std::vector<float> matrix_B(n_embd * n_embd, 1.0f / n_embd);
|
||||||
|
@ -119,8 +119,8 @@ static float device_flops(struct llama_model * model, enum ggml_type dtype, prof
|
||||||
};
|
};
|
||||||
struct ggml_context * ctx = ggml_init(params);
|
struct ggml_context * ctx = ggml_init(params);
|
||||||
|
|
||||||
struct ggml_tensor * tensor_a = ggml_new_tensor_2d(ctx, dtype, n_embd, n_embd);
|
struct ggml_tensor * tensor_a = ggml_new_tensor_2d(ctx, src0t, n_embd, n_embd);
|
||||||
struct ggml_tensor * tensor_b = ggml_new_tensor_2d(ctx, dtype, n_embd, n_embd);
|
struct ggml_tensor * tensor_b = ggml_new_tensor_2d(ctx, src1t, n_embd, n_embd);
|
||||||
|
|
||||||
ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
|
ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
|
||||||
|
|
||||||
|
@ -168,27 +168,29 @@ static float device_flops(struct llama_model * model, enum ggml_type dtype, prof
|
||||||
return (float)flops;
|
return (float)flops;
|
||||||
}
|
}
|
||||||
|
|
||||||
float device_cpu_flops(struct llama_model * model, enum ggml_type dtype, int n_threads) {
|
float device_cpu_flops(struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t, int n_threads) {
|
||||||
return device_flops(model, dtype, PROFILER_BACKEND_TYPE_CPU, n_threads);
|
return device_flops(model, src0t, src1t, PROFILER_BACKEND_TYPE_CPU, n_threads);
|
||||||
}
|
}
|
||||||
|
|
||||||
float device_metal_flops(struct llama_model * model, enum ggml_type dtype) {
|
float device_metal_flops(struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t) {
|
||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
return device_flops(model, dtype, PROFILER_BACKEND_TYPE_METAL, 4);
|
return device_flops(model, src0t, src1t, PROFILER_BACKEND_TYPE_METAL, 4);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
(void)model;
|
(void)model;
|
||||||
(void)dtype;
|
(void)src0t;
|
||||||
|
(void)src1t;
|
||||||
return 0.0f;
|
return 0.0f;
|
||||||
}
|
}
|
||||||
|
|
||||||
float device_cuda_flops(struct llama_model * model, enum ggml_type dtype) {
|
float device_cuda_flops(struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t) {
|
||||||
#ifdef GGML_USE_CUDA
|
#ifdef GGML_USE_CUDA
|
||||||
return device_flops(model, dtype, PROFILER_BACKEND_TYPE_CUDA, 4);
|
return device_flops(model, src0t, src1t, PROFILER_BACKEND_TYPE_CUDA, 4);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
(void)model;
|
(void)model;
|
||||||
(void)dtype;
|
(void)src0t;
|
||||||
|
(void)src1t;
|
||||||
return 0.0f;
|
return 0.0f;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -463,18 +465,30 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
|
||||||
}
|
}
|
||||||
LOG_INF("\n");
|
LOG_INF("\n");
|
||||||
|
|
||||||
LOG_INF("| CPU flops (F32, GFLOPS) ");
|
LOG_INF("| CPU flops (F32 x F32, GFLOPS)");
|
||||||
for (int i = 0; i < n; ++i) {
|
for (int i = 0; i < n; ++i) {
|
||||||
LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_f32);
|
LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_f32);
|
||||||
}
|
}
|
||||||
LOG_INF("\n");
|
LOG_INF("\n");
|
||||||
|
|
||||||
LOG_INF("| CPU flops (F16, GFLOPS) ");
|
LOG_INF("| CPU flops (F16 x F16, GFLOPS)");
|
||||||
for (int i = 0; i < n; ++i) {
|
for (int i = 0; i < n; ++i) {
|
||||||
LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_f16);
|
LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_f16);
|
||||||
}
|
}
|
||||||
LOG_INF("\n");
|
LOG_INF("\n");
|
||||||
|
|
||||||
|
LOG_INF("| CPU flops (Q4K x F32, GFLOPS)");
|
||||||
|
for (int i = 0; i < n; ++i) {
|
||||||
|
LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q4k_f32);
|
||||||
|
}
|
||||||
|
LOG_INF("\n");
|
||||||
|
|
||||||
|
LOG_INF("| CPU flops (Q6K x F32, GFLOPS)");
|
||||||
|
for (int i = 0; i < n; ++i) {
|
||||||
|
LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q6k_f32);
|
||||||
|
}
|
||||||
|
LOG_INF("\n");
|
||||||
|
|
||||||
LOG_INF("| Physical Mem Total (GB) ");
|
LOG_INF("| Physical Mem Total (GB) ");
|
||||||
for (int i = 0; i < n; ++i) {
|
for (int i = 0; i < n; ++i) {
|
||||||
LOG_INF("| %-10.2f ", dev_info_set[i].memory.total_physical);
|
LOG_INF("| %-10.2f ", dev_info_set[i].memory.total_physical);
|
||||||
|
@ -577,33 +591,51 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
|
||||||
}
|
}
|
||||||
LOG_INF("\n");
|
LOG_INF("\n");
|
||||||
|
|
||||||
LOG_INF("| Metal flops (F32, GFLOPS) ");
|
LOG_INF("| Metal flops (F32xF32, GFLOPS)");
|
||||||
for (int i = 0; i < n; ++i) {
|
for (int i = 0; i < n; ++i) {
|
||||||
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops);
|
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_f32);
|
||||||
}
|
}
|
||||||
LOG_INF("\n");
|
LOG_INF("\n");
|
||||||
|
|
||||||
LOG_INF("| CUDA flops (F32, GFLOPS) ");
|
LOG_INF("| Metal flops (F16xF16, GFLOPS)");
|
||||||
|
for (int i = 0; i < n; ++i) {
|
||||||
|
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_f16);
|
||||||
|
}
|
||||||
|
LOG_INF("\n");
|
||||||
|
|
||||||
|
LOG_INF("| Metal flops (Q4KxF32, GFLOPS)");
|
||||||
|
for (int i = 0; i < n; ++i) {
|
||||||
|
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q4k_f32);
|
||||||
|
}
|
||||||
|
LOG_INF("\n");
|
||||||
|
|
||||||
|
LOG_INF("| Metal flops (Q6KxF32, GFLOPS)");
|
||||||
|
for (int i = 0; i < n; ++i) {
|
||||||
|
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q6k_f32);
|
||||||
|
}
|
||||||
|
LOG_INF("\n");
|
||||||
|
|
||||||
|
LOG_INF("| CUDA flops (F32xF32, GFLOPS)");
|
||||||
for (int i = 0; i < n; ++i) {
|
for (int i = 0; i < n; ++i) {
|
||||||
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_f32);
|
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_f32);
|
||||||
}
|
}
|
||||||
LOG_INF("\n");
|
LOG_INF("\n");
|
||||||
|
|
||||||
LOG_INF("| CUDA flops (F16, GFLOPS) ");
|
LOG_INF("| CUDA flops (F16xF16, GFLOPS)");
|
||||||
for (int i = 0; i < n; ++i) {
|
for (int i = 0; i < n; ++i) {
|
||||||
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_f16);
|
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_f16);
|
||||||
}
|
}
|
||||||
LOG_INF("\n");
|
LOG_INF("\n");
|
||||||
|
|
||||||
LOG_INF("| CUDA flops (Q8_0, GFLOPS) ");
|
LOG_INF("| CUDA flops (Q4KxF32, GFLOPS)");
|
||||||
for (int i = 0; i < n; ++i) {
|
for (int i = 0; i < n; ++i) {
|
||||||
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q8);
|
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q4k_f32);
|
||||||
}
|
}
|
||||||
LOG_INF("\n");
|
LOG_INF("\n");
|
||||||
|
|
||||||
LOG_INF("| CUDA flops (Q4_K, GFLOPS) ");
|
LOG_INF("| CUDA flops (Q6KxF32, GFLOPS)");
|
||||||
for (int i = 0; i < n; ++i) {
|
for (int i = 0; i < n; ++i) {
|
||||||
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q4k);
|
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q6k_f32);
|
||||||
}
|
}
|
||||||
LOG_INF("\n");
|
LOG_INF("\n");
|
||||||
|
|
||||||
|
@ -660,10 +692,11 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
|
||||||
+ gpu_description_len
|
+ gpu_description_len
|
||||||
+ sizeof(float) // disk_read_bandwidth
|
+ sizeof(float) // disk_read_bandwidth
|
||||||
+ sizeof(uint32_t) // cpu_props.cores
|
+ sizeof(uint32_t) // cpu_props.cores
|
||||||
+ sizeof(float) * 2 // cpu_props.flops_f32 and cpu_props.flops_f16
|
+ sizeof(float) * 4 // cpu_props.flops_f32, cpu_props.flops_f16, cpu_props.flops_q4k_f32, cpu_props.flops_q6k_f32
|
||||||
+ sizeof(struct memory_info)
|
+ sizeof(struct memory_info)
|
||||||
+ sizeof(struct gpu_support)
|
+ sizeof(struct gpu_support)
|
||||||
+ sizeof(float) * 7; // gpu_props.memory_free, gpu_props.memory_total, gpu_props.metal_flops,
|
+ sizeof(float) * 10; // gpu_props.memory_free, gpu_props.memory_total,
|
||||||
|
// gpu_props.metal_flops_f32, gpu_props.metal_flops_f16, gpu_props.metal_flops_q4k_f32, gpu_props.metal_flops_q6k_f32,
|
||||||
// gpu_props.cuda_flops_f32, gpu_props.cuda_flops_f16, gpu_props.cuda_flops_q8, and gpu_props.cuda_flops_q4k
|
// gpu_props.cuda_flops_f32, gpu_props.cuda_flops_f16, gpu_props.cuda_flops_q8, and gpu_props.cuda_flops_q4k
|
||||||
|
|
||||||
*buffer = (char *)malloc(total_size);
|
*buffer = (char *)malloc(total_size);
|
||||||
|
@ -712,6 +745,12 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
|
||||||
memcpy(ptr, &dev_info->cpu_props.flops_f16, sizeof(float));
|
memcpy(ptr, &dev_info->cpu_props.flops_f16, sizeof(float));
|
||||||
ptr += sizeof(float);
|
ptr += sizeof(float);
|
||||||
|
|
||||||
|
memcpy(ptr, &dev_info->cpu_props.flops_q4k_f32, sizeof(float));
|
||||||
|
ptr += sizeof(float);
|
||||||
|
|
||||||
|
memcpy(ptr, &dev_info->cpu_props.flops_q6k_f32, sizeof(float));
|
||||||
|
ptr += sizeof(float);
|
||||||
|
|
||||||
memcpy(ptr, &dev_info->memory, sizeof(struct memory_info));
|
memcpy(ptr, &dev_info->memory, sizeof(struct memory_info));
|
||||||
ptr += sizeof(struct memory_info);
|
ptr += sizeof(struct memory_info);
|
||||||
|
|
||||||
|
@ -724,7 +763,16 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
|
||||||
memcpy(ptr, &dev_info->gpu_props.memory_total, sizeof(float));
|
memcpy(ptr, &dev_info->gpu_props.memory_total, sizeof(float));
|
||||||
ptr += sizeof(float);
|
ptr += sizeof(float);
|
||||||
|
|
||||||
memcpy(ptr, &dev_info->gpu_props.metal_flops, sizeof(float));
|
memcpy(ptr, &dev_info->gpu_props.metal_flops_f32, sizeof(float));
|
||||||
|
ptr += sizeof(float);
|
||||||
|
|
||||||
|
memcpy(ptr, &dev_info->gpu_props.metal_flops_f16, sizeof(float));
|
||||||
|
ptr += sizeof(float);
|
||||||
|
|
||||||
|
memcpy(ptr, &dev_info->gpu_props.metal_flops_q4k_f32, sizeof(float));
|
||||||
|
ptr += sizeof(float);
|
||||||
|
|
||||||
|
memcpy(ptr, &dev_info->gpu_props.metal_flops_q6k_f32, sizeof(float));
|
||||||
ptr += sizeof(float);
|
ptr += sizeof(float);
|
||||||
|
|
||||||
memcpy(ptr, &dev_info->gpu_props.cuda_flops_f32, sizeof(float));
|
memcpy(ptr, &dev_info->gpu_props.cuda_flops_f32, sizeof(float));
|
||||||
|
@ -733,10 +781,10 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
|
||||||
memcpy(ptr, &dev_info->gpu_props.cuda_flops_f16, sizeof(float));
|
memcpy(ptr, &dev_info->gpu_props.cuda_flops_f16, sizeof(float));
|
||||||
ptr += sizeof(float);
|
ptr += sizeof(float);
|
||||||
|
|
||||||
memcpy(ptr, &dev_info->gpu_props.cuda_flops_q8, sizeof(float));
|
memcpy(ptr, &dev_info->gpu_props.cuda_flops_q4k_f32, sizeof(float));
|
||||||
ptr += sizeof(float);
|
ptr += sizeof(float);
|
||||||
|
|
||||||
memcpy(ptr, &dev_info->gpu_props.cuda_flops_q4k, sizeof(float));
|
memcpy(ptr, &dev_info->gpu_props.cuda_flops_q6k_f32, sizeof(float));
|
||||||
|
|
||||||
// no need to synchronize model flops
|
// no need to synchronize model flops
|
||||||
return total_size;
|
return total_size;
|
||||||
|
@ -802,6 +850,12 @@ void deserialize(const char * buffer, struct device_info * dev_info) {
|
||||||
memcpy(&dev_info->cpu_props.flops_f16, ptr, sizeof(float));
|
memcpy(&dev_info->cpu_props.flops_f16, ptr, sizeof(float));
|
||||||
ptr += sizeof(float);
|
ptr += sizeof(float);
|
||||||
|
|
||||||
|
memcpy(&dev_info->cpu_props.flops_q4k_f32, ptr, sizeof(float));
|
||||||
|
ptr += sizeof(float);
|
||||||
|
|
||||||
|
memcpy(&dev_info->cpu_props.flops_q6k_f32, ptr, sizeof(float));
|
||||||
|
ptr += sizeof(float);
|
||||||
|
|
||||||
memcpy(&dev_info->memory, ptr, sizeof(struct memory_info));
|
memcpy(&dev_info->memory, ptr, sizeof(struct memory_info));
|
||||||
ptr += sizeof(struct memory_info);
|
ptr += sizeof(struct memory_info);
|
||||||
|
|
||||||
|
@ -814,7 +868,16 @@ void deserialize(const char * buffer, struct device_info * dev_info) {
|
||||||
memcpy(&dev_info->gpu_props.memory_total, ptr, sizeof(float));
|
memcpy(&dev_info->gpu_props.memory_total, ptr, sizeof(float));
|
||||||
ptr += sizeof(float);
|
ptr += sizeof(float);
|
||||||
|
|
||||||
memcpy(&dev_info->gpu_props.metal_flops, ptr, sizeof(float));
|
memcpy(&dev_info->gpu_props.metal_flops_f32, ptr, sizeof(float));
|
||||||
|
ptr += sizeof(float);
|
||||||
|
|
||||||
|
memcpy(&dev_info->gpu_props.metal_flops_f16, ptr, sizeof(float));
|
||||||
|
ptr += sizeof(float);
|
||||||
|
|
||||||
|
memcpy(&dev_info->gpu_props.metal_flops_q4k_f32, ptr, sizeof(float));
|
||||||
|
ptr += sizeof(float);
|
||||||
|
|
||||||
|
memcpy(&dev_info->gpu_props.metal_flops_q6k_f32, ptr, sizeof(float));
|
||||||
ptr += sizeof(float);
|
ptr += sizeof(float);
|
||||||
|
|
||||||
memcpy(&dev_info->gpu_props.cuda_flops_f32, ptr, sizeof(float));
|
memcpy(&dev_info->gpu_props.cuda_flops_f32, ptr, sizeof(float));
|
||||||
|
@ -823,10 +886,10 @@ void deserialize(const char * buffer, struct device_info * dev_info) {
|
||||||
memcpy(&dev_info->gpu_props.cuda_flops_f16, ptr, sizeof(float));
|
memcpy(&dev_info->gpu_props.cuda_flops_f16, ptr, sizeof(float));
|
||||||
ptr += sizeof(float);
|
ptr += sizeof(float);
|
||||||
|
|
||||||
memcpy(&dev_info->gpu_props.cuda_flops_q8, ptr, sizeof(float));
|
memcpy(&dev_info->gpu_props.cuda_flops_q4k_f32, ptr, sizeof(float));
|
||||||
ptr += sizeof(float);
|
ptr += sizeof(float);
|
||||||
|
|
||||||
memcpy(&dev_info->gpu_props.cuda_flops_q4k, ptr, sizeof(float));
|
memcpy(&dev_info->gpu_props.cuda_flops_q6k_f32, ptr, sizeof(float));
|
||||||
|
|
||||||
// no need to synchronize model flops
|
// no need to synchronize model flops
|
||||||
}
|
}
|
|
@ -10,9 +10,17 @@ struct cpu_props {
|
||||||
uint32_t cores;
|
uint32_t cores;
|
||||||
float flops_f32; // in GFLOPS
|
float flops_f32; // in GFLOPS
|
||||||
float flops_f16; // in GFLOPS
|
float flops_f16; // in GFLOPS
|
||||||
|
float flops_q4k_f32; // in GFLOPS
|
||||||
|
float flops_q6k_f32; // in GFLOPS
|
||||||
|
|
||||||
cpu_props()
|
cpu_props() :
|
||||||
: name(""), description(""), cores(0), flops_f32(0.0f), flops_f16(0.0f) {}
|
name(""),
|
||||||
|
description(""),
|
||||||
|
cores(0),
|
||||||
|
flops_f32 (0.0f),
|
||||||
|
flops_f16 (0.0f),
|
||||||
|
flops_q4k_f32(0.0f),
|
||||||
|
flops_q6k_f32(0.0f) {}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct memory_info {
|
struct memory_info {
|
||||||
|
@ -22,8 +30,12 @@ struct memory_info {
|
||||||
float available_swap; // in GB
|
float available_swap; // in GB
|
||||||
float bandwidth; // in GB/s
|
float bandwidth; // in GB/s
|
||||||
|
|
||||||
memory_info()
|
memory_info() :
|
||||||
: total_physical(0.0f), available_physical(0.0f), total_swap(0.0f), available_swap(0.0f), bandwidth(0.0f) {}
|
total_physical (0.0f),
|
||||||
|
available_physical(0.0f),
|
||||||
|
total_swap (0.0f),
|
||||||
|
available_swap (0.0f),
|
||||||
|
bandwidth (0.0f) {}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct gpu_support {
|
struct gpu_support {
|
||||||
|
@ -35,8 +47,14 @@ struct gpu_support {
|
||||||
bool blas;
|
bool blas;
|
||||||
bool sycl;
|
bool sycl;
|
||||||
|
|
||||||
gpu_support()
|
gpu_support() :
|
||||||
: metal(false), cuda(false), vulkan(false), kompute(false), gpublas(false), blas(false), sycl(false) {}
|
metal (false),
|
||||||
|
cuda (false),
|
||||||
|
vulkan (false),
|
||||||
|
kompute(false),
|
||||||
|
gpublas(false),
|
||||||
|
blas (false),
|
||||||
|
sycl (false) {}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct gpu_props {
|
struct gpu_props {
|
||||||
|
@ -44,14 +62,28 @@ struct gpu_props {
|
||||||
const char * description;
|
const char * description;
|
||||||
float memory_free; // in GB
|
float memory_free; // in GB
|
||||||
float memory_total; // in GB
|
float memory_total; // in GB
|
||||||
float metal_flops; // in GFLOPS
|
float metal_flops_f32; // in GFLOPS
|
||||||
|
float metal_flops_f16; // in GFLOPS
|
||||||
|
float metal_flops_q4k_f32; // in GFLOPS
|
||||||
|
float metal_flops_q6k_f32; // in GFLOPS
|
||||||
float cuda_flops_f32; // in GFLOPS
|
float cuda_flops_f32; // in GFLOPS
|
||||||
float cuda_flops_f16; // in GFLOPS
|
float cuda_flops_f16; // in GFLOPS
|
||||||
float cuda_flops_q8; // in GFLOPS
|
float cuda_flops_q4k_f32; // in GFLOPS
|
||||||
float cuda_flops_q4k; // in GFLOPS
|
float cuda_flops_q6k_f32; // in GFLOPS
|
||||||
|
|
||||||
gpu_props()
|
gpu_props() :
|
||||||
: name(""), description(""), memory_free(0.0f), memory_total(0.0f), metal_flops(0.0f), cuda_flops_f32(0.0f), cuda_flops_f16(0.0f), cuda_flops_q8(0.0f), cuda_flops_q4k(0.0f) {}
|
name(""),
|
||||||
|
description(""),
|
||||||
|
memory_free (0.0f),
|
||||||
|
memory_total (0.0f),
|
||||||
|
metal_flops_f32 (0.0f),
|
||||||
|
metal_flops_f16 (0.0f),
|
||||||
|
metal_flops_q4k_f32(0.0f),
|
||||||
|
metal_flops_q6k_f32(0.0f),
|
||||||
|
cuda_flops_f32 (0.0f),
|
||||||
|
cuda_flops_f16 (0.0f),
|
||||||
|
cuda_flops_q4k_f32 (0.0f),
|
||||||
|
cuda_flops_q6k_f32 (0.0f) {}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct model_flops {
|
struct model_flops {
|
||||||
|
@ -65,8 +97,13 @@ struct model_flops {
|
||||||
int64_t output_params;
|
int64_t output_params;
|
||||||
int64_t layer_params;
|
int64_t layer_params;
|
||||||
|
|
||||||
model_flops()
|
model_flops() :
|
||||||
: input_flops(0), output_flops(0), layer_flops(0), input_params(0), output_params(0), layer_params(0) {}
|
input_flops (0),
|
||||||
|
output_flops (0),
|
||||||
|
layer_flops (0),
|
||||||
|
input_params (0),
|
||||||
|
output_params(0),
|
||||||
|
layer_params (0) {}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct device_info {
|
struct device_info {
|
||||||
|
@ -79,8 +116,15 @@ struct device_info {
|
||||||
struct gpu_props gpu_props;
|
struct gpu_props gpu_props;
|
||||||
struct model_flops model_flops;
|
struct model_flops model_flops;
|
||||||
|
|
||||||
device_info()
|
device_info() :
|
||||||
: rank(0), device_name(""), disk_read_bandwidth(0.0f), cpu_props(), memory(), gpu_support(), gpu_props(), model_flops() {}
|
rank(0),
|
||||||
|
device_name(""),
|
||||||
|
disk_read_bandwidth(0.0f),
|
||||||
|
cpu_props(),
|
||||||
|
memory(),
|
||||||
|
gpu_support(),
|
||||||
|
gpu_props(),
|
||||||
|
model_flops() {}
|
||||||
};
|
};
|
||||||
|
|
||||||
enum profiler_backend_type {
|
enum profiler_backend_type {
|
||||||
|
@ -92,9 +136,9 @@ enum profiler_backend_type {
|
||||||
const char * device_name(void);
|
const char * device_name(void);
|
||||||
|
|
||||||
uint32_t device_cpu_cores (void);
|
uint32_t device_cpu_cores (void);
|
||||||
float device_cpu_flops (struct llama_model * model, enum ggml_type dtype, int n_threads);
|
float device_cpu_flops (struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t, int n_threads);
|
||||||
float device_metal_flops (struct llama_model * model, enum ggml_type dtype);
|
float device_metal_flops (struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t);
|
||||||
float device_cuda_flops (struct llama_model * model, enum ggml_type dtype);
|
float device_cuda_flops (struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t);
|
||||||
uint64_t device_physical_memory(bool available);
|
uint64_t device_physical_memory(bool available);
|
||||||
uint64_t device_swap_memory (bool available);
|
uint64_t device_swap_memory (bool available);
|
||||||
uint64_t device_disk_read_bw (const char * test_file, size_t buffer_size_mb);
|
uint64_t device_disk_read_bw (const char * test_file, size_t buffer_size_mb);
|
||||||
|
|
|
@ -3549,8 +3549,10 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_
|
||||||
void llama_profile_device(device_info * dev_info, struct llama_model * model, llama_model_loader * ml, const char * test_file, int n_threads) {
|
void llama_profile_device(device_info * dev_info, struct llama_model * model, llama_model_loader * ml, const char * test_file, int n_threads) {
|
||||||
dev_info->device_name = device_name();
|
dev_info->device_name = device_name();
|
||||||
dev_info->cpu_props.cores = device_cpu_cores();
|
dev_info->cpu_props.cores = device_cpu_cores();
|
||||||
dev_info->cpu_props.flops_f32 = device_cpu_flops(model, GGML_TYPE_F32, n_threads);
|
dev_info->cpu_props.flops_f32 = device_cpu_flops(model, GGML_TYPE_F32, GGML_TYPE_F32, n_threads);
|
||||||
dev_info->cpu_props.flops_f16 = device_cpu_flops(model, GGML_TYPE_F16, n_threads);
|
dev_info->cpu_props.flops_f16 = device_cpu_flops(model, GGML_TYPE_F16, GGML_TYPE_F16, n_threads);
|
||||||
|
dev_info->cpu_props.flops_q4k_f32 = device_cpu_flops(model, GGML_TYPE_Q4_K, GGML_TYPE_F32, n_threads);
|
||||||
|
dev_info->cpu_props.flops_q6k_f32 = device_cpu_flops(model, GGML_TYPE_Q6_K, GGML_TYPE_F32, n_threads);
|
||||||
|
|
||||||
dev_info->memory.total_physical = round(device_physical_memory(false) / (double)(1 << 30) * 100) / 100;
|
dev_info->memory.total_physical = round(device_physical_memory(false) / (double)(1 << 30) * 100) / 100;
|
||||||
dev_info->memory.available_physical = round(device_physical_memory(true) / (double)(1 << 30) * 100) / 100;
|
dev_info->memory.available_physical = round(device_physical_memory(true) / (double)(1 << 30) * 100) / 100;
|
||||||
|
@ -3580,11 +3582,14 @@ void llama_profile_device(device_info * dev_info, struct llama_model * model, ll
|
||||||
dev_info->gpu_props.description = gpu_props.description;
|
dev_info->gpu_props.description = gpu_props.description;
|
||||||
dev_info->gpu_props.memory_free = round(gpu_props.memory_free / (double)(1 << 30) * 100) / 100;
|
dev_info->gpu_props.memory_free = round(gpu_props.memory_free / (double)(1 << 30) * 100) / 100;
|
||||||
dev_info->gpu_props.memory_total = round(gpu_props.memory_total / (double)(1 << 30) * 100) / 100;
|
dev_info->gpu_props.memory_total = round(gpu_props.memory_total / (double)(1 << 30) * 100) / 100;
|
||||||
dev_info->gpu_props.metal_flops = device_metal_flops(model, GGML_TYPE_F32);
|
dev_info->gpu_props.metal_flops_f32 = device_metal_flops(model, GGML_TYPE_F32, GGML_TYPE_F32);
|
||||||
dev_info->gpu_props.cuda_flops_f32 = device_cuda_flops(model, GGML_TYPE_F32);
|
dev_info->gpu_props.metal_flops_f16 = device_metal_flops(model, GGML_TYPE_F16, GGML_TYPE_F16);
|
||||||
dev_info->gpu_props.cuda_flops_f16 = device_cuda_flops(model, GGML_TYPE_F16);
|
dev_info->gpu_props.metal_flops_q4k_f32 = device_metal_flops(model, GGML_TYPE_Q4_K, GGML_TYPE_F32);
|
||||||
dev_info->gpu_props.cuda_flops_q8 = device_cuda_flops(model, GGML_TYPE_Q8_0);
|
dev_info->gpu_props.metal_flops_q6k_f32 = device_metal_flops(model, GGML_TYPE_Q6_K, GGML_TYPE_F32);
|
||||||
dev_info->gpu_props.cuda_flops_q4k = device_cuda_flops(model, GGML_TYPE_Q4_K);
|
dev_info->gpu_props.cuda_flops_f32 = device_cuda_flops (model, GGML_TYPE_F32, GGML_TYPE_F32);
|
||||||
|
dev_info->gpu_props.cuda_flops_f16 = device_cuda_flops (model, GGML_TYPE_F16, GGML_TYPE_F16);
|
||||||
|
dev_info->gpu_props.cuda_flops_q4k_f32 = device_cuda_flops (model, GGML_TYPE_Q4_K, GGML_TYPE_F32);
|
||||||
|
dev_info->gpu_props.cuda_flops_q6k_f32 = device_cuda_flops (model, GGML_TYPE_Q6_K, GGML_TYPE_F32);
|
||||||
|
|
||||||
if (dev_info->rank == 0) {
|
if (dev_info->rank == 0) {
|
||||||
struct model_flops * ffo = &dev_info->model_flops;
|
struct model_flops * ffo = &dev_info->model_flops;
|
||||||
|
@ -10687,7 +10692,9 @@ struct llm_build_context {
|
||||||
// build the input layer as a seperate subgraph
|
// build the input layer as a seperate subgraph
|
||||||
ggml_build_forward_expand(sub_gf, inpL);
|
ggml_build_forward_expand(sub_gf, inpL);
|
||||||
sub_gfs.push_back(sub_gf);
|
sub_gfs.push_back(sub_gf);
|
||||||
|
|
||||||
sub_gf = nullptr;
|
sub_gf = nullptr;
|
||||||
|
inpL = nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
// inpB - contains the output embedding from other nodes
|
// inpB - contains the output embedding from other nodes
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue