mirror of
https://github.com/Lizonghang/prima.cpp.git
synced 2025-09-08 04:39:04 +00:00
add cpu flops test
This commit is contained in:
parent
2bd4d03aa8
commit
5fae6ac36f
6 changed files with 146 additions and 44 deletions
83
Makefile
83
Makefile
|
@ -1,45 +1,46 @@
|
|||
# Define the default target now so that it is always the first target
|
||||
BUILD_TARGETS = \
|
||||
libllava.a \
|
||||
llama-baby-llama \
|
||||
llama-batched \
|
||||
llama-batched-bench \
|
||||
llama-bench \
|
||||
llama-cli \
|
||||
llama-convert-llama2c-to-ggml \
|
||||
llama-embedding \
|
||||
llama-eval-callback \
|
||||
llama-export-lora \
|
||||
llama-gbnf-validator \
|
||||
llama-gguf \
|
||||
llama-gguf-hash \
|
||||
llama-gguf-split \
|
||||
llama-gritlm \
|
||||
llama-imatrix \
|
||||
llama-infill \
|
||||
llama-llava-cli \
|
||||
llama-minicpmv-cli\
|
||||
llama-lookahead \
|
||||
llama-lookup \
|
||||
llama-lookup-create \
|
||||
llama-lookup-merge \
|
||||
llama-lookup-stats \
|
||||
llama-parallel \
|
||||
llama-passkey \
|
||||
llama-perplexity \
|
||||
llama-q8dot \
|
||||
llama-quantize \
|
||||
llama-quantize-stats \
|
||||
llama-retrieval \
|
||||
llama-save-load-state \
|
||||
llama-server \
|
||||
llama-simple \
|
||||
llama-speculative \
|
||||
llama-tokenize \
|
||||
llama-vdot \
|
||||
llama-cvector-generator \
|
||||
llama-gen-docs \
|
||||
tests/test-c.o
|
||||
BUILD_TARGETS = llama-cli
|
||||
# BUILD_TARGETS = \
|
||||
# libllava.a \
|
||||
# llama-baby-llama \
|
||||
# llama-batched \
|
||||
# llama-batched-bench \
|
||||
# llama-bench \
|
||||
# llama-cli \
|
||||
# llama-convert-llama2c-to-ggml \
|
||||
# llama-embedding \
|
||||
# llama-eval-callback \
|
||||
# llama-export-lora \
|
||||
# llama-gbnf-validator \
|
||||
# llama-gguf \
|
||||
# llama-gguf-hash \
|
||||
# llama-gguf-split \
|
||||
# llama-gritlm \
|
||||
# llama-imatrix \
|
||||
# llama-infill \
|
||||
# llama-llava-cli \
|
||||
# llama-minicpmv-cli\
|
||||
# llama-lookahead \
|
||||
# llama-lookup \
|
||||
# llama-lookup-create \
|
||||
# llama-lookup-merge \
|
||||
# llama-lookup-stats \
|
||||
# llama-parallel \
|
||||
# llama-passkey \
|
||||
# llama-perplexity \
|
||||
# llama-q8dot \
|
||||
# llama-quantize \
|
||||
# llama-quantize-stats \
|
||||
# llama-retrieval \
|
||||
# llama-save-load-state \
|
||||
# llama-server \
|
||||
# llama-simple \
|
||||
# llama-speculative \
|
||||
# llama-tokenize \
|
||||
# llama-vdot \
|
||||
# llama-cvector-generator \
|
||||
# llama-gen-docs \
|
||||
# tests/test-c.o
|
||||
|
||||
# Binaries only useful for tests
|
||||
TEST_TARGETS = \
|
||||
|
|
|
@ -890,9 +890,11 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|||
}
|
||||
|
||||
// get device profile
|
||||
LOG_INF("Start profiling this device, this may take some seconds ...\n");
|
||||
|
||||
device_info dev_info;
|
||||
dev_info.rank = params.rank;
|
||||
llama_profile_device(&dev_info, model, params.model.c_str());
|
||||
llama_profile_device(&dev_info, model, params.model.c_str(), params.cpuparams.n_threads);
|
||||
|
||||
// create llama context
|
||||
struct llama_context_params cparams = llama_context_params_from_gpt_params(params);
|
||||
|
|
|
@ -73,6 +73,69 @@ uint32_t device_cpu_cores() {
|
|||
return core_count;
|
||||
}
|
||||
|
||||
float device_cpu_flops(struct llama_model * model, enum ggml_type dtype, int n_threads) {
|
||||
// define matrix dimensions
|
||||
const int n_embd = llama_n_embd(model);
|
||||
const int n_ff_hidden = llama_n_ff_hidden(model);
|
||||
const int rows_A = n_embd, cols_A = n_ff_hidden;
|
||||
const int rows_B = n_embd, cols_B = n_ff_hidden;
|
||||
|
||||
// calculate memory size needed for ggml_context allocation
|
||||
size_t ctx_size = 0;
|
||||
ctx_size += rows_A * cols_A * ggml_type_size(dtype); // tensor a
|
||||
ctx_size += rows_B * cols_B * ggml_type_size(dtype); // tensor b
|
||||
ctx_size += rows_A * rows_B * ggml_type_size(dtype); // result
|
||||
ctx_size += 3 * ggml_tensor_overhead(); // metadata for 3 tensors
|
||||
ctx_size += ggml_graph_overhead(); // compute graph
|
||||
ctx_size = (size_t)(ctx_size * 1.2); // some overhead
|
||||
|
||||
// allocate ggml_context
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ ctx_size,
|
||||
/*.mem_buffer =*/ NULL,
|
||||
/*.no_alloc =*/ false,
|
||||
};
|
||||
struct ggml_context * ctx = ggml_init(params);
|
||||
|
||||
// create tensors and set data
|
||||
struct ggml_tensor * tensor_a = ggml_new_tensor_2d(ctx, dtype, cols_A, rows_A);
|
||||
struct ggml_tensor * tensor_b = ggml_new_tensor_2d(ctx, dtype, cols_B, rows_B);
|
||||
|
||||
// fill tensors with random data
|
||||
float * matrix_A = (float *)malloc(rows_A * cols_A * sizeof(float));
|
||||
float * matrix_B = (float *)malloc(rows_B * cols_B * sizeof(float));
|
||||
|
||||
for (int i = 0; i < rows_A * cols_A; i++) {
|
||||
matrix_A[i] = (float)(rand() % 100) / 10.0f; // random float between 0.0 and 10.0
|
||||
}
|
||||
for (int i = 0; i < rows_B * cols_B; i++) {
|
||||
matrix_B[i] = (float)(rand() % 100) / 10.0f;
|
||||
}
|
||||
|
||||
memcpy(tensor_a->data, matrix_A, ggml_nbytes(tensor_a));
|
||||
memcpy(tensor_b->data, matrix_B, ggml_nbytes(tensor_b));
|
||||
|
||||
free(matrix_A);
|
||||
free(matrix_B);
|
||||
|
||||
// create ggml_cgraph for multiplication
|
||||
struct ggml_cgraph * gf = ggml_new_graph(ctx);
|
||||
struct ggml_tensor * result = ggml_mul_mat(ctx, tensor_a, tensor_b);
|
||||
ggml_build_forward_expand(gf, result);
|
||||
|
||||
// run the computation
|
||||
int64_t start_time = ggml_time_us();
|
||||
ggml_graph_compute_with_ctx(ctx, gf, n_threads);
|
||||
int64_t end_time = ggml_time_us();
|
||||
|
||||
double elapsed_seconds = (end_time - start_time) / 1e6;
|
||||
double flops = (2.0 * rows_A * cols_A * cols_B) / elapsed_seconds / 1e9;
|
||||
|
||||
// free memory
|
||||
ggml_free(ctx);
|
||||
return (float)flops;
|
||||
}
|
||||
|
||||
uint64_t device_physical_memory(bool available) {
|
||||
uint64_t memory = 0;
|
||||
|
||||
|
@ -344,6 +407,18 @@ void device_print_props(struct device_info * dev_info_set, int n) {
|
|||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| CPU flops (F32) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_f32);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| CPU flops (F16) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_f16);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Physical Mem Total (GB) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.2f ", dev_info_set[i].memory.total_physical);
|
||||
|
@ -467,6 +542,7 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
|
|||
+ gpu_description_len
|
||||
+ sizeof(float) // disk_read_bandwidth
|
||||
+ sizeof(uint32_t) // cpu_props.cores
|
||||
+ sizeof(float) * 2 // cpu_props.flops_f32 and cpu_props.flops_f16
|
||||
+ sizeof(struct memory_info)
|
||||
+ sizeof(struct gpu_support)
|
||||
+ sizeof(float) * 2; // gpu_props.memory_free and gpu_props.memory_total
|
||||
|
@ -511,6 +587,12 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
|
|||
memcpy(ptr, &dev_info->cpu_props.cores, sizeof(uint32_t));
|
||||
ptr += sizeof(uint32_t);
|
||||
|
||||
memcpy(ptr, &dev_info->cpu_props.flops_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->cpu_props.flops_f16, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->memory, sizeof(struct memory_info));
|
||||
ptr += sizeof(struct memory_info);
|
||||
|
||||
|
@ -579,6 +661,12 @@ void deserialize(const char * buffer, struct device_info * dev_info) {
|
|||
memcpy(&dev_info->cpu_props.cores, ptr, sizeof(uint32_t));
|
||||
ptr += sizeof(uint32_t);
|
||||
|
||||
memcpy(&dev_info->cpu_props.flops_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->cpu_props.flops_f16, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->memory, ptr, sizeof(struct memory_info));
|
||||
ptr += sizeof(struct memory_info);
|
||||
|
||||
|
|
|
@ -1,12 +1,15 @@
|
|||
#ifndef PROFILER_H
|
||||
#define PROFILER_H
|
||||
|
||||
#include "ggml.h"
|
||||
#include "llama.h"
|
||||
|
||||
struct cpu_props {
|
||||
const char * name;
|
||||
const char * description;
|
||||
uint32_t cores;
|
||||
float flops_f32;
|
||||
float flops_f16;
|
||||
};
|
||||
|
||||
struct memory_info {
|
||||
|
@ -47,6 +50,7 @@ struct device_info {
|
|||
const char * device_name(void);
|
||||
|
||||
uint32_t device_cpu_cores (void);
|
||||
float device_cpu_flops (struct llama_model * model, enum ggml_type dtype, int n_threads);
|
||||
uint64_t device_physical_memory(bool available);
|
||||
uint64_t device_swap_memory (bool available);
|
||||
uint64_t device_disk_read_bw (const char * test_file, size_t buffer_size_mb);
|
||||
|
|
|
@ -410,7 +410,7 @@ extern "C" {
|
|||
// Call once at the start of the program
|
||||
LLAMA_API void llama_backend_init(void);
|
||||
|
||||
LLAMA_API void llama_profile_device (struct device_info * dev_info, struct llama_model * model, const char * test_file);
|
||||
LLAMA_API void llama_profile_device (struct device_info * dev_info, struct llama_model * model, const char * test_file, int n_threads);
|
||||
LLAMA_API ggml_backend_buffer_type_t llama_dev_buffer_type(struct llama_model * model, int device);
|
||||
|
||||
//optional:
|
||||
|
@ -476,6 +476,7 @@ extern "C" {
|
|||
LLAMA_API int32_t llama_n_embd (const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_n_layer (const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_n_head (const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_n_ff_hidden(const struct llama_model * model);
|
||||
|
||||
LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
|
||||
|
||||
|
|
|
@ -3546,9 +3546,11 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_
|
|||
GGML_UNUSED(model);
|
||||
}
|
||||
|
||||
void llama_profile_device(device_info * dev_info, struct llama_model * model, const char * test_file) {
|
||||
void llama_profile_device(device_info * dev_info, struct llama_model * model, const char * test_file, int n_threads) {
|
||||
dev_info->device_name = device_name();
|
||||
dev_info->cpu_props.cores = device_cpu_cores();
|
||||
dev_info->cpu_props.flops_f32 = device_cpu_flops(model, GGML_TYPE_F32, n_threads);
|
||||
dev_info->cpu_props.flops_f16 = device_cpu_flops(model, GGML_TYPE_F16, n_threads);
|
||||
|
||||
dev_info->memory.total_physical = round(device_physical_memory(false) / (double)(1 << 30) * 100) / 100;
|
||||
dev_info->memory.available_physical = round(device_physical_memory(true) / (double)(1 << 30) * 100) / 100;
|
||||
|
@ -20429,6 +20431,10 @@ int32_t llama_n_head(const struct llama_model * model) {
|
|||
return model->hparams.n_head();
|
||||
}
|
||||
|
||||
int32_t llama_n_ff_hidden(const struct llama_model * model) {
|
||||
return model->hparams.n_ff_arr[0];
|
||||
}
|
||||
|
||||
const struct llama_model * llama_get_model(const struct llama_context * ctx) {
|
||||
return &ctx->model;
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue