diff --git a/Makefile b/Makefile
index f2fe5c24..3ff09f78 100644
--- a/Makefile
+++ b/Makefile
@@ -1,45 +1,46 @@
 # Define the default target now so that it is always the first target
-BUILD_TARGETS = \
-	libllava.a \
-	llama-baby-llama \
-	llama-batched \
-	llama-batched-bench \
-	llama-bench \
-	llama-cli \
-	llama-convert-llama2c-to-ggml \
-	llama-embedding \
-	llama-eval-callback \
-	llama-export-lora \
-	llama-gbnf-validator \
-	llama-gguf \
-	llama-gguf-hash \
-	llama-gguf-split \
-	llama-gritlm \
-	llama-imatrix \
-	llama-infill \
-	llama-llava-cli \
-	llama-minicpmv-cli\
-	llama-lookahead \
-	llama-lookup \
-	llama-lookup-create \
-	llama-lookup-merge \
-	llama-lookup-stats \
-	llama-parallel \
-	llama-passkey \
-	llama-perplexity \
-	llama-q8dot \
-	llama-quantize \
-	llama-quantize-stats \
-	llama-retrieval \
-	llama-save-load-state \
-	llama-server \
-	llama-simple \
-	llama-speculative \
-	llama-tokenize \
-	llama-vdot \
-	llama-cvector-generator \
-	llama-gen-docs \
-	tests/test-c.o
+BUILD_TARGETS = llama-cli
+# BUILD_TARGETS = \
+# 	libllava.a \
+# 	llama-baby-llama \
+# 	llama-batched \
+# 	llama-batched-bench \
+# 	llama-bench \
+# 	llama-cli \
+# 	llama-convert-llama2c-to-ggml \
+# 	llama-embedding \
+# 	llama-eval-callback \
+# 	llama-export-lora \
+# 	llama-gbnf-validator \
+# 	llama-gguf \
+# 	llama-gguf-hash \
+# 	llama-gguf-split \
+# 	llama-gritlm \
+# 	llama-imatrix \
+# 	llama-infill \
+# 	llama-llava-cli \
+# 	llama-minicpmv-cli\
+# 	llama-lookahead \
+# 	llama-lookup \
+# 	llama-lookup-create \
+# 	llama-lookup-merge \
+# 	llama-lookup-stats \
+# 	llama-parallel \
+# 	llama-passkey \
+# 	llama-perplexity \
+# 	llama-q8dot \
+# 	llama-quantize \
+# 	llama-quantize-stats \
+# 	llama-retrieval \
+# 	llama-save-load-state \
+# 	llama-server \
+# 	llama-simple \
+# 	llama-speculative \
+# 	llama-tokenize \
+# 	llama-vdot \
+# 	llama-cvector-generator \
+# 	llama-gen-docs \
+# 	tests/test-c.o
 
 # Binaries only useful for tests
 TEST_TARGETS = \
diff --git a/common/common.cpp b/common/common.cpp
index a2f7b4cc..a2690a3c 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -890,9 +890,11 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
     }
 
     // get device profile
+    LOG_INF("Start profiling this device, this may take some seconds ...\n");
+
     device_info dev_info;
     dev_info.rank = params.rank;
-    llama_profile_device(&dev_info, model, params.model.c_str());
+    llama_profile_device(&dev_info, model, params.model.c_str(), params.cpuparams.n_threads);
 
     // create llama context
     struct llama_context_params cparams = llama_context_params_from_gpt_params(params);
diff --git a/common/profiler.cpp b/common/profiler.cpp
index 44abfe6a..b4505c42 100644
--- a/common/profiler.cpp
+++ b/common/profiler.cpp
@@ -73,6 +73,69 @@ uint32_t device_cpu_cores() {
     return core_count;
 }
 
+float device_cpu_flops(struct llama_model * model, enum ggml_type dtype, int n_threads) {
+    // define matrix dimensions
+    const int n_embd      = llama_n_embd(model);
+    const int n_ff_hidden = llama_n_ff_hidden(model);
+    const int rows_A = n_embd, cols_A = n_ff_hidden;
+    const int rows_B = n_embd, cols_B = n_ff_hidden;
+
+    // calculate memory size needed for ggml_context allocation
+    size_t ctx_size = 0;
+    ctx_size += rows_A * cols_A * ggml_type_size(dtype); // tensor a
+    ctx_size += rows_B * cols_B * ggml_type_size(dtype); // tensor b
+    ctx_size += rows_A * rows_B * ggml_type_size(dtype); // result
+    ctx_size += 3 * ggml_tensor_overhead(); // metadata for 3 tensors
+    ctx_size += ggml_graph_overhead(); // compute graph
+    ctx_size = (size_t)(ctx_size * 1.2); // some overhead
+
+    // allocate ggml_context
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ ctx_size,
+        /*.mem_buffer =*/ NULL,
+        /*.no_alloc   =*/ false,
+    };
+    struct ggml_context * ctx = ggml_init(params);
+
+    // create tensors and set data
+    struct ggml_tensor * tensor_a = ggml_new_tensor_2d(ctx, dtype, cols_A, rows_A);
+    struct ggml_tensor * tensor_b = ggml_new_tensor_2d(ctx, dtype, cols_B, rows_B);
+
+    // fill tensors with random data
+    float * matrix_A = (float *)malloc(rows_A * cols_A * sizeof(float));
+    float * matrix_B = (float *)malloc(rows_B * cols_B * sizeof(float));
+
+    for (int i = 0; i < rows_A * cols_A; i++) {
+        matrix_A[i] = (float)(rand() % 100) / 10.0f; // random float between 0.0 and 10.0
+    }
+    for (int i = 0; i < rows_B * cols_B; i++) {
+        matrix_B[i] = (float)(rand() % 100) / 10.0f;
+    }
+
+    memcpy(tensor_a->data, matrix_A, ggml_nbytes(tensor_a));
+    memcpy(tensor_b->data, matrix_B, ggml_nbytes(tensor_b));
+
+    free(matrix_A);
+    free(matrix_B);
+
+    // create ggml_cgraph for multiplication
+    struct ggml_cgraph * gf = ggml_new_graph(ctx);
+    struct ggml_tensor * result = ggml_mul_mat(ctx, tensor_a, tensor_b);
+    ggml_build_forward_expand(gf, result);
+
+    // run the computation
+    int64_t start_time = ggml_time_us();
+    ggml_graph_compute_with_ctx(ctx, gf, n_threads);
+    int64_t end_time = ggml_time_us();
+
+    double elapsed_seconds = (end_time - start_time) / 1e6;
+    double flops = (2.0 * rows_A * cols_A * cols_B) / elapsed_seconds / 1e9;
+
+    // free memory
+    ggml_free(ctx);
+    return (float)flops;
+}
+
 uint64_t device_physical_memory(bool available) {
     uint64_t memory = 0;
 
@@ -344,6 +407,18 @@ void device_print_props(struct device_info * dev_info_set, int n) {
     }
     LOG_INF("\n");
 
+    LOG_INF("| CPU flops (F32)              ");
+    for (int i = 0; i < n; ++i) {
+        LOG_INF("| %-10.1f   ", dev_info_set[i].cpu_props.flops_f32);
+    }
+    LOG_INF("\n");
+
+    LOG_INF("| CPU flops (F16)              ");
+    for (int i = 0; i < n; ++i) {
+        LOG_INF("| %-10.1f   ", dev_info_set[i].cpu_props.flops_f16);
+    }
+    LOG_INF("\n");
+
     LOG_INF("| Physical Mem Total (GB)      ");
     for (int i = 0; i < n; ++i) {
         LOG_INF("| %-10.2f   ", dev_info_set[i].memory.total_physical);
@@ -467,6 +542,7 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
                       + gpu_description_len
                       + sizeof(float)       // disk_read_bandwidth
                       + sizeof(uint32_t)    // cpu_props.cores
+                      + sizeof(float) * 2    // cpu_props.flops_f32 and cpu_props.flops_f16
                       + sizeof(struct memory_info)
                       + sizeof(struct gpu_support)
                       + sizeof(float) * 2;  // gpu_props.memory_free and gpu_props.memory_total
@@ -511,6 +587,12 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
     memcpy(ptr, &dev_info->cpu_props.cores, sizeof(uint32_t));
     ptr += sizeof(uint32_t);
 
+    memcpy(ptr, &dev_info->cpu_props.flops_f32, sizeof(float));
+    ptr += sizeof(float);
+
+    memcpy(ptr, &dev_info->cpu_props.flops_f16, sizeof(float));
+    ptr += sizeof(float);
+
     memcpy(ptr, &dev_info->memory, sizeof(struct memory_info));
     ptr += sizeof(struct memory_info);
 
@@ -579,6 +661,12 @@ void deserialize(const char * buffer, struct device_info * dev_info) {
     memcpy(&dev_info->cpu_props.cores, ptr, sizeof(uint32_t));
     ptr += sizeof(uint32_t);
 
+    memcpy(&dev_info->cpu_props.flops_f32, ptr, sizeof(float));
+    ptr += sizeof(float);
+
+    memcpy(&dev_info->cpu_props.flops_f16, ptr, sizeof(float));
+    ptr += sizeof(float);
+
     memcpy(&dev_info->memory, ptr, sizeof(struct memory_info));
     ptr += sizeof(struct memory_info);
 
diff --git a/common/profiler.h b/common/profiler.h
index b768b7cc..fc6d1b8a 100644
--- a/common/profiler.h
+++ b/common/profiler.h
@@ -1,12 +1,15 @@
 #ifndef PROFILER_H
 #define PROFILER_H
 
+#include "ggml.h"
 #include "llama.h"
 
 struct cpu_props {
     const char * name;
     const char * description;
     uint32_t     cores;
+    float        flops_f32;
+    float        flops_f16;
 };
 
 struct memory_info {
@@ -47,6 +50,7 @@ struct device_info {
 const char * device_name(void); 
 
 uint32_t device_cpu_cores      (void);
+float    device_cpu_flops      (struct llama_model * model, enum ggml_type dtype, int n_threads);
 uint64_t device_physical_memory(bool available);
 uint64_t device_swap_memory    (bool available);
 uint64_t device_disk_read_bw   (const char * test_file, size_t buffer_size_mb);
diff --git a/include/llama.h b/include/llama.h
index ff7d1599..1c61f624 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -410,7 +410,7 @@ extern "C" {
     // Call once at the start of the program
     LLAMA_API void llama_backend_init(void);
 
-    LLAMA_API void llama_profile_device (struct device_info * dev_info, struct llama_model * model, const char * test_file);
+    LLAMA_API void llama_profile_device (struct device_info * dev_info, struct llama_model * model, const char * test_file, int n_threads);
     LLAMA_API ggml_backend_buffer_type_t llama_dev_buffer_type(struct llama_model * model, int device);
 
     //optional:
@@ -476,6 +476,7 @@ extern "C" {
     LLAMA_API int32_t llama_n_embd     (const struct llama_model * model);
     LLAMA_API int32_t llama_n_layer    (const struct llama_model * model);
     LLAMA_API int32_t llama_n_head     (const struct llama_model * model);
+    LLAMA_API int32_t llama_n_ff_hidden(const struct llama_model * model);
 
     LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
 
diff --git a/src/llama.cpp b/src/llama.cpp
index b77e7ca9..a68ecfbc 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -3546,9 +3546,11 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_
     GGML_UNUSED(model);
 }
 
-void llama_profile_device(device_info * dev_info, struct llama_model * model, const char * test_file) {
+void llama_profile_device(device_info * dev_info, struct llama_model * model, const char * test_file, int n_threads) {
     dev_info->device_name               = device_name();
     dev_info->cpu_props.cores           = device_cpu_cores();
+    dev_info->cpu_props.flops_f32       = device_cpu_flops(model, GGML_TYPE_F32, n_threads);
+    dev_info->cpu_props.flops_f16       = device_cpu_flops(model, GGML_TYPE_F16, n_threads);
 
     dev_info->memory.total_physical     = round(device_physical_memory(false) / (double)(1 << 30) * 100) / 100;
     dev_info->memory.available_physical = round(device_physical_memory(true)  / (double)(1 << 30) * 100) / 100;
@@ -20429,6 +20431,10 @@ int32_t llama_n_head(const struct llama_model * model) {
     return model->hparams.n_head();
 }
 
+int32_t llama_n_ff_hidden(const struct llama_model * model) {
+    return model->hparams.n_ff_arr[0];
+}
+
 const struct llama_model * llama_get_model(const struct llama_context * ctx) {
     return &ctx->model;
 }