diff --git a/common/common.cpp b/common/common.cpp
index 89707fb7..c90048dc 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1588,6 +1588,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
 
     if (n_world == 1) {
         uint32_t n_layers = llama_model_n_layers(model);
+        
         // assign all layers to this device
         params.n_layer_window[0]  = n_layers;
         cparams.n_layer_window[0] = n_layers;
@@ -1596,6 +1597,8 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
 
 #if defined(GGML_USE_METAL) || defined(GGML_USE_CUDA)
         params.n_gpu_layers = std::min((int32_t)n_layers, params.n_gpu_layers);
+        cparams.n_gpu_layers = params.n_gpu_layers;
+        mparams.n_gpu_layers = params.n_gpu_layers;
 #endif
 
     } else {
diff --git a/common/profiler.cpp b/common/profiler.cpp
index b842071c..b54bb0be 100644
--- a/common/profiler.cpp
+++ b/common/profiler.cpp
@@ -48,6 +48,16 @@
 #include <dirent.h>
 
 
+static int gcd_int(int a, int b) {
+    while (b != 0) {
+        int t = b;
+        b = a % b;
+        a = t;
+    }
+    return a;
+}
+
+
 static size_t get_page_size() {
     size_t page_size = 0;
 
@@ -154,8 +164,25 @@ uint32_t device_cpu_cores() {
 
 static float device_flops(struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t, enum profiler_backend_type btype, int n_threads) {
     int n_repeat = 1;
-    int n_embd = std::min(llama_n_embd(model), 4096);
-    if (btype == PROFILER_BACKEND_TYPE_CPU) n_embd /= 8; // simulate small tensor calculation on cpu
+    int n_embd   = std::min(llama_n_embd(model), 4096);
+
+    // simulate small tensor calculation on cpu
+    if (btype == PROFILER_BACKEND_TYPE_CPU) n_embd /= 8;
+
+    // ensure that the block sizes of the tensors are compatible
+    int bs0 = ggml_blck_size(src0t);
+    int bs1 = ggml_blck_size(src1t);
+    int gcd = gcd_int(bs0, bs1);
+    int lcm = bs0 / gcd * bs1;
+
+    if (n_embd % bs0 != 0 || n_embd % bs1 != 0) {
+        if (n_embd < lcm) {
+            n_embd = 2 * lcm;
+        } else {
+            n_embd = 2 * (n_embd / lcm) * lcm;
+        }
+    }
+
     std::vector<float> matrix_A(n_embd * n_embd, 1.0f); 
     std::vector<float> matrix_B(n_embd * n_embd, 1.0f / n_embd);
 
@@ -188,9 +215,6 @@ static float device_flops(struct llama_model * model, enum ggml_type src0t, enum
     };
     struct ggml_context * ctx = ggml_init(params);
 
-    if(n_embd < ggml_blck_size(src0t)){
-        n_embd = 2 * ggml_blck_size(src0t);
-    }
     struct ggml_tensor * tensor_a = ggml_new_tensor_2d(ctx, src0t, n_embd, n_embd);
     struct ggml_tensor * tensor_b = ggml_new_tensor_2d(ctx, src1t, n_embd, n_embd);