diff --git a/common/profiler.cpp b/common/profiler.cpp
index 61f6a5c1..829c8f58 100644
--- a/common/profiler.cpp
+++ b/common/profiler.cpp
@@ -101,7 +101,7 @@ uint32_t device_cpu_cores() {
 static float device_flops(struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t, enum profiler_backend_type btype, int n_threads) {
     int n_repeat = 1;
     int n_embd = std::min(llama_n_embd(model), 4096);
-    if (btype == PROFILER_BACKEND_TYPE_CPU) n_embd /= 8; // simulate small tensor calculation on cpu
+    // if (btype == PROFILER_BACKEND_TYPE_CPU) n_embd /= 8; // simulate small tensor calculation on cpu
     std::vector<float> matrix_A(n_embd * n_embd, 1.0f); 
     std::vector<float> matrix_B(n_embd * n_embd, 1.0f / n_embd);
 
@@ -1381,6 +1381,13 @@ static uint64_t device_termux_swappable_memory() {
     return total_swappable;
 }
 
+uint64_t device_swappable_memory() {
+    if (access("/data/data/com.termux/files/usr/bin", F_OK) == 0) {
+        return device_termux_swappable_memory();
+    }
+    return 0;
+}
+
 static float device_disk_access_delay(struct device_info & dev_info, struct llama_model * model, const struct llama_context_params cparams) {
     auto n_bytes     = dev_info.model_bytes;
     int n_layers     = llama_model_n_layers(model);
@@ -1463,18 +1470,12 @@ static float device_disk_access_delay(struct device_info & dev_info, struct llam
         if (getenv("TERMUX_VERSION") != NULL) {
             // termux on android: swap has higher priority than releasing mmap
             // non-app memory that can be swapped to disk
-            float used_mem_can_swap = (float)(static_cast<double>(device_termux_swappable_memory()) / 1024.0 / 1024.0 / 1024.0);
-            float swapout_gib       = std::min(
-                std::min(0.0f, total_mem_needed - dev_info.memory.available_physical),
-                std::min(used_mem_can_swap, dev_info.memory.available_swap)
+            float swapout_gib = std::min(
+                std::max(0.0f, total_mem_needed - dev_info.memory.available_physical),
+                std::min(dev_info.memory.used_can_swap, dev_info.memory.available_swap)
             );
-            float disk_write_bw     = dev_info.disk.write_seq_bw * 1e9 / 1024.0 / 1024.0 / 1024.0;
-            float swapout_delay     = swapout_gib / disk_write_bw * 1000; // ms
-
-            float mmapin_gib        = total_mem_needed - (dev_info.memory.available_physical + swapout_gib);
-            float mmapin_delay      = mmapin_gib / disk_read_bw * 1000; // ms
-            
-            return swapout_delay + mmapin_delay;
+            float mmapin_gib = total_mem_needed - (dev_info.memory.available_physical + swapout_gib);
+            return mmapin_gib / disk_read_bw * 1000; // ms
         } else {
             // if this linux not in termux env, use sequantial read bandwidth
             // POSIX_FADV_SEQUENTIAL is set on linux
@@ -1592,6 +1593,12 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
     }
     LOG_INF("\n");
 
+    LOG_INF("| Used Mem Swappable (GiB)     ");
+    for (int i = 0; i < n; ++i) {
+        LOG_INF("| %-10.2f   ", dev_info_set[i].memory.used_can_swap);
+    }
+    LOG_INF("\n");
+
     LOG_INF("| Swap Mem Total (GiB)         ");
     for (int i = 0; i < n; ++i) {
         LOG_INF("| %-10.2f   ", dev_info_set[i].memory.total_swap);
diff --git a/common/profiler.h b/common/profiler.h
index 1e65e5bd..acd6f5ac 100644
--- a/common/profiler.h
+++ b/common/profiler.h
@@ -37,13 +37,15 @@ struct cpu_props {
 struct memory_info {
     float        total_physical;     // in GiB
     float        available_physical; // in GiB
+    float        used_can_swap;      // in GiB
     float        total_swap;         // in GiB
     float        available_swap;     // in GiB
-    float        cpu_read_ram_bw;     // in GB/s
+    float        cpu_read_ram_bw;    // in GB/s
 
     memory_info() : 
         total_physical    (0.0f), 
         available_physical(0.0f), 
+        used_can_swap     (0.0f),
         total_swap        (0.0f), 
         available_swap    (0.0f), 
         cpu_read_ram_bw   (0.0f) {}
@@ -251,6 +253,7 @@ float    device_cuda_flops        (struct llama_model * model, enum ggml_type sr
 float    device_inp_embd_delay    (struct llama_model * model, enum ggml_type src0t, int n_tokens, int n_threads);
 uint64_t device_physical_memory   (bool available);
 uint64_t device_swap_memory       (bool available);
+uint64_t device_swappable_memory  ();
 void     device_disk_seq_bw       (float * read_seq_bw, float * write_seq_bw, int n_threads);
 void     device_disk_rnd_bw       (float * read_rnd_bw, float * write_rnd_bw, int n_threads);
 float    device_memory_bw         (int n_thread);
diff --git a/src/llama.cpp b/src/llama.cpp
index 418dd2ac..901ec2f9 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -3583,6 +3583,7 @@ void llama_profile_device(
 
     dev_info->memory.total_physical     = round(device_physical_memory(false) / (double)(1 << 30) * 100) / 100;
     dev_info->memory.available_physical = round(device_physical_memory(true)  / (double)(1 << 30) * 100) / 100;
+    dev_info->memory.used_can_swap      = round(device_swappable_memory()     / (double)(1 << 30) * 100) / 100;
     dev_info->memory.total_swap         = round(device_swap_memory(false)     / (double)(1 << 30) * 100) / 100;
     dev_info->memory.available_swap     = round(device_swap_memory(true)      / (double)(1 << 30) * 100) / 100;
     dev_info->memory.cpu_read_ram_bw    = device_memory_bw(n_threads);