diff --git a/common/profiler.cpp b/common/profiler.cpp index 61f6a5c1..829c8f58 100644 --- a/common/profiler.cpp +++ b/common/profiler.cpp @@ -101,7 +101,7 @@ uint32_t device_cpu_cores() { static float device_flops(struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t, enum profiler_backend_type btype, int n_threads) { int n_repeat = 1; int n_embd = std::min(llama_n_embd(model), 4096); - if (btype == PROFILER_BACKEND_TYPE_CPU) n_embd /= 8; // simulate small tensor calculation on cpu + // if (btype == PROFILER_BACKEND_TYPE_CPU) n_embd /= 8; // simulate small tensor calculation on cpu std::vector matrix_A(n_embd * n_embd, 1.0f); std::vector matrix_B(n_embd * n_embd, 1.0f / n_embd); @@ -1381,6 +1381,13 @@ static uint64_t device_termux_swappable_memory() { return total_swappable; } +uint64_t device_swappable_memory() { + if (access("/data/data/com.termux/files/usr/bin", F_OK) == 0) { + return device_termux_swappable_memory(); + } + return 0; +} + static float device_disk_access_delay(struct device_info & dev_info, struct llama_model * model, const struct llama_context_params cparams) { auto n_bytes = dev_info.model_bytes; int n_layers = llama_model_n_layers(model); @@ -1463,18 +1470,12 @@ static float device_disk_access_delay(struct device_info & dev_info, struct llam if (getenv("TERMUX_VERSION") != NULL) { // termux on android: swap has higher priority than releasing mmap // non-app memory that can be swapped to disk - float used_mem_can_swap = (float)(static_cast(device_termux_swappable_memory()) / 1024.0 / 1024.0 / 1024.0); - float swapout_gib = std::min( - std::min(0.0f, total_mem_needed - dev_info.memory.available_physical), - std::min(used_mem_can_swap, dev_info.memory.available_swap) + float swapout_gib = std::min( + std::max(0.0f, total_mem_needed - dev_info.memory.available_physical), + std::min(dev_info.memory.used_can_swap, dev_info.memory.available_swap) ); - float disk_write_bw = dev_info.disk.write_seq_bw * 1e9 / 1024.0 / 1024.0 / 1024.0; - float swapout_delay = swapout_gib / disk_write_bw * 1000; // ms - - float mmapin_gib = total_mem_needed - (dev_info.memory.available_physical + swapout_gib); - float mmapin_delay = mmapin_gib / disk_read_bw * 1000; // ms - - return swapout_delay + mmapin_delay; + float mmapin_gib = total_mem_needed - (dev_info.memory.available_physical + swapout_gib); + return mmapin_gib / disk_read_bw * 1000; // ms } else { // if this linux not in termux env, use sequantial read bandwidth // POSIX_FADV_SEQUENTIAL is set on linux @@ -1592,6 +1593,12 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m } LOG_INF("\n"); + LOG_INF("| Used Mem Swappable (GiB) "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.2f ", dev_info_set[i].memory.used_can_swap); + } + LOG_INF("\n"); + LOG_INF("| Swap Mem Total (GiB) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].memory.total_swap); diff --git a/common/profiler.h b/common/profiler.h index 1e65e5bd..acd6f5ac 100644 --- a/common/profiler.h +++ b/common/profiler.h @@ -37,13 +37,15 @@ struct cpu_props { struct memory_info { float total_physical; // in GiB float available_physical; // in GiB + float used_can_swap; // in GiB float total_swap; // in GiB float available_swap; // in GiB - float cpu_read_ram_bw; // in GB/s + float cpu_read_ram_bw; // in GB/s memory_info() : total_physical (0.0f), available_physical(0.0f), + used_can_swap (0.0f), total_swap (0.0f), available_swap (0.0f), cpu_read_ram_bw (0.0f) {} @@ -251,6 +253,7 @@ float device_cuda_flops (struct llama_model * model, enum ggml_type sr float device_inp_embd_delay (struct llama_model * model, enum ggml_type src0t, int n_tokens, int n_threads); uint64_t device_physical_memory (bool available); uint64_t device_swap_memory (bool available); +uint64_t device_swappable_memory (); void device_disk_seq_bw (float * read_seq_bw, float * write_seq_bw, int n_threads); void device_disk_rnd_bw (float * read_rnd_bw, float * write_rnd_bw, int n_threads); float device_memory_bw (int n_thread); diff --git a/src/llama.cpp b/src/llama.cpp index 418dd2ac..901ec2f9 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3583,6 +3583,7 @@ void llama_profile_device( dev_info->memory.total_physical = round(device_physical_memory(false) / (double)(1 << 30) * 100) / 100; dev_info->memory.available_physical = round(device_physical_memory(true) / (double)(1 << 30) * 100) / 100; + dev_info->memory.used_can_swap = round(device_swappable_memory() / (double)(1 << 30) * 100) / 100; dev_info->memory.total_swap = round(device_swap_memory(false) / (double)(1 << 30) * 100) / 100; dev_info->memory.available_swap = round(device_swap_memory(true) / (double)(1 << 30) * 100) / 100; dev_info->memory.cpu_read_ram_bw = device_memory_bw(n_threads);