diff --git a/common/common.cpp b/common/common.cpp index 942d513b..8cc7a964 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1201,8 +1201,7 @@ static bool assign_layers_to_device( } if (dev_gpu[m]) { - float reserved_mem = 0.1f; // reserved shared memory to avoid potential OOM, set to 100 MiB by default - vec_z_gpu[m] = (double)((dev.gpu_props.memory_free - reserved_mem) * GIGABYTE - c_gpu[m]) / (double)(n_layer * b_prime); + vec_z_gpu[m] = (double)(dev.gpu_props.memory_free * GIGABYTE - c_gpu[m]) / (double)(n_layer * b_prime); if (dev.gpu_support.metal && m == 0 && cparams.keep_out_in_metal) { vec_z_gpu[m] -= (double)bo / (double)(n_layer * b_prime); } diff --git a/src/llama.cpp b/src/llama.cpp index 5b32ce90..0c3715a0 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3620,12 +3620,10 @@ void llama_profile_device( dev_info->gpu_props.name = gpu_props.name; dev_info->gpu_props.description = gpu_props.description; - dev_info->gpu_props.memory_free = round(gpu_props.memory_free / (double)(1 << 30) * 100) / 100; -#if defined(GGML_USE_CUDA) || defined(GGML_USE_METAL) - // GPU memory limitation + // reserved/limit memory to avoid potential OOM, default to 200 MiB + dev_info->gpu_props.memory_free = round(gpu_props.memory_free / (double)(1 << 30) * 100) / 100 - 0.2; dev_info->gpu_props.memory_free = std::min((float)gpu_mem, dev_info->gpu_props.memory_free); -#endif dev_info->gpu_props.memory_total = round(gpu_props.memory_total / (double)(1 << 30) * 100) / 100; dev_info->gpu_props.metal_read_vram_bw = device_metal_read_vram_bw(); @@ -18072,9 +18070,9 @@ static int llama_decode_internal( manage_graph_tensors(sub_gf, POSIX_MADV_DONTNEED); int next_gf_id = (i + 1) % gf.size(); - manage_graph_tensors(gf[next_gf_id], POSIX_MADV_WILLNEED, true); + manage_graph_tensors(gf[next_gf_id], POSIX_MADV_WILLNEED, false); if (my_rank == 0 && (is_last_l || (next_gf_id == (int)gf.size() - 1))) { - manage_graph_tensors(gf[0], POSIX_MADV_WILLNEED, true); + manage_graph_tensors(gf[0], POSIX_MADV_WILLNEED, false); } } }