From 4dde8458cf2e4e13dae84ae074a945f3d011dd01 Mon Sep 17 00:00:00 2001 From: "Li, Zonghang" <870644199@qq.com> Date: Tue, 24 Jun 2025 19:29:10 +0400 Subject: [PATCH] fix compute buffer estimate: reserve 100 MiB VRAM to avoid potential OOM --- src/llama.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 45b15223..a97e4ae1 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -22012,8 +22012,9 @@ void llama_model_compute_buf_size( if (backend == BACKEND_CUDA) { // context GPU memory usage, i.e. the initial memory cost of creating a CUDA context, // even before you launch any kernels or allocate your own buffers. - // this value may vary by GPU and CUDA version, but it's lower than 400 MiB in most cases. - *gpu_buf += 400 * 1024 * 1024; + // this value may vary by GPU and CUDA version, but it's lower than 400 MiB in most cases, + // another 100 MiB is used to prevent accidental OOM. + *gpu_buf += 500 * 1024 * 1024; } }