From 72701ae8724fc2bfe7e86884b26b6762281e746f Mon Sep 17 00:00:00 2001 From: "Li, Zonghang" <870644199@qq.com> Date: Tue, 24 Jun 2025 20:39:49 +0400 Subject: [PATCH] fix compute buffer estimate: reserve 200 MiB VRAM to avoid potential OOM --- src/llama.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index a97e4ae1..de548a66 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -22013,8 +22013,8 @@ void llama_model_compute_buf_size( // context GPU memory usage, i.e. the initial memory cost of creating a CUDA context, // even before you launch any kernels or allocate your own buffers. // this value may vary by GPU and CUDA version, but it's lower than 400 MiB in most cases, - // another 100 MiB is used to prevent accidental OOM. - *gpu_buf += 500 * 1024 * 1024; + // another 200 MiB is used to prevent accidental OOM. + *gpu_buf += 600 * 1024 * 1024; } }