From 4dde8458cf2e4e13dae84ae074a945f3d011dd01 Mon Sep 17 00:00:00 2001
From: "Li, Zonghang" <870644199@qq.com>
Date: Tue, 24 Jun 2025 19:29:10 +0400
Subject: [PATCH] fix compute buffer estimate: reserve 100 MiB VRAM to avoid
 potential OOM

---
 src/llama.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 45b15223..a97e4ae1 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -22012,8 +22012,9 @@ void llama_model_compute_buf_size(
     if (backend == BACKEND_CUDA) {
         // context GPU memory usage, i.e. the initial memory cost of creating a CUDA context, 
         // even before you launch any kernels or allocate your own buffers.
-        // this value may vary by GPU and CUDA version, but it's lower than 400 MiB in most cases.
-        *gpu_buf += 400 * 1024 * 1024;
+        // this value may vary by GPU and CUDA version, but it's lower than 400 MiB in most cases,
+        // another 100 MiB is used to prevent accidental OOM.
+        *gpu_buf += 500 * 1024 * 1024;
     }
 }