fix compute buffer estimate: reserve 100 MiB VRAM to avoid potential OOM

This commit is contained in:
Li, Zonghang 2025-06-24 19:29:10 +04:00
parent 90b1079d78
commit 4dde8458cf

View file

@ -22012,8 +22012,9 @@ void llama_model_compute_buf_size(
if (backend == BACKEND_CUDA) { if (backend == BACKEND_CUDA) {
// context GPU memory usage, i.e. the initial memory cost of creating a CUDA context, // context GPU memory usage, i.e. the initial memory cost of creating a CUDA context,
// even before you launch any kernels or allocate your own buffers. // even before you launch any kernels or allocate your own buffers.
// this value may vary by GPU and CUDA version, but it's lower than 400 MiB in most cases. // this value may vary by GPU and CUDA version, but it's lower than 400 MiB in most cases,
*gpu_buf += 400 * 1024 * 1024; // another 100 MiB is used to prevent accidental OOM.
*gpu_buf += 500 * 1024 * 1024;
} }
} }