Merge branch 'master' into concedo_experimental

# Conflicts: # README.md
2025-09-13 02:19:41 +00:00 · 2023-12-27 21:43:46 +08:00 · 2023-12-27 21:43:46 +08:00 · 69ab1bf2f8
commit 69ab1bf2f8
parent 5b2d93a1f8 b47879b0dd
7 changed files with 406 additions and 593 deletions
--- a/llama.cpp
+++ b/llama.cpp
@ -9784,7 +9784,8 @@ struct llama_context * llama_new_context_with_model(
            ctx->alloc = ggml_allocr_new_from_buffer(ctx->buf_alloc);
 #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
            if (model->n_gpu_layers > 0) {
-                ggml_cuda_set_scratch_size(alloc_size);
+                // the CPU buffer adds this padding in case the malloc buffer is not aligned, so we need to do the same for the GPU buffer, since we use the same offsets
+                ggml_cuda_set_scratch_size(alloc_size + 64);
                LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0);

                // calculate total VRAM usage