increase pool buffers

2026-05-30 20:33:39 +00:00 · 2024-01-20 11:52:39 +08:00 · 2024-01-20 11:52:39 +08:00 · 97693e7e97
commit 97693e7e97
parent 21f0ce2502
2 changed files with 10 additions and 10 deletions
--- a/otherarch/ggml_v2-cuda-legacy.cu
+++ b/otherarch/ggml_v2-cuda-legacy.cu
@ -385,7 +385,7 @@ static to_fp32_cuda_t ggml_v2_get_to_fp32_cuda(ggml_v2_type type) {
 }

 // buffer pool for cuda
-#define MAX_CUDA_BUFFERS 16
+#define MAX_CUDA_BUFFERS_V2 16

 struct scoped_spin_lock {
    std::atomic_flag& lock;
@ -406,13 +406,13 @@ struct cuda_buffer {
    size_t size = 0;
 };

-static cuda_buffer g_cuda_buffer_pool[MAX_CUDA_BUFFERS];
+static cuda_buffer g_cuda_buffer_pool[MAX_CUDA_BUFFERS_V2];
 static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;

 static void * ggml_v2_cuda_pool_malloc(size_t size, size_t * actual_size) {
    scoped_spin_lock lock(g_cuda_pool_lock);

-    for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
+    for (int i = 0; i < MAX_CUDA_BUFFERS_V2; ++i) {
        cuda_buffer& b = g_cuda_buffer_pool[i];
        if (b.size >= size && b.ptr != nullptr) {
            void * ptr = b.ptr;
@ -431,7 +431,7 @@ static void * ggml_v2_cuda_pool_malloc(size_t size, size_t * actual_size) {
 static void ggml_v2_cuda_pool_free(void * ptr, size_t size) {
    scoped_spin_lock lock(g_cuda_pool_lock);

-    for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
+    for (int i = 0; i < MAX_CUDA_BUFFERS_V2; ++i) {
        cuda_buffer& b = g_cuda_buffer_pool[i];
        if (b.ptr == nullptr) {
            b.ptr = ptr;
@ -439,7 +439,7 @@ static void ggml_v2_cuda_pool_free(void * ptr, size_t size) {
            return;
        }
    }
-    fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
+    fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS_V2\n");
    CUDA_CHECK(cudaFree(ptr));
 }

--- a/otherarch/ggml_v3-cuda.cu
+++ b/otherarch/ggml_v3-cuda.cu
@ -7254,7 +7254,7 @@ static void im2col_f32_f16_cuda(const float* x, half* dst,
 }

 // buffer pool for cuda
-#define MAX_CUDA_BUFFERS 256
+#define MAX_CUDA_BUFFERS_V3 512

 struct scoped_spin_lock {
    std::atomic_flag& lock;
@ -7278,7 +7278,7 @@ struct ggml_v3_cuda_buffer {
    size_t size = 0;
 };

-static ggml_v3_cuda_buffer g_cuda_buffer_pool[GGML_V3_CUDA_MAX_DEVICES][MAX_CUDA_BUFFERS];
+static ggml_v3_cuda_buffer g_cuda_buffer_pool[GGML_V3_CUDA_MAX_DEVICES][MAX_CUDA_BUFFERS_V3];
 static size_t g_cuda_pool_size[GGML_V3_CUDA_MAX_DEVICES] = {0};

 static void * ggml_v3_cuda_pool_malloc_leg(int device, size_t size, size_t * actual_size) {
@ -7289,7 +7289,7 @@ static void * ggml_v3_cuda_pool_malloc_leg(int device, size_t size, size_t * act
    int worst_i = -1;
    size_t worst_size = 0; //largest unused buffer seen so far

-    for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
+    for (int i = 0; i < MAX_CUDA_BUFFERS_V3; ++i) {
        ggml_v3_cuda_buffer& b = g_cuda_buffer_pool[device][i];
        if (b.size > 0 && b.size >= size && b.size < best_size)
        {
@ -7336,7 +7336,7 @@ static void * ggml_v3_cuda_pool_malloc_leg(int device, size_t size, size_t * act
 static void ggml_v3_cuda_pool_free_leg(int device, void * ptr, size_t size) {
    scoped_spin_lock lock(g_cuda_pool_lock);

-    for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
+    for (int i = 0; i < MAX_CUDA_BUFFERS_V3; ++i) {
        ggml_v3_cuda_buffer& b = g_cuda_buffer_pool[device][i];
        if (b.ptr == nullptr) {
            b.ptr = ptr;
@ -7344,7 +7344,7 @@ static void ggml_v3_cuda_pool_free_leg(int device, void * ptr, size_t size) {
            return;
        }
    }
-    fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
+    fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS_V3\n");
    ggml_v3_cuda_set_device(device);
    CUDA_CHECK(cudaFree(ptr));
    g_cuda_pool_size[device] -= size;