From 3f008f2ad9b7e106599dbaa9308a759d54629d94 Mon Sep 17 00:00:00 2001
From: Zonghang Li <lizhuestc@gmail.com>
Date: Tue, 26 Nov 2024 22:28:34 +0400
Subject: [PATCH] add op GGML_OP_READ

---
 ggml/include/ggml.h         |  1 +
 ggml/src/ggml-cuda.cu       |  4 +++
 ggml/src/ggml-cuda/read.cu  | 49 +++++++++++++++++++++++++++++++++++++
 ggml/src/ggml-cuda/read.cuh |  5 ++++
 4 files changed, 59 insertions(+)
 create mode 100644 ggml/src/ggml-cuda/read.cu
 create mode 100644 ggml/src/ggml-cuda/read.cuh

diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 96b8b08c..962dc032 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -530,6 +530,7 @@ extern "C" {
         GGML_OP_OPT_STEP_ADAMW,
 
         GGML_OP_COUNT,
+        GGML_OP_READ,
     };
 
     enum ggml_unary_op {
diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index edb61abd..791b6861 100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -28,6 +28,7 @@
 #include "ggml-cuda/pad.cuh"
 #include "ggml-cuda/pool2d.cuh"
 #include "ggml-cuda/quantize.cuh"
+#include "ggml-cuda/read.cuh"
 #include "ggml-cuda/rope.cuh"
 #include "ggml-cuda/scale.cuh"
 #include "ggml-cuda/softmax.cuh"
@@ -2145,6 +2146,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
     }
 
     switch (dst->op) {
+        case GGML_OP_READ:
+            ggml_cuda_read(dst);
+            break;
         case GGML_OP_ARGMAX:
             ggml_cuda_argmax(ctx, dst);
             break;
diff --git a/ggml/src/ggml-cuda/read.cu b/ggml/src/ggml-cuda/read.cu
new file mode 100644
index 00000000..47ca250b
--- /dev/null
+++ b/ggml/src/ggml-cuda/read.cu
@@ -0,0 +1,49 @@
+#include "common.cuh"
+#include "read.cuh"
+
+__global__ void read_vram_f32(
+    const float * data, int64_t ne,
+    int64_t nb00, int64_t nb01, int64_t nb02, int64_t nb03,
+    int64_t ne00, int64_t ne01, int64_t ne02
+) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= ne) return;
+
+    int i = idx % ne00;
+    int j = (idx / ne00) % ne01;
+    int k = (idx / (ne00 * ne01)) % ne02;
+
+    int64_t offset = i * nb00 + j * nb01 + k * nb02;
+
+    volatile float value = data[offset / sizeof(float)];
+    asm volatile("" : : "f"(value) : "memory");
+}
+
+void ggml_cuda_read(ggml_tensor * dst) {
+    const int64_t ne = ggml_nelements(dst);
+    GGML_ASSERT(ggml_nbytes(dst) <= INT_MAX);
+    GGML_ASSERT(ggml_is_contiguous(dst));
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
+    const int64_t ne00 = dst->ne[0];
+    const int64_t ne01 = dst->ne[1];
+    const int64_t ne02 = dst->ne[2];
+
+    const int64_t nb00 = dst->nb[0];
+    const int64_t nb01 = dst->nb[1];
+    const int64_t nb02 = dst->nb[2];
+    const int64_t nb03 = dst->nb[3];
+
+    const char * dst_ddc = (const char *)dst->data;
+
+    cudaStream_t stream;
+    cudaStreamCreate(&stream);
+
+    const int num_blocks = (ne + CUDA_READ_BLOCK_SIZE - 1) / CUDA_READ_BLOCK_SIZE;
+    read_vram_f32<<<num_blocks, CUDA_READ_BLOCK_SIZE, 0, stream>>>(
+        (const float *)dst_ddc, ne, nb00, nb01, nb02, nb03, ne00, ne01, ne02
+    );
+
+    cudaStreamSynchronize(stream);
+    cudaStreamDestroy(stream);
+}
diff --git a/ggml/src/ggml-cuda/read.cuh b/ggml/src/ggml-cuda/read.cuh
new file mode 100644
index 00000000..0cd27d90
--- /dev/null
+++ b/ggml/src/ggml-cuda/read.cuh
@@ -0,0 +1,5 @@
+#include "common.cuh"
+
+#define CUDA_READ_BLOCK_SIZE 32
+
+void ggml_cuda_read(ggml_tensor * dst);