Merge branch 'master' into concedo_experimental

# Conflicts: # Makefile # README.md
2025-09-16 03:49:42 +00:00 · 2023-12-23 00:01:21 +08:00 · 2023-12-23 00:01:21 +08:00 · b814bb217d
commit b814bb217d
parent 3bca03d26b ba66175132
3 changed files with 15 additions and 8 deletions
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -90,6 +90,13 @@
 #include <cuda_runtime.h>
 #include <cublas_v2.h>
 #include <cuda_fp16.h>
+// CUDA 10.2 does not have these macro definitions.
+#ifndef CUBLAS_TF32_TENSOR_OP_MATH
+#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
+#define CUBLAS_COMPUTE_16F CUDA_R_16F
+#define CUBLAS_COMPUTE_32F CUDA_R_32F
+#define cublasComputeType_t cudaDataType_t
+#endif
 #endif // defined(GGML_USE_HIPBLAS)

 #include "ggml-cuda.h"
@ -5267,17 +5274,17 @@ static  __global__ void im2col_f32_f16(
    const int ky = (i - kd) / OW;
    const int ix = i % OW;

-    const int iiw = ix * s0 + kx * d0 - p0;
-    const int iih = blockIdx.y * s1 + ky * d1 - p1;
+    const int64_t iiw = ix * s0 + kx * d0 - p0;
+    const int64_t iih = blockIdx.y * s1 + ky * d1 - p1;

-    const int offset_dst =
+    const int64_t offset_dst =
        (blockIdx.y * OW + ix) * CHW +
        (blockIdx.z * (KW * KH) + ky * KW + kx);

    if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
        dst[offset_dst] = __float2half(0.0f);
    } else {
-        const int offset_src = blockIdx.z * offset_delta;
+        const int64_t offset_src = blockIdx.z * offset_delta;
        dst[offset_dst] = __float2half(x[offset_src + iih * IW + iiw]);
    }
 }
@ -8835,7 +8842,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
        const cudaMemcpyKind src1_kind = src1->backend == GGML_BACKEND_CPU ?
            cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice;
        const cudaMemcpyKind dst_kind  =  dst->backend == GGML_BACKEND_CPU ?
-            cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice;
+            cudaMemcpyDeviceToHost : cudaMemcpyDeviceToDevice;

        for (int32_t row_id = 0; row_id < n_as; ++row_id) {
            const struct ggml_tensor * src0_row = dst->src[row_id + 2];