From 8018e5222a14c5d7686c31802c0681bb29c00afa Mon Sep 17 00:00:00 2001
From: Concedo <39025047+LostRuins@users.noreply.github.com>
Date: Sun, 21 Sep 2025 21:56:53 +0800
Subject: [PATCH] try fix compile issues rocm (+4 squashed commit)

Squashed commit:

[9e33a33f2] try fix compile issues rocm

[36e5eb56b] try fix compile issues rocm

[6ca18812b] try fix compile issues rocm

[b56ad2e8f] try fix compile issues rocm
---
 otherarch/ggml_v2-cuda-legacy.cu |  9 ++++++
 otherarch/ggml_v2-cuda.cu        |  9 ++++++
 otherarch/ggml_v3-cuda.cu        | 49 +++++++++++++++++++-------------
 3 files changed, 48 insertions(+), 19 deletions(-)
diff --git a/otherarch/ggml_v2-cuda-legacy.cu b/otherarch/ggml_v2-cuda-legacy.cu
index 29b0014e7..b55c71715 100644
--- a/otherarch/ggml_v2-cuda-legacy.cu
+++ b/otherarch/ggml_v2-cuda-legacy.cu
@@ -8,8 +8,17 @@
 #include <hip/hip_runtime.h>
 #include <hipblas/hipblas.h>
 #include <hip/hip_fp16.h>
+
+#if HIP_VERSION >= 60500000
+#define CUBLAS_COMPUTE_16F HIPBLAS_COMPUTE_16F
+#define CUBLAS_COMPUTE_32F HIPBLAS_COMPUTE_32F
+#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_COMPUTE_32F_FAST_16F
+#else
+#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
 #define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
 #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
+#endif // HIP_VERSION >= 6050000
+
 #define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
 #define CUBLAS_OP_N HIPBLAS_OP_N
 #define CUBLAS_OP_T HIPBLAS_OP_T
diff --git a/otherarch/ggml_v2-cuda.cu b/otherarch/ggml_v2-cuda.cu
index ff5632ec1..a2cef23f1 100644
--- a/otherarch/ggml_v2-cuda.cu
+++ b/otherarch/ggml_v2-cuda.cu
@@ -8,8 +8,17 @@
 #include <hip/hip_runtime.h>
 #include <hipblas/hipblas.h>
 #include <hip/hip_fp16.h>
+
+#if HIP_VERSION >= 60500000
+#define CUBLAS_COMPUTE_16F HIPBLAS_COMPUTE_16F
+#define CUBLAS_COMPUTE_32F HIPBLAS_COMPUTE_32F
+#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_COMPUTE_32F_FAST_16F
+#else
+#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
 #define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
 #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
+#endif // HIP_VERSION >= 6050000
+
 #define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
 #define CUBLAS_OP_N HIPBLAS_OP_N
 #define CUBLAS_OP_T HIPBLAS_OP_T
diff --git a/otherarch/ggml_v3-cuda.cu b/otherarch/ggml_v3-cuda.cu
index 2cabcbff4..593509a54 100644
--- a/otherarch/ggml_v3-cuda.cu
+++ b/otherarch/ggml_v3-cuda.cu
@@ -9,6 +9,7 @@
 #include <stdint.h>
 #include <stdio.h>
 #include <vector>
+#include <cstring>
 
 
 #if defined(GGML_USE_HIP)
@@ -19,9 +20,21 @@
 // for rocblas_initialize()
 #include "rocblas/rocblas.h"
 #endif // __HIP_PLATFORM_AMD__
+
+#if HIP_VERSION >= 60500000
+#define CUBLAS_COMPUTE_16F HIPBLAS_COMPUTE_16F
+#define CUBLAS_COMPUTE_32F HIPBLAS_COMPUTE_32F
+#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_COMPUTE_32F_FAST_16F
+#define cublasComputeType_t hipblasComputeType_t
+#define cudaDataType_t hipDataType
+#else
 #define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
 #define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
 #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
+#define cublasComputeType_t hipblasDatatype_t
+#define cudaDataType_t hipblasDatatype_t
+#endif // HIP_VERSION >= 6050000
+
 #define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
 #define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
 #define CUBLAS_OP_N HIPBLAS_OP_N
@@ -31,7 +44,6 @@
 #define CUDA_R_16F  HIPBLAS_R_16F
 #define CUDA_R_32F  HIPBLAS_R_32F
 #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
-#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
 #define cublasCreate hipblasCreate
 #define cublasGemmEx hipblasGemmEx
 #define cublasGemmBatchedEx hipblasGemmBatchedEx
@@ -41,7 +53,6 @@
 #define cublasSetStream hipblasSetStream
 #define cublasSgemm hipblasSgemm
 #define cublasStatus_t hipblasStatus_t
-#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
 #define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
 #define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
 #define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
@@ -7857,7 +7868,7 @@ static void ggml_v3_cuda_op_leaky_relu(
     GGML_V3_ASSERT( dst->type == GGML_V3_TYPE_F32);
 
     float negative_slope;
-    memcpy(&negative_slope, dst->op_params, sizeof(float));
+    std::memcpy(&negative_slope, dst->op_params, sizeof(float));
 
     leaky_relu_f32_cuda(src0_dd, dst_dd, ggml_v3_nelements(src0), negative_slope, main_stream);
 
@@ -7891,7 +7902,7 @@ static void ggml_v3_cuda_op_norm(
     const int64_t nrows = ggml_v3_nrows(src0);
 
     float eps;
-    memcpy(&eps, dst->op_params, sizeof(float));
+    std::memcpy(&eps, dst->op_params, sizeof(float));
 
     norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
 
@@ -7977,7 +7988,7 @@ static void ggml_v3_cuda_op_rms_norm(
     const int64_t nrows = ggml_v3_nrows(src0);
 
     float eps;
-    memcpy(&eps, dst->op_params, sizeof(float));
+    std::memcpy(&eps, dst->op_params, sizeof(float));
 
     rms_norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
 
@@ -8370,12 +8381,12 @@ static void ggml_v3_cuda_op_rope(
 
     // RoPE alteration for extended context
     float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
-    memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
-    memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
-    memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
-    memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
-    memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
-    memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
+    std::memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
+    std::memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
+    std::memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
+    std::memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
+    std::memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
+    std::memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
 
     const int32_t * pos = nullptr;
     if ((mode & 1) == 0) {
@@ -8444,7 +8455,7 @@ static void ggml_v3_cuda_op_alibi(
     //const int n_past = ((int32_t *) dst->op_params)[0];
     const int n_head = ((int32_t *) dst->op_params)[1];
     float max_bias;
-    memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
+    std::memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
 
     //GGML_V3_ASSERT(ne01 + n_past == ne00);
     GGML_V3_ASSERT(n_head == ne02);
@@ -8565,7 +8576,7 @@ static void ggml_v3_cuda_op_soft_max(
     const int64_t nrows_y = src1 ? ggml_v3_nrows(src1) : 1;
 
     float scale = 1.0f;
-    memcpy(&scale, dst->op_params, sizeof(float));
+    std::memcpy(&scale, dst->op_params, sizeof(float));
 
 #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION >= CUDART_HMAX
 #ifdef GGML_V3_CUDA_F16
@@ -8594,7 +8605,7 @@ static void ggml_v3_cuda_op_scale(
     GGML_V3_ASSERT( dst->type == GGML_V3_TYPE_F32);
 
     float scale;
-    memcpy(&scale, dst->op_params, sizeof(float));
+    std::memcpy(&scale, dst->op_params, sizeof(float));
 
     scale_f32_cuda(src0_dd, dst_dd, scale, ggml_v3_nelements(src0), main_stream);
     CUDA_CHECK(cudaGetLastError());
@@ -8613,8 +8624,8 @@ static void ggml_v3_cuda_op_clamp(
 
     float min;
     float max;
-    memcpy(&min, dst->op_params, sizeof(float));
-    memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
+    std::memcpy(&min, dst->op_params, sizeof(float));
+    std::memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
 
     clamp_f32_cuda(src0_dd, dst_dd, min, max, ggml_v3_nelements(src0), main_stream);
     CUDA_CHECK(cudaGetLastError());
@@ -9643,7 +9654,7 @@ static void ggml_v3_cuda_mul_mat_id(const ggml_v3_tensor * src0, const ggml_v3_t
         CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_v3_nbytes(ids), cudaMemcpyDeviceToHost, stream));
         CUDA_CHECK(cudaStreamSynchronize(stream));
     } else {
-        memcpy(ids_host.data(), ids->data, ggml_v3_nbytes(ids));
+        std::memcpy(ids_host.data(), ids->data, ggml_v3_nbytes(ids));
     }
 
     const ggml_v3_tensor_extra_gpu * src1_extra = (const ggml_v3_tensor_extra_gpu *) src1->extra;
@@ -10020,7 +10031,7 @@ static void ggml_v3_cuda_assign_buffers_impl(struct ggml_v3_tensor * tensor, boo
         char * src0_ddc = (char *) src0_extra->data_device[g_main_device_v3];
         size_t offset = 0;
         if (tensor->op == GGML_V3_OP_VIEW) {
-            memcpy(&offset, tensor->op_params, sizeof(size_t));
+            std::memcpy(&offset, tensor->op_params, sizeof(size_t));
         }
         extra = ggml_v3_cuda_alloc_temp_tensor_extra();
         extra->data_device[g_main_device_v3] = src0_ddc + offset;
@@ -10076,7 +10087,7 @@ void ggml_v3_cuda_assign_scratch_offset(struct ggml_v3_tensor * tensor, size_t o
         char * src0_ddc = (char *) src0_extra->data_device[g_main_device_v3];
         size_t view_offset = 0;
         if (tensor->op == GGML_V3_OP_VIEW) {
-            memcpy(&view_offset, tensor->op_params, sizeof(size_t));
+            std::memcpy(&view_offset, tensor->op_params, sizeof(size_t));
         }
         extra->data_device[g_main_device_v3] = src0_ddc + view_offset;
     } else {