Merge branch 'master' into concedo_experimental

# Conflicts:
#	Makefile
#	README.md
This commit is contained in:
Concedo 2023-12-23 00:01:21 +08:00
commit b814bb217d
3 changed files with 15 additions and 8 deletions

View file

@ -90,6 +90,13 @@
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cuda_fp16.h>
// CUDA 10.2 does not have these macro definitions.
#ifndef CUBLAS_TF32_TENSOR_OP_MATH
#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
#define CUBLAS_COMPUTE_16F CUDA_R_16F
#define CUBLAS_COMPUTE_32F CUDA_R_32F
#define cublasComputeType_t cudaDataType_t
#endif
#endif // defined(GGML_USE_HIPBLAS)
#include "ggml-cuda.h"
@ -5267,17 +5274,17 @@ static __global__ void im2col_f32_f16(
const int ky = (i - kd) / OW;
const int ix = i % OW;
const int iiw = ix * s0 + kx * d0 - p0;
const int iih = blockIdx.y * s1 + ky * d1 - p1;
const int64_t iiw = ix * s0 + kx * d0 - p0;
const int64_t iih = blockIdx.y * s1 + ky * d1 - p1;
const int offset_dst =
const int64_t offset_dst =
(blockIdx.y * OW + ix) * CHW +
(blockIdx.z * (KW * KH) + ky * KW + kx);
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
dst[offset_dst] = __float2half(0.0f);
} else {
const int offset_src = blockIdx.z * offset_delta;
const int64_t offset_src = blockIdx.z * offset_delta;
dst[offset_dst] = __float2half(x[offset_src + iih * IW + iiw]);
}
}
@ -8835,7 +8842,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
const cudaMemcpyKind src1_kind = src1->backend == GGML_BACKEND_CPU ?
cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice;
const cudaMemcpyKind dst_kind = dst->backend == GGML_BACKEND_CPU ?
cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice;
cudaMemcpyDeviceToHost : cudaMemcpyDeviceToDevice;
for (int32_t row_id = 0; row_id < n_as; ++row_id) {
const struct ggml_tensor * src0_row = dst->src[row_id + 2];