mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-16 03:49:42 +00:00
Merge branch 'master' into concedo_experimental
# Conflicts: # Makefile # README.md
This commit is contained in:
commit
b814bb217d
3 changed files with 15 additions and 8 deletions
17
ggml-cuda.cu
17
ggml-cuda.cu
|
@ -90,6 +90,13 @@
|
|||
#include <cuda_runtime.h>
|
||||
#include <cublas_v2.h>
|
||||
#include <cuda_fp16.h>
|
||||
// CUDA 10.2 does not have these macro definitions.
|
||||
#ifndef CUBLAS_TF32_TENSOR_OP_MATH
|
||||
#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
|
||||
#define CUBLAS_COMPUTE_16F CUDA_R_16F
|
||||
#define CUBLAS_COMPUTE_32F CUDA_R_32F
|
||||
#define cublasComputeType_t cudaDataType_t
|
||||
#endif
|
||||
#endif // defined(GGML_USE_HIPBLAS)
|
||||
|
||||
#include "ggml-cuda.h"
|
||||
|
@ -5267,17 +5274,17 @@ static __global__ void im2col_f32_f16(
|
|||
const int ky = (i - kd) / OW;
|
||||
const int ix = i % OW;
|
||||
|
||||
const int iiw = ix * s0 + kx * d0 - p0;
|
||||
const int iih = blockIdx.y * s1 + ky * d1 - p1;
|
||||
const int64_t iiw = ix * s0 + kx * d0 - p0;
|
||||
const int64_t iih = blockIdx.y * s1 + ky * d1 - p1;
|
||||
|
||||
const int offset_dst =
|
||||
const int64_t offset_dst =
|
||||
(blockIdx.y * OW + ix) * CHW +
|
||||
(blockIdx.z * (KW * KH) + ky * KW + kx);
|
||||
|
||||
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
|
||||
dst[offset_dst] = __float2half(0.0f);
|
||||
} else {
|
||||
const int offset_src = blockIdx.z * offset_delta;
|
||||
const int64_t offset_src = blockIdx.z * offset_delta;
|
||||
dst[offset_dst] = __float2half(x[offset_src + iih * IW + iiw]);
|
||||
}
|
||||
}
|
||||
|
@ -8835,7 +8842,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
|
|||
const cudaMemcpyKind src1_kind = src1->backend == GGML_BACKEND_CPU ?
|
||||
cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice;
|
||||
const cudaMemcpyKind dst_kind = dst->backend == GGML_BACKEND_CPU ?
|
||||
cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice;
|
||||
cudaMemcpyDeviceToHost : cudaMemcpyDeviceToDevice;
|
||||
|
||||
for (int32_t row_id = 0; row_id < n_as; ++row_id) {
|
||||
const struct ggml_tensor * src0_row = dst->src[row_id + 2];
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue