mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-07 15:49:05 +00:00
this commit removes ggml_cuda_f16 targets. Merge commit '7a6e91ad26160dd6dfb33d29ac441617422f28e7' into concedo_experimental
# Conflicts: # docs/build.md # docs/multimodal/MobileVLM.md # ggml/CMakeLists.txt # ggml/src/ggml-cuda/CMakeLists.txt # ggml/src/ggml-musa/CMakeLists.txt
This commit is contained in:
commit
b50f94ae27
9 changed files with 31 additions and 84 deletions
|
@ -37,7 +37,6 @@ endif()
|
|||
|
||||
# 3rd party libs
|
||||
option(LLAMA_CUBLAS "llama: use CUDA" ON)
|
||||
option(LLAMA_CUDA_F16 "llama: use 16 bit floats for dmmv CUDA kernels" OFF)
|
||||
set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
|
||||
"llama: max. batch size for using peer access")
|
||||
|
||||
|
@ -98,9 +97,6 @@ if (LLAMA_CUBLAS)
|
|||
add_compile_definitions(GGML_USE_CUDA)
|
||||
add_compile_definitions(SD_USE_CUDA)
|
||||
|
||||
if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
|
||||
add_compile_definitions(GGML_CUDA_F16)
|
||||
endif()
|
||||
add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE})
|
||||
|
||||
# only build minimal quants required for fattn quant kv
|
||||
|
@ -131,17 +127,13 @@ if (LLAMA_CUBLAS)
|
|||
# 70 == (assumed) compute capability at which unrolling a loop in mul_mat_q kernels is faster
|
||||
# 75 == int8 tensor cores
|
||||
# 80 == Ampere, asynchronous data loading, faster tensor core instructions
|
||||
if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
|
||||
set(CMAKE_CUDA_ARCHITECTURES "60-virtual;61-virtual;70-virtual;75-virtual;80-virtual") # needed for f16 CUDA intrinsics
|
||||
message("CUDA Toolkit Version: ${CUDAToolkit_VERSION}")
|
||||
if(CUDAToolkit_VERSION VERSION_GREATER 12)
|
||||
add_compile_definitions(GGML_CUDA_USE_GRAPHS) #try enable cuda graphs on cu12 build
|
||||
set(CMAKE_CUDA_ARCHITECTURES "50-virtual;61-virtual;70-virtual;75-virtual;80-virtual") # lowest CUDA 12 standard + lowest for integer intrinsics
|
||||
else()
|
||||
message("CUDA Toolkit Version: ${CUDAToolkit_VERSION}")
|
||||
if(CUDAToolkit_VERSION VERSION_GREATER 12)
|
||||
add_compile_definitions(GGML_CUDA_USE_GRAPHS) #try enable cuda graphs on cu12 build
|
||||
set(CMAKE_CUDA_ARCHITECTURES "50-virtual;61-virtual;70-virtual;75-virtual;80-virtual") # lowest CUDA 12 standard + lowest for integer intrinsics
|
||||
else()
|
||||
add_compile_definitions(KCPP_LIMIT_CUDA_MAX_ARCH=750) #will cause issues with ggml_cuda_highest_compiled_arch if removed
|
||||
set(CMAKE_CUDA_ARCHITECTURES "35-virtual;50-virtual;61-virtual;70-virtual;75-virtual") # lowest CUDA 12 standard + lowest for integer intrinsics
|
||||
endif()
|
||||
add_compile_definitions(KCPP_LIMIT_CUDA_MAX_ARCH=750) #will cause issues with ggml_cuda_highest_compiled_arch if removed
|
||||
set(CMAKE_CUDA_ARCHITECTURES "35-virtual;50-virtual;61-virtual;70-virtual;75-virtual") # lowest CUDA 12 standard + lowest for integer intrinsics
|
||||
endif()
|
||||
endif()
|
||||
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
|
||||
|
|
7
Makefile
7
Makefile
|
@ -224,13 +224,6 @@ else
|
|||
NVCCFLAGS += -arch=native
|
||||
endif # LLAMA_PORTABLE
|
||||
|
||||
ifdef LLAMA_CUDA_F16
|
||||
NVCCFLAGS += -DGGML_CUDA_F16
|
||||
endif # LLAMA_CUDA_F16
|
||||
ifdef LLAMA_CUDA_DMMV_F16
|
||||
NVCCFLAGS += -DGGML_CUDA_F16
|
||||
endif # LLAMA_CUDA_DMMV_F16
|
||||
|
||||
ifdef LLAMA_CUDA_CCBIN
|
||||
NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
|
||||
endif
|
||||
|
|
|
@ -210,14 +210,6 @@ static const char * cu_get_error_str(CUresult err) {
|
|||
#define GGML_CUDA_ASSUME(x)
|
||||
#endif // CUDART_VERSION >= 11010
|
||||
|
||||
#ifdef GGML_CUDA_F16
|
||||
typedef half dfloat; // dequantize float
|
||||
typedef half2 dfloat2;
|
||||
#else
|
||||
typedef float dfloat; // dequantize float
|
||||
typedef float2 dfloat2;
|
||||
#endif // GGML_CUDA_F16
|
||||
|
||||
#if (!defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)) || (defined(GGML_USE_HIP) && !defined(GGML_HIP_NO_VMM))
|
||||
#define GGML_USE_VMM
|
||||
#endif // (!defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)) || (defined(GGML_USE_HIP) && !defined(GGML_HIP_NO_VMM))
|
||||
|
@ -563,7 +555,7 @@ static __device__ __forceinline__ float ggml_cuda_e8m0_to_fp32(uint8_t x) {
|
|||
#endif // CUDART_VERSION >= 12050
|
||||
}
|
||||
|
||||
typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);
|
||||
typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, float2 & v);
|
||||
|
||||
static __device__ __forceinline__ float get_alibi_slope(
|
||||
const float max_bias, const uint32_t h, const uint32_t n_head_log2, const float m0, const float m1
|
||||
|
|
|
@ -27,7 +27,7 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __
|
|||
const int64_t y_offset = qr == 1 ? 1 : qk/2;
|
||||
|
||||
// dequantize
|
||||
dfloat2 v;
|
||||
float2 v;
|
||||
dequantize_kernel(vx, ib, iqs, v);
|
||||
|
||||
const int64_t iy0 = ((i03*ne02 + i02)*ne01 + i01)*ne00 + iybs + iqs;
|
||||
|
|
|
@ -42,7 +42,7 @@ static __device__ void cpy_blck_q8_0_f32(const char * cxi, char * cdsti) {
|
|||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < QK8_0; j += 2) {
|
||||
dfloat2 dq;
|
||||
float2 dq;
|
||||
dequantize_q8_0(cxi, 0, j, dq);
|
||||
*(cdstf + j) = dq.x;
|
||||
*(cdstf + j + 1) = dq.y;
|
||||
|
@ -55,7 +55,7 @@ static __device__ void cpy_blck_q_f32(const char * cxi, char * cdsti) {
|
|||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < qk/2; j++) {
|
||||
dfloat2 dq;
|
||||
float2 dq;
|
||||
dequant(cxi, 0, j, dq);
|
||||
*(cdstf + j) = dq.x;
|
||||
*(cdstf + j + qk/2) = dq.y;
|
||||
|
|
|
@ -1,48 +1,37 @@
|
|||
#include "common.cuh"
|
||||
|
||||
static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
|
||||
static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int64_t ib, const int iqs, float2 & v){
|
||||
const block_q4_0 * x = (const block_q4_0 *) vx;
|
||||
|
||||
const dfloat d = x[ib].d;
|
||||
const float d = x[ib].d;
|
||||
|
||||
const int vui = x[ib].qs[iqs];
|
||||
|
||||
v.x = vui & 0xF;
|
||||
v.y = vui >> 4;
|
||||
|
||||
#ifdef GGML_CUDA_F16
|
||||
v = __hsub2(v, {8.0f, 8.0f});
|
||||
v = __hmul2(v, {d, d});
|
||||
#else
|
||||
v.x = (v.x - 8.0f) * d;
|
||||
v.y = (v.y - 8.0f) * d;
|
||||
#endif // GGML_CUDA_F16
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
|
||||
static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int64_t ib, const int iqs, float2 & v){
|
||||
const block_q4_1 * x = (const block_q4_1 *) vx;
|
||||
|
||||
const dfloat d = __low2half(x[ib].dm);
|
||||
const dfloat m = __high2half(x[ib].dm);
|
||||
const float2 dm = __half22float2(x[ib].dm);
|
||||
|
||||
const int vui = x[ib].qs[iqs];
|
||||
|
||||
v.x = vui & 0xF;
|
||||
v.y = vui >> 4;
|
||||
|
||||
#ifdef GGML_CUDA_F16
|
||||
v = __hmul2(v, {d, d});
|
||||
v = __hadd2(v, {m, m});
|
||||
#else
|
||||
v.x = (v.x * d) + m;
|
||||
v.y = (v.y * d) + m;
|
||||
#endif // GGML_CUDA_F16
|
||||
v.x = (v.x * dm.x) + dm.y;
|
||||
v.y = (v.y * dm.x) + dm.y;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
|
||||
static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int64_t ib, const int iqs, float2 & v){
|
||||
const block_q5_0 * x = (const block_q5_0 *) vx;
|
||||
|
||||
const dfloat d = x[ib].d;
|
||||
const float d = x[ib].d;
|
||||
|
||||
uint32_t qh;
|
||||
memcpy(&qh, x[ib].qh, sizeof(qh));
|
||||
|
@ -53,20 +42,14 @@ static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const in
|
|||
v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
|
||||
v.y = ((x[ib].qs[iqs] >> 4) | xh_1);
|
||||
|
||||
#ifdef GGML_CUDA_F16
|
||||
v = __hsub2(v, {16.0f, 16.0f});
|
||||
v = __hmul2(v, {d, d});
|
||||
#else
|
||||
v.x = (v.x - 16.0f) * d;
|
||||
v.y = (v.y - 16.0f) * d;
|
||||
#endif // GGML_CUDA_F16
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
|
||||
static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int64_t ib, const int iqs, float2 & v){
|
||||
const block_q5_1 * x = (const block_q5_1 *) vx;
|
||||
|
||||
const dfloat d = __low2half(x[ib].dm);
|
||||
const dfloat m = __high2half(x[ib].dm);
|
||||
const float2 dm = __half22float2(x[ib].dm);
|
||||
|
||||
uint32_t qh;
|
||||
memcpy(&qh, x[ib].qh, sizeof(qh));
|
||||
|
@ -77,27 +60,18 @@ static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const in
|
|||
v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
|
||||
v.y = ((x[ib].qs[iqs] >> 4) | xh_1);
|
||||
|
||||
#ifdef GGML_CUDA_F16
|
||||
v = __hmul2(v, {d, d});
|
||||
v = __hadd2(v, {m, m});
|
||||
#else
|
||||
v.x = (v.x * d) + m;
|
||||
v.y = (v.y * d) + m;
|
||||
#endif // GGML_CUDA_F16
|
||||
v.x = (v.x * dm.x) + dm.y;
|
||||
v.y = (v.y * dm.x) + dm.y;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
|
||||
static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int64_t ib, const int iqs, float2 & v){
|
||||
const block_q8_0 * x = (const block_q8_0 *) vx;
|
||||
|
||||
const dfloat d = x[ib].d;
|
||||
const float d = x[ib].d;
|
||||
|
||||
v.x = x[ib].qs[iqs + 0];
|
||||
v.y = x[ib].qs[iqs + 1];
|
||||
|
||||
#ifdef GGML_CUDA_F16
|
||||
v = __hmul2(v, {d, d});
|
||||
#else
|
||||
v.x *= d;
|
||||
v.y *= d;
|
||||
#endif // GGML_CUDA_F16
|
||||
}
|
||||
|
|
|
@ -32,7 +32,7 @@ static __global__ void k_get_rows(
|
|||
const int y_offset = qr == 1 ? 1 : qk/2;
|
||||
|
||||
// dequantize
|
||||
dfloat2 v;
|
||||
float2 v;
|
||||
dequantize_kernel(src0_row, ib, iqs, v);
|
||||
|
||||
dst_row[iybs + iqs + 0] = ggml_cuda_cast<dst_t>(v.x);
|
||||
|
|
|
@ -3686,10 +3686,6 @@ static ggml_backend_feature * ggml_backend_cuda_get_features(ggml_backend_reg_t
|
|||
features.push_back({ "NO_PEER_COPY", "1" });
|
||||
#endif
|
||||
|
||||
#ifdef GGML_CUDA_F16
|
||||
features.push_back({ "F16", "1" });
|
||||
#endif
|
||||
|
||||
#ifdef GGML_CUDA_USE_GRAPHS
|
||||
features.push_back({ "USE_GRAPHS", "1" });
|
||||
#endif
|
||||
|
|
|
@ -87,7 +87,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp
|
|||
sumi = ggml_cuda_dp4a(vi1, u[2*i+1], sumi);
|
||||
}
|
||||
|
||||
#ifdef GGML_CUDA_F16
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
const float2 tmp = __half22float2(__hmul2(dm4, ds8));
|
||||
const float d4d8 = tmp.x;
|
||||
const float m4s8 = tmp.y;
|
||||
|
@ -96,7 +96,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp
|
|||
const float2 ds8f = __half22float2(ds8);
|
||||
const float d4d8 = dm4f.x * ds8f.x;
|
||||
const float m4s8 = dm4f.y * ds8f.y;
|
||||
#endif // GGML_CUDA_F16
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
|
||||
// scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
|
||||
return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
|
||||
|
@ -158,7 +158,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
|
|||
sumi = ggml_cuda_dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
|
||||
}
|
||||
|
||||
#ifdef GGML_CUDA_F16
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
const float2 tmp = __half22float2(__hmul2(dm5, ds8));
|
||||
const float d5d8 = tmp.x;
|
||||
const float m5s8 = tmp.y;
|
||||
|
@ -167,7 +167,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
|
|||
const float2 ds8f = __half22float2(ds8);
|
||||
const float d5d8 = dm5f.x * ds8f.x;
|
||||
const float m5s8 = dm5f.y * ds8f.y;
|
||||
#endif // GGML_CUDA_F16
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
|
||||
// scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it
|
||||
return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
|
||||
|
@ -201,7 +201,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
|
|||
sumi = ggml_cuda_dp4a(v[i], u[i], sumi);
|
||||
}
|
||||
|
||||
#ifdef GGML_CUDA_F16
|
||||
#ifdef FAST_FP16_AVAILABLE
|
||||
const float2 tmp = __half22float2(__hmul2(dm8, ds8));
|
||||
const float d8d8 = tmp.x;
|
||||
const float m8s8 = tmp.y;
|
||||
|
@ -210,7 +210,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
|
|||
const float2 ds8f = __half22float2(ds8);
|
||||
const float d8d8 = dm8f.x * ds8f.x;
|
||||
const float m8s8 = dm8f.y * ds8f.y;
|
||||
#endif // GGML_CUDA_F16
|
||||
#endif // FAST_FP16_AVAILABLE
|
||||
|
||||
// scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
|
||||
return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
|
||||
|
|
Loading…
Add table
Reference in a new issue