moved volta onto tile kernels, so building for cc7.0 can be avoided

this shouldn't do anything (+2 squashed commit)

Squashed commit:

[1cdcb302a] another attempt to tip the scales, part 2

[8f647b709] another attempt to tip the scales (volta)
This commit is contained in:
Concedo 2025-12-07 18:20:47 +08:00
parent 40d3d830a1
commit cd73613136
4 changed files with 8 additions and 10 deletions

View file

@ -139,10 +139,10 @@ if (LLAMA_CUBLAS)
elseif(CUDAToolkit_VERSION VERSION_GREATER 12)
add_compile_definitions(GGML_CUDA_USE_GRAPHS) #try enable cuda graphs on cu12 build
add_compile_definitions(KCPP_LIMIT_CUDA_MAX_ARCH=800)
set(CMAKE_CUDA_ARCHITECTURES "50-virtual;61-virtual;70-virtual;75-virtual;80-virtual") # lowest CUDA 12 standard + lowest for integer intrinsics
set(CMAKE_CUDA_ARCHITECTURES "50-virtual;61-virtual;75-virtual;80-virtual") # lowest CUDA 12 standard + lowest for integer intrinsics
else()
add_compile_definitions(KCPP_LIMIT_CUDA_MAX_ARCH=750) #will cause issues with ggml_cuda_highest_compiled_arch if removed
set(CMAKE_CUDA_ARCHITECTURES "35-virtual;50-virtual;61-virtual;70-virtual;75-virtual") # lowest CUDA 12 standard + lowest for integer intrinsics
set(CMAKE_CUDA_ARCHITECTURES "35-virtual;50-virtual;61-virtual;75-virtual") # lowest CUDA 12 standard + lowest for integer intrinsics
endif()
endif()
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")

View file

@ -226,7 +226,6 @@ NVCCFLAGS += -Wno-deprecated-gpu-targets \
-gencode arch=compute_35,code=compute_35 \
-gencode arch=compute_50,code=compute_50 \
-gencode arch=compute_61,code=compute_61 \
-gencode arch=compute_70,code=compute_70 \
-gencode arch=compute_75,code=compute_75 \
-DKCPP_LIMIT_CUDA_MAX_ARCH=750
@ -234,7 +233,6 @@ else ifdef LLAMA_ARCHES_CU12
NVCCFLAGS += -Wno-deprecated-gpu-targets \
-gencode arch=compute_50,code=compute_50 \
-gencode arch=compute_61,code=compute_61 \
-gencode arch=compute_70,code=compute_70 \
-gencode arch=compute_75,code=compute_75 \
-gencode arch=compute_80,code=compute_80 \
-DKCPP_LIMIT_CUDA_MAX_ARCH=800

View file

@ -223,7 +223,7 @@ static const char * cu_get_error_str(CUresult err) {
#define FP16_AVAILABLE
#endif // defined(GGML_USE_HIP) || defined(GGML_USE_MUSA) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
#if defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
#if defined(FP16_AVAILABLE) && __CUDA_ARCH__ > GGML_CUDA_CC_VOLTA
#define FAST_FP16_AVAILABLE
#endif // defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
@ -263,7 +263,7 @@ static bool fp16_available(const int cc) {
static bool fast_fp16_available(const int cc) {
return GGML_CUDA_CC_IS_AMD(cc) ||
(GGML_CUDA_CC_IS_NVIDIA(cc) && fp16_available(cc) && cc > 610) ||
(GGML_CUDA_CC_IS_NVIDIA(cc) && fp16_available(cc) && cc > GGML_CUDA_CC_VOLTA) ||
(GGML_CUDA_CC_IS_MTHREADS(cc) && fp16_available(cc));
}

View file

@ -89,10 +89,10 @@ static constexpr __host__ __device__ fattn_mma_config ggml_cuda_fattn_mma_get_co
}
static constexpr __host__ __device__ fattn_mma_config ggml_cuda_fattn_mma_get_config_volta(const int DKQ, const int DV, const int ncols) {
// GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 8, 64, 4, 32, 288, 256, 64, 1, false);
// GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 16, 64, 4, 32, 288, 256, 64, 1, false);
// GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 32, 128, 2, 32, 160, 128, 64, 1, false);
// GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 64, 256, 1, 32, 160, 128, 64, 1, false);
GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 8, 64, 4, 32, 288, 256, 64, 1, false);
GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 16, 64, 4, 32, 288, 256, 64, 1, false);
GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 32, 128, 2, 32, 160, 128, 64, 1, false);
GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 64, 256, 1, 32, 160, 128, 64, 1, false);
// TODO tune specifically for Volta
return ggml_cuda_fattn_mma_get_config_ampere(DKQ, DV, ncols);