moved volta onto tile kernels, so building for cc7.0 can be avoided

this shouldn't do anything (+2 squashed commit) Squashed commit: [1cdcb302a] another attempt to tip the scales, part 2 [8f647b709] another attempt to tip the scales (volta)
2026-04-28 03:30:20 +00:00 · 2025-12-07 18:20:47 +08:00 · 2025-12-07 18:20:47 +08:00 · cd73613136
commit cd73613136
parent 40d3d830a1
4 changed files with 8 additions and 10 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -139,10 +139,10 @@ if (LLAMA_CUBLAS)
        elseif(CUDAToolkit_VERSION VERSION_GREATER 12)
                add_compile_definitions(GGML_CUDA_USE_GRAPHS) #try enable cuda graphs on cu12 build
                add_compile_definitions(KCPP_LIMIT_CUDA_MAX_ARCH=800)
-                set(CMAKE_CUDA_ARCHITECTURES "50-virtual;61-virtual;70-virtual;75-virtual;80-virtual") # lowest CUDA 12 standard + lowest for integer intrinsics
+                set(CMAKE_CUDA_ARCHITECTURES "50-virtual;61-virtual;75-virtual;80-virtual") # lowest CUDA 12 standard + lowest for integer intrinsics
        else()
                add_compile_definitions(KCPP_LIMIT_CUDA_MAX_ARCH=750) #will cause issues with ggml_cuda_highest_compiled_arch if removed
-                set(CMAKE_CUDA_ARCHITECTURES "35-virtual;50-virtual;61-virtual;70-virtual;75-virtual") # lowest CUDA 12 standard + lowest for integer intrinsics
+                set(CMAKE_CUDA_ARCHITECTURES "35-virtual;50-virtual;61-virtual;75-virtual") # lowest CUDA 12 standard + lowest for integer intrinsics
        endif()
    endif()
    message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
--- a/2
+++ b/2
@ -226,7 +226,6 @@ NVCCFLAGS += -Wno-deprecated-gpu-targets \
             -gencode arch=compute_35,code=compute_35 \
             -gencode arch=compute_50,code=compute_50 \
             -gencode arch=compute_61,code=compute_61 \
-             -gencode arch=compute_70,code=compute_70 \
             -gencode arch=compute_75,code=compute_75 \
             -DKCPP_LIMIT_CUDA_MAX_ARCH=750

@ -234,7 +233,6 @@ else ifdef LLAMA_ARCHES_CU12
 NVCCFLAGS += -Wno-deprecated-gpu-targets \
             -gencode arch=compute_50,code=compute_50 \
             -gencode arch=compute_61,code=compute_61 \
-             -gencode arch=compute_70,code=compute_70 \
             -gencode arch=compute_75,code=compute_75 \
             -gencode arch=compute_80,code=compute_80 \
 			 -DKCPP_LIMIT_CUDA_MAX_ARCH=800
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@ -223,7 +223,7 @@ static const char * cu_get_error_str(CUresult err) {
 #define FP16_AVAILABLE
 #endif // defined(GGML_USE_HIP) || defined(GGML_USE_MUSA) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL

-#if defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
+#if defined(FP16_AVAILABLE) && __CUDA_ARCH__ > GGML_CUDA_CC_VOLTA
 #define FAST_FP16_AVAILABLE
 #endif // defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610

@ -263,7 +263,7 @@ static bool fp16_available(const int cc) {

 static bool fast_fp16_available(const int cc) {
    return GGML_CUDA_CC_IS_AMD(cc) ||
-        (GGML_CUDA_CC_IS_NVIDIA(cc) && fp16_available(cc) && cc > 610) ||
+        (GGML_CUDA_CC_IS_NVIDIA(cc) && fp16_available(cc) && cc > GGML_CUDA_CC_VOLTA) ||
        (GGML_CUDA_CC_IS_MTHREADS(cc) && fp16_available(cc));
 }

--- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh
+++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
@ -89,10 +89,10 @@ static constexpr __host__ __device__ fattn_mma_config ggml_cuda_fattn_mma_get_co
 }

 static constexpr __host__ __device__ fattn_mma_config ggml_cuda_fattn_mma_get_config_volta(const int DKQ, const int DV, const int ncols) {
-    // GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512,  8,  64, 4,  32, 288, 256,  64, 1, false);
-    // GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 16,  64, 4,  32, 288, 256,  64, 1, false);
-    // GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 32, 128, 2,  32, 160, 128,  64, 1, false);
-    // GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 64, 256, 1,  32, 160, 128,  64, 1, false);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512,  8,  64, 4,  32, 288, 256,  64, 1, false);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 16,  64, 4,  32, 288, 256,  64, 1, false);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 32, 128, 2,  32, 160, 128,  64, 1, false);
+    GGML_CUDA_FATTN_MMA_CONFIG_CASE(576, 512, 64, 256, 1,  32, 160, 128,  64, 1, false);

    // TODO tune specifically for Volta
    return ggml_cuda_fattn_mma_get_config_ampere(DKQ, DV, ncols);