From f3bb947a13e5619e8def268f4bcb24f426565b83 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Sat, 31 May 2025 18:08:50 +0800 Subject: [PATCH] cuda use wmma flash attention for turing (+1 squashed commits) Squashed commits: [3c5112398] 117 (+10 squashed commit) Squashed commit: [4f01bb2d4] 117 graphs 80v [7549034ea] 117 graphs [dabf9cb99] checking if cuda 11.5.2 works [ba7ccdb7a] another try cu11.7 only [752cf2ae5] increase aria2c download log rate [dc4f198fd] test send turing to wmma flash attention [496a22e83] temp build test cu11.7.0 [ca759c424] temp build test cu11.7 [c46ada17c] test build: enable virtual80 for oldcpu [3ccfd939a] test build: with cuda graphs for all --- .github/workflows/kcpp-build-release-win-full-cu12.yaml | 1 - CMakeLists.txt | 2 +- ggml/src/ggml-cuda/fattn-wmma-f16.cu | 2 +- ggml/src/ggml-cuda/fattn.cu | 2 +- koboldcpp.py | 4 ++-- 5 files changed, 5 insertions(+), 6 deletions(-) diff --git a/.github/workflows/kcpp-build-release-win-full-cu12.yaml b/.github/workflows/kcpp-build-release-win-full-cu12.yaml index 0f74ae9da..1490996af 100644 --- a/.github/workflows/kcpp-build-release-win-full-cu12.yaml +++ b/.github/workflows/kcpp-build-release-win-full-cu12.yaml @@ -57,7 +57,6 @@ jobs: id: make_build run: | make LLAMA_CLBLAST=1 LLAMA_VULKAN=1 LLAMA_PORTABLE=1 -j ${env:NUMBER_OF_PROCESSORS} - echo "Vulkan Shaders Rebuilt" - uses: Jimver/cuda-toolkit@v0.2.15 id: cuda-toolkit diff --git a/CMakeLists.txt b/CMakeLists.txt index 80cd79edc..4f9e3f031 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,7 +7,7 @@ message(STATUS "It is ONLY for CUBLAS builds on windows visual studio. IT WILL O message(STATUS "IF YOU ARE SEEING THIS, you MUST ONLY be building CUBLAS BUILDS! NOTHING ELSE WILL BE SUPPORTED !!!") message(STATUS "============== ============== ==============") -cmake_minimum_required(VERSION 3.12) # Don't bump this version for no reason +cmake_minimum_required(VERSION 3.18) # Don't bump this version for no reason project("llama.cpp" C CXX) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) diff --git a/ggml/src/ggml-cuda/fattn-wmma-f16.cu b/ggml/src/ggml-cuda/fattn-wmma-f16.cu index c5668adb1..8cf649d31 100644 --- a/ggml/src/ggml-cuda/fattn-wmma-f16.cu +++ b/ggml/src/ggml-cuda/fattn-wmma-f16.cu @@ -56,7 +56,7 @@ static __global__ void flash_attn_ext_f16( const int ne1, const int ne2, const int ne3) { -#if defined(FLASH_ATTN_AVAILABLE) && (__CUDA_ARCH__ == GGML_CUDA_CC_VOLTA || (defined(GGML_HIP_ROCWMMA_FATTN) && defined(FP16_MMA_AVAILABLE))) +#if defined(FLASH_ATTN_AVAILABLE) && ((__CUDA_ARCH__ == GGML_CUDA_CC_VOLTA || __CUDA_ARCH__ == GGML_CUDA_CC_TURING) || (defined(GGML_HIP_ROCWMMA_FATTN) && defined(FP16_MMA_AVAILABLE))) // Skip unused kernel variants for faster compilation: if (use_logit_softcap && !(D == 128 || D == 256)) { NO_DEVICE_CODE; diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu index 6bc0096cc..8d1f1a8f6 100644 --- a/ggml/src/ggml-cuda/fattn.cu +++ b/ggml/src/ggml-cuda/fattn.cu @@ -337,7 +337,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst } // The MMA implementation needs Turing or newer, use the old WMMA code for Volta: - if (fp16_mma_available(cc) && !new_mma_available(cc)) { + if (cc == GGML_CUDA_CC_TURING || (fp16_mma_available(cc) && !new_mma_available(cc))) { //kcpp: turing use wmma to fix cu11 incoherence ggml_cuda_flash_attn_ext_wmma_f16(ctx, dst); return; } diff --git a/koboldcpp.py b/koboldcpp.py index 8ba267ae6..a44e79709 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -47,7 +47,7 @@ net_save_slots = 10 # abuse prevention stop_token_max = 256 -ban_token_max = 512 +ban_token_max = 768 logit_bias_max = 512 dry_seq_break_max = 128 @@ -5912,7 +5912,7 @@ def downloader_internal(input_url, output_filename, capture_output, min_file_siz a2cexe = (os.path.join(basepath, "aria2c-win.exe")) if os.path.exists(a2cexe): #on windows try using embedded a2cexe rc = subprocess.run([ - a2cexe, "-x", "16", "-s", "16", "--summary-interval=30", "--console-log-level=error", "--log-level=error", + a2cexe, "-x", "16", "-s", "16", "--summary-interval=20", "--console-log-level=error", "--log-level=error", "--download-result=default", "--allow-overwrite=true", "--file-allocation=none", "--max-tries=3", "-o", output_filename, input_url ], capture_output=capture_output, text=True, check=True, encoding='utf-8') dl_success = (rc.returncode == 0 and os.path.exists(output_filename) and os.path.getsize(output_filename) > min_file_size)