cuda use wmma flash attention for turing (+1 squashed commits)

Squashed commits:

[3c5112398] 117 (+10 squashed commit)

Squashed commit:

[4f01bb2d4] 117 graphs 80v

[7549034ea] 117 graphs

[dabf9cb99] checking if cuda 11.5.2 works

[ba7ccdb7a] another try cu11.7 only

[752cf2ae5] increase aria2c download log rate

[dc4f198fd] test send turing to wmma flash attention

[496a22e83] temp build test cu11.7.0

[ca759c424] temp build test cu11.7

[c46ada17c] test build: enable virtual80 for oldcpu

[3ccfd939a] test build: with cuda graphs for all
This commit is contained in:
Concedo 2025-05-31 18:08:50 +08:00
parent b08dca65ed
commit f3bb947a13
5 changed files with 5 additions and 6 deletions

View file

@ -57,7 +57,6 @@ jobs:
id: make_build id: make_build
run: | run: |
make LLAMA_CLBLAST=1 LLAMA_VULKAN=1 LLAMA_PORTABLE=1 -j ${env:NUMBER_OF_PROCESSORS} make LLAMA_CLBLAST=1 LLAMA_VULKAN=1 LLAMA_PORTABLE=1 -j ${env:NUMBER_OF_PROCESSORS}
echo "Vulkan Shaders Rebuilt"
- uses: Jimver/cuda-toolkit@v0.2.15 - uses: Jimver/cuda-toolkit@v0.2.15
id: cuda-toolkit id: cuda-toolkit

View file

@ -7,7 +7,7 @@ message(STATUS "It is ONLY for CUBLAS builds on windows visual studio. IT WILL O
message(STATUS "IF YOU ARE SEEING THIS, you MUST ONLY be building CUBLAS BUILDS! NOTHING ELSE WILL BE SUPPORTED !!!") message(STATUS "IF YOU ARE SEEING THIS, you MUST ONLY be building CUBLAS BUILDS! NOTHING ELSE WILL BE SUPPORTED !!!")
message(STATUS "============== ============== ==============") message(STATUS "============== ============== ==============")
cmake_minimum_required(VERSION 3.12) # Don't bump this version for no reason cmake_minimum_required(VERSION 3.18) # Don't bump this version for no reason
project("llama.cpp" C CXX) project("llama.cpp" C CXX)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON) set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

View file

@ -56,7 +56,7 @@ static __global__ void flash_attn_ext_f16(
const int ne1, const int ne1,
const int ne2, const int ne2,
const int ne3) { const int ne3) {
#if defined(FLASH_ATTN_AVAILABLE) && (__CUDA_ARCH__ == GGML_CUDA_CC_VOLTA || (defined(GGML_HIP_ROCWMMA_FATTN) && defined(FP16_MMA_AVAILABLE))) #if defined(FLASH_ATTN_AVAILABLE) && ((__CUDA_ARCH__ == GGML_CUDA_CC_VOLTA || __CUDA_ARCH__ == GGML_CUDA_CC_TURING) || (defined(GGML_HIP_ROCWMMA_FATTN) && defined(FP16_MMA_AVAILABLE)))
// Skip unused kernel variants for faster compilation: // Skip unused kernel variants for faster compilation:
if (use_logit_softcap && !(D == 128 || D == 256)) { if (use_logit_softcap && !(D == 128 || D == 256)) {
NO_DEVICE_CODE; NO_DEVICE_CODE;

View file

@ -337,7 +337,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
} }
// The MMA implementation needs Turing or newer, use the old WMMA code for Volta: // The MMA implementation needs Turing or newer, use the old WMMA code for Volta:
if (fp16_mma_available(cc) && !new_mma_available(cc)) { if (cc == GGML_CUDA_CC_TURING || (fp16_mma_available(cc) && !new_mma_available(cc))) { //kcpp: turing use wmma to fix cu11 incoherence
ggml_cuda_flash_attn_ext_wmma_f16(ctx, dst); ggml_cuda_flash_attn_ext_wmma_f16(ctx, dst);
return; return;
} }

View file

@ -47,7 +47,7 @@ net_save_slots = 10
# abuse prevention # abuse prevention
stop_token_max = 256 stop_token_max = 256
ban_token_max = 512 ban_token_max = 768
logit_bias_max = 512 logit_bias_max = 512
dry_seq_break_max = 128 dry_seq_break_max = 128
@ -5912,7 +5912,7 @@ def downloader_internal(input_url, output_filename, capture_output, min_file_siz
a2cexe = (os.path.join(basepath, "aria2c-win.exe")) a2cexe = (os.path.join(basepath, "aria2c-win.exe"))
if os.path.exists(a2cexe): #on windows try using embedded a2cexe if os.path.exists(a2cexe): #on windows try using embedded a2cexe
rc = subprocess.run([ rc = subprocess.run([
a2cexe, "-x", "16", "-s", "16", "--summary-interval=30", "--console-log-level=error", "--log-level=error", a2cexe, "-x", "16", "-s", "16", "--summary-interval=20", "--console-log-level=error", "--log-level=error",
"--download-result=default", "--allow-overwrite=true", "--file-allocation=none", "--max-tries=3", "-o", output_filename, input_url "--download-result=default", "--allow-overwrite=true", "--file-allocation=none", "--max-tries=3", "-o", output_filename, input_url
], capture_output=capture_output, text=True, check=True, encoding='utf-8') ], capture_output=capture_output, text=True, check=True, encoding='utf-8')
dl_success = (rc.returncode == 0 and os.path.exists(output_filename) and os.path.getsize(output_filename) > min_file_size) dl_success = (rc.returncode == 0 and os.path.exists(output_filename) and os.path.getsize(output_filename) > min_file_size)