mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-10 17:14:36 +00:00
cuda use wmma flash attention for turing (+1 squashed commits)
Squashed commits: [3c5112398] 117 (+10 squashed commit) Squashed commit: [4f01bb2d4] 117 graphs 80v [7549034ea] 117 graphs [dabf9cb99] checking if cuda 11.5.2 works [ba7ccdb7a] another try cu11.7 only [752cf2ae5] increase aria2c download log rate [dc4f198fd] test send turing to wmma flash attention [496a22e83] temp build test cu11.7.0 [ca759c424] temp build test cu11.7 [c46ada17c] test build: enable virtual80 for oldcpu [3ccfd939a] test build: with cuda graphs for all
This commit is contained in:
parent
b08dca65ed
commit
f3bb947a13
5 changed files with 5 additions and 6 deletions
|
@ -57,7 +57,6 @@ jobs:
|
||||||
id: make_build
|
id: make_build
|
||||||
run: |
|
run: |
|
||||||
make LLAMA_CLBLAST=1 LLAMA_VULKAN=1 LLAMA_PORTABLE=1 -j ${env:NUMBER_OF_PROCESSORS}
|
make LLAMA_CLBLAST=1 LLAMA_VULKAN=1 LLAMA_PORTABLE=1 -j ${env:NUMBER_OF_PROCESSORS}
|
||||||
echo "Vulkan Shaders Rebuilt"
|
|
||||||
|
|
||||||
- uses: Jimver/cuda-toolkit@v0.2.15
|
- uses: Jimver/cuda-toolkit@v0.2.15
|
||||||
id: cuda-toolkit
|
id: cuda-toolkit
|
||||||
|
|
|
@ -7,7 +7,7 @@ message(STATUS "It is ONLY for CUBLAS builds on windows visual studio. IT WILL O
|
||||||
message(STATUS "IF YOU ARE SEEING THIS, you MUST ONLY be building CUBLAS BUILDS! NOTHING ELSE WILL BE SUPPORTED !!!")
|
message(STATUS "IF YOU ARE SEEING THIS, you MUST ONLY be building CUBLAS BUILDS! NOTHING ELSE WILL BE SUPPORTED !!!")
|
||||||
message(STATUS "============== ============== ==============")
|
message(STATUS "============== ============== ==============")
|
||||||
|
|
||||||
cmake_minimum_required(VERSION 3.12) # Don't bump this version for no reason
|
cmake_minimum_required(VERSION 3.18) # Don't bump this version for no reason
|
||||||
project("llama.cpp" C CXX)
|
project("llama.cpp" C CXX)
|
||||||
|
|
||||||
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
|
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
|
||||||
|
|
|
@ -56,7 +56,7 @@ static __global__ void flash_attn_ext_f16(
|
||||||
const int ne1,
|
const int ne1,
|
||||||
const int ne2,
|
const int ne2,
|
||||||
const int ne3) {
|
const int ne3) {
|
||||||
#if defined(FLASH_ATTN_AVAILABLE) && (__CUDA_ARCH__ == GGML_CUDA_CC_VOLTA || (defined(GGML_HIP_ROCWMMA_FATTN) && defined(FP16_MMA_AVAILABLE)))
|
#if defined(FLASH_ATTN_AVAILABLE) && ((__CUDA_ARCH__ == GGML_CUDA_CC_VOLTA || __CUDA_ARCH__ == GGML_CUDA_CC_TURING) || (defined(GGML_HIP_ROCWMMA_FATTN) && defined(FP16_MMA_AVAILABLE)))
|
||||||
// Skip unused kernel variants for faster compilation:
|
// Skip unused kernel variants for faster compilation:
|
||||||
if (use_logit_softcap && !(D == 128 || D == 256)) {
|
if (use_logit_softcap && !(D == 128 || D == 256)) {
|
||||||
NO_DEVICE_CODE;
|
NO_DEVICE_CODE;
|
||||||
|
|
|
@ -337,7 +337,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
|
||||||
}
|
}
|
||||||
|
|
||||||
// The MMA implementation needs Turing or newer, use the old WMMA code for Volta:
|
// The MMA implementation needs Turing or newer, use the old WMMA code for Volta:
|
||||||
if (fp16_mma_available(cc) && !new_mma_available(cc)) {
|
if (cc == GGML_CUDA_CC_TURING || (fp16_mma_available(cc) && !new_mma_available(cc))) { //kcpp: turing use wmma to fix cu11 incoherence
|
||||||
ggml_cuda_flash_attn_ext_wmma_f16(ctx, dst);
|
ggml_cuda_flash_attn_ext_wmma_f16(ctx, dst);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
|
@ -47,7 +47,7 @@ net_save_slots = 10
|
||||||
|
|
||||||
# abuse prevention
|
# abuse prevention
|
||||||
stop_token_max = 256
|
stop_token_max = 256
|
||||||
ban_token_max = 512
|
ban_token_max = 768
|
||||||
logit_bias_max = 512
|
logit_bias_max = 512
|
||||||
dry_seq_break_max = 128
|
dry_seq_break_max = 128
|
||||||
|
|
||||||
|
@ -5912,7 +5912,7 @@ def downloader_internal(input_url, output_filename, capture_output, min_file_siz
|
||||||
a2cexe = (os.path.join(basepath, "aria2c-win.exe"))
|
a2cexe = (os.path.join(basepath, "aria2c-win.exe"))
|
||||||
if os.path.exists(a2cexe): #on windows try using embedded a2cexe
|
if os.path.exists(a2cexe): #on windows try using embedded a2cexe
|
||||||
rc = subprocess.run([
|
rc = subprocess.run([
|
||||||
a2cexe, "-x", "16", "-s", "16", "--summary-interval=30", "--console-log-level=error", "--log-level=error",
|
a2cexe, "-x", "16", "-s", "16", "--summary-interval=20", "--console-log-level=error", "--log-level=error",
|
||||||
"--download-result=default", "--allow-overwrite=true", "--file-allocation=none", "--max-tries=3", "-o", output_filename, input_url
|
"--download-result=default", "--allow-overwrite=true", "--file-allocation=none", "--max-tries=3", "-o", output_filename, input_url
|
||||||
], capture_output=capture_output, text=True, check=True, encoding='utf-8')
|
], capture_output=capture_output, text=True, check=True, encoding='utf-8')
|
||||||
dl_success = (rc.returncode == 0 and os.path.exists(output_filename) and os.path.getsize(output_filename) > min_file_size)
|
dl_success = (rc.returncode == 0 and os.path.exists(output_filename) and os.path.getsize(output_filename) > min_file_size)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue