cuda use wmma flash attention for turing (+1 squashed commits)

Squashed commits: [3c5112398] 117 (+10 squashed commit) Squashed commit: [4f01bb2d4] 117 graphs 80v [7549034ea] 117 graphs [dabf9cb99] checking if cuda 11.5.2 works [ba7ccdb7a] another try cu11.7 only [752cf2ae5] increase aria2c download log rate [dc4f198fd] test send turing to wmma flash attention [496a22e83] temp build test cu11.7.0 [ca759c424] temp build test cu11.7 [c46ada17c] test build: enable virtual80 for oldcpu [3ccfd939a] test build: with cuda graphs for all
2025-09-10 09:04:36 +00:00 · 2025-05-31 18:08:50 +08:00 · 2025-05-31 18:08:50 +08:00 · f3bb947a13
commit f3bb947a13
parent b08dca65ed
5 changed files with 5 additions and 6 deletions
--- a/koboldcpp.py
+++ b/koboldcpp.py
@ -47,7 +47,7 @@ net_save_slots = 10

 # abuse prevention
 stop_token_max = 256
-ban_token_max = 512
+ban_token_max = 768
 logit_bias_max = 512
 dry_seq_break_max = 128

@ -5912,7 +5912,7 @@ def downloader_internal(input_url, output_filename, capture_output, min_file_siz
            a2cexe = (os.path.join(basepath, "aria2c-win.exe"))
            if os.path.exists(a2cexe): #on windows try using embedded a2cexe
                rc = subprocess.run([
-                        a2cexe, "-x", "16", "-s", "16", "--summary-interval=30", "--console-log-level=error", "--log-level=error",
+                        a2cexe, "-x", "16", "-s", "16", "--summary-interval=20", "--console-log-level=error", "--log-level=error",
                        "--download-result=default", "--allow-overwrite=true", "--file-allocation=none", "--max-tries=3", "-o", output_filename, input_url
                    ], capture_output=capture_output, text=True, check=True, encoding='utf-8')
                dl_success = (rc.returncode == 0 and os.path.exists(output_filename) and os.path.getsize(output_filename) > min_file_size)