From 5f38594dc0cae8ed12eb5211c0130a7376987bda Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Fri, 6 Jun 2025 14:08:57 +0800 Subject: [PATCH] remove debug prints --- ggml/src/ggml-cuda/fattn-mma-f16.cuh | 1 - ggml/src/ggml-cuda/fattn.cu | 9 ++------- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-cuda/fattn-mma-f16.cuh b/ggml/src/ggml-cuda/fattn-mma-f16.cuh index 22f9b9c5b..e230f6d49 100644 --- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh +++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh @@ -1246,7 +1246,6 @@ static __global__ void flash_attn_ext_f16( } #if __CUDA_ARCH__ == GGML_CUDA_CC_TURING if (ncols1*ncols2 > 32) { - printf("\nCUDA_ARCH:%d, DKQ:%d, DV:%d, ncols1:%d, ncols2:%d, nwarps:%d, ntiles:%d, ne00:%d, ne01:%d\n",__CUDA_ARCH__,DKQ, DV, ncols1, ncols2, nwarps, ntiles,ne00,ne01); NO_DEVICE_CODE; return; } diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu index 22834c03b..862eaabd7 100644 --- a/ggml/src/ggml-cuda/fattn.cu +++ b/ggml/src/ggml-cuda/fattn.cu @@ -15,26 +15,21 @@ static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1(ggml_backend_cuda_con if constexpr (ncols2 <= 8) { if (Q->ne[1] <= 8/ncols2) { - printf("\nCase B: %d %d %d %d %d\n",DKQ,DV,8/ncols2,ncols2,Q->ne[1]); ggml_cuda_flash_attn_ext_mma_f16_case(ctx, dst); return; } } if (Q->ne[1] <= 16/ncols2) { - printf("\nCase C: %d %d %d %d %d\n",DKQ,DV,16/ncols2,ncols2,Q->ne[1]); ggml_cuda_flash_attn_ext_mma_f16_case(ctx, dst); return; } if (ggml_cuda_highest_compiled_arch(cc) <= GGML_CUDA_CC_TURING || Q->ne[1] <= 32/ncols2) { - printf("\nCase D: %d %d %d %d %d\n",DKQ,DV,32/ncols2,ncols2,Q->ne[1]); ggml_cuda_flash_attn_ext_mma_f16_case(ctx, dst); return; } - printf("\nDBG: %d %d %d\n",ggml_cuda_highest_compiled_arch(cc),cc,GGML_CUDA_CC_TURING); - printf("\nCase E: %d %d %d %d %d\n",DKQ,DV,64/ncols2,ncols2,Q->ne[1]); ggml_cuda_flash_attn_ext_mma_f16_case(ctx, dst); } @@ -52,7 +47,7 @@ static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2(ggml_backend_cuda_con GGML_ASSERT(Q->ne[2] % K->ne[2] == 0); const int gqa_ratio = Q->ne[2] / K->ne[2]; - printf("\ngqa_ratio is %d\n",gqa_ratio); + if (use_gqa_opt && gqa_ratio % 8 == 0) { ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1(ctx, dst); return; @@ -77,7 +72,7 @@ static void ggml_cuda_flash_attn_ext_mma_f16(ggml_backend_cuda_context & ctx, gg const ggml_tensor * K = dst->src[1]; const ggml_tensor * V = dst->src[2]; const ggml_tensor * mask = dst->src[3]; - printf("\nQ->ne[0] is %d\n",Q->ne[0]); + switch (Q->ne[0]) { case 64: GGML_ASSERT(V->ne[0] == 64);