diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu index 4928f3902..7e61ffa17 100644 --- a/ggml/src/ggml-cuda/fattn.cu +++ b/ggml/src/ggml-cuda/fattn.cu @@ -306,7 +306,10 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const //kcpp: use wmma to fix cu11 incoherence if (ggml_cuda_should_use_wmma_fattn(cc) && (ggml_cuda_highest_compiled_arch(cc) <= GGML_CUDA_CC_TURING || cc == GGML_CUDA_CC_TURING)) { - return BEST_FATTN_KERNEL_WMMA_F16; + if(Q->ne[0] != 40 && Q->ne[0] != 72 && Q->ne[0] != 576) //kcpp: these sizes not supported in wmma + { + return BEST_FATTN_KERNEL_WMMA_F16; + } } return BEST_FATTN_KERNEL_MMA_F16; @@ -330,7 +333,7 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const } } //kcpp: patch from previous version for my sanity. it worked before, idk it should work now. - if (Q->ne[1] <= 8 || Q->ne[0] == 256) { + if ((Q->ne[1] <= 8 || Q->ne[0] == 256) && can_use_vector_kernel) { return BEST_FATTN_KERNEL_VEC; }