diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index f9dab5e50..bfa76f9d2 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -2061,6 +2061,13 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor bool use_batched_cublas_bf16 = src0->type == GGML_TYPE_BF16 && bf16_mma_hardware_available(cc); bool use_batched_cublas_f32 = src0->type == GGML_TYPE_F32; + if(ggml_cuda_highest_compiled_arch(cc) <= GGML_CUDA_CC_TURING) + { + //kcpp: https://github.com/ggml-org/llama.cpp/pull/14361 broke oldpc mode without this. + use_batched_cublas_bf16 = false; + use_batched_cublas_f32 = false; + } + if (!split && use_mul_mat_vec_f) { // the custom F16 vector kernel can be used over batched cuBLAS GEMM // but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention) diff --git a/koboldcpp.py b/koboldcpp.py index 95e7642d0..0fa896e65 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -63,7 +63,7 @@ dry_seq_break_max = 128 extra_images_max = 4 # global vars -KcppVersion = "1.97.3" +KcppVersion = "1.97.4" showdebug = True kcpp_instance = None #global running instance global_memory = {"tunnel_url": "", "restart_target":"", "input_to_exit":False, "load_complete":False, "restart_override_config_target":""}