From 487d509b44ea2df38eaa3a07a6a11d8a58394ae4 Mon Sep 17 00:00:00 2001
From: Concedo <39025047+LostRuins@users.noreply.github.com>
Date: Sun, 10 Aug 2025 00:09:16 +0800
Subject: [PATCH] try fix oldpc cuda broken without flash attn since upstream
 pr14361 between 1.94 and 1.95 (+1 squashed commits)

Squashed commits:

[940f0c639] try fix oldpc cuda broken without flash attn since upstream pr14361 between 1.94 and 1.95
---
 ggml/src/ggml-cuda/ggml-cuda.cu | 7 +++++++
 koboldcpp.py                    | 2 +-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index f9dab5e50..bfa76f9d2 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2061,6 +2061,13 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
     bool use_batched_cublas_bf16 = src0->type == GGML_TYPE_BF16 && bf16_mma_hardware_available(cc);
     bool use_batched_cublas_f32  = src0->type == GGML_TYPE_F32;
 
+    if(ggml_cuda_highest_compiled_arch(cc) <= GGML_CUDA_CC_TURING)
+    {
+        //kcpp: https://github.com/ggml-org/llama.cpp/pull/14361 broke oldpc mode without this.
+        use_batched_cublas_bf16 = false;
+        use_batched_cublas_f32 = false;
+    }
+
     if (!split && use_mul_mat_vec_f) {
         // the custom F16 vector kernel can be used over batched cuBLAS GEMM
         // but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention)
diff --git a/koboldcpp.py b/koboldcpp.py
index 95e7642d0..0fa896e65 100644
--- a/koboldcpp.py
+++ b/koboldcpp.py
@@ -63,7 +63,7 @@ dry_seq_break_max = 128
 extra_images_max = 4
 
 # global vars
-KcppVersion = "1.97.3"
+KcppVersion = "1.97.4"
 showdebug = True
 kcpp_instance = None #global running instance
 global_memory = {"tunnel_url": "", "restart_target":"", "input_to_exit":False, "load_complete":False, "restart_override_config_target":""}