From 1a360b8458bb0f3efa84064ea1b2bb387eb5779c Mon Sep 17 00:00:00 2001
From: Concedo <39025047+LostRuins@users.noreply.github.com>
Date: Sun, 12 Oct 2025 17:34:27 +0800
Subject: [PATCH] sdcpp: optimize the handling of the FeedForward precision fix
 (+1 squashed commits)

Squashed commits:

[621ff6392] sdcpp: optimize the handling of the FeedForward precision fix (+1 squashed commits)

Squashed commits:

[05b16906c] sdcpp: optimize the handling of the FeedForward precision fix
---
 koboldcpp.py                    |  2 +-
 otherarch/sdcpp/common.hpp      | 22 +++++++++++-----------
 otherarch/sdcpp/ggml_extend.hpp | 18 ++++++++++++++----
 otherarch/sdcpp/qwen_image.hpp  |  5 ++++-
 4 files changed, 30 insertions(+), 17 deletions(-)
diff --git a/koboldcpp.py b/koboldcpp.py
index 0714dc41f..1e6c946b3 100644
--- a/koboldcpp.py
+++ b/koboldcpp.py
@@ -64,7 +64,7 @@ dry_seq_break_max = 128
 extra_images_max = 4
 
 # global vars
-KcppVersion = "1.100"
+KcppVersion = "1.100.1"
 showdebug = True
 kcpp_instance = None #global running instance
 global_memory = {"tunnel_url": "", "restart_target":"", "input_to_exit":False, "load_complete":False, "restart_override_config_target":""}
diff --git a/otherarch/sdcpp/common.hpp b/otherarch/sdcpp/common.hpp
index 3ae599819..82c44ce41 100644
--- a/otherarch/sdcpp/common.hpp
+++ b/otherarch/sdcpp/common.hpp
@@ -243,9 +243,8 @@ public:
                 int64_t dim_out,
                 int64_t mult          = 4,
                 Activation activation = Activation::GEGLU,
-                bool force_prec_f32 = false) {
+                bool precision_fix    = false) {
         int64_t inner_dim = dim * mult;
-        SD_UNUSED(force_prec_f32);
         if (activation == Activation::GELU) {
             blocks["net.0"] = std::shared_ptr<GGMLBlock>(new GELU(dim, inner_dim));
         } else {
@@ -253,7 +252,14 @@ public:
         }
 
         // net_1 is nn.Dropout(), skip for inference
-        blocks["net.2"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim_out));
+        float scale = 1.f;
+        if (precision_fix) {
+            scale = 1.f / 128.f;
+        }
+        // The purpose of the scale here is to prevent NaN issues in certain situations.
+        // For example, when using Vulkan without enabling force_prec_f32,
+        // or when using CUDA but the weights are k-quants.
+        blocks["net.2"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim_out, true, false, false, scale));
     }
 
     struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
@@ -264,13 +270,7 @@ public:
         auto net_2 = std::dynamic_pointer_cast<Linear>(blocks["net.2"]);
 
         x = net_0->forward(ctx, x);  // [ne3, ne2, ne1, inner_dim]
-        // The purpose of the scale here is to prevent NaN issues in certain situations.
-        // For example, when using Vulkan without enabling force_prec_f32,
-        // or when using CUDA but the weights are k-quants.
-        float scale = 1.f / 128.f;
-        x           = ggml_scale(ctx, x, scale);
-        x           = net_2->forward(ctx, x);  // [ne3, ne2, ne1, dim_out]
-        x           = ggml_scale(ctx, x, 1.f / scale);
+        x = net_2->forward(ctx, x);  // [ne3, ne2, ne1, dim_out]
         return x;
     }
 };
@@ -563,4 +563,4 @@ public:
     }
 };
 
-#endif  // __COMMON_HPP__
+#endif  // __COMMON_HPP__
\ No newline at end of file
diff --git a/otherarch/sdcpp/ggml_extend.hpp b/otherarch/sdcpp/ggml_extend.hpp
index d34563708..bb964b299 100644
--- a/otherarch/sdcpp/ggml_extend.hpp
+++ b/otherarch/sdcpp/ggml_extend.hpp
@@ -953,11 +953,18 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_linear(struct ggml_context* ctx,
                                                      struct ggml_tensor* x,
                                                      struct ggml_tensor* w,
                                                      struct ggml_tensor* b,
-                                                     bool force_prec_f32 = false) {
+                                                     bool force_prec_f32 = false,
+                                                     float scale         = 1.f) {
+    if (scale != 1.f) {
+        x = ggml_scale(ctx, x, scale);
+    }
     x = ggml_mul_mat(ctx, w, x);
     if (force_prec_f32) {
         ggml_mul_mat_set_prec(x, GGML_PREC_F32);
     }
+    if (scale != 1.f) {
+        x = ggml_scale(ctx, x, 1.f / scale);
+    }
     if (b != NULL) {
         x = ggml_add_inplace(ctx, x, b);
     }
@@ -1971,6 +1978,7 @@ protected:
     bool bias;
     bool force_f32;
     bool force_prec_f32;
+    float scale;
 
     void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
         enum ggml_type wtype = get_type(prefix + "weight", tensor_types, GGML_TYPE_F32);
@@ -1989,12 +1997,14 @@ public:
            int64_t out_features,
            bool bias           = true,
            bool force_f32      = false,
-           bool force_prec_f32 = false)
+           bool force_prec_f32 = false,
+           float scale         = 1.f)
         : in_features(in_features),
           out_features(out_features),
           bias(bias),
           force_f32(force_f32),
-          force_prec_f32(force_prec_f32) {}
+          force_prec_f32(force_prec_f32),
+          scale(scale) {}
 
     struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
         struct ggml_tensor* w = params["weight"];
@@ -2002,7 +2012,7 @@ public:
         if (bias) {
             b = params["bias"];
         }
-        return ggml_nn_linear(ctx, x, w, b, force_prec_f32);
+        return ggml_nn_linear(ctx, x, w, b, force_prec_f32, scale);
     }
 };
 
diff --git a/otherarch/sdcpp/qwen_image.hpp b/otherarch/sdcpp/qwen_image.hpp
index 726d24dd9..3ac32dedd 100644
--- a/otherarch/sdcpp/qwen_image.hpp
+++ b/otherarch/sdcpp/qwen_image.hpp
@@ -97,7 +97,10 @@ namespace Qwen {
             blocks["to_out.0"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, out_dim, out_bias));
             // to_out.1 is nn.Dropout
 
-            blocks["to_add_out"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, out_context_dim, out_bias));
+            float scale = 1.f / 32.f;
+            // The purpose of the scale here is to prevent NaN issues in certain situations.
+            // For example when using CUDA but the weights are k-quants (not all prompts).
+            blocks["to_add_out"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, out_context_dim, out_bias, false, false, scale));
         }
 
         std::pair<ggml_tensor*, ggml_tensor*> forward(struct ggml_context* ctx,