From 1a360b8458bb0f3efa84064ea1b2bb387eb5779c Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Sun, 12 Oct 2025 17:34:27 +0800 Subject: [PATCH] sdcpp: optimize the handling of the FeedForward precision fix (+1 squashed commits) Squashed commits: [621ff6392] sdcpp: optimize the handling of the FeedForward precision fix (+1 squashed commits) Squashed commits: [05b16906c] sdcpp: optimize the handling of the FeedForward precision fix --- koboldcpp.py | 2 +- otherarch/sdcpp/common.hpp | 22 +++++++++++----------- otherarch/sdcpp/ggml_extend.hpp | 18 ++++++++++++++---- otherarch/sdcpp/qwen_image.hpp | 5 ++++- 4 files changed, 30 insertions(+), 17 deletions(-) diff --git a/koboldcpp.py b/koboldcpp.py index 0714dc41f..1e6c946b3 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -64,7 +64,7 @@ dry_seq_break_max = 128 extra_images_max = 4 # global vars -KcppVersion = "1.100" +KcppVersion = "1.100.1" showdebug = True kcpp_instance = None #global running instance global_memory = {"tunnel_url": "", "restart_target":"", "input_to_exit":False, "load_complete":False, "restart_override_config_target":""} diff --git a/otherarch/sdcpp/common.hpp b/otherarch/sdcpp/common.hpp index 3ae599819..82c44ce41 100644 --- a/otherarch/sdcpp/common.hpp +++ b/otherarch/sdcpp/common.hpp @@ -243,9 +243,8 @@ public: int64_t dim_out, int64_t mult = 4, Activation activation = Activation::GEGLU, - bool force_prec_f32 = false) { + bool precision_fix = false) { int64_t inner_dim = dim * mult; - SD_UNUSED(force_prec_f32); if (activation == Activation::GELU) { blocks["net.0"] = std::shared_ptr(new GELU(dim, inner_dim)); } else { @@ -253,7 +252,14 @@ public: } // net_1 is nn.Dropout(), skip for inference - blocks["net.2"] = std::shared_ptr(new Linear(inner_dim, dim_out)); + float scale = 1.f; + if (precision_fix) { + scale = 1.f / 128.f; + } + // The purpose of the scale here is to prevent NaN issues in certain situations. + // For example, when using Vulkan without enabling force_prec_f32, + // or when using CUDA but the weights are k-quants. + blocks["net.2"] = std::shared_ptr(new Linear(inner_dim, dim_out, true, false, false, scale)); } struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { @@ -264,13 +270,7 @@ public: auto net_2 = std::dynamic_pointer_cast(blocks["net.2"]); x = net_0->forward(ctx, x); // [ne3, ne2, ne1, inner_dim] - // The purpose of the scale here is to prevent NaN issues in certain situations. - // For example, when using Vulkan without enabling force_prec_f32, - // or when using CUDA but the weights are k-quants. - float scale = 1.f / 128.f; - x = ggml_scale(ctx, x, scale); - x = net_2->forward(ctx, x); // [ne3, ne2, ne1, dim_out] - x = ggml_scale(ctx, x, 1.f / scale); + x = net_2->forward(ctx, x); // [ne3, ne2, ne1, dim_out] return x; } }; @@ -563,4 +563,4 @@ public: } }; -#endif // __COMMON_HPP__ +#endif // __COMMON_HPP__ \ No newline at end of file diff --git a/otherarch/sdcpp/ggml_extend.hpp b/otherarch/sdcpp/ggml_extend.hpp index d34563708..bb964b299 100644 --- a/otherarch/sdcpp/ggml_extend.hpp +++ b/otherarch/sdcpp/ggml_extend.hpp @@ -953,11 +953,18 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_linear(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* w, struct ggml_tensor* b, - bool force_prec_f32 = false) { + bool force_prec_f32 = false, + float scale = 1.f) { + if (scale != 1.f) { + x = ggml_scale(ctx, x, scale); + } x = ggml_mul_mat(ctx, w, x); if (force_prec_f32) { ggml_mul_mat_set_prec(x, GGML_PREC_F32); } + if (scale != 1.f) { + x = ggml_scale(ctx, x, 1.f / scale); + } if (b != NULL) { x = ggml_add_inplace(ctx, x, b); } @@ -1971,6 +1978,7 @@ protected: bool bias; bool force_f32; bool force_prec_f32; + float scale; void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") { enum ggml_type wtype = get_type(prefix + "weight", tensor_types, GGML_TYPE_F32); @@ -1989,12 +1997,14 @@ public: int64_t out_features, bool bias = true, bool force_f32 = false, - bool force_prec_f32 = false) + bool force_prec_f32 = false, + float scale = 1.f) : in_features(in_features), out_features(out_features), bias(bias), force_f32(force_f32), - force_prec_f32(force_prec_f32) {} + force_prec_f32(force_prec_f32), + scale(scale) {} struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { struct ggml_tensor* w = params["weight"]; @@ -2002,7 +2012,7 @@ public: if (bias) { b = params["bias"]; } - return ggml_nn_linear(ctx, x, w, b, force_prec_f32); + return ggml_nn_linear(ctx, x, w, b, force_prec_f32, scale); } }; diff --git a/otherarch/sdcpp/qwen_image.hpp b/otherarch/sdcpp/qwen_image.hpp index 726d24dd9..3ac32dedd 100644 --- a/otherarch/sdcpp/qwen_image.hpp +++ b/otherarch/sdcpp/qwen_image.hpp @@ -97,7 +97,10 @@ namespace Qwen { blocks["to_out.0"] = std::shared_ptr(new Linear(inner_dim, out_dim, out_bias)); // to_out.1 is nn.Dropout - blocks["to_add_out"] = std::shared_ptr(new Linear(inner_dim, out_context_dim, out_bias)); + float scale = 1.f / 32.f; + // The purpose of the scale here is to prevent NaN issues in certain situations. + // For example when using CUDA but the weights are k-quants (not all prompts). + blocks["to_add_out"] = std::shared_ptr(new Linear(inner_dim, out_context_dim, out_bias, false, false, scale)); } std::pair forward(struct ggml_context* ctx,