From 54db35cd7ad7e3066aecee90dfe59785a43a87dc Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Sun, 12 Oct 2025 20:35:46 +0800 Subject: [PATCH] fix t5 scale as well --- otherarch/sdcpp/t5.hpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/otherarch/sdcpp/t5.hpp b/otherarch/sdcpp/t5.hpp index 2d75c999c..11f8ba11e 100644 --- a/otherarch/sdcpp/t5.hpp +++ b/otherarch/sdcpp/t5.hpp @@ -504,7 +504,9 @@ public: T5DenseGatedActDense(int64_t model_dim, int64_t ff_dim) { blocks["wi_0"] = std::shared_ptr(new Linear(model_dim, ff_dim, false)); blocks["wi_1"] = std::shared_ptr(new Linear(model_dim, ff_dim, false)); - blocks["wo"] = std::shared_ptr(new Linear(ff_dim, model_dim, false)); + float scale = 1.f / 32.f; + // The purpose of the scale here is to prevent NaN issues on some backends(CUDA, ...). + blocks["wo"] = std::shared_ptr(new Linear(ff_dim, model_dim, false, false, false, scale)); } struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {