mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-19 08:00:25 +00:00
sdcpp: optimize the handling of the FeedForward precision fix (+1 squashed commits)
Squashed commits: [621ff6392] sdcpp: optimize the handling of the FeedForward precision fix (+1 squashed commits) Squashed commits: [05b16906c] sdcpp: optimize the handling of the FeedForward precision fix
This commit is contained in:
parent
9503547ca1
commit
1a360b8458
4 changed files with 30 additions and 17 deletions
|
|
@ -64,7 +64,7 @@ dry_seq_break_max = 128
|
|||
extra_images_max = 4
|
||||
|
||||
# global vars
|
||||
KcppVersion = "1.100"
|
||||
KcppVersion = "1.100.1"
|
||||
showdebug = True
|
||||
kcpp_instance = None #global running instance
|
||||
global_memory = {"tunnel_url": "", "restart_target":"", "input_to_exit":False, "load_complete":False, "restart_override_config_target":""}
|
||||
|
|
|
|||
|
|
@ -243,9 +243,8 @@ public:
|
|||
int64_t dim_out,
|
||||
int64_t mult = 4,
|
||||
Activation activation = Activation::GEGLU,
|
||||
bool force_prec_f32 = false) {
|
||||
bool precision_fix = false) {
|
||||
int64_t inner_dim = dim * mult;
|
||||
SD_UNUSED(force_prec_f32);
|
||||
if (activation == Activation::GELU) {
|
||||
blocks["net.0"] = std::shared_ptr<GGMLBlock>(new GELU(dim, inner_dim));
|
||||
} else {
|
||||
|
|
@ -253,7 +252,14 @@ public:
|
|||
}
|
||||
|
||||
// net_1 is nn.Dropout(), skip for inference
|
||||
blocks["net.2"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim_out));
|
||||
float scale = 1.f;
|
||||
if (precision_fix) {
|
||||
scale = 1.f / 128.f;
|
||||
}
|
||||
// The purpose of the scale here is to prevent NaN issues in certain situations.
|
||||
// For example, when using Vulkan without enabling force_prec_f32,
|
||||
// or when using CUDA but the weights are k-quants.
|
||||
blocks["net.2"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim_out, true, false, false, scale));
|
||||
}
|
||||
|
||||
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
|
||||
|
|
@ -264,13 +270,7 @@ public:
|
|||
auto net_2 = std::dynamic_pointer_cast<Linear>(blocks["net.2"]);
|
||||
|
||||
x = net_0->forward(ctx, x); // [ne3, ne2, ne1, inner_dim]
|
||||
// The purpose of the scale here is to prevent NaN issues in certain situations.
|
||||
// For example, when using Vulkan without enabling force_prec_f32,
|
||||
// or when using CUDA but the weights are k-quants.
|
||||
float scale = 1.f / 128.f;
|
||||
x = ggml_scale(ctx, x, scale);
|
||||
x = net_2->forward(ctx, x); // [ne3, ne2, ne1, dim_out]
|
||||
x = ggml_scale(ctx, x, 1.f / scale);
|
||||
x = net_2->forward(ctx, x); // [ne3, ne2, ne1, dim_out]
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
|
@ -563,4 +563,4 @@ public:
|
|||
}
|
||||
};
|
||||
|
||||
#endif // __COMMON_HPP__
|
||||
#endif // __COMMON_HPP__
|
||||
|
|
@ -953,11 +953,18 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_linear(struct ggml_context* ctx,
|
|||
struct ggml_tensor* x,
|
||||
struct ggml_tensor* w,
|
||||
struct ggml_tensor* b,
|
||||
bool force_prec_f32 = false) {
|
||||
bool force_prec_f32 = false,
|
||||
float scale = 1.f) {
|
||||
if (scale != 1.f) {
|
||||
x = ggml_scale(ctx, x, scale);
|
||||
}
|
||||
x = ggml_mul_mat(ctx, w, x);
|
||||
if (force_prec_f32) {
|
||||
ggml_mul_mat_set_prec(x, GGML_PREC_F32);
|
||||
}
|
||||
if (scale != 1.f) {
|
||||
x = ggml_scale(ctx, x, 1.f / scale);
|
||||
}
|
||||
if (b != NULL) {
|
||||
x = ggml_add_inplace(ctx, x, b);
|
||||
}
|
||||
|
|
@ -1971,6 +1978,7 @@ protected:
|
|||
bool bias;
|
||||
bool force_f32;
|
||||
bool force_prec_f32;
|
||||
float scale;
|
||||
|
||||
void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
|
||||
enum ggml_type wtype = get_type(prefix + "weight", tensor_types, GGML_TYPE_F32);
|
||||
|
|
@ -1989,12 +1997,14 @@ public:
|
|||
int64_t out_features,
|
||||
bool bias = true,
|
||||
bool force_f32 = false,
|
||||
bool force_prec_f32 = false)
|
||||
bool force_prec_f32 = false,
|
||||
float scale = 1.f)
|
||||
: in_features(in_features),
|
||||
out_features(out_features),
|
||||
bias(bias),
|
||||
force_f32(force_f32),
|
||||
force_prec_f32(force_prec_f32) {}
|
||||
force_prec_f32(force_prec_f32),
|
||||
scale(scale) {}
|
||||
|
||||
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
|
||||
struct ggml_tensor* w = params["weight"];
|
||||
|
|
@ -2002,7 +2012,7 @@ public:
|
|||
if (bias) {
|
||||
b = params["bias"];
|
||||
}
|
||||
return ggml_nn_linear(ctx, x, w, b, force_prec_f32);
|
||||
return ggml_nn_linear(ctx, x, w, b, force_prec_f32, scale);
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -97,7 +97,10 @@ namespace Qwen {
|
|||
blocks["to_out.0"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, out_dim, out_bias));
|
||||
// to_out.1 is nn.Dropout
|
||||
|
||||
blocks["to_add_out"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, out_context_dim, out_bias));
|
||||
float scale = 1.f / 32.f;
|
||||
// The purpose of the scale here is to prevent NaN issues in certain situations.
|
||||
// For example when using CUDA but the weights are k-quants (not all prompts).
|
||||
blocks["to_add_out"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, out_context_dim, out_bias, false, false, scale));
|
||||
}
|
||||
|
||||
std::pair<ggml_tensor*, ggml_tensor*> forward(struct ggml_context* ctx,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue