Merge branch 'upstream' into concedo_experimental

# Conflicts:
#	README.md
#	docs/build.md
#	docs/development/HOWTO-add-model.md
#	tests/test-backend-ops.cpp
#	tests/test-chat-template.cpp
This commit is contained in:
Concedo 2025-01-10 17:57:38 +08:00
commit b154bd3671
50 changed files with 189817 additions and 187510 deletions

View file

@ -620,7 +620,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
// sanity checks
// sanity checks for models that have attention layers
if (qs.n_attention_wv != 0)
{
const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin();
// attention layers have a non-zero number of kv heads
@ -758,6 +759,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
quantize &= name.find("time_mix_w2.weight") == std::string::npos;
quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos;
quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos;
quantize &= name.find("time_mix_lerp_fused.weight") == std::string::npos;
// do not quantize relative position bias (T5)
quantize &= name.find("attn_rel_b.weight") == std::string::npos;