Merge branch 'master' into concedo_experimental

# Conflicts:
#	CMakeLists.txt
#	Makefile
#	README.md
#	build.zig
#	tests/test-backend-ops.cpp
This commit is contained in:
Concedo 2024-02-21 18:59:13 +08:00
commit 1a490e87c8
17 changed files with 734 additions and 110 deletions

View file

@ -2560,6 +2560,7 @@ struct llama_model_loader {
case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break;
case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
default:
{
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
@ -2923,6 +2924,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
case LLAMA_FTYPE_MOSTLY_Q3_K_XS:return "Q3_K - Extra small";
case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
case LLAMA_FTYPE_MOSTLY_IQ1_S :return "IQ1_S - 1.5625 bpw";
case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
default: return "unknown, may not work";
}
@ -10677,6 +10679,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL && qs.model.hparams.n_gqa() >= 4) {
new_type = GGML_TYPE_Q5_K;
}
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
@ -10729,6 +10734,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
}
}
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL && !qs.has_imatrix) {
if (i_layer < n_layer/8) new_type = GGML_TYPE_Q5_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
new_type = GGML_TYPE_Q5_K;
@ -10745,7 +10753,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
if (arch != LLM_ARCH_FALCON) {
if (qs.model.hparams.n_expert == 8) {
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ||
ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
new_type = GGML_TYPE_Q5_K;
}
@ -10812,8 +10820,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
case GGML_TYPE_IQ2_XS:
case GGML_TYPE_IQ3_XXS:
case GGML_TYPE_IQ1_S:
case GGML_TYPE_Q2_K: new_type = GGML_TYPE_Q4_0; break;
case GGML_TYPE_Q3_K: new_type = GGML_TYPE_Q4_1; break;
case GGML_TYPE_Q2_K:
case GGML_TYPE_Q3_K: new_type = GGML_TYPE_IQ4_NL; break;
case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
@ -10854,7 +10862,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
case LLAMA_FTYPE_MOSTLY_IQ2_XXS: quantized_type = GGML_TYPE_IQ2_XXS; break;
case LLAMA_FTYPE_MOSTLY_IQ2_XS: quantized_type = GGML_TYPE_IQ2_XS; break;
case LLAMA_FTYPE_MOSTLY_IQ3_XXS: quantized_type = GGML_TYPE_IQ3_XXS; break;
case LLAMA_FTYPE_MOSTLY_IQ1_S: quantized_type = GGML_TYPE_IQ1_S ; break;
case LLAMA_FTYPE_MOSTLY_IQ1_S: quantized_type = GGML_TYPE_IQ1_S; break;
case LLAMA_FTYPE_MOSTLY_IQ4_NL: quantized_type = GGML_TYPE_IQ4_NL; break;
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
}
@ -12930,7 +12939,7 @@ LLAMA_API int32_t llama_chat_apply_template(
// load template from model
std::vector<char> model_template(2048, 0); // longest known template is about 1200 bytes
std::string template_key = "tokenizer.chat_template";
int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), curr_tmpl.size());
int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
if (res < 0) {
// worst case: there is no information about template, we will use chatml by default
curr_tmpl = "<|im_start|>"; // see llama_chat_apply_template_internal