Merge branch 'master' into concedo_experimental

# Conflicts:
#	Makefile
#	README.md
#	common/log.h
This commit is contained in:
Concedo 2023-09-02 11:24:28 +08:00
commit eed651494e
26 changed files with 1143 additions and 658 deletions

View file

@ -3610,7 +3610,7 @@ static void llama_grammar_advance_stack(
std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
if (stack.empty()) {
new_stacks.push_back(stack);
new_stacks.emplace_back(stack);
return;
}
@ -3647,7 +3647,7 @@ static void llama_grammar_advance_stack(
}
case LLAMA_GRETYPE_CHAR:
case LLAMA_GRETYPE_CHAR_NOT:
new_stacks.push_back(stack);
new_stacks.emplace_back(stack);
break;
default:
// end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
@ -4406,7 +4406,7 @@ struct llama_logit_info {
}
return min_heap;
}
float probability_from_logit(float logit) {
float probability_from_logit(float logit) const {
return normalizer * std::exp(logit - max_l);
}
};
@ -4696,6 +4696,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
llm_load_arch(*ml, model);
llm_load_hparams(*ml, model, 0, 0, 0);
if (params->only_copy) {
ftype = model.ftype;
}
const size_t align = GGUF_DEFAULT_ALIGNMENT;
struct gguf_context * ctx_out = gguf_init_empty();
@ -4782,18 +4786,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
// quantize only 2D tensors
quantize &= (tensor->n_dims == 2);
quantize &= params->quantize_output_tensor || name != "output.weight";
quantize &= quantized_type != tensor->type;
quantize &= !params->only_copy;
enum ggml_type new_type;
void * new_data;
size_t new_size;
if (!quantize) {
new_type = tensor->type;
new_data = tensor->data;
new_size = ggml_nbytes(tensor);
LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
} else {
if (quantize) {
new_type = quantized_type;
#ifdef GGML_USE_K_QUANTS
// TODO: avoid hardcoded tensor names - use the TN_* constants
@ -4892,7 +4891,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
}
}
#endif
// If we've decided to quantize to the same type the tensor is already
// in then there's nothing to do.
quantize = tensor->type != new_type;
}
if (!quantize) {
new_type = tensor->type;
new_data = tensor->data;
new_size = ggml_nbytes(tensor);
LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
} else {
const size_t nelements = ggml_nelements(tensor);
float * f32_data;
@ -5323,6 +5331,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
/*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
/*.allow_requantize =*/ false,
/*.quantize_output_tensor =*/ true,
/*.only_copy =*/ false,
};
return result;