mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 09:34:37 +00:00
Merge branch 'master' into concedo_experimental
# Conflicts: # Makefile # README.md # common/log.h
This commit is contained in:
commit
eed651494e
26 changed files with 1143 additions and 658 deletions
31
llama.cpp
31
llama.cpp
|
@ -3610,7 +3610,7 @@ static void llama_grammar_advance_stack(
|
|||
std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
|
||||
|
||||
if (stack.empty()) {
|
||||
new_stacks.push_back(stack);
|
||||
new_stacks.emplace_back(stack);
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -3647,7 +3647,7 @@ static void llama_grammar_advance_stack(
|
|||
}
|
||||
case LLAMA_GRETYPE_CHAR:
|
||||
case LLAMA_GRETYPE_CHAR_NOT:
|
||||
new_stacks.push_back(stack);
|
||||
new_stacks.emplace_back(stack);
|
||||
break;
|
||||
default:
|
||||
// end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
|
||||
|
@ -4406,7 +4406,7 @@ struct llama_logit_info {
|
|||
}
|
||||
return min_heap;
|
||||
}
|
||||
float probability_from_logit(float logit) {
|
||||
float probability_from_logit(float logit) const {
|
||||
return normalizer * std::exp(logit - max_l);
|
||||
}
|
||||
};
|
||||
|
@ -4696,6 +4696,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
llm_load_arch(*ml, model);
|
||||
llm_load_hparams(*ml, model, 0, 0, 0);
|
||||
|
||||
if (params->only_copy) {
|
||||
ftype = model.ftype;
|
||||
}
|
||||
|
||||
const size_t align = GGUF_DEFAULT_ALIGNMENT;
|
||||
struct gguf_context * ctx_out = gguf_init_empty();
|
||||
|
||||
|
@ -4782,18 +4786,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
// quantize only 2D tensors
|
||||
quantize &= (tensor->n_dims == 2);
|
||||
quantize &= params->quantize_output_tensor || name != "output.weight";
|
||||
quantize &= quantized_type != tensor->type;
|
||||
quantize &= !params->only_copy;
|
||||
|
||||
enum ggml_type new_type;
|
||||
void * new_data;
|
||||
size_t new_size;
|
||||
|
||||
if (!quantize) {
|
||||
new_type = tensor->type;
|
||||
new_data = tensor->data;
|
||||
new_size = ggml_nbytes(tensor);
|
||||
LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
|
||||
} else {
|
||||
if (quantize) {
|
||||
new_type = quantized_type;
|
||||
#ifdef GGML_USE_K_QUANTS
|
||||
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
||||
|
@ -4892,7 +4891,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// If we've decided to quantize to the same type the tensor is already
|
||||
// in then there's nothing to do.
|
||||
quantize = tensor->type != new_type;
|
||||
}
|
||||
if (!quantize) {
|
||||
new_type = tensor->type;
|
||||
new_data = tensor->data;
|
||||
new_size = ggml_nbytes(tensor);
|
||||
LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
|
||||
} else {
|
||||
const size_t nelements = ggml_nelements(tensor);
|
||||
|
||||
float * f32_data;
|
||||
|
@ -5323,6 +5331,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|||
/*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
|
||||
/*.allow_requantize =*/ false,
|
||||
/*.quantize_output_tensor =*/ true,
|
||||
/*.only_copy =*/ false,
|
||||
};
|
||||
|
||||
return result;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue