Merge branch 'upstream' into concedo_experimental

# Conflicts:
#	.devops/full-cuda.Dockerfile
#	.devops/full.Dockerfile
#	.devops/main-cuda.Dockerfile
#	.devops/main-rocm.Dockerfile
#	.devops/main-vulkan.Dockerfile
#	.devops/main.Dockerfile
#	.devops/server-cuda.Dockerfile
#	.devops/server.Dockerfile
#	README.md
#	common/CMakeLists.txt
#	grammars/README.md
#	tests/test-grammar-integration.cpp
#	tests/test-grammar-parser.cpp
#	tests/test-json-schema-to-grammar.cpp
This commit is contained in:
Concedo 2024-06-09 17:30:05 +08:00
commit 562d980140
25 changed files with 881 additions and 676 deletions

View file

@ -728,6 +728,7 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
{ LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" },
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
@ -4715,8 +4716,7 @@ static void llm_load_vocab(
LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
LLAMA_LOG_WARN("%s: \n", __func__);
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} else if (
tokenizer_pre == "default") {
} else if (tokenizer_pre == "default") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} else if (
tokenizer_pre == "llama3" ||
@ -4743,7 +4743,8 @@ static void llm_load_vocab(
tokenizer_pre == "jina-es" ||
tokenizer_pre == "jina-de" ||
tokenizer_pre == "jina-v2-es" ||
tokenizer_pre == "jina-v2-de") {
tokenizer_pre == "jina-v2-de" ||
tokenizer_pre == "jina-v2-code") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
} else if (
tokenizer_pre == "refact") {
@ -5593,7 +5594,7 @@ static bool llm_load_tensors(
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
} else {
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
}
layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
@ -5634,6 +5635,9 @@ static bool llm_load_tensors(
layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}); //output_norm
layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd});
layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
@ -8597,6 +8601,11 @@ struct llm_build_context {
// attention layer norm
cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, cb, il);
if (model.layers[il].attn_norm_2 != nullptr) {
cur = ggml_add(ctx0, cur, inpL); // re-add the layer input
cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_norm_2, model.layers[il].attn_norm_2_b, LLM_NORM, cb, il);
}
struct ggml_tensor * ffn_inp = cur;
cb(ffn_inp, "ffn_inp", il);
@ -13940,7 +13949,7 @@ static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
const uint32_t chr) {
bool found = false;
bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR || pos->type == LLAMA_GRETYPE_CHAR_ANY;
GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT); // NOLINT
@ -13949,6 +13958,10 @@ static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
// inclusive range, e.g. [a-z]
found = found || (pos->value <= chr && chr <= pos[1].value);
pos += 2;
} else if (pos->type == LLAMA_GRETYPE_CHAR_ANY) {
// Any character matches "."
found = true;
pos += 1;
} else {
// exact char match, e.g. [a] or "a"
found = found || pos->value == chr;
@ -13966,7 +13979,7 @@ static bool llama_grammar_match_partial_char(
const llama_grammar_element * pos,
const llama_partial_utf8 partial_utf8) {
bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR || pos->type == LLAMA_GRETYPE_CHAR_ANY;
GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT);
uint32_t partial_value = partial_utf8.value;
@ -13996,6 +14009,9 @@ static bool llama_grammar_match_partial_char(
return is_positive_char;
}
pos += 2;
} else if (pos->type == LLAMA_GRETYPE_CHAR_ANY) {
// Any character matches "."
return true;
} else {
// exact char match, e.g. [a] or "a"
if (low <= pos->value && pos->value <= high) {
@ -14056,6 +14072,7 @@ static void llama_grammar_advance_stack(
}
case LLAMA_GRETYPE_CHAR:
case LLAMA_GRETYPE_CHAR_NOT:
case LLAMA_GRETYPE_CHAR_ANY:
if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
// only add the stack if it's not a duplicate of one we already have
new_stacks.emplace_back(stack);
@ -15543,6 +15560,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
if (imatrix_data) {
LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
qs.has_imatrix = true;
// check imatrix for nans or infs
for (const auto & kv : *imatrix_data) {
for (float f : kv.second) {
if (!std::isfinite(f)) {
throw std::runtime_error(format("imatrix contains non-finite value %f\n", f));
}
}
}
}
}