mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 09:34:37 +00:00
Merge commit '751fcfc6c3
' into concedo_experimental
# Conflicts: # .github/workflows/build.yml # CONTRIBUTING.md # README.md # flake.lock # tests/CMakeLists.txt # tests/test-backend-ops.cpp
This commit is contained in:
commit
c81d1623b4
41 changed files with 102934 additions and 92608 deletions
|
@ -119,7 +119,7 @@
|
|||
|
||||
// bump if necessary
|
||||
#define LLAMA_MAX_NODES 8192
|
||||
#define LLAMA_MAX_LAYERS 256
|
||||
#define LLAMA_MAX_LAYERS 512
|
||||
#define LLAMA_MAX_EXPERTS 160 // DeepSeekV2
|
||||
|
||||
//
|
||||
|
@ -3763,7 +3763,7 @@ struct llama_model_loader {
|
|||
}
|
||||
|
||||
if (param_overrides_p != nullptr) {
|
||||
for (const struct llama_model_kv_override *p = param_overrides_p; p->key[0] != 0; p++) {
|
||||
for (const struct llama_model_kv_override * p = param_overrides_p; p->key[0] != 0; p++) {
|
||||
kv_overrides.insert({std::string(p->key), *p});
|
||||
}
|
||||
}
|
||||
|
@ -3932,7 +3932,7 @@ struct llama_model_loader {
|
|||
ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED);
|
||||
|
||||
{
|
||||
const int kid = gguf_find_key(meta, "general.file_type");
|
||||
const int kid = gguf_find_key(meta, "general.file_type"); // TODO: use LLM_KV
|
||||
if (kid >= 0) {
|
||||
ftype = (llama_ftype) gguf_get_val_u32(meta, kid);
|
||||
}
|
||||
|
@ -4064,7 +4064,9 @@ struct llama_model_loader {
|
|||
throw std::runtime_error(format("%s is not a float32, int32 array", key.c_str()));
|
||||
}
|
||||
|
||||
GGML_ASSERT(arr_info.length <= N_MAX);
|
||||
if (arr_info.length > N_MAX) {
|
||||
throw std::runtime_error(format("array length %u for key %s exceeds max %u", (uint32_t) arr_info.length, key.c_str(), (uint32_t) N_MAX));
|
||||
}
|
||||
|
||||
std::copy((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length, result.begin());
|
||||
|
||||
|
@ -4100,8 +4102,6 @@ struct llama_model_loader {
|
|||
// get array of n <= N_MAX elements, or a single element repeated n times
|
||||
template<typename T, size_t N_MAX>
|
||||
bool get_key_or_arr(const std::string & key, std::array<T, N_MAX> & result, uint32_t n, const bool required = true) {
|
||||
GGML_ASSERT(n <= N_MAX);
|
||||
|
||||
const int kid = gguf_find_key(meta, key.c_str());
|
||||
|
||||
if (kid < 0) {
|
||||
|
@ -4111,6 +4111,10 @@ struct llama_model_loader {
|
|||
return false;
|
||||
}
|
||||
|
||||
if (n > N_MAX) {
|
||||
throw std::runtime_error(format("n > N_MAX: %u > %u for key %s", (uint32_t) n, (uint32_t) N_MAX, key.c_str()));
|
||||
}
|
||||
|
||||
if (gguf_get_kv_type(meta, kid) == GGUF_TYPE_ARRAY) {
|
||||
struct GGUFMeta::ArrayInfo arr_info =
|
||||
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta, kid);
|
||||
|
@ -5073,7 +5077,7 @@ static void llm_load_hparams(
|
|||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
switch (hparams.n_layer) {
|
||||
case 42: model.type = e_model::MODEL_SMALL; break;
|
||||
case 42: model.type = e_model::MODEL_7B; break;
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
|
@ -5436,6 +5440,7 @@ static void llm_load_vocab(
|
|||
if (merges_keyidx == -1) {
|
||||
throw std::runtime_error("cannot find tokenizer merges in model file\n");
|
||||
}
|
||||
|
||||
const int n_merges = gguf_get_arr_n(ctx, merges_keyidx);
|
||||
for (int i = 0; i < n_merges; i++) {
|
||||
const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
|
||||
|
@ -5483,16 +5488,6 @@ static void llm_load_vocab(
|
|||
vocab.special_cls_id = -1;
|
||||
vocab.special_mask_id = -1;
|
||||
|
||||
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
|
||||
if (add_space_prefix_keyidx != -1) {
|
||||
vocab.tokenizer_add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
|
||||
} // The default value of add_space_prefix is true.
|
||||
|
||||
const int remove_extra_whitespaces_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS).c_str());
|
||||
if (remove_extra_whitespaces_keyidx != -1) {
|
||||
vocab.tokenizer_remove_extra_whitespaces = gguf_get_val_bool(ctx, remove_extra_whitespaces_keyidx);
|
||||
} // The default value of remove_extra_whitespaces is false.
|
||||
|
||||
const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
|
||||
if (precompiled_charsmap_keyidx != -1) {
|
||||
size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
|
||||
|
@ -5600,6 +5595,19 @@ static void llm_load_vocab(
|
|||
} else if (
|
||||
tokenizer_pre == "jais") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS;
|
||||
} else if (
|
||||
tokenizer_pre == "tekken") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_TEKKEN;
|
||||
vocab.tokenizer_clean_spaces = false;
|
||||
vocab.tokenizer_ignore_merges = true;
|
||||
vocab.tokenizer_add_bos = true;
|
||||
} else if (
|
||||
tokenizer_pre == "smollm") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMOLLM;
|
||||
vocab.tokenizer_clean_spaces = false;
|
||||
} else if (
|
||||
tokenizer_pre == "codeshell") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CODESHELL;
|
||||
} else {
|
||||
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
||||
}
|
||||
|
@ -5623,10 +5631,8 @@ static void llm_load_vocab(
|
|||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
}
|
||||
|
||||
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
|
||||
if (add_space_prefix_keyidx != -1) {
|
||||
vocab.tokenizer_add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
|
||||
}
|
||||
ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX, vocab.tokenizer_add_space_prefix, false);
|
||||
ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, vocab.tokenizer_remove_extra_whitespaces, false);
|
||||
}
|
||||
|
||||
const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
|
||||
|
@ -6225,10 +6231,10 @@ static bool llm_load_tensors(
|
|||
|
||||
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
||||
|
||||
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
||||
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
||||
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
||||
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
||||
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head});
|
||||
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
|
||||
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
|
||||
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
|
||||
|
||||
// optional bias tensors
|
||||
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
|
@ -15863,6 +15869,8 @@ struct llm_tokenizer_bpe {
|
|||
case LLAMA_VOCAB_PRE_TYPE_STARCODER:
|
||||
case LLAMA_VOCAB_PRE_TYPE_REFACT:
|
||||
case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
|
||||
case LLAMA_VOCAB_PRE_TYPE_SMOLLM:
|
||||
case LLAMA_VOCAB_PRE_TYPE_CODESHELL:
|
||||
regex_exprs = {
|
||||
"\\p{N}",
|
||||
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
||||
|
@ -15900,6 +15908,13 @@ struct llm_tokenizer_bpe {
|
|||
"\\p{N}",
|
||||
};
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_TEKKEN:
|
||||
// original regex from tokenizer.json
|
||||
// "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
|
||||
regex_exprs = {
|
||||
"[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||
};
|
||||
break;
|
||||
default:
|
||||
// default regex for BPE tokenization pre-processing
|
||||
regex_exprs = {
|
||||
|
@ -18632,8 +18647,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
|
||||
// copy the KV pairs from the input file
|
||||
gguf_set_kv (ctx_out, ml.meta);
|
||||
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
|
||||
gguf_set_val_u32(ctx_out, "general.file_type", ftype);
|
||||
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION); // TODO: use LLM_KV
|
||||
gguf_set_val_u32(ctx_out, "general.file_type", ftype); // TODO: use LLM_KV
|
||||
|
||||
// Remove split metadata
|
||||
gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_NO).c_str());
|
||||
gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str());
|
||||
|
@ -19782,7 +19798,6 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|||
case LLM_ARCH_BAICHUAN:
|
||||
case LLM_ARCH_STARCODER:
|
||||
case LLM_ARCH_PLAMO:
|
||||
case LLM_ARCH_CODESHELL:
|
||||
case LLM_ARCH_ORION:
|
||||
case LLM_ARCH_INTERNLM2:
|
||||
case LLM_ARCH_MINICPM:
|
||||
|
@ -19812,6 +19827,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|||
case LLM_ARCH_STARCODER2:
|
||||
case LLM_ARCH_OPENELM:
|
||||
case LLM_ARCH_GPTNEOX:
|
||||
case LLM_ARCH_CODESHELL:
|
||||
return LLAMA_ROPE_TYPE_NEOX;
|
||||
|
||||
// all model arches should be listed explicitly here
|
||||
|
@ -20267,7 +20283,7 @@ size_t llama_state_get_size(const struct llama_context * ctx) {
|
|||
);
|
||||
|
||||
// on session change it is very likely that the state size has changed - so we need to update this function
|
||||
static_assert(LLAMA_SESSION_VERSION == 6, "So you just bumped the session version - good. But did you remember to update llama_state_get_size?");
|
||||
static_assert(LLAMA_SESSION_VERSION == 7, "So you just bumped the session version - good. But did you remember to update llama_state_get_size?");
|
||||
|
||||
return s_total;
|
||||
}
|
||||
|
@ -21971,7 +21987,7 @@ static int32_t llama_chat_apply_template_internal(
|
|||
if (add_ass) {
|
||||
ss << "<|assistant|>";
|
||||
}
|
||||
} else if (tmpl == "chaglm4" || tmpl_contains("[gMASK]<sop>")) {
|
||||
} else if (tmpl == "chatglm4" || tmpl_contains("[gMASK]<sop>")) {
|
||||
ss << "[gMASK]" << "<sop>";
|
||||
for (auto message : chat) {
|
||||
std::string role(message->role);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue