mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-12 22:32:18 +00:00
Merge commit '017cc5f446' into concedo_experimental
# Conflicts: # .github/ISSUE_TEMPLATE/010-bug-compilation.yml # .github/ISSUE_TEMPLATE/019-bug-misc.yml # CODEOWNERS # examples/batched-bench/batched-bench.cpp # examples/batched/batched.cpp # examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp # examples/gritlm/gritlm.cpp # examples/llama-bench/llama-bench.cpp # examples/passkey/passkey.cpp # examples/quantize-stats/quantize-stats.cpp # examples/run/run.cpp # examples/simple-chat/simple-chat.cpp # examples/simple/simple.cpp # examples/tokenize/tokenize.cpp # ggml/CMakeLists.txt # ggml/src/ggml-metal/CMakeLists.txt # ggml/src/ggml-vulkan/CMakeLists.txt # scripts/sync-ggml.last # src/llama.cpp # tests/test-autorelease.cpp # tests/test-model-load-cancel.cpp # tests/test-tokenizer-0.cpp # tests/test-tokenizer-1-bpe.cpp # tests/test-tokenizer-1-spm.cpp
This commit is contained in:
commit
dcfa1eca4e
45 changed files with 806 additions and 229 deletions
|
|
@ -39,6 +39,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|||
{ LLM_ARCH_MAMBA, "mamba" },
|
||||
{ LLM_ARCH_XVERSE, "xverse" },
|
||||
{ LLM_ARCH_COMMAND_R, "command-r" },
|
||||
{ LLM_ARCH_COHERE2, "cohere2" },
|
||||
{ LLM_ARCH_DBRX, "dbrx" },
|
||||
{ LLM_ARCH_OLMO, "olmo" },
|
||||
{ LLM_ARCH_OLMO2, "olmo2" },
|
||||
|
|
@ -91,6 +92,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|||
{ LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
|
||||
{ LLM_KV_EXPERT_SHARED_COUNT, "%s.expert_shared_count" },
|
||||
{ LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
|
||||
{ LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" },
|
||||
{ LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" },
|
||||
{ LLM_KV_POOLING_TYPE, "%s.pooling_type" },
|
||||
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
|
||||
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
|
||||
|
|
@ -807,6 +810,21 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|||
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_COHERE2,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_DBRX,
|
||||
{
|
||||
|
|
@ -968,6 +986,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|||
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
|
||||
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
||||
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
||||
{ LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
|
||||
},
|
||||
},
|
||||
{
|
||||
|
|
@ -1350,6 +1369,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
|||
{LLM_TENSOR_FFN_DOWN_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
||||
{LLM_TENSOR_FFN_GATE_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
||||
{LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
||||
{LLM_TENSOR_FFN_EXP_PROBS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
|
||||
// this tensor is loaded for T5, but never used
|
||||
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
|
||||
{LLM_TENSOR_CONV1D, {LLM_TENSOR_LAYER_INPUT, GGML_OP_IM2COL}},
|
||||
|
|
|
|||
|
|
@ -43,6 +43,7 @@ enum llm_arch {
|
|||
LLM_ARCH_MAMBA,
|
||||
LLM_ARCH_XVERSE,
|
||||
LLM_ARCH_COMMAND_R,
|
||||
LLM_ARCH_COHERE2,
|
||||
LLM_ARCH_DBRX,
|
||||
LLM_ARCH_OLMO,
|
||||
LLM_ARCH_OLMO2,
|
||||
|
|
@ -95,6 +96,8 @@ enum llm_kv {
|
|||
LLM_KV_EXPERT_USED_COUNT,
|
||||
LLM_KV_EXPERT_SHARED_COUNT,
|
||||
LLM_KV_EXPERT_WEIGHTS_SCALE,
|
||||
LLM_KV_EXPERT_WEIGHTS_NORM,
|
||||
LLM_KV_EXPERT_GATING_FUNC,
|
||||
LLM_KV_POOLING_TYPE,
|
||||
LLM_KV_LOGIT_SCALE,
|
||||
LLM_KV_DECODER_START_TOKEN_ID,
|
||||
|
|
@ -230,6 +233,7 @@ enum llm_tensor {
|
|||
LLM_TENSOR_FFN_DOWN_SHEXP,
|
||||
LLM_TENSOR_FFN_GATE_SHEXP,
|
||||
LLM_TENSOR_FFN_UP_SHEXP,
|
||||
LLM_TENSOR_FFN_EXP_PROBS_B,
|
||||
LLM_TENSOR_ATTN_Q_NORM,
|
||||
LLM_TENSOR_ATTN_K_NORM,
|
||||
LLM_TENSOR_LAYER_OUT_NORM,
|
||||
|
|
|
|||
|
|
@ -45,6 +45,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
|||
{ "vicuna-orca", LLM_CHAT_TEMPLATE_VICUNA_ORCA },
|
||||
{ "deepseek", LLM_CHAT_TEMPLATE_DEEPSEEK },
|
||||
{ "deepseek2", LLM_CHAT_TEMPLATE_DEEPSEEK_2 },
|
||||
{ "deepseek3", LLM_CHAT_TEMPLATE_DEEPSEEK_3 },
|
||||
{ "command-r", LLM_CHAT_TEMPLATE_COMMAND_R },
|
||||
{ "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 },
|
||||
{ "chatglm3", LLM_CHAT_TEMPLATE_CHATGML_3 },
|
||||
|
|
@ -148,6 +149,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
|||
return LLM_CHAT_TEMPLATE_MINICPM;
|
||||
} else if (tmpl_contains("'Assistant: ' + message['content'] + eos_token")) {
|
||||
return LLM_CHAT_TEMPLATE_DEEPSEEK_2;
|
||||
} else if (tmpl_contains(LU8("'<|Assistant|>' + message['content'] + '<|end▁of▁sentence|>'"))) {
|
||||
return LLM_CHAT_TEMPLATE_DEEPSEEK_3;
|
||||
} else if (tmpl_contains("[|system|]") && tmpl_contains("[|assistant|]") && tmpl_contains("[|endofturn|]")) {
|
||||
// ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
|
||||
// EXAONE-3.0-7.8B-Instruct
|
||||
|
|
@ -453,6 +456,21 @@ int32_t llm_chat_apply_template(
|
|||
if (add_ass) {
|
||||
ss << "Assistant:";
|
||||
}
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK_3) {
|
||||
// DeepSeek-V3
|
||||
for (auto message : chat) {
|
||||
std::string role(message->role);
|
||||
if (role == "system") {
|
||||
ss << message->content << "\n\n";
|
||||
} else if (role == "user") {
|
||||
ss << LU8("<|User|>") << message->content;
|
||||
} else if (role == "assistant") {
|
||||
ss << LU8("<|Assistant|>") << message->content << LU8("<|end▁of▁sentence|>");
|
||||
}
|
||||
}
|
||||
if (add_ass) {
|
||||
ss << LU8("<|Assistant|>");
|
||||
}
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_EXAONE_3) {
|
||||
// ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
|
||||
// EXAONE-3.0-7.8B-Instruct
|
||||
|
|
|
|||
|
|
@ -25,6 +25,7 @@ enum llm_chat_template {
|
|||
LLM_CHAT_TEMPLATE_VICUNA_ORCA,
|
||||
LLM_CHAT_TEMPLATE_DEEPSEEK,
|
||||
LLM_CHAT_TEMPLATE_DEEPSEEK_2,
|
||||
LLM_CHAT_TEMPLATE_DEEPSEEK_3,
|
||||
LLM_CHAT_TEMPLATE_COMMAND_R,
|
||||
LLM_CHAT_TEMPLATE_LLAMA_3,
|
||||
LLM_CHAT_TEMPLATE_CHATGML_3,
|
||||
|
|
|
|||
|
|
@ -6,7 +6,13 @@
|
|||
|
||||
// bump if necessary
|
||||
#define LLAMA_MAX_LAYERS 512
|
||||
#define LLAMA_MAX_EXPERTS 160 // DeepSeekV2
|
||||
#define LLAMA_MAX_EXPERTS 256 // DeepSeekV3
|
||||
|
||||
enum llama_expert_gating_func_type {
|
||||
LLAMA_EXPERT_GATING_FUNC_TYPE_NONE = 0,
|
||||
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX = 1,
|
||||
LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID = 2,
|
||||
};
|
||||
|
||||
struct llama_hparams_posnet {
|
||||
uint32_t n_embd;
|
||||
|
|
@ -54,7 +60,9 @@ struct llama_hparams {
|
|||
uint32_t n_expert_shared = 0;
|
||||
uint32_t n_norm_groups = 0;
|
||||
|
||||
float expert_weights_scale = 0.0;
|
||||
float expert_weights_scale = 0.0;
|
||||
bool expert_weights_norm = false;
|
||||
uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
|
||||
|
||||
float f_norm_eps;
|
||||
float f_norm_rms_eps;
|
||||
|
|
|
|||
|
|
@ -119,10 +119,10 @@ bool llama_kv_cache_init(
|
|||
|
||||
struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
|
||||
struct llama_kv_cache & cache,
|
||||
const struct llama_ubatch & batch) {
|
||||
const uint32_t n_tokens = batch.n_tokens;
|
||||
const uint32_t n_seqs = batch.n_seqs;
|
||||
const uint32_t n_seq_tokens = batch.n_seq_tokens;
|
||||
const struct llama_ubatch & ubatch) {
|
||||
const uint32_t n_tokens = ubatch.n_tokens;
|
||||
const uint32_t n_seqs = ubatch.n_seqs;
|
||||
const uint32_t n_seq_tokens = ubatch.n_seq_tokens;
|
||||
|
||||
if (cache.recurrent) {
|
||||
// For recurrent state architectures (like Mamba or RWKV),
|
||||
|
|
@ -130,16 +130,16 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
|
|||
// A slot should be always be contiguous.
|
||||
|
||||
// can only process batches with an equal number of new tokens in each sequence
|
||||
GGML_ASSERT(batch.equal_seqs);
|
||||
GGML_ASSERT(ubatch.equal_seqs);
|
||||
|
||||
int32_t min = cache.size - 1;
|
||||
int32_t max = 0;
|
||||
|
||||
// everything should fit if all seq_ids are smaller than the max
|
||||
for (uint32_t s = 0; s < n_seqs; ++s) {
|
||||
const uint32_t n_seq_id = batch.n_seq_id[s];
|
||||
const uint32_t n_seq_id = ubatch.n_seq_id[s];
|
||||
for (uint32_t j = 0; j < n_seq_id; ++j) {
|
||||
const llama_seq_id seq_id = batch.seq_id[s][j];
|
||||
const llama_seq_id seq_id = ubatch.seq_id[s][j];
|
||||
|
||||
if (seq_id < 0 || (uint32_t) seq_id >= cache.size) {
|
||||
// too big seq_id
|
||||
|
|
@ -198,7 +198,7 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
|
|||
|
||||
// find usable cell range
|
||||
for (uint32_t s = 0; s < n_seqs; ++s) {
|
||||
const llama_seq_id seq_id = batch.seq_id[s][0];
|
||||
const llama_seq_id seq_id = ubatch.seq_id[s][0];
|
||||
llama_kv_cell & seq_meta = cache.cells[seq_id];
|
||||
bool has_cell = false;
|
||||
if (seq_meta.tail >= 0) {
|
||||
|
|
@ -237,7 +237,7 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
|
|||
// gather and re-order
|
||||
for (uint32_t s = 0; s < n_seqs; ++s) {
|
||||
int32_t dst_id = s + min;
|
||||
int32_t src_id = cache.cells[batch.seq_id[s][0]].tail;
|
||||
int32_t src_id = cache.cells[ubatch.seq_id[s][0]].tail;
|
||||
if (dst_id != src_id) {
|
||||
llama_kv_cell & dst_cell = cache.cells[dst_id];
|
||||
llama_kv_cell & src_cell = cache.cells[src_id];
|
||||
|
|
@ -258,7 +258,7 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
|
|||
|
||||
// update the pos of the used seqs
|
||||
for (uint32_t s = 0; s < n_seqs; ++s) {
|
||||
const llama_pos last_pos = batch.pos[n_seq_tokens * s + n_seq_tokens - 1];
|
||||
const llama_pos last_pos = ubatch.pos[n_seq_tokens * s + n_seq_tokens - 1];
|
||||
int32_t cell_id = s + min;
|
||||
llama_kv_cell & cell = cache.cells[cell_id];
|
||||
|
||||
|
|
@ -266,12 +266,12 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
|
|||
// What should happen when the pos backtracks or skips a value?
|
||||
// Clearing the state mid-batch would require special-casing which isn't done.
|
||||
LLAMA_LOG_WARN("%s: non-consecutive token position %d after %d for sequence %d with %u new tokens\n",
|
||||
__func__, last_pos, cell.pos, batch.seq_id[s][0], n_seq_tokens);
|
||||
__func__, last_pos, cell.pos, ubatch.seq_id[s][0], n_seq_tokens);
|
||||
}
|
||||
cell.pos = last_pos;
|
||||
cell.seq_id.clear();
|
||||
for (int32_t j = 0; j < batch.n_seq_id[s]; ++j) {
|
||||
const llama_seq_id seq_id = batch.seq_id[s][j];
|
||||
for (int32_t j = 0; j < ubatch.n_seq_id[s]; ++j) {
|
||||
const llama_seq_id seq_id = ubatch.seq_id[s][j];
|
||||
cell.seq_id.insert(seq_id);
|
||||
cache.cells[seq_id].tail = cell_id;
|
||||
}
|
||||
|
|
@ -325,10 +325,10 @@ struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
|
|||
for (uint32_t s = 0; s < n_seqs; s++) {
|
||||
for (uint32_t i = 0; i < n_seq_tokens; ++i) {
|
||||
uint32_t k = s*n_seq_tokens + i;
|
||||
cache.cells[cache.head + k].pos = batch.pos[k];
|
||||
cache.cells[cache.head + k].pos = ubatch.pos[k];
|
||||
|
||||
for (int32_t j = 0; j < batch.n_seq_id[s]; j++) {
|
||||
cache.cells[cache.head + k].seq_id.insert(batch.seq_id[s][j]);
|
||||
for (int32_t j = 0; j < ubatch.n_seq_id[s]; j++) {
|
||||
cache.cells[cache.head + k].seq_id.insert(ubatch.seq_id[s][j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -241,12 +241,16 @@ llama_file::~llama_file() = default;
|
|||
size_t llama_file::tell() const { return pimpl->tell(); }
|
||||
size_t llama_file::size() const { return pimpl->size; }
|
||||
|
||||
int llama_file::fileno() const {
|
||||
int llama_file::file_id() const {
|
||||
#ifdef _WIN32
|
||||
return _fileno(pimpl->fp);
|
||||
#else
|
||||
#if defined(fileno)
|
||||
return fileno(pimpl->fp);
|
||||
#else
|
||||
return ::fileno(pimpl->fp);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
void llama_file::seek(size_t offset, int whence) const { pimpl->seek(offset, whence); }
|
||||
|
|
@ -265,7 +269,7 @@ struct llama_mmap::impl {
|
|||
|
||||
impl(struct llama_file * file, size_t prefetch, bool numa) {
|
||||
size = file->size();
|
||||
int fd = file->fileno();
|
||||
int fd = file->file_id();
|
||||
int flags = MAP_SHARED;
|
||||
if (numa) { prefetch = 0; }
|
||||
#ifdef __linux__
|
||||
|
|
@ -357,7 +361,7 @@ struct llama_mmap::impl {
|
|||
|
||||
size = file->size();
|
||||
|
||||
HANDLE hFile = (HANDLE) _get_osfhandle(file->fileno());
|
||||
HANDLE hFile = (HANDLE) _get_osfhandle(file->file_id());
|
||||
|
||||
HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
|
||||
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@ struct llama_file {
|
|||
size_t tell() const;
|
||||
size_t size() const;
|
||||
|
||||
int fileno() const;
|
||||
int file_id() const; // fileno overload
|
||||
|
||||
void seek(size_t offset, int whence) const;
|
||||
|
||||
|
|
|
|||
|
|
@ -70,6 +70,7 @@ const char * llm_type_name(llm_type type) {
|
|||
case MODEL_70B: return "70B";
|
||||
case MODEL_236B: return "236B";
|
||||
case MODEL_314B: return "314B";
|
||||
case MODEL_671B: return "671B";
|
||||
case MODEL_SMALL: return "0.1B";
|
||||
case MODEL_MEDIUM: return "0.4B";
|
||||
case MODEL_LARGE: return "0.8B";
|
||||
|
|
@ -129,6 +130,14 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|||
}
|
||||
}
|
||||
|
||||
static const char * llama_expert_gating_func_name(llama_expert_gating_func_type type) {
|
||||
switch (type) {
|
||||
case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX: return "softmax";
|
||||
case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID: return "sigmoid";
|
||||
default: return "unknown";
|
||||
}
|
||||
}
|
||||
|
||||
std::string llama_model_arch_name (const llama_model & model) {
|
||||
return llm_arch_name(model.arch);
|
||||
}
|
||||
|
|
@ -793,6 +802,16 @@ void llm_load_hparams(llama_model_loader & ml, llama_model & model) {
|
|||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_COHERE2:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
||||
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
switch (hparams.n_layer) {
|
||||
case 32: model.type = e_model::MODEL_8B; break;
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_DBRX:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
|
|
@ -930,11 +949,19 @@ void llm_load_hparams(llama_model_loader & ml, llama_model & model) {
|
|||
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
||||
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
||||
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
||||
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
|
||||
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
|
||||
if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
|
||||
// for compatibility with existing DeepSeek V2 and V2.5 GGUFs
|
||||
// that have no expert_gating_func model parameter set
|
||||
hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
|
||||
}
|
||||
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 27: model.type = e_model::MODEL_16B; break;
|
||||
case 60: model.type = e_model::MODEL_236B; break;
|
||||
case 61: model.type = e_model::MODEL_671B; break;
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
|
|
@ -1266,6 +1293,10 @@ void llm_load_vocab(llama_model_loader & ml, llama_model & model) {
|
|||
tokenizer_pre == "deepseek-coder") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
|
||||
vocab.tokenizer_clean_spaces = false;
|
||||
} else if (
|
||||
tokenizer_pre == "deepseek-v3") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM;
|
||||
vocab.tokenizer_clean_spaces = false;
|
||||
} else if (
|
||||
tokenizer_pre == "falcon") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON;
|
||||
|
|
@ -1912,24 +1943,24 @@ void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|||
LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
|
||||
|
||||
// special tokens
|
||||
if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
|
||||
if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
|
||||
if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
|
||||
if (vocab.special_eom_id != -1) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, vocab.special_eom_id, vocab.id_to_token[vocab.special_eom_id].text.c_str() ); }
|
||||
if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
|
||||
if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
|
||||
if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
|
||||
if (vocab.special_cls_id != -1) { LLAMA_LOG_INFO( "%s: CLS token = %d '%s'\n", __func__, vocab.special_cls_id, vocab.id_to_token[vocab.special_cls_id].text.c_str() ); }
|
||||
if (vocab.special_mask_id != -1) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
|
||||
if (vocab.special_bos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
|
||||
if (vocab.special_eos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
|
||||
if (vocab.special_eot_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
|
||||
if (vocab.special_eom_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, vocab.special_eom_id, vocab.id_to_token[vocab.special_eom_id].text.c_str() ); }
|
||||
if (vocab.special_unk_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
|
||||
if (vocab.special_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
|
||||
if (vocab.special_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
|
||||
if (vocab.special_cls_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: CLS token = %d '%s'\n", __func__, vocab.special_cls_id, vocab.id_to_token[vocab.special_cls_id].text.c_str() ); }
|
||||
if (vocab.special_mask_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
|
||||
|
||||
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
|
||||
if (vocab.linefeed_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
|
||||
|
||||
if (vocab.special_fim_pre_id != -1) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n", __func__, vocab.special_fim_pre_id, vocab.id_to_token[vocab.special_fim_pre_id].text.c_str() ); }
|
||||
if (vocab.special_fim_suf_id != -1) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n", __func__, vocab.special_fim_suf_id, vocab.id_to_token[vocab.special_fim_suf_id].text.c_str() ); }
|
||||
if (vocab.special_fim_mid_id != -1) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n", __func__, vocab.special_fim_mid_id, vocab.id_to_token[vocab.special_fim_mid_id].text.c_str() ); }
|
||||
if (vocab.special_fim_pad_id != -1) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n", __func__, vocab.special_fim_pad_id, vocab.id_to_token[vocab.special_fim_pad_id].text.c_str() ); }
|
||||
if (vocab.special_fim_rep_id != -1) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n", __func__, vocab.special_fim_rep_id, vocab.id_to_token[vocab.special_fim_rep_id].text.c_str() ); }
|
||||
if (vocab.special_fim_sep_id != -1) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n", __func__, vocab.special_fim_sep_id, vocab.id_to_token[vocab.special_fim_sep_id].text.c_str() ); }
|
||||
if (vocab.special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n", __func__, vocab.special_fim_pre_id, vocab.id_to_token[vocab.special_fim_pre_id].text.c_str() ); }
|
||||
if (vocab.special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n", __func__, vocab.special_fim_suf_id, vocab.id_to_token[vocab.special_fim_suf_id].text.c_str() ); }
|
||||
if (vocab.special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n", __func__, vocab.special_fim_mid_id, vocab.id_to_token[vocab.special_fim_mid_id].text.c_str() ); }
|
||||
if (vocab.special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n", __func__, vocab.special_fim_pad_id, vocab.id_to_token[vocab.special_fim_pad_id].text.c_str() ); }
|
||||
if (vocab.special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n", __func__, vocab.special_fim_rep_id, vocab.id_to_token[vocab.special_fim_rep_id].text.c_str() ); }
|
||||
if (vocab.special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n", __func__, vocab.special_fim_sep_id, vocab.id_to_token[vocab.special_fim_sep_id].text.c_str() ); }
|
||||
|
||||
for (const auto & id : vocab.special_eog_ids) {
|
||||
LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, vocab.id_to_token[id].text.c_str() );
|
||||
|
|
@ -1951,6 +1982,8 @@ void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|||
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
||||
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
|
||||
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
||||
LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
|
||||
LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((enum llama_expert_gating_func_type) hparams.expert_gating_func));
|
||||
LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
|
||||
}
|
||||
|
||||
|
|
@ -1996,6 +2029,10 @@ struct llama_model_params llama_model_default_params() {
|
|||
}
|
||||
|
||||
void llama_free_model(struct llama_model * model) {
|
||||
llama_model_free(model);
|
||||
}
|
||||
|
||||
void llama_model_free(struct llama_model * model) {
|
||||
delete model;
|
||||
}
|
||||
|
||||
|
|
@ -2051,6 +2088,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|||
case LLM_ARCH_MINICPM:
|
||||
case LLM_ARCH_XVERSE:
|
||||
case LLM_ARCH_COMMAND_R:
|
||||
case LLM_ARCH_COHERE2:
|
||||
case LLM_ARCH_OLMO:
|
||||
case LLM_ARCH_ARCTIC:
|
||||
case LLM_ARCH_DEEPSEEK:
|
||||
|
|
|
|||
|
|
@ -63,6 +63,7 @@ enum llm_type {
|
|||
MODEL_70B,
|
||||
MODEL_236B,
|
||||
MODEL_314B,
|
||||
MODEL_671B,
|
||||
MODEL_SMALL,
|
||||
MODEL_MEDIUM,
|
||||
MODEL_LARGE,
|
||||
|
|
@ -213,6 +214,7 @@ struct llama_layer {
|
|||
struct ggml_tensor * ffn_down_b = nullptr; // b2
|
||||
struct ggml_tensor * ffn_up_b = nullptr; // b3
|
||||
struct ggml_tensor * ffn_act = nullptr;
|
||||
struct ggml_tensor * ffn_exp_probs_b = nullptr;
|
||||
|
||||
// mamba proj
|
||||
struct ggml_tensor * ssm_in = nullptr;
|
||||
|
|
|
|||
|
|
@ -22,7 +22,7 @@ static void zeros(std::ofstream & file, size_t n) {
|
|||
}
|
||||
}
|
||||
|
||||
struct quantize_state_internal {
|
||||
struct quantize_state_impl {
|
||||
const llama_model & model;
|
||||
const llama_model_quantize_params * params;
|
||||
|
||||
|
|
@ -43,13 +43,13 @@ struct quantize_state_internal {
|
|||
// used to figure out if a model shares tok_embd with the output weight
|
||||
bool has_output = false;
|
||||
|
||||
quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params)
|
||||
quantize_state_impl(const llama_model & model, const llama_model_quantize_params * params)
|
||||
: model(model)
|
||||
, params(params)
|
||||
{}
|
||||
};
|
||||
|
||||
static void llama_tensor_dequantize_internal(
|
||||
static void llama_tensor_dequantize_impl(
|
||||
struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
|
||||
const size_t nelements, const int nthread
|
||||
) {
|
||||
|
|
@ -121,7 +121,7 @@ static void llama_tensor_dequantize_internal(
|
|||
workers.clear();
|
||||
}
|
||||
|
||||
static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
|
||||
static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
|
||||
const std::string name = ggml_get_name(tensor);
|
||||
|
||||
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
||||
|
|
@ -410,7 +410,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|||
return new_type;
|
||||
}
|
||||
|
||||
static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
|
||||
static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
|
||||
if (nthread < 2) {
|
||||
// single-thread
|
||||
size_t new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, imatrix);
|
||||
|
|
@ -464,7 +464,7 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
|
|||
return new_size;
|
||||
}
|
||||
|
||||
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
|
||||
static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
|
||||
ggml_type default_type;
|
||||
llama_ftype ftype = params->ftype;
|
||||
|
||||
|
|
@ -534,7 +534,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
llm_load_hparams(ml, model);
|
||||
llm_load_stats (ml, model);
|
||||
|
||||
struct quantize_state_internal qs(model, params);
|
||||
struct quantize_state_impl qs(model, params);
|
||||
|
||||
if (params->only_copy) {
|
||||
ftype = model.ftype;
|
||||
|
|
@ -837,7 +837,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
} else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
|
||||
throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
|
||||
} else {
|
||||
llama_tensor_dequantize_internal(tensor, f32_conv_buf, workers, nelements, nthread);
|
||||
llama_tensor_dequantize_impl(tensor, f32_conv_buf, workers, nelements, nthread);
|
||||
f32_data = (float *) f32_conv_buf.data();
|
||||
}
|
||||
|
||||
|
|
@ -866,7 +866,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|||
void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows;
|
||||
const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
|
||||
|
||||
new_size += llama_tensor_quantize_internal(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
|
||||
new_size += llama_tensor_quantize_impl(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
|
||||
}
|
||||
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
|
||||
}
|
||||
|
|
@ -919,7 +919,7 @@ uint32_t llama_model_quantize(
|
|||
const char * fname_out,
|
||||
const llama_model_quantize_params * params) {
|
||||
try {
|
||||
llama_model_quantize_internal(fname_inp, fname_out, params);
|
||||
llama_model_quantize_impl(fname_inp, fname_out, params);
|
||||
} catch (const std::exception & err) {
|
||||
LLAMA_LOG_ERROR("%s: failed to quantize: %s\n", __func__, err.what());
|
||||
return 1;
|
||||
|
|
|
|||
|
|
@ -257,7 +257,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
|
|||
for (int i = 0; i < (int)cur_p->size; ++i) {
|
||||
const float val = cur_p->data[i].logit;
|
||||
int ib = int(bucket_scale * val + bucket_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low);
|
||||
ib = std::max(0, std::min(nbuckets-1, ib));
|
||||
ib = std::max(0, std::min(nbuckets - 1, ib));
|
||||
bucket_idx[i] = ib;
|
||||
++histo[ib];
|
||||
}
|
||||
|
|
@ -280,13 +280,13 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
|
|||
for (int i = 0; i < (int)cur_p->size; ++i) {
|
||||
int j = bucket_idx[i];
|
||||
if (j >= ib) {
|
||||
*bucket_ptrs[nbuckets-1-j]++ = cur_p->data[i];
|
||||
*bucket_ptrs[nbuckets - 1 - j]++ = cur_p->data[i];
|
||||
}
|
||||
}
|
||||
|
||||
ptr = tmp_tokens.data();
|
||||
int ndone = 0;
|
||||
for (int j = nbuckets-1; j > ib; --j) {
|
||||
for (int j = nbuckets - 1; j > ib; --j) {
|
||||
std::sort(ptr, ptr + histo[j], comp);
|
||||
ptr += histo[j];
|
||||
ndone += histo[j];
|
||||
|
|
@ -1832,7 +1832,7 @@ static void llama_sampler_dry_apply(struct llama_sampler * smpl, llama_token_dat
|
|||
ctx->dry_repeat_count[last - k] = std::min(n, rep_limit);
|
||||
if (n > 0) {
|
||||
lt = k;
|
||||
rt = k+n-1;
|
||||
rt = k + n - 1;
|
||||
}
|
||||
} else {
|
||||
// If k is inside the current Z-box, consider two cases.
|
||||
|
|
|
|||
|
|
@ -609,6 +609,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
|||
"\\p{N}+",
|
||||
};
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM:
|
||||
regex_exprs = {
|
||||
"\\p{N}{1,3}",
|
||||
"[一-龥-ゟ゠-ヿ]+",
|
||||
"[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\r\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\r\n]*|\\s*[\r\n]+|\\s+(?!\\S)|\\s+",
|
||||
};
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
|
||||
regex_exprs = {
|
||||
"[\r\n]",
|
||||
|
|
@ -717,7 +724,7 @@ struct llm_tokenizer_bpe_session {
|
|||
|
||||
bool append_bos(std::vector<llama_vocab::id> & output) const {
|
||||
if (vocab.tokenizer_add_bos) {
|
||||
GGML_ASSERT(vocab.special_bos_id != -1);
|
||||
GGML_ASSERT(vocab.special_bos_id != LLAMA_TOKEN_NULL);
|
||||
output.push_back(vocab.special_bos_id);
|
||||
return true;
|
||||
}
|
||||
|
|
@ -726,7 +733,7 @@ struct llm_tokenizer_bpe_session {
|
|||
|
||||
bool append_eos(std::vector<llama_vocab::id> & output) const {
|
||||
if (vocab.tokenizer_add_eos) {
|
||||
GGML_ASSERT(vocab.special_eos_id != -1);
|
||||
GGML_ASSERT(vocab.special_eos_id != LLAMA_TOKEN_NULL);
|
||||
output.push_back(vocab.special_eos_id);
|
||||
return true;
|
||||
}
|
||||
|
|
@ -1623,7 +1630,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
|||
if (source == 0) {
|
||||
buffer.erase_after(buffer.before_begin());
|
||||
} else {
|
||||
buffer.erase_after(std::next(buffer.begin(), (source-1)));
|
||||
buffer.erase_after(std::next(buffer.begin(), (source - 1)));
|
||||
}
|
||||
|
||||
// repeat for the right side
|
||||
|
|
@ -1637,7 +1644,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
|||
if (source == 0) {
|
||||
buffer.erase_after(buffer.before_begin());
|
||||
} else {
|
||||
buffer.erase_after(std::next(buffer.begin(), (source-1)));
|
||||
buffer.erase_after(std::next(buffer.begin(), (source - 1)));
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
|
@ -1675,7 +1682,7 @@ std::vector<llama_vocab::id> llama_tokenize_internal(
|
|||
bool is_prev_special = true; // prefix with space if first token
|
||||
|
||||
if (add_special && vocab.tokenizer_add_bos) {
|
||||
GGML_ASSERT(vocab.special_bos_id != -1);
|
||||
GGML_ASSERT(vocab.special_bos_id != LLAMA_TOKEN_NULL);
|
||||
output.push_back(vocab.special_bos_id);
|
||||
is_prev_special = true;
|
||||
}
|
||||
|
|
@ -1710,7 +1717,7 @@ std::vector<llama_vocab::id> llama_tokenize_internal(
|
|||
// }
|
||||
|
||||
if (add_special && vocab.tokenizer_add_eos) {
|
||||
GGML_ASSERT(vocab.special_eos_id != -1);
|
||||
GGML_ASSERT(vocab.special_eos_id != LLAMA_TOKEN_NULL);
|
||||
output.push_back(vocab.special_eos_id);
|
||||
}
|
||||
} break;
|
||||
|
|
@ -1769,7 +1776,7 @@ std::vector<llama_vocab::id> llama_tokenize_internal(
|
|||
case LLAMA_VOCAB_TYPE_WPM:
|
||||
{
|
||||
if (add_special) {
|
||||
GGML_ASSERT(vocab.special_cls_id != -1);
|
||||
GGML_ASSERT(vocab.special_cls_id != LLAMA_TOKEN_NULL);
|
||||
output.push_back(vocab.special_cls_id);
|
||||
}
|
||||
|
||||
|
|
@ -1789,14 +1796,14 @@ std::vector<llama_vocab::id> llama_tokenize_internal(
|
|||
}
|
||||
|
||||
if (add_special) {
|
||||
GGML_ASSERT(vocab.special_sep_id != -1);
|
||||
GGML_ASSERT(vocab.special_sep_id != LLAMA_TOKEN_NULL);
|
||||
output.push_back(vocab.special_sep_id);
|
||||
}
|
||||
} break;
|
||||
case LLAMA_VOCAB_TYPE_UGM:
|
||||
{
|
||||
if (add_special && vocab.tokenizer_add_bos) {
|
||||
GGML_ASSERT(vocab.special_bos_id != -1);
|
||||
GGML_ASSERT(vocab.special_bos_id != LLAMA_TOKEN_NULL);
|
||||
output.push_back(vocab.special_bos_id);
|
||||
}
|
||||
llm_tokenizer_ugm_session session(vocab);
|
||||
|
|
@ -1821,7 +1828,7 @@ std::vector<llama_vocab::id> llama_tokenize_internal(
|
|||
//}
|
||||
|
||||
if (add_special && vocab.tokenizer_add_eos) {
|
||||
GGML_ASSERT(vocab.special_eos_id != -1);
|
||||
GGML_ASSERT(vocab.special_eos_id != LLAMA_TOKEN_NULL);
|
||||
output.push_back(vocab.special_eos_id);
|
||||
}
|
||||
} break;
|
||||
|
|
@ -1890,7 +1897,7 @@ llama_token_attr llama_token_get_attr_impl(const struct llama_vocab & vocab, lla
|
|||
}
|
||||
|
||||
bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token) {
|
||||
return token != -1 && vocab.special_eog_ids.count(token) > 0;
|
||||
return token != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(token) > 0;
|
||||
}
|
||||
|
||||
bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token token) {
|
||||
|
|
@ -2134,7 +2141,7 @@ int32_t llama_detokenize_impl(
|
|||
}
|
||||
|
||||
if (remove_special && vocab.tokenizer_add_eos) {
|
||||
if (n_tokens > 0 && tokens[n_tokens-1] == vocab.special_eos_id) {
|
||||
if (n_tokens > 0 && tokens[n_tokens - 1] == vocab.special_eos_id) {
|
||||
n_tokens--;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
275
src/llama.cpp
275
src/llama.cpp
|
|
@ -23,12 +23,8 @@
|
|||
#include <algorithm>
|
||||
#include <array>
|
||||
#include <cassert>
|
||||
#include <cctype>
|
||||
#include <cfloat>
|
||||
#include <cinttypes>
|
||||
#include <climits>
|
||||
#include <cmath>
|
||||
#include <cstdarg>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
|
|
@ -36,7 +32,6 @@
|
|||
#include <ctime>
|
||||
#include <functional>
|
||||
#include <initializer_list>
|
||||
#include <locale>
|
||||
#include <map>
|
||||
#include <numeric>
|
||||
#include <type_traits>
|
||||
|
|
@ -1655,6 +1650,32 @@ static bool llm_load_tensors(
|
|||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_COHERE2:
|
||||
{
|
||||
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
|
||||
|
||||
// output
|
||||
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
|
||||
// init output from the input tok embed
|
||||
model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab },
|
||||
llama_model_loader::TENSOR_DUPLICATED);
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
auto & layer = model.layers[i];
|
||||
|
||||
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
|
||||
|
||||
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd }, 0);
|
||||
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
|
||||
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
|
||||
|
||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
|
||||
}
|
||||
}
|
||||
break;
|
||||
case LLM_ARCH_OLMO: // adapted from LLM_ARCH_LLAMA with norm params removed
|
||||
{
|
||||
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
|
|
@ -1934,6 +1955,7 @@ static bool llm_load_tensors(
|
|||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||
} else {
|
||||
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
||||
layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
|
||||
if (n_expert == 0) {
|
||||
throw std::runtime_error("n_expert must be > 0");
|
||||
|
|
@ -2617,21 +2639,21 @@ static struct ggml_tensor * llm_build_inp_embd(
|
|||
struct ggml_context * ctx,
|
||||
struct llama_context & lctx,
|
||||
const llama_hparams & hparams,
|
||||
const llama_ubatch & batch,
|
||||
const llama_ubatch & ubatch,
|
||||
struct ggml_tensor * tok_embd,
|
||||
const llm_build_cb & cb) {
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
if (batch.token) {
|
||||
lctx.inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens);
|
||||
if (ubatch.token) {
|
||||
lctx.inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ubatch.n_tokens);
|
||||
cb(lctx.inp_tokens, "inp_tokens", -1);
|
||||
ggml_set_input(lctx.inp_tokens);
|
||||
|
||||
inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
|
||||
} else {
|
||||
lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
|
||||
lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, ubatch.n_tokens);
|
||||
inpL = lctx.inp_embd;
|
||||
ggml_set_input(lctx.inp_embd);
|
||||
}
|
||||
|
|
@ -2915,12 +2937,14 @@ static struct ggml_tensor * llm_build_moe_ffn(
|
|||
struct ggml_tensor * up_exps,
|
||||
struct ggml_tensor * gate_exps,
|
||||
struct ggml_tensor * down_exps,
|
||||
struct ggml_tensor * exp_probs_b,
|
||||
int64_t n_expert,
|
||||
int64_t n_expert_used,
|
||||
llm_ffn_op_type type_op,
|
||||
bool norm_w,
|
||||
bool scale_w,
|
||||
float w_scale,
|
||||
llama_expert_gating_func_type gating_op,
|
||||
const llm_build_cb & cb,
|
||||
int il) {
|
||||
int64_t n_embd = cur->ne[0];
|
||||
|
|
@ -2929,11 +2953,31 @@ static struct ggml_tensor * llm_build_moe_ffn(
|
|||
ggml_tensor * logits = llm_build_lora_mm(lctx, ctx, gate_inp, cur); // [n_expert, n_tokens]
|
||||
cb(logits, "ffn_moe_logits", il);
|
||||
|
||||
ggml_tensor * probs = ggml_soft_max(ctx, logits); // [n_expert, n_tokens]
|
||||
ggml_tensor * probs = nullptr;
|
||||
switch (gating_op) {
|
||||
case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX:
|
||||
{
|
||||
probs = ggml_soft_max(ctx, logits); // [n_expert, n_tokens]
|
||||
} break;
|
||||
case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID:
|
||||
{
|
||||
probs = ggml_sigmoid(ctx, logits); // [n_expert, n_tokens]
|
||||
} break;
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
cb(probs, "ffn_moe_probs", il);
|
||||
|
||||
// add experts selection bias - introduced in DeepSeek V3
|
||||
// leave probs unbiased as it's later used to get expert weights
|
||||
ggml_tensor * selection_probs = probs;
|
||||
if (exp_probs_b != nullptr) {
|
||||
selection_probs = ggml_add(ctx, probs, exp_probs_b);
|
||||
cb(selection_probs, "ffn_moe_probs_biased", il);
|
||||
}
|
||||
|
||||
// select experts
|
||||
ggml_tensor * selected_experts = ggml_top_k(ctx, probs, n_expert_used); // [n_expert_used, n_tokens]
|
||||
ggml_tensor * selected_experts = ggml_top_k(ctx, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
|
||||
cb(selected_experts->src[0], "ffn_moe_argsort", il);
|
||||
cb(selected_experts, "ffn_moe_topk", il);
|
||||
|
||||
|
|
@ -3218,7 +3262,7 @@ static struct ggml_tensor * llm_build_copy_mask_state(
|
|||
static struct ggml_tensor * llm_build_mamba(
|
||||
struct ggml_context * ctx,
|
||||
struct llama_context & lctx,
|
||||
const llama_ubatch & batch,
|
||||
const llama_ubatch & ubatch,
|
||||
struct ggml_cgraph * graph,
|
||||
struct ggml_tensor * cur,
|
||||
struct ggml_tensor * state_copy,
|
||||
|
|
@ -3234,17 +3278,17 @@ static struct ggml_tensor * llm_build_mamba(
|
|||
const int64_t d_inner = hparams.ssm_d_inner;
|
||||
const int64_t d_state = hparams.ssm_d_state;
|
||||
const int64_t dt_rank = hparams.ssm_dt_rank;
|
||||
const int64_t n_seqs = batch.n_seqs;
|
||||
const int64_t n_seqs = ubatch.n_seqs;
|
||||
// Some variants of Mamba arch (e.g. FalconMamba do apply layer norm on B and Dt layers)
|
||||
const bool ssm_dt_b_c_rms = hparams.ssm_dt_b_c_rms;
|
||||
// Use the same RMS norm as the final layer norm
|
||||
const float norm_rms_eps = hparams.f_norm_rms_eps;
|
||||
|
||||
const int64_t n_seq_tokens = batch.n_seq_tokens;
|
||||
const int64_t n_seq_tokens = ubatch.n_seq_tokens;
|
||||
|
||||
GGML_ASSERT(n_seqs != 0);
|
||||
GGML_ASSERT(batch.equal_seqs);
|
||||
GGML_ASSERT(batch.n_tokens == n_seq_tokens * n_seqs);
|
||||
GGML_ASSERT(ubatch.equal_seqs);
|
||||
GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
|
||||
|
||||
struct ggml_tensor * conv_states_all = kv.k_l[il];
|
||||
struct ggml_tensor * ssm_states_all = kv.v_l[il];
|
||||
|
|
@ -4072,9 +4116,11 @@ struct llm_build_context {
|
|||
model.layers[il].ffn_up_exps,
|
||||
model.layers[il].ffn_gate_exps,
|
||||
model.layers[il].ffn_down_exps,
|
||||
nullptr,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_SILU, true,
|
||||
false, 0.0,
|
||||
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
||||
cb, il);
|
||||
cb(cur, "ffn_moe_out", il);
|
||||
}
|
||||
|
|
@ -4724,9 +4770,11 @@ struct llm_build_context {
|
|||
model.layers[il].ffn_up_exps,
|
||||
model.layers[il].ffn_gate_exps,
|
||||
model.layers[il].ffn_down_exps,
|
||||
nullptr,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_GELU, true,
|
||||
false, 0.0,
|
||||
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
||||
cb, il);
|
||||
cb(cur, "ffn_moe_out", il);
|
||||
|
||||
|
|
@ -4865,9 +4913,11 @@ struct llm_build_context {
|
|||
model.layers[il].ffn_up_exps,
|
||||
model.layers[il].ffn_gate_exps,
|
||||
model.layers[il].ffn_down_exps,
|
||||
nullptr,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_SILU, true,
|
||||
false, 0.0,
|
||||
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
||||
cb, il);
|
||||
cb(cur, "ffn_moe_out", il);
|
||||
|
||||
|
|
@ -6113,9 +6163,11 @@ struct llm_build_context {
|
|||
model.layers[il].ffn_up_exps,
|
||||
model.layers[il].ffn_gate_exps,
|
||||
model.layers[il].ffn_down_exps,
|
||||
nullptr,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_SILU, false,
|
||||
false, 0.0,
|
||||
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
||||
cb, il);
|
||||
cb(cur, "ffn_moe_out", il);
|
||||
|
||||
|
|
@ -7755,6 +7807,137 @@ struct llm_build_context {
|
|||
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_cohere2() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
const float f_logit_scale = hparams.f_logit_scale;
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
|
||||
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
// cohere2 requires different mask for layers using sliding window (SWA)
|
||||
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
||||
struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa();
|
||||
|
||||
// sliding window switch pattern
|
||||
const int32_t sliding_window_pattern = 4;
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
// three layers sliding window attention (window size 4096) and ROPE
|
||||
// fourth layer uses global attention without positional embeddings
|
||||
const bool is_sliding = il % sliding_window_pattern < (sliding_window_pattern - 1);
|
||||
struct ggml_tensor * KQ_mask_l = is_sliding ? KQ_mask_swa : KQ_mask;
|
||||
|
||||
// norm
|
||||
cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, NULL, LLM_NORM, cb, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
struct ggml_tensor * ffn_inp = cur;
|
||||
|
||||
// self-attention
|
||||
{
|
||||
// rope freq factors for 128k context
|
||||
struct ggml_tensor * rope_factors = build_rope_factors(il);
|
||||
|
||||
// compute Q and K and RoPE them
|
||||
struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
||||
cb(Qcur, "Qcur", il);
|
||||
if (model.layers[il].bq) {
|
||||
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
||||
cb(Qcur, "Qcur", il);
|
||||
}
|
||||
|
||||
struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
||||
cb(Kcur, "Kcur", il);
|
||||
if (model.layers[il].bk) {
|
||||
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
||||
cb(Kcur, "Kcur", il);
|
||||
}
|
||||
|
||||
struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
||||
cb(Vcur, "Vcur", il);
|
||||
if (model.layers[il].bv) {
|
||||
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
||||
cb(Vcur, "Vcur", il);
|
||||
}
|
||||
|
||||
if (is_sliding) {
|
||||
Qcur = ggml_rope_ext(ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor,
|
||||
beta_fast, beta_slow);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
Kcur = ggml_rope_ext(ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
||||
rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor,
|
||||
attn_factor, beta_fast, beta_slow);
|
||||
cb(Kcur, "Kcur", il);
|
||||
} else {
|
||||
// For non-sliding layers, just reshape without applying RoPE
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||
cb(Kcur, "Kcur", il);
|
||||
}
|
||||
|
||||
cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur,
|
||||
KQ_mask_l, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
|
||||
}
|
||||
|
||||
if (il == n_layer - 1) {
|
||||
// skip computing output for unused tokens
|
||||
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
||||
ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
|
||||
}
|
||||
|
||||
struct ggml_tensor * attn_out = cur;
|
||||
|
||||
// feed-forward network
|
||||
{
|
||||
cur = llm_build_ffn(ctx0, lctx, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate,
|
||||
NULL, NULL, model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR,
|
||||
cb, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
}
|
||||
|
||||
// add together residual + FFN + self-attention
|
||||
cur = ggml_add(ctx0, cur, inpL);
|
||||
cur = ggml_add(ctx0, cur, attn_out);
|
||||
cur = lctx.cvec.apply_to(ctx0, cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
cur = inpL;
|
||||
|
||||
cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, NULL, LLM_NORM, cb, -1);
|
||||
cb(cur, "result_norm", -1);
|
||||
|
||||
// lm_head
|
||||
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
||||
|
||||
if (f_logit_scale) {
|
||||
cur = ggml_scale(ctx0, cur, f_logit_scale);
|
||||
}
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
}
|
||||
|
||||
// ref: https://allenai.org/olmo
|
||||
// based on the original build_llama() function, changes:
|
||||
// * non-parametric layer norm
|
||||
|
|
@ -8107,9 +8290,11 @@ struct llm_build_context {
|
|||
model.layers[il].ffn_up_exps,
|
||||
model.layers[il].ffn_gate_exps,
|
||||
model.layers[il].ffn_down_exps,
|
||||
nullptr,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_SILU, false,
|
||||
false, 0.0,
|
||||
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
||||
cb, il);
|
||||
cb(cur, "ffn_moe_out", il);
|
||||
|
||||
|
|
@ -8504,9 +8689,11 @@ struct llm_build_context {
|
|||
model.layers[il].ffn_up_exps,
|
||||
model.layers[il].ffn_gate_exps,
|
||||
model.layers[il].ffn_down_exps,
|
||||
nullptr,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_SILU, true,
|
||||
false, 0.0,
|
||||
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
||||
cb, il);
|
||||
cb(cur, "ffn_moe_out", il);
|
||||
|
||||
|
|
@ -8645,9 +8832,11 @@ struct llm_build_context {
|
|||
model.layers[il].ffn_up_exps,
|
||||
model.layers[il].ffn_gate_exps,
|
||||
model.layers[il].ffn_down_exps,
|
||||
nullptr,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_SILU, false,
|
||||
false, hparams.expert_weights_scale,
|
||||
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
||||
cb, il);
|
||||
cb(moe_out, "ffn_moe_out", il);
|
||||
|
||||
|
|
@ -8874,9 +9063,11 @@ struct llm_build_context {
|
|||
model.layers[il].ffn_up_exps,
|
||||
model.layers[il].ffn_gate_exps,
|
||||
model.layers[il].ffn_down_exps,
|
||||
model.layers[il].ffn_exp_probs_b,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_SILU, false,
|
||||
LLM_FFN_SILU, hparams.expert_weights_norm,
|
||||
true, hparams.expert_weights_scale,
|
||||
(enum llama_expert_gating_func_type) hparams.expert_gating_func,
|
||||
cb, il);
|
||||
cb(moe_out, "ffn_moe_out", il);
|
||||
|
||||
|
|
@ -10506,6 +10697,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|||
{
|
||||
result = llm.build_command_r();
|
||||
} break;
|
||||
case LLM_ARCH_COHERE2:
|
||||
{
|
||||
result = llm.build_cohere2();
|
||||
} break;
|
||||
case LLM_ARCH_DBRX:
|
||||
{
|
||||
result = llm.build_dbrx();
|
||||
|
|
@ -10639,7 +10834,7 @@ static enum ggml_status llama_graph_compute(
|
|||
// return positive int on warning
|
||||
// return negative int on error
|
||||
//
|
||||
static int llama_decode_internal(
|
||||
static int llama_decode_impl(
|
||||
llama_context & lctx,
|
||||
llama_batch inp_batch) {
|
||||
|
||||
|
|
@ -10974,7 +11169,7 @@ static int llama_decode_internal(
|
|||
// return positive int on warning
|
||||
// return negative int on error
|
||||
//
|
||||
static int llama_encode_internal(
|
||||
static int llama_encode_impl(
|
||||
llama_context & lctx,
|
||||
llama_batch inp_batch) {
|
||||
|
||||
|
|
@ -11156,7 +11351,7 @@ static int llama_encode_internal(
|
|||
}
|
||||
|
||||
// find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
|
||||
static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
||||
static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
|
||||
auto & kv_self = lctx.kv_self;
|
||||
|
||||
const auto & hparams = lctx.model.hparams;
|
||||
|
|
@ -11376,7 +11571,7 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
|||
//LLAMA_LOG_INFO("(tmp log) KV defrag time: %.3f ms\n", (t_end - t_start)/1000.0);
|
||||
}
|
||||
|
||||
static void llama_kv_cache_update_internal(struct llama_context & lctx) {
|
||||
static void llama_kv_cache_update_impl(struct llama_context & lctx) {
|
||||
bool need_reserve = false;
|
||||
|
||||
if (lctx.kv_self.has_shift) {
|
||||
|
|
@ -11412,7 +11607,7 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
|
|||
|
||||
// defragment the KV cache if needed
|
||||
if (lctx.kv_self.do_defrag) {
|
||||
llama_kv_cache_defrag_internal(lctx);
|
||||
llama_kv_cache_defrag_impl(lctx);
|
||||
|
||||
need_reserve = true;
|
||||
|
||||
|
|
@ -11441,13 +11636,7 @@ int32_t llama_lora_adapter_set(
|
|||
struct llama_context * ctx,
|
||||
struct llama_lora_adapter * adapter,
|
||||
float scale) {
|
||||
if (ctx->cparams.flash_attn) {
|
||||
LLAMA_LOG_ERROR("%s: flash_attn is not compatible with LoRA\n", __func__);
|
||||
return -1;
|
||||
}
|
||||
|
||||
ctx->lora_adapters[adapter] = scale;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
@ -11582,6 +11771,12 @@ int64_t llama_time_us(void) {
|
|||
struct llama_model * llama_load_model_from_file(
|
||||
const char * path_model,
|
||||
struct llama_model_params params) {
|
||||
return llama_model_load_from_file(path_model, params);
|
||||
}
|
||||
|
||||
struct llama_model * llama_model_load_from_file(
|
||||
const char * path_model,
|
||||
struct llama_model_params params) {
|
||||
ggml_time_init();
|
||||
|
||||
llama_model * model = new llama_model;
|
||||
|
|
@ -11620,7 +11815,7 @@ struct llama_model * llama_load_model_from_file(
|
|||
ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC");
|
||||
if (!rpc_reg) {
|
||||
LLAMA_LOG_ERROR("%s: failed to find RPC backend\n", __func__);
|
||||
llama_free_model(model);
|
||||
llama_model_free(model);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
|
|
@ -11628,7 +11823,7 @@ struct llama_model * llama_load_model_from_file(
|
|||
ggml_backend_rpc_add_device_t ggml_backend_rpc_add_device_fn = (ggml_backend_rpc_add_device_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device");
|
||||
if (!ggml_backend_rpc_add_device_fn) {
|
||||
LLAMA_LOG_ERROR("%s: failed to find RPC device add function\n", __func__);
|
||||
llama_free_model(model);
|
||||
llama_model_free(model);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
|
|
@ -11638,7 +11833,7 @@ struct llama_model * llama_load_model_from_file(
|
|||
model->devices.push_back(dev);
|
||||
} else {
|
||||
LLAMA_LOG_ERROR("%s: failed to add RPC device for server '%s'\n", __func__, server.c_str());
|
||||
llama_free_model(model);
|
||||
llama_model_free(model);
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
|
@ -11670,7 +11865,7 @@ struct llama_model * llama_load_model_from_file(
|
|||
if (params.split_mode == LLAMA_SPLIT_MODE_NONE) {
|
||||
if (params.main_gpu < 0 || params.main_gpu >= (int)model->devices.size()) {
|
||||
LLAMA_LOG_ERROR("%s: invalid value for main_gpu: %d (available devices: %d)\n", __func__, params.main_gpu, (int)model->devices.size());
|
||||
llama_free_model(model);
|
||||
llama_model_free(model);
|
||||
return nullptr;
|
||||
}
|
||||
ggml_backend_dev_t main_gpu = model->devices[params.main_gpu];
|
||||
|
|
@ -11693,7 +11888,7 @@ struct llama_model * llama_load_model_from_file(
|
|||
LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
|
||||
}
|
||||
|
||||
llama_free_model(model);
|
||||
llama_model_free(model);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
|
|
@ -12117,7 +12312,7 @@ void llama_kv_cache_defrag(struct llama_context * ctx) {
|
|||
}
|
||||
|
||||
void llama_kv_cache_update(struct llama_context * ctx) {
|
||||
llama_kv_cache_update_internal(*ctx);
|
||||
llama_kv_cache_update_impl(*ctx);
|
||||
}
|
||||
|
||||
bool llama_kv_cache_can_shift(struct llama_context * ctx) {
|
||||
|
|
@ -12129,7 +12324,7 @@ bool llama_kv_cache_can_shift(struct llama_context * ctx) {
|
|||
int32_t llama_encode(
|
||||
struct llama_context * ctx,
|
||||
struct llama_batch batch) {
|
||||
const int ret = llama_encode_internal(*ctx, batch);
|
||||
const int ret = llama_encode_impl(*ctx, batch);
|
||||
if (ret != 0) {
|
||||
LLAMA_LOG_ERROR("%s: failed to encode, ret = %d\n", __func__, ret);
|
||||
}
|
||||
|
|
@ -12140,7 +12335,7 @@ int32_t llama_encode(
|
|||
int32_t llama_decode(
|
||||
struct llama_context * ctx,
|
||||
struct llama_batch batch) {
|
||||
const int ret = llama_decode_internal(*ctx, batch);
|
||||
const int ret = llama_decode_impl(*ctx, batch);
|
||||
if (ret != 0) {
|
||||
LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
|
||||
}
|
||||
|
|
@ -12360,16 +12555,16 @@ int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix,
|
|||
return 0;
|
||||
}
|
||||
|
||||
int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int split_no, int split_count) {
|
||||
int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count) {
|
||||
std::string str_split_path(split_path);
|
||||
char postfix[32];
|
||||
snprintf(postfix, 32, "-%05d-of-%05d.gguf", split_no + 1, split_count);
|
||||
std::string str_postfix(postfix);
|
||||
|
||||
// check if dest ends with postfix
|
||||
// check if split_prefix ends with postfix
|
||||
int size_prefix = str_split_path.size() - str_postfix.size();
|
||||
if (size_prefix > 0 && str_split_path.find(str_postfix, size_prefix) != std::string::npos) {
|
||||
snprintf(dest, std::min((size_t) size_prefix + 1, maxlen), "%s", split_path);
|
||||
snprintf(split_prefix, std::min((size_t) size_prefix + 1, maxlen), "%s", split_path);
|
||||
return size_prefix;
|
||||
}
|
||||
|
||||
|
|
@ -12378,6 +12573,8 @@ int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int
|
|||
|
||||
const char * llama_print_system_info(void) {
|
||||
static std::string s;
|
||||
s.clear(); // Clear the string, since it's static, otherwise it will accumulate data from previous calls.
|
||||
|
||||
|
||||
for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
|
||||
auto * reg = ggml_backend_reg_get(i);
|
||||
|
|
|
|||
|
|
@ -675,18 +675,24 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
|
|||
{ "\\p{N}", unicode_cpt_flags::NUMBER },
|
||||
{ "\\p{L}", unicode_cpt_flags::LETTER },
|
||||
{ "\\p{P}", unicode_cpt_flags::PUNCTUATION },
|
||||
{ "\\p{M}", unicode_cpt_flags::ACCENT_MARK },
|
||||
{ "\\p{S}", unicode_cpt_flags::SYMBOL },
|
||||
};
|
||||
|
||||
static const std::map<int, int> k_ucat_cpt = {
|
||||
{ unicode_cpt_flags::NUMBER, 0xD1 },
|
||||
{ unicode_cpt_flags::LETTER, 0xD2 },
|
||||
{ unicode_cpt_flags::PUNCTUATION, 0xD3 },
|
||||
{ unicode_cpt_flags::ACCENT_MARK, 0xD4 },
|
||||
{ unicode_cpt_flags::SYMBOL, 0xD5 },
|
||||
};
|
||||
|
||||
static const std::map<int, std::string> k_ucat_map = {
|
||||
{ unicode_cpt_flags::NUMBER, "\x30-\x39" }, // 0-9
|
||||
{ unicode_cpt_flags::LETTER, "\x41-\x5A\x61-\x7A" }, // A-Za-z
|
||||
{ unicode_cpt_flags::PUNCTUATION, "\x21-\x23\x25-\x2A\x2C-\x2F\x3A-\x3B\x3F-\x40\\\x5B-\\\x5D\x5F\\\x7B\\\x7D" }, // !-#%-*,-/:-;?-@\[-\]_\{\}
|
||||
{ unicode_cpt_flags::ACCENT_MARK, "" }, // no sub-128 codepoints
|
||||
{ unicode_cpt_flags::SYMBOL, "\\\x24\\\x2B\x3C-\x3E\x5E\x60\\\x7C" }, // $+<=>^`|
|
||||
};
|
||||
|
||||
// compute collapsed codepoints only if needed by at least one regex
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue