mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 09:34:37 +00:00
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .devops/nix/apps.nix # .devops/tools.sh # Makefile # README.md # docs/backend/SYCL.md # docs/build.md # examples/CMakeLists.txt # ggml/include/ggml.h # src/llama-vocab.cpp # tests/test-backend-ops.cpp # tests/test-chat-template.cpp # tests/test-sampling.cpp
This commit is contained in:
commit
ba5babb876
61 changed files with 1389 additions and 5170 deletions
205
src/llama.cpp
205
src/llama.cpp
|
@ -106,7 +106,6 @@
|
|||
#endif
|
||||
|
||||
// bump if necessary
|
||||
#define LLAMA_MAX_NODES 8192
|
||||
#define LLAMA_MAX_LAYERS 512
|
||||
#define LLAMA_MAX_EXPERTS 160 // DeepSeekV2
|
||||
|
||||
|
@ -2257,8 +2256,7 @@ struct llama_hparams {
|
|||
return n_head_arr[il];
|
||||
}
|
||||
|
||||
GGML_ASSERT(false);
|
||||
return 0;
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
uint32_t n_head_kv(uint32_t il = 0) const {
|
||||
|
@ -2266,8 +2264,7 @@ struct llama_hparams {
|
|||
return n_head_kv_arr[il];
|
||||
}
|
||||
|
||||
GGML_ASSERT(false);
|
||||
return 0;
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
uint32_t n_ff(uint32_t il = 0) const {
|
||||
|
@ -2275,8 +2272,7 @@ struct llama_hparams {
|
|||
return n_ff_arr[il];
|
||||
}
|
||||
|
||||
GGML_ASSERT(false);
|
||||
return 0;
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
uint32_t n_gqa(uint32_t il = 0) const {
|
||||
|
@ -2453,6 +2449,7 @@ struct llama_layer {
|
|||
// long rope factors
|
||||
struct ggml_tensor * rope_long = nullptr;
|
||||
struct ggml_tensor * rope_short = nullptr;
|
||||
struct ggml_tensor * rope_freqs = nullptr;
|
||||
|
||||
// bitnet scale
|
||||
struct ggml_tensor * wq_scale;
|
||||
|
@ -2655,7 +2652,6 @@ struct llama_context {
|
|||
llama_context(const llama_model & model)
|
||||
: model(model)
|
||||
, sampling(llama_n_vocab(&model))
|
||||
, grammar()
|
||||
, t_start_us(model.t_start_us)
|
||||
, t_load_us(model.t_load_us) {}
|
||||
|
||||
|
@ -2673,7 +2669,6 @@ struct llama_context {
|
|||
|
||||
struct llama_cparams cparams;
|
||||
struct llama_sampling sampling;
|
||||
struct llama_grammar grammar;
|
||||
struct llama_kv_cache kv_self;
|
||||
struct llama_control_vector cvec;
|
||||
|
||||
|
@ -2907,7 +2902,7 @@ static size_t llama_get_device_memory(const llama_model & model, int device) {
|
|||
#elif defined(GGML_USE_CANN)
|
||||
size_t total;
|
||||
size_t free;
|
||||
ggml_backend_cann_get_device_memory(device, &total, &free);
|
||||
ggml_backend_cann_get_device_memory(device, &free, &total);
|
||||
return free;
|
||||
#else
|
||||
return 1;
|
||||
|
@ -3576,6 +3571,15 @@ namespace GGUFMeta {
|
|||
|
||||
using llama_buf_map = std::unordered_map<uint32_t, ggml_backend_buffer_t>;
|
||||
|
||||
// TODO: update when needed or think of some clever automatic way to do this
|
||||
static size_t llama_model_max_nodes(const llama_model & /*model*/) {
|
||||
//if (model.arch == LLM_ARCH_LLAMA && model.hparams.n_layer > ??) { // llama-3 405B
|
||||
// return 32768;
|
||||
//}
|
||||
|
||||
return 8192;
|
||||
}
|
||||
|
||||
struct llama_model_loader {
|
||||
int n_kv = 0;
|
||||
int n_tensors = 0;
|
||||
|
@ -4907,6 +4911,7 @@ static void llm_load_hparams(
|
|||
} break;
|
||||
case LLM_ARCH_PHI3:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
|
@ -6100,6 +6105,8 @@ static bool llm_load_tensors(
|
|||
|
||||
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
||||
|
||||
layer.rope_freqs = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), {n_embd/n_head/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
||||
|
||||
if (n_expert == 0) {
|
||||
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
||||
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
||||
|
@ -8118,7 +8125,7 @@ static struct ggml_tensor * llm_build_moe_ffn(
|
|||
cb(gate, "ffn_moe_gelu", il);
|
||||
} break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
ggml_tensor * par = ggml_mul(ctx, up, gate); // [n_ff, n_expert_used, n_tokens]
|
||||
|
@ -8445,7 +8452,7 @@ struct llm_build_context {
|
|||
}
|
||||
|
||||
struct ggml_cgraph * build_k_shift() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
GGML_ASSERT(kv_self.size == n_ctx);
|
||||
|
||||
|
@ -8476,7 +8483,7 @@ struct llm_build_context {
|
|||
}
|
||||
|
||||
struct ggml_cgraph * build_s_copy() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
GGML_ASSERT(kv_self.recurrent);
|
||||
|
||||
|
@ -8499,7 +8506,7 @@ struct llm_build_context {
|
|||
}
|
||||
|
||||
struct ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids) {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
for (uint32_t i = 0; i < ids.size(); ++i) {
|
||||
const uint32_t id = ids[i];
|
||||
|
@ -8577,6 +8584,10 @@ struct llm_build_context {
|
|||
// choose long/short freq factors based on the context size
|
||||
const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max;
|
||||
|
||||
if (model.layers[il].rope_freqs != nullptr) {
|
||||
return model.layers[il].rope_freqs;
|
||||
}
|
||||
|
||||
if (n_ctx_pre_seq > hparams.n_ctx_orig_yarn) {
|
||||
return model.layers[il].rope_long;
|
||||
}
|
||||
|
@ -8681,8 +8692,8 @@ struct llm_build_context {
|
|||
} break;
|
||||
default:
|
||||
{
|
||||
GGML_ASSERT(false && "unknown pooling type");
|
||||
} break;
|
||||
GGML_ABORT("unknown pooling type");
|
||||
}
|
||||
}
|
||||
|
||||
cb(cur, "result_embd_pooled", -1);
|
||||
|
@ -8740,7 +8751,7 @@ struct llm_build_context {
|
|||
}
|
||||
|
||||
struct ggml_cgraph * build_llama() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
||||
int32_t n_tokens = this->n_tokens;
|
||||
|
@ -8771,6 +8782,9 @@ struct llm_build_context {
|
|||
|
||||
// self-attention
|
||||
{
|
||||
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
||||
struct ggml_tensor * rope_factors = build_rope_factors(il);
|
||||
|
||||
// compute Q and K and RoPE them
|
||||
struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
@ -8794,14 +8808,14 @@ struct llm_build_context {
|
|||
}
|
||||
|
||||
Qcur = ggml_rope_ext(
|
||||
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
||||
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
Kcur = ggml_rope_ext(
|
||||
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
||||
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
|
@ -8883,7 +8897,7 @@ struct llm_build_context {
|
|||
}
|
||||
|
||||
struct ggml_cgraph * build_baichuan() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
|
@ -8937,7 +8951,7 @@ struct llm_build_context {
|
|||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd/n_head, n_head, n_tokens);
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
@ -8998,7 +9012,7 @@ struct llm_build_context {
|
|||
}
|
||||
|
||||
struct ggml_cgraph * build_xverse() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
|
@ -9101,7 +9115,7 @@ struct llm_build_context {
|
|||
}
|
||||
|
||||
struct ggml_cgraph * build_falcon() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
|
@ -9221,7 +9235,7 @@ struct llm_build_context {
|
|||
}
|
||||
|
||||
struct ggml_cgraph * build_grok() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
||||
int32_t n_tokens = this->n_tokens;
|
||||
|
@ -9378,7 +9392,7 @@ struct llm_build_context {
|
|||
}
|
||||
|
||||
struct ggml_cgraph * build_dbrx() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
||||
int32_t n_tokens = this->n_tokens;
|
||||
|
@ -9504,7 +9518,7 @@ struct llm_build_context {
|
|||
}
|
||||
|
||||
struct ggml_cgraph * build_starcoder() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
|
@ -9608,7 +9622,7 @@ struct llm_build_context {
|
|||
}
|
||||
|
||||
struct ggml_cgraph * build_refact() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
|
@ -9702,7 +9716,7 @@ struct llm_build_context {
|
|||
}
|
||||
|
||||
struct ggml_cgraph * build_bert() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
|
@ -9896,7 +9910,7 @@ struct llm_build_context {
|
|||
}
|
||||
|
||||
struct ggml_cgraph * build_bloom() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
|
@ -9997,7 +10011,7 @@ struct llm_build_context {
|
|||
}
|
||||
|
||||
struct ggml_cgraph * build_mpt() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
|
@ -10287,7 +10301,7 @@ struct llm_build_context {
|
|||
}
|
||||
|
||||
struct ggml_cgraph * build_qwen() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
|
@ -10399,7 +10413,7 @@ struct llm_build_context {
|
|||
}
|
||||
|
||||
struct ggml_cgraph * build_qwen2() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
|
@ -10511,7 +10525,7 @@ struct llm_build_context {
|
|||
}
|
||||
|
||||
struct ggml_cgraph * build_qwen2moe() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
||||
int32_t n_tokens = this->n_tokens;
|
||||
|
@ -10657,7 +10671,7 @@ struct llm_build_context {
|
|||
}
|
||||
|
||||
struct ggml_cgraph * build_phi2() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
|
@ -10778,7 +10792,7 @@ struct llm_build_context {
|
|||
}
|
||||
|
||||
struct ggml_cgraph * build_phi3() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
|
@ -10793,7 +10807,7 @@ struct llm_build_context {
|
|||
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
||||
struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
auto residual = inpL;
|
||||
|
@ -10851,7 +10865,7 @@ struct llm_build_context {
|
|||
|
||||
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
||||
Kcur, Vcur, Qcur, KQ_mask_swa, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
||||
}
|
||||
|
||||
if (il == n_layer - 1) {
|
||||
|
@ -11010,7 +11024,7 @@ struct llm_build_context {
|
|||
}
|
||||
|
||||
struct ggml_cgraph * build_gpt2() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
|
@ -11115,7 +11129,7 @@ struct llm_build_context {
|
|||
}
|
||||
|
||||
struct ggml_cgraph * build_codeshell() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
|
@ -11226,7 +11240,7 @@ struct llm_build_context {
|
|||
}
|
||||
|
||||
struct ggml_cgraph * build_orion() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
|
@ -11344,7 +11358,7 @@ struct llm_build_context {
|
|||
}
|
||||
|
||||
struct ggml_cgraph * build_internlm2() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
|
@ -11465,7 +11479,7 @@ struct llm_build_context {
|
|||
// https://github.com/ggerganov/llama.cpp/issues/5276#issuecomment-1925774738
|
||||
// based on the original build_llama() function
|
||||
struct ggml_cgraph * build_minicpm() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
|
@ -11609,7 +11623,7 @@ struct llm_build_context {
|
|||
}
|
||||
|
||||
struct ggml_cgraph * build_gemma() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
||||
|
||||
|
@ -11717,7 +11731,7 @@ struct llm_build_context {
|
|||
}
|
||||
|
||||
struct ggml_cgraph * build_gemma2() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
||||
|
||||
|
@ -11769,7 +11783,7 @@ struct llm_build_context {
|
|||
switch (model.type) {
|
||||
case e_model::MODEL_9B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); break;
|
||||
case e_model::MODEL_27B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head))); break;
|
||||
default: GGML_ASSERT(false);
|
||||
default: GGML_ABORT("fatal error");
|
||||
};
|
||||
cb(Qcur, "Qcur_scaled", il);
|
||||
|
||||
|
@ -11852,7 +11866,7 @@ struct llm_build_context {
|
|||
|
||||
|
||||
struct ggml_cgraph * build_starcoder2() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
|
@ -11971,7 +11985,7 @@ struct llm_build_context {
|
|||
}
|
||||
|
||||
struct ggml_cgraph * build_mamba() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
const int64_t d_model = n_embd;
|
||||
const int64_t d_conv = hparams.ssm_d_conv;
|
||||
|
@ -12120,7 +12134,7 @@ struct llm_build_context {
|
|||
|
||||
struct ggml_cgraph * build_command_r() {
|
||||
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
|
@ -12274,7 +12288,7 @@ struct llm_build_context {
|
|||
// * removed bias
|
||||
// * removed MoE
|
||||
struct ggml_cgraph * build_olmo() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
||||
int32_t n_tokens = this->n_tokens;
|
||||
|
@ -12398,7 +12412,7 @@ struct llm_build_context {
|
|||
}
|
||||
|
||||
struct ggml_cgraph * build_openelm() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
|
@ -12523,7 +12537,7 @@ struct llm_build_context {
|
|||
}
|
||||
|
||||
struct ggml_cgraph * build_gptneox() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
|
@ -12665,7 +12679,7 @@ struct llm_build_context {
|
|||
}
|
||||
|
||||
struct ggml_cgraph * build_arctic() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
||||
int32_t n_tokens = this->n_tokens;
|
||||
|
@ -12797,7 +12811,7 @@ struct llm_build_context {
|
|||
}
|
||||
|
||||
struct ggml_cgraph * build_deepseek2() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
||||
int32_t n_tokens = this->n_tokens;
|
||||
|
@ -13025,7 +13039,7 @@ struct llm_build_context {
|
|||
}
|
||||
|
||||
struct ggml_cgraph * build_bitnet() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
|
@ -13165,7 +13179,7 @@ struct llm_build_context {
|
|||
}
|
||||
|
||||
struct ggml_cgraph * build_t5() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
||||
int32_t n_tokens = this->n_tokens;
|
||||
|
@ -13482,7 +13496,7 @@ struct llm_build_context {
|
|||
}
|
||||
|
||||
struct ggml_cgraph * build_jais() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
|
@ -13574,7 +13588,7 @@ struct llm_build_context {
|
|||
}
|
||||
|
||||
struct ggml_cgraph * build_chatglm() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
|
@ -13934,7 +13948,7 @@ static struct ggml_cgraph * llama_build_graph(
|
|||
result = llm.build_jais();
|
||||
} break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
// add on pooling layer
|
||||
|
@ -14058,18 +14072,23 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|||
"causal attention is not supported by this model"
|
||||
);
|
||||
|
||||
if (lctx.inp_KQ_mask) {
|
||||
if (lctx.inp_KQ_mask || lctx.inp_KQ_mask_swa) {
|
||||
// NOTE: hparams.causal_attn indicates the model is capable of generation and uses the kv cache.
|
||||
if (cparams.causal_attn && !lctx.is_encoding) {
|
||||
const int64_t n_kv = kv_self.n;
|
||||
const int64_t n_tokens = batch.n_tokens;
|
||||
|
||||
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
|
||||
|
||||
float * data = (float *) lctx.inp_KQ_mask->data;
|
||||
float * data = nullptr;
|
||||
float * data_swa = nullptr;
|
||||
|
||||
if (lctx.inp_KQ_mask) {
|
||||
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
|
||||
data = (float *) lctx.inp_KQ_mask->data;
|
||||
}
|
||||
|
||||
if (lctx.inp_KQ_mask_swa) {
|
||||
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask_swa->buffer));
|
||||
data_swa = (float *) lctx.inp_KQ_mask_swa->data;
|
||||
}
|
||||
|
||||
|
@ -14087,12 +14106,15 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|||
f = -INFINITY;
|
||||
} else {
|
||||
if (hparams.use_alibi) {
|
||||
f = -fabs(lctx.kv_self.cells[i].pos - pos);
|
||||
f = -std::abs(lctx.kv_self.cells[i].pos - pos);
|
||||
} else {
|
||||
f = 0.0f;
|
||||
}
|
||||
}
|
||||
data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
|
||||
|
||||
if (data) {
|
||||
data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
|
||||
}
|
||||
|
||||
// may need to cut off old tokens for sliding window
|
||||
if (data_swa) {
|
||||
|
@ -14104,9 +14126,19 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|||
}
|
||||
}
|
||||
|
||||
for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
|
||||
for (int j = 0; j < n_kv; ++j) {
|
||||
data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
|
||||
if (data) {
|
||||
for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
|
||||
for (int j = 0; j < n_kv; ++j) {
|
||||
data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (data_swa) {
|
||||
for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
|
||||
for (int j = 0; j < n_kv; ++j) {
|
||||
data_swa[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -14128,7 +14160,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|||
for (int s = 0; s < batch.n_seq_id[i]; ++s) {
|
||||
if (batch.seq_id[i][s] == seq_id) {
|
||||
if (hparams.use_alibi) {
|
||||
f = -fabs(batch.pos[i] - batch.pos[j]);
|
||||
f = -std::abs(batch.pos[i] - batch.pos[j]);
|
||||
} else {
|
||||
f = 0.0f;
|
||||
}
|
||||
|
@ -14715,8 +14747,8 @@ static int llama_decode_internal(
|
|||
} break;
|
||||
case LLAMA_POOLING_TYPE_UNSPECIFIED:
|
||||
{
|
||||
GGML_ASSERT(false && "unknown pooling type");
|
||||
} break;
|
||||
GGML_ABORT("unknown pooling type");
|
||||
}
|
||||
}
|
||||
}
|
||||
n_outputs_prev += lctx.n_outputs;
|
||||
|
@ -14901,9 +14933,9 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
|||
// each move requires 6*n_layer tensors (see build_defrag)
|
||||
// - source view, destination view, copy operation
|
||||
// - x2 for keys and values
|
||||
//const uint32_t max_moves = LLAMA_MAX_NODES/(6*n_layer);
|
||||
//const uint32_t max_moves = llama_model_max_nodes(model)/(6*n_layer);
|
||||
// TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
|
||||
const uint32_t max_moves = (LLAMA_MAX_NODES - 2*n_layer)/(6*n_layer);
|
||||
const uint32_t max_moves = (llama_model_max_nodes(lctx.model) - 2*n_layer)/(6*n_layer);
|
||||
|
||||
// determine which KV cells to move where
|
||||
//
|
||||
|
@ -15107,7 +15139,7 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
|
|||
// apply K-shift if needed
|
||||
if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) {
|
||||
if (lctx.model.arch == LLM_ARCH_DEEPSEEK2) { // not supported due to MLA
|
||||
GGML_ASSERT(false && "Deepseek2 does not support K-shift");
|
||||
GGML_ABORT("Deepseek2 does not support K-shift");
|
||||
}
|
||||
|
||||
{
|
||||
|
@ -15246,7 +15278,7 @@ static void llama_tensor_dequantize_internal(
|
|||
} else if (ggml_is_quantized(tensor->type)) {
|
||||
qtype.to_float(tensor->data, f32_output, nelements);
|
||||
} else {
|
||||
GGML_ASSERT(false); // unreachable
|
||||
GGML_ABORT("fatal error"); // unreachable
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
@ -16655,9 +16687,7 @@ struct llama_context * llama_new_context_with_model(
|
|||
for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
|
||||
ggml_backend_t backend = ggml_backend_sycl_init(i);
|
||||
if (backend == nullptr) {
|
||||
int id_list[GGML_SYCL_MAX_DEVICES];
|
||||
ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
|
||||
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, id_list[i], i);
|
||||
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d for No.%d backend\n", __func__, i, i);
|
||||
llama_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
|
@ -16781,8 +16811,10 @@ struct llama_context * llama_new_context_with_model(
|
|||
}
|
||||
}
|
||||
|
||||
const size_t max_nodes = llama_model_max_nodes(*model);
|
||||
|
||||
// buffer used to store the computation graph and the tensor meta data
|
||||
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead_custom(LLAMA_MAX_NODES, false));
|
||||
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));
|
||||
|
||||
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
|
||||
bool pipeline_parallel =
|
||||
|
@ -16795,7 +16827,7 @@ struct llama_context * llama_new_context_with_model(
|
|||
// currently this is only implemented in the CUDA backend
|
||||
pipeline_parallel = false;
|
||||
#endif
|
||||
ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES, pipeline_parallel);
|
||||
ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), max_nodes, pipeline_parallel);
|
||||
|
||||
if (pipeline_parallel) {
|
||||
LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(ctx->sched));
|
||||
|
@ -16847,10 +16879,6 @@ const struct llama_vocab * llama_get_vocab(const struct llama_context * ctx) {
|
|||
return &ctx->model.vocab;
|
||||
}
|
||||
|
||||
struct llama_grammar * llama_get_grammar(struct llama_context * ctx) {
|
||||
return &ctx->grammar;
|
||||
}
|
||||
|
||||
uint32_t llama_n_ctx(const struct llama_context * ctx) {
|
||||
return ctx->cparams.n_ctx;
|
||||
}
|
||||
|
@ -16924,8 +16952,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|||
|
||||
// all model arches should be listed explicitly here
|
||||
case LLM_ARCH_UNKNOWN:
|
||||
GGML_ASSERT(false && "unknown architecture");
|
||||
break;
|
||||
GGML_ABORT("unknown architecture");
|
||||
}
|
||||
|
||||
return LLAMA_ROPE_TYPE_NONE;
|
||||
|
@ -18500,7 +18527,7 @@ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
|
|||
} catch (const std::exception & err) {
|
||||
LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
|
||||
#ifndef NDEBUG
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
#endif
|
||||
return nullptr;
|
||||
}
|
||||
|
@ -18545,7 +18572,7 @@ float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
|
|||
} catch (const std::exception & err) {
|
||||
LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what());
|
||||
#ifndef NDEBUG
|
||||
GGML_ASSERT(false);
|
||||
GGML_ABORT("fatal error");
|
||||
#endif
|
||||
return nullptr;
|
||||
}
|
||||
|
@ -19169,11 +19196,7 @@ const char * llama_print_system_info(void) {
|
|||
s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
|
||||
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
||||
s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
|
||||
#ifdef GGML_USE_LLAMAFILE
|
||||
s += "LLAMAFILE = 1 | ";
|
||||
#else
|
||||
s += "LLAMAFILE = 0 | ";
|
||||
#endif
|
||||
s += "LLAMAFILE = " + std::to_string(ggml_cpu_has_llamafile()) + " | ";
|
||||
|
||||
return s.c_str();
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue