Merge branch 'upstream' into concedo_experimental

# Conflicts:
#	.devops/nix/apps.nix
#	.devops/tools.sh
#	Makefile
#	README.md
#	docs/backend/SYCL.md
#	docs/build.md
#	examples/CMakeLists.txt
#	ggml/include/ggml.h
#	src/llama-vocab.cpp
#	tests/test-backend-ops.cpp
#	tests/test-chat-template.cpp
#	tests/test-sampling.cpp
This commit is contained in:
Concedo 2024-07-27 23:15:54 +08:00
commit ba5babb876
61 changed files with 1389 additions and 5170 deletions

View file

@ -106,7 +106,6 @@
#endif
// bump if necessary
#define LLAMA_MAX_NODES 8192
#define LLAMA_MAX_LAYERS 512
#define LLAMA_MAX_EXPERTS 160 // DeepSeekV2
@ -2257,8 +2256,7 @@ struct llama_hparams {
return n_head_arr[il];
}
GGML_ASSERT(false);
return 0;
GGML_ABORT("fatal error");
}
uint32_t n_head_kv(uint32_t il = 0) const {
@ -2266,8 +2264,7 @@ struct llama_hparams {
return n_head_kv_arr[il];
}
GGML_ASSERT(false);
return 0;
GGML_ABORT("fatal error");
}
uint32_t n_ff(uint32_t il = 0) const {
@ -2275,8 +2272,7 @@ struct llama_hparams {
return n_ff_arr[il];
}
GGML_ASSERT(false);
return 0;
GGML_ABORT("fatal error");
}
uint32_t n_gqa(uint32_t il = 0) const {
@ -2453,6 +2449,7 @@ struct llama_layer {
// long rope factors
struct ggml_tensor * rope_long = nullptr;
struct ggml_tensor * rope_short = nullptr;
struct ggml_tensor * rope_freqs = nullptr;
// bitnet scale
struct ggml_tensor * wq_scale;
@ -2655,7 +2652,6 @@ struct llama_context {
llama_context(const llama_model & model)
: model(model)
, sampling(llama_n_vocab(&model))
, grammar()
, t_start_us(model.t_start_us)
, t_load_us(model.t_load_us) {}
@ -2673,7 +2669,6 @@ struct llama_context {
struct llama_cparams cparams;
struct llama_sampling sampling;
struct llama_grammar grammar;
struct llama_kv_cache kv_self;
struct llama_control_vector cvec;
@ -2907,7 +2902,7 @@ static size_t llama_get_device_memory(const llama_model & model, int device) {
#elif defined(GGML_USE_CANN)
size_t total;
size_t free;
ggml_backend_cann_get_device_memory(device, &total, &free);
ggml_backend_cann_get_device_memory(device, &free, &total);
return free;
#else
return 1;
@ -3576,6 +3571,15 @@ namespace GGUFMeta {
using llama_buf_map = std::unordered_map<uint32_t, ggml_backend_buffer_t>;
// TODO: update when needed or think of some clever automatic way to do this
static size_t llama_model_max_nodes(const llama_model & /*model*/) {
//if (model.arch == LLM_ARCH_LLAMA && model.hparams.n_layer > ??) { // llama-3 405B
// return 32768;
//}
return 8192;
}
struct llama_model_loader {
int n_kv = 0;
int n_tensors = 0;
@ -4907,6 +4911,7 @@ static void llm_load_hparams(
} break;
case LLM_ARCH_PHI3:
{
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
switch (hparams.n_layer) {
@ -6100,6 +6105,8 @@ static bool llm_load_tensors(
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
layer.rope_freqs = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), {n_embd/n_head/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
if (n_expert == 0) {
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
@ -8118,7 +8125,7 @@ static struct ggml_tensor * llm_build_moe_ffn(
cb(gate, "ffn_moe_gelu", il);
} break;
default:
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
ggml_tensor * par = ggml_mul(ctx, up, gate); // [n_ff, n_expert_used, n_tokens]
@ -8445,7 +8452,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_k_shift() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
GGML_ASSERT(kv_self.size == n_ctx);
@ -8476,7 +8483,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_s_copy() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
GGML_ASSERT(kv_self.recurrent);
@ -8499,7 +8506,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids) {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
for (uint32_t i = 0; i < ids.size(); ++i) {
const uint32_t id = ids[i];
@ -8577,6 +8584,10 @@ struct llm_build_context {
// choose long/short freq factors based on the context size
const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max;
if (model.layers[il].rope_freqs != nullptr) {
return model.layers[il].rope_freqs;
}
if (n_ctx_pre_seq > hparams.n_ctx_orig_yarn) {
return model.layers[il].rope_long;
}
@ -8681,8 +8692,8 @@ struct llm_build_context {
} break;
default:
{
GGML_ASSERT(false && "unknown pooling type");
} break;
GGML_ABORT("unknown pooling type");
}
}
cb(cur, "result_embd_pooled", -1);
@ -8740,7 +8751,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_llama() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;
@ -8771,6 +8782,9 @@ struct llm_build_context {
// self-attention
{
// rope freq factors for llama3; may return nullptr for llama2 and other models
struct ggml_tensor * rope_factors = build_rope_factors(il);
// compute Q and K and RoPE them
struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
cb(Qcur, "Qcur", il);
@ -8794,14 +8808,14 @@ struct llm_build_context {
}
Qcur = ggml_rope_ext(
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Qcur, "Qcur", il);
Kcur = ggml_rope_ext(
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
@ -8883,7 +8897,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_baichuan() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@ -8937,7 +8951,7 @@ struct llm_build_context {
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd/n_head, n_head, n_tokens);
break;
default:
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
cb(Qcur, "Qcur", il);
cb(Kcur, "Kcur", il);
@ -8998,7 +9012,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_xverse() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@ -9101,7 +9115,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_falcon() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@ -9221,7 +9235,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_grok() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;
@ -9378,7 +9392,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_dbrx() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;
@ -9504,7 +9518,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_starcoder() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@ -9608,7 +9622,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_refact() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@ -9702,7 +9716,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_bert() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@ -9896,7 +9910,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_bloom() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@ -9997,7 +10011,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_mpt() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@ -10287,7 +10301,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_qwen() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@ -10399,7 +10413,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_qwen2() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@ -10511,7 +10525,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_qwen2moe() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;
@ -10657,7 +10671,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_phi2() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@ -10778,7 +10792,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_phi3() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@ -10793,7 +10807,7 @@ struct llm_build_context {
struct ggml_tensor * inp_pos = build_inp_pos();
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa();
for (int il = 0; il < n_layer; ++il) {
auto residual = inpL;
@ -10851,7 +10865,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
Kcur, Vcur, Qcur, KQ_mask_swa, n_tokens, kv_head, n_kv, 1.0f, cb, il);
}
if (il == n_layer - 1) {
@ -11010,7 +11024,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_gpt2() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@ -11115,7 +11129,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_codeshell() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@ -11226,7 +11240,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_orion() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@ -11344,7 +11358,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_internlm2() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@ -11465,7 +11479,7 @@ struct llm_build_context {
// https://github.com/ggerganov/llama.cpp/issues/5276#issuecomment-1925774738
// based on the original build_llama() function
struct ggml_cgraph * build_minicpm() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@ -11609,7 +11623,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_gemma() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head_k = hparams.n_embd_head_k;
@ -11717,7 +11731,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_gemma2() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head_k = hparams.n_embd_head_k;
@ -11769,7 +11783,7 @@ struct llm_build_context {
switch (model.type) {
case e_model::MODEL_9B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); break;
case e_model::MODEL_27B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head))); break;
default: GGML_ASSERT(false);
default: GGML_ABORT("fatal error");
};
cb(Qcur, "Qcur_scaled", il);
@ -11852,7 +11866,7 @@ struct llm_build_context {
struct ggml_cgraph * build_starcoder2() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@ -11971,7 +11985,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_mamba() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t d_model = n_embd;
const int64_t d_conv = hparams.ssm_d_conv;
@ -12120,7 +12134,7 @@ struct llm_build_context {
struct ggml_cgraph * build_command_r() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@ -12274,7 +12288,7 @@ struct llm_build_context {
// * removed bias
// * removed MoE
struct ggml_cgraph * build_olmo() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;
@ -12398,7 +12412,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_openelm() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@ -12523,7 +12537,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_gptneox() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@ -12665,7 +12679,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_arctic() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;
@ -12797,7 +12811,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_deepseek2() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;
@ -13025,7 +13039,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_bitnet() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@ -13165,7 +13179,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_t5() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;
@ -13482,7 +13496,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_jais() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@ -13574,7 +13588,7 @@ struct llm_build_context {
}
struct ggml_cgraph * build_chatglm() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
@ -13934,7 +13948,7 @@ static struct ggml_cgraph * llama_build_graph(
result = llm.build_jais();
} break;
default:
GGML_ASSERT(false);
GGML_ABORT("fatal error");
}
// add on pooling layer
@ -14058,18 +14072,23 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
"causal attention is not supported by this model"
);
if (lctx.inp_KQ_mask) {
if (lctx.inp_KQ_mask || lctx.inp_KQ_mask_swa) {
// NOTE: hparams.causal_attn indicates the model is capable of generation and uses the kv cache.
if (cparams.causal_attn && !lctx.is_encoding) {
const int64_t n_kv = kv_self.n;
const int64_t n_tokens = batch.n_tokens;
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
float * data = (float *) lctx.inp_KQ_mask->data;
float * data = nullptr;
float * data_swa = nullptr;
if (lctx.inp_KQ_mask) {
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
data = (float *) lctx.inp_KQ_mask->data;
}
if (lctx.inp_KQ_mask_swa) {
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask_swa->buffer));
data_swa = (float *) lctx.inp_KQ_mask_swa->data;
}
@ -14087,12 +14106,15 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
f = -INFINITY;
} else {
if (hparams.use_alibi) {
f = -fabs(lctx.kv_self.cells[i].pos - pos);
f = -std::abs(lctx.kv_self.cells[i].pos - pos);
} else {
f = 0.0f;
}
}
data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
if (data) {
data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
}
// may need to cut off old tokens for sliding window
if (data_swa) {
@ -14104,9 +14126,19 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
}
}
for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
for (int j = 0; j < n_kv; ++j) {
data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
if (data) {
for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
for (int j = 0; j < n_kv; ++j) {
data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
}
}
}
if (data_swa) {
for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
for (int j = 0; j < n_kv; ++j) {
data_swa[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
}
}
}
}
@ -14128,7 +14160,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
for (int s = 0; s < batch.n_seq_id[i]; ++s) {
if (batch.seq_id[i][s] == seq_id) {
if (hparams.use_alibi) {
f = -fabs(batch.pos[i] - batch.pos[j]);
f = -std::abs(batch.pos[i] - batch.pos[j]);
} else {
f = 0.0f;
}
@ -14715,8 +14747,8 @@ static int llama_decode_internal(
} break;
case LLAMA_POOLING_TYPE_UNSPECIFIED:
{
GGML_ASSERT(false && "unknown pooling type");
} break;
GGML_ABORT("unknown pooling type");
}
}
}
n_outputs_prev += lctx.n_outputs;
@ -14901,9 +14933,9 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
// each move requires 6*n_layer tensors (see build_defrag)
// - source view, destination view, copy operation
// - x2 for keys and values
//const uint32_t max_moves = LLAMA_MAX_NODES/(6*n_layer);
//const uint32_t max_moves = llama_model_max_nodes(model)/(6*n_layer);
// TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
const uint32_t max_moves = (LLAMA_MAX_NODES - 2*n_layer)/(6*n_layer);
const uint32_t max_moves = (llama_model_max_nodes(lctx.model) - 2*n_layer)/(6*n_layer);
// determine which KV cells to move where
//
@ -15107,7 +15139,7 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
// apply K-shift if needed
if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) {
if (lctx.model.arch == LLM_ARCH_DEEPSEEK2) { // not supported due to MLA
GGML_ASSERT(false && "Deepseek2 does not support K-shift");
GGML_ABORT("Deepseek2 does not support K-shift");
}
{
@ -15246,7 +15278,7 @@ static void llama_tensor_dequantize_internal(
} else if (ggml_is_quantized(tensor->type)) {
qtype.to_float(tensor->data, f32_output, nelements);
} else {
GGML_ASSERT(false); // unreachable
GGML_ABORT("fatal error"); // unreachable
}
return;
}
@ -16655,9 +16687,7 @@ struct llama_context * llama_new_context_with_model(
for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
ggml_backend_t backend = ggml_backend_sycl_init(i);
if (backend == nullptr) {
int id_list[GGML_SYCL_MAX_DEVICES];
ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, id_list[i], i);
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d for No.%d backend\n", __func__, i, i);
llama_free(ctx);
return nullptr;
}
@ -16781,8 +16811,10 @@ struct llama_context * llama_new_context_with_model(
}
}
const size_t max_nodes = llama_model_max_nodes(*model);
// buffer used to store the computation graph and the tensor meta data
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead_custom(LLAMA_MAX_NODES, false));
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
bool pipeline_parallel =
@ -16795,7 +16827,7 @@ struct llama_context * llama_new_context_with_model(
// currently this is only implemented in the CUDA backend
pipeline_parallel = false;
#endif
ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES, pipeline_parallel);
ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), max_nodes, pipeline_parallel);
if (pipeline_parallel) {
LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(ctx->sched));
@ -16847,10 +16879,6 @@ const struct llama_vocab * llama_get_vocab(const struct llama_context * ctx) {
return &ctx->model.vocab;
}
struct llama_grammar * llama_get_grammar(struct llama_context * ctx) {
return &ctx->grammar;
}
uint32_t llama_n_ctx(const struct llama_context * ctx) {
return ctx->cparams.n_ctx;
}
@ -16924,8 +16952,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
// all model arches should be listed explicitly here
case LLM_ARCH_UNKNOWN:
GGML_ASSERT(false && "unknown architecture");
break;
GGML_ABORT("unknown architecture");
}
return LLAMA_ROPE_TYPE_NONE;
@ -18500,7 +18527,7 @@ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
#ifndef NDEBUG
GGML_ASSERT(false);
GGML_ABORT("fatal error");
#endif
return nullptr;
}
@ -18545,7 +18572,7 @@ float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what());
#ifndef NDEBUG
GGML_ASSERT(false);
GGML_ABORT("fatal error");
#endif
return nullptr;
}
@ -19169,11 +19196,7 @@ const char * llama_print_system_info(void) {
s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
#ifdef GGML_USE_LLAMAFILE
s += "LLAMAFILE = 1 | ";
#else
s += "LLAMAFILE = 0 | ";
#endif
s += "LLAMAFILE = " + std::to_string(ggml_cpu_has_llamafile()) + " | ";
return s.c_str();
}