mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 01:24:36 +00:00
Merge branch 'master' into concedo_experimental
# Conflicts: # CMakeLists.txt # ci/run.sh # llama.cpp
This commit is contained in:
commit
234f79fe9d
16 changed files with 5321 additions and 4861 deletions
324
llama.cpp
324
llama.cpp
|
@ -246,6 +246,8 @@ enum llm_kv {
|
|||
LLM_KV_ATTENTION_HEAD_COUNT_KV,
|
||||
LLM_KV_ATTENTION_MAX_ALIBI_BIAS,
|
||||
LLM_KV_ATTENTION_CLAMP_KQV,
|
||||
LLM_KV_ATTENTION_KEY_LENGTH,
|
||||
LLM_KV_ATTENTION_VALUE_LENGTH,
|
||||
LLM_KV_ATTENTION_LAYERNORM_EPS,
|
||||
LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
|
||||
|
||||
|
@ -298,6 +300,8 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
|
|||
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
||||
{ LLM_KV_ATTENTION_MAX_ALIBI_BIAS, "%s.attention.max_alibi_bias" },
|
||||
{ LLM_KV_ATTENTION_CLAMP_KQV, "%s.attention.clamp_kqv" },
|
||||
{ LLM_KV_ATTENTION_KEY_LENGTH, "%s.attention.key_length" },
|
||||
{ LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
|
||||
{ LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
|
||||
{ LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
|
||||
|
||||
|
@ -1289,6 +1293,8 @@ struct llama_hparams {
|
|||
uint32_t n_head_kv;
|
||||
uint32_t n_layer;
|
||||
uint32_t n_rot;
|
||||
uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
|
||||
uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
|
||||
uint32_t n_ff;
|
||||
uint32_t n_expert = 0;
|
||||
uint32_t n_expert_used = 0;
|
||||
|
@ -1315,6 +1321,8 @@ struct llama_hparams {
|
|||
if (this->n_head_kv != other.n_head_kv) return true;
|
||||
if (this->n_layer != other.n_layer) return true;
|
||||
if (this->n_rot != other.n_rot) return true;
|
||||
if (this->n_embd_head_k != other.n_embd_head_k) return true;
|
||||
if (this->n_embd_head_v != other.n_embd_head_v) return true;
|
||||
if (this->n_ff != other.n_ff) return true;
|
||||
if (this->n_expert != other.n_expert) return true;
|
||||
if (this->n_expert_used != other.n_expert_used) return true;
|
||||
|
@ -1336,12 +1344,12 @@ struct llama_hparams {
|
|||
return n_head/n_head_kv;
|
||||
}
|
||||
|
||||
uint32_t n_embd_head() const {
|
||||
return n_embd/n_head;
|
||||
uint32_t n_embd_k_gqa() const { // dimension of key embeddings across all k-v heads
|
||||
return n_embd_head_k * n_head_kv;
|
||||
}
|
||||
|
||||
uint32_t n_embd_gqa() const {
|
||||
return n_embd/n_gqa();
|
||||
uint32_t n_embd_v_gqa() const { // dimension of value embeddings across all k-v heads
|
||||
return n_embd_head_v * n_head_kv;
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -1655,8 +1663,9 @@ static bool llama_kv_cache_init(
|
|||
uint32_t n_ctx,
|
||||
int n_gpu_layers,
|
||||
bool offload) {
|
||||
const uint32_t n_embd = hparams.n_embd_gqa();
|
||||
const uint32_t n_layer = hparams.n_layer;
|
||||
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
||||
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
||||
const uint32_t n_layer = hparams.n_layer;
|
||||
|
||||
cache.has_shift = false;
|
||||
|
||||
|
@ -1687,8 +1696,8 @@ static bool llama_kv_cache_init(
|
|||
const int i_gpu_start = (int) n_layer - n_gpu_layers;
|
||||
|
||||
for (int i = 0; i < (int) n_layer; i++) {
|
||||
ggml_tensor * k = ggml_new_tensor_1d(cache.ctx, ktype, n_embd*n_ctx);
|
||||
ggml_tensor * v = ggml_new_tensor_1d(cache.ctx, vtype, n_embd*n_ctx);
|
||||
ggml_tensor * k = ggml_new_tensor_1d(cache.ctx, ktype, n_embd_k_gqa*n_ctx);
|
||||
ggml_tensor * v = ggml_new_tensor_1d(cache.ctx, vtype, n_embd_v_gqa*n_ctx);
|
||||
ggml_format_name(k, "cache_k_l%d", i);
|
||||
ggml_format_name(v, "cache_v_l%d", i);
|
||||
cache.k_l.push_back(k);
|
||||
|
@ -2683,6 +2692,12 @@ static void llm_load_hparams(
|
|||
// gpt-j n_rot = rotary_dim
|
||||
}
|
||||
|
||||
hparams.n_embd_head_k = hparams.n_embd / hparams.n_head;
|
||||
ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
|
||||
|
||||
hparams.n_embd_head_v = hparams.n_embd / hparams.n_head;
|
||||
ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
|
||||
|
||||
// arch-specific KVs
|
||||
switch (model.arch) {
|
||||
case LLM_ARCH_LLAMA:
|
||||
|
@ -3112,8 +3127,12 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|||
LLAMA_LOG_INFO("%s: n_head = %u\n", __func__, hparams.n_head);
|
||||
LLAMA_LOG_INFO("%s: n_head_kv = %u\n", __func__, hparams.n_head_kv);
|
||||
LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
|
||||
LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
|
||||
LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
|
||||
LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k);
|
||||
LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v);
|
||||
LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
|
||||
LLAMA_LOG_INFO("%s: n_embd_k_gqa = %u\n", __func__, hparams.n_embd_k_gqa());
|
||||
LLAMA_LOG_INFO("%s: n_embd_v_gqa = %u\n", __func__, hparams.n_embd_v_gqa());
|
||||
LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
|
||||
LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
|
||||
LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
|
||||
|
@ -3202,10 +3221,11 @@ static bool llm_load_tensors(
|
|||
|
||||
// create tensors for the weights
|
||||
{
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
||||
const int64_t n_layer = hparams.n_layer;
|
||||
const int64_t n_vocab = hparams.n_vocab;
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
||||
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
||||
const int64_t n_layer = hparams.n_layer;
|
||||
const int64_t n_vocab = hparams.n_vocab;
|
||||
|
||||
const auto tn = LLM_TN(model.arch);
|
||||
switch (model.arch) {
|
||||
|
@ -3231,7 +3251,10 @@ static bool llm_load_tensors(
|
|||
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
||||
}
|
||||
|
||||
const uint32_t n_ff = hparams.n_ff;
|
||||
const uint32_t n_ff = hparams.n_ff;
|
||||
const int64_t n_embd_gqa = n_embd_v_gqa;
|
||||
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
|
||||
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
||||
|
||||
const int i_gpu_start = n_layer - n_gpu_layers;
|
||||
|
||||
|
@ -3299,7 +3322,10 @@ static bool llm_load_tensors(
|
|||
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
||||
}
|
||||
|
||||
const uint32_t n_ff = hparams.n_ff;
|
||||
const uint32_t n_ff = hparams.n_ff;
|
||||
const int64_t n_embd_gqa = n_embd_v_gqa;
|
||||
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
|
||||
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
||||
|
||||
const int i_gpu_start = n_layer - n_gpu_layers;
|
||||
|
||||
|
@ -3347,7 +3373,10 @@ static bool llm_load_tensors(
|
|||
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
||||
}
|
||||
|
||||
const uint32_t n_ff = hparams.n_ff;
|
||||
const uint32_t n_ff = hparams.n_ff;
|
||||
const int64_t n_embd_gqa = n_embd_v_gqa;
|
||||
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
|
||||
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
||||
|
||||
const int i_gpu_start = n_layer - n_gpu_layers;
|
||||
|
||||
|
@ -3397,7 +3426,10 @@ static bool llm_load_tensors(
|
|||
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
||||
}
|
||||
|
||||
const uint32_t n_ff = hparams.n_ff;
|
||||
const uint32_t n_ff = hparams.n_ff;
|
||||
const int64_t n_embd_gqa = n_embd_v_gqa;
|
||||
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
|
||||
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
||||
|
||||
const int i_gpu_start = n_layer - n_gpu_layers;
|
||||
|
||||
|
@ -3449,7 +3481,11 @@ static bool llm_load_tensors(
|
|||
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
||||
}
|
||||
|
||||
const uint32_t n_ff = hparams.n_ff;
|
||||
const uint32_t n_ff = hparams.n_ff;
|
||||
const int64_t n_embd_gqa = n_embd_v_gqa;
|
||||
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
|
||||
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
||||
|
||||
const int i_gpu_start = n_layer - n_gpu_layers;
|
||||
model.layers.resize(n_layer);
|
||||
for (uint32_t i = 0; i < n_layer; ++i) {
|
||||
|
@ -3498,7 +3534,10 @@ static bool llm_load_tensors(
|
|||
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
||||
}
|
||||
|
||||
const uint32_t n_ff = hparams.n_ff;
|
||||
const uint32_t n_ff = hparams.n_ff;
|
||||
const int64_t n_embd_gqa = n_embd_v_gqa;
|
||||
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
|
||||
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
||||
|
||||
const int i_gpu_start = n_layer - n_gpu_layers;
|
||||
|
||||
|
@ -3549,7 +3588,10 @@ static bool llm_load_tensors(
|
|||
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
||||
}
|
||||
|
||||
const uint32_t n_ff = hparams.n_ff;
|
||||
const uint32_t n_ff = hparams.n_ff;
|
||||
const int64_t n_embd_gqa = n_embd_v_gqa;
|
||||
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
|
||||
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
||||
|
||||
const int i_gpu_start = n_layer - n_gpu_layers;
|
||||
|
||||
|
@ -3596,7 +3638,10 @@ static bool llm_load_tensors(
|
|||
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
||||
}
|
||||
|
||||
const uint32_t n_ff = hparams.n_ff;
|
||||
const uint32_t n_ff = hparams.n_ff;
|
||||
const int64_t n_embd_gqa = n_embd_v_gqa;
|
||||
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
|
||||
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
||||
|
||||
const int i_gpu_start = n_layer - n_gpu_layers;
|
||||
|
||||
|
@ -3694,7 +3739,10 @@ static bool llm_load_tensors(
|
|||
model.output_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, backend_output);
|
||||
}
|
||||
|
||||
const uint32_t n_ff = hparams.n_ff;
|
||||
const uint32_t n_ff = hparams.n_ff;
|
||||
const int64_t n_embd_gqa = n_embd_v_gqa;
|
||||
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
|
||||
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
||||
|
||||
const int i_gpu_start = n_layer - n_gpu_layers;
|
||||
|
||||
|
@ -3743,7 +3791,10 @@ static bool llm_load_tensors(
|
|||
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
||||
}
|
||||
|
||||
const uint32_t n_ff = hparams.n_ff;
|
||||
const uint32_t n_ff = hparams.n_ff;
|
||||
const int64_t n_embd_gqa = n_embd_v_gqa;
|
||||
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
|
||||
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
||||
|
||||
const int i_gpu_start = n_layer - n_gpu_layers;
|
||||
|
||||
|
@ -3790,7 +3841,10 @@ static bool llm_load_tensors(
|
|||
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
||||
}
|
||||
|
||||
const uint32_t n_ff = hparams.n_ff;
|
||||
const uint32_t n_ff = hparams.n_ff;
|
||||
const int64_t n_embd_gqa = n_embd_v_gqa;
|
||||
GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
|
||||
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
||||
|
||||
const int i_gpu_start = n_layer - n_gpu_layers;
|
||||
|
||||
|
@ -4029,8 +4083,8 @@ static struct ggml_tensor * llm_build_inp_embd(
|
|||
return inpL;
|
||||
}
|
||||
|
||||
// Persimmon: n_rot = n_embd_head/2
|
||||
// Other: n_rot = n_embd_head
|
||||
// Persimmon: n_rot = n_embd_head_k/2
|
||||
// Other: n_rot = n_embd_head_k
|
||||
static void llm_build_k_shift(
|
||||
struct ggml_context * ctx,
|
||||
const llama_hparams & hparams,
|
||||
|
@ -4043,17 +4097,17 @@ static void llm_build_k_shift(
|
|||
float freq_base,
|
||||
float freq_scale,
|
||||
const llm_build_cb & cb) {
|
||||
const int64_t n_layer = hparams.n_layer;
|
||||
const int64_t n_head_kv = hparams.n_head_kv;
|
||||
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
||||
const int64_t n_embd_head = hparams.n_embd_head();
|
||||
const int32_t n_orig_ctx = cparams.n_yarn_orig_ctx;
|
||||
const float ext_factor = cparams.yarn_ext_factor;
|
||||
const float attn_factor = cparams.yarn_attn_factor;
|
||||
const float beta_fast = cparams.yarn_beta_fast;
|
||||
const float beta_slow = cparams.yarn_beta_slow;
|
||||
const int64_t n_layer = hparams.n_layer;
|
||||
const int64_t n_head_kv = hparams.n_head_kv;
|
||||
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
||||
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
||||
const int32_t n_orig_ctx = cparams.n_yarn_orig_ctx;
|
||||
const float ext_factor = cparams.yarn_ext_factor;
|
||||
const float attn_factor = cparams.yarn_attn_factor;
|
||||
const float beta_fast = cparams.yarn_beta_fast;
|
||||
const float beta_slow = cparams.yarn_beta_slow;
|
||||
|
||||
GGML_ASSERT(n_embd_head % n_rot == 0);
|
||||
GGML_ASSERT(n_embd_head_k % n_rot == 0);
|
||||
|
||||
struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_ctx);
|
||||
cb(K_shift, "K_shift", -1);
|
||||
|
@ -4071,9 +4125,9 @@ static void llm_build_k_shift(
|
|||
// we rotate only the first n_rot dimensions
|
||||
ggml_rope_custom_inplace(ctx,
|
||||
ggml_view_3d(ctx, kv.k_l[il],
|
||||
n_embd_head, n_head_kv, n_ctx,
|
||||
ggml_row_size(kv.k_l[il]->type, n_embd_head),
|
||||
ggml_row_size(kv.k_l[il]->type, n_embd_gqa),
|
||||
n_embd_head_k, n_head_kv, n_ctx,
|
||||
ggml_row_size(kv.k_l[il]->type, n_embd_head_k),
|
||||
ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa),
|
||||
0),
|
||||
K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
|
@ -4094,18 +4148,19 @@ static void llm_build_kv_store(
|
|||
int32_t kv_head,
|
||||
const llm_build_cb & cb,
|
||||
int64_t il) {
|
||||
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
||||
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
||||
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
// compute the transposed [n_tokens, n_embd] V matrix
|
||||
struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, n_embd_gqa, n_tokens));
|
||||
struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, n_embd_v_gqa, n_tokens));
|
||||
//struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur); // TODO: reshape above is likely not needed
|
||||
cb(v_cur_t, "v_cur_t", il);
|
||||
|
||||
struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_gqa,
|
||||
(ggml_row_size(kv.k_l[il]->type, n_embd_gqa))*kv_head);
|
||||
struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa,
|
||||
(ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa))*kv_head);
|
||||
cb(k_cache_view, "k_cache_view", il);
|
||||
|
||||
struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_gqa,
|
||||
struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_v_gqa,
|
||||
( n_ctx)*ggml_element_size(kv.v_l[il]),
|
||||
(kv_head)*ggml_element_size(kv.v_l[il]));
|
||||
cb(v_cache_view, "v_cache_view", il);
|
||||
|
@ -4255,20 +4310,20 @@ static struct ggml_tensor * llm_build_kqv(
|
|||
float kq_scale,
|
||||
const llm_build_cb & cb,
|
||||
int il) {
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
const int64_t n_head = hparams.n_head;
|
||||
const int64_t n_head_kv = hparams.n_head_kv;
|
||||
const int64_t n_embd_head = hparams.n_embd_head();
|
||||
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
||||
const int64_t n_head = hparams.n_head;
|
||||
const int64_t n_head_kv = hparams.n_head_kv;
|
||||
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
||||
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
||||
const int64_t n_embd_head_v = hparams.n_embd_head_v;
|
||||
|
||||
struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
|
||||
cb(q, "q", il);
|
||||
|
||||
struct ggml_tensor * k =
|
||||
ggml_view_3d(ctx, kv.k_l[il],
|
||||
n_embd_head, n_kv, n_head_kv,
|
||||
ggml_row_size(kv.k_l[il]->type, n_embd_gqa),
|
||||
ggml_row_size(kv.k_l[il]->type, n_embd_head),
|
||||
n_embd_head_k, n_kv, n_head_kv,
|
||||
ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa),
|
||||
ggml_row_size(kv.k_l[il]->type, n_embd_head_k),
|
||||
0);
|
||||
cb(k, "k", il);
|
||||
|
||||
|
@ -4307,9 +4362,9 @@ static struct ggml_tensor * llm_build_kqv(
|
|||
// split cached v into n_head heads
|
||||
struct ggml_tensor * v =
|
||||
ggml_view_3d(ctx, kv.v_l[il],
|
||||
n_kv, n_embd_head, n_head_kv,
|
||||
n_kv, n_embd_head_v, n_head_kv,
|
||||
ggml_element_size(kv.v_l[il])*n_ctx,
|
||||
ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head,
|
||||
ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head_v,
|
||||
0);
|
||||
cb(v, "v", il);
|
||||
|
||||
|
@ -4319,7 +4374,7 @@ static struct ggml_tensor * llm_build_kqv(
|
|||
struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
|
||||
cb(kqv_merged, "kqv_merged", il);
|
||||
|
||||
struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, n_embd, n_tokens);
|
||||
struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens);
|
||||
cb(cur, "kqv_merged_cont", il);
|
||||
|
||||
cur = ggml_mul_mat(ctx, wo, cur);
|
||||
|
@ -4346,8 +4401,10 @@ struct llm_build_context {
|
|||
const int64_t n_ctx; // user-specified context size (can be different from n_ctx_train)
|
||||
const int64_t n_head;
|
||||
const int64_t n_head_kv;
|
||||
const int64_t n_embd_head;
|
||||
const int64_t n_embd_gqa;
|
||||
const int64_t n_embd_head_k;
|
||||
const int64_t n_embd_k_gqa;
|
||||
const int64_t n_embd_head_v;
|
||||
const int64_t n_embd_v_gqa;
|
||||
const int64_t n_expert;
|
||||
const int64_t n_expert_used;
|
||||
|
||||
|
@ -4389,8 +4446,10 @@ struct llm_build_context {
|
|||
n_ctx (cparams.n_ctx),
|
||||
n_head (hparams.n_head),
|
||||
n_head_kv (hparams.n_head_kv),
|
||||
n_embd_head (hparams.n_embd_head()),
|
||||
n_embd_gqa (hparams.n_embd_gqa()),
|
||||
n_embd_head_k (hparams.n_embd_head_k),
|
||||
n_embd_k_gqa (hparams.n_embd_k_gqa()),
|
||||
n_embd_head_v (hparams.n_embd_head_v),
|
||||
n_embd_v_gqa (hparams.n_embd_v_gqa()),
|
||||
n_expert (hparams.n_expert),
|
||||
n_expert_used (hparams.n_expert_used),
|
||||
freq_base (cparams.rope_freq_base),
|
||||
|
@ -4433,6 +4492,8 @@ struct llm_build_context {
|
|||
struct ggml_cgraph * build_llama() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
|
@ -4617,6 +4678,9 @@ struct llm_build_context {
|
|||
struct ggml_cgraph * build_baichuan() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
|
@ -4734,6 +4798,11 @@ struct llm_build_context {
|
|||
struct ggml_cgraph * build_falcon() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_gqa == n_embd);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
|
@ -4853,6 +4922,11 @@ struct llm_build_context {
|
|||
struct ggml_cgraph * build_starcoder() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_gqa == n_embd);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * pos;
|
||||
struct ggml_tensor * inpL;
|
||||
|
@ -4949,7 +5023,12 @@ struct llm_build_context {
|
|||
struct ggml_cgraph * build_persimmon() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
|
||||
const int64_t n_rot = n_embd_head / 2;
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_gqa == n_embd);
|
||||
|
||||
const int64_t n_rot = n_embd_head_k / 2;
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
@ -5158,6 +5237,11 @@ struct llm_build_context {
|
|||
struct ggml_cgraph * build_refact() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_gqa == n_embd);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
|
@ -5246,6 +5330,11 @@ struct llm_build_context {
|
|||
struct ggml_cgraph * build_bloom() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_gqa == n_embd);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
|
@ -5337,6 +5426,11 @@ struct llm_build_context {
|
|||
struct ggml_cgraph * build_mpt() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_gqa == n_embd);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
|
@ -5432,6 +5526,9 @@ struct llm_build_context {
|
|||
struct ggml_cgraph * build_stablelm() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
|
@ -5542,6 +5639,9 @@ struct llm_build_context {
|
|||
struct ggml_cgraph * build_qwen() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
|
@ -5653,6 +5753,11 @@ struct llm_build_context {
|
|||
struct ggml_cgraph * build_phi2() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_gqa == n_embd);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * attn_norm_output;
|
||||
struct ggml_tensor * ffn_output;
|
||||
|
@ -5765,6 +5870,9 @@ struct llm_build_context {
|
|||
struct ggml_cgraph * build_plamo() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
|
@ -5869,6 +5977,11 @@ struct llm_build_context {
|
|||
struct ggml_cgraph * build_gpt2() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_gqa == n_embd);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * pos;
|
||||
struct ggml_tensor * inpL;
|
||||
|
@ -8176,7 +8289,7 @@ void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * c
|
|||
}
|
||||
}
|
||||
|
||||
void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep) {
|
||||
void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int32_t k, size_t min_keep) {
|
||||
const int64_t t_start_sample_us = ggml_time_us();
|
||||
|
||||
k = std::max(k, (int) min_keep);
|
||||
|
@ -8539,7 +8652,7 @@ void llama_sample_classifier_free_guidance(
|
|||
}
|
||||
}
|
||||
|
||||
llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu) {
|
||||
llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) {
|
||||
GGML_ASSERT(ctx);
|
||||
|
||||
auto N = float(llama_n_vocab(llama_get_model(ctx)));
|
||||
|
@ -9750,7 +9863,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|||
return result;
|
||||
}
|
||||
|
||||
int llama_max_devices(void) {
|
||||
int32_t llama_max_devices(void) {
|
||||
return LLAMA_MAX_DEVICES;
|
||||
}
|
||||
|
||||
|
@ -9892,8 +10005,8 @@ struct llama_context * llama_new_context_with_model(
|
|||
const ggml_type type_k = params.type_k;
|
||||
const ggml_type type_v = params.type_v;
|
||||
|
||||
GGML_ASSERT(hparams.n_embd_head() % ggml_blck_size(type_k) == 0);
|
||||
GGML_ASSERT(hparams.n_embd_head() % ggml_blck_size(type_v) == 0);
|
||||
GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0);
|
||||
GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0);
|
||||
|
||||
// reserve memory for context buffers
|
||||
if (!hparams.vocab_only) {
|
||||
|
@ -10061,15 +10174,15 @@ enum llama_vocab_type llama_vocab_type(const struct llama_model * model) {
|
|||
return model->vocab.type;
|
||||
}
|
||||
|
||||
int llama_n_vocab(const struct llama_model * model) {
|
||||
int32_t llama_n_vocab(const struct llama_model * model) {
|
||||
return model->vocab.id_to_token.size();
|
||||
}
|
||||
|
||||
int llama_n_ctx_train(const struct llama_model * model) {
|
||||
int32_t llama_n_ctx_train(const struct llama_model * model) {
|
||||
return model->hparams.n_ctx_train;
|
||||
}
|
||||
|
||||
int llama_n_embd(const struct llama_model * model) {
|
||||
int32_t llama_n_embd(const struct llama_model * model) {
|
||||
return model->hparams.n_embd;
|
||||
}
|
||||
|
||||
|
@ -10077,7 +10190,7 @@ float llama_rope_freq_scale_train(const struct llama_model * model) {
|
|||
return model->hparams.rope_freq_scale_train;
|
||||
}
|
||||
|
||||
int llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size) {
|
||||
int32_t llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size) {
|
||||
const auto & it = model->gguf_kv.find(key);
|
||||
if (it == model->gguf_kv.end()) {
|
||||
if (buf_size > 0) {
|
||||
|
@ -10088,11 +10201,11 @@ int llama_model_meta_val_str(const struct llama_model * model, const char * key,
|
|||
return snprintf(buf, buf_size, "%s", it->second.c_str());
|
||||
}
|
||||
|
||||
int llama_model_meta_count(const struct llama_model * model) {
|
||||
int32_t llama_model_meta_count(const struct llama_model * model) {
|
||||
return (int)model->gguf_kv.size();
|
||||
}
|
||||
|
||||
int llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
|
||||
int32_t llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
|
||||
if (i < 0 || i >= (int)model->gguf_kv.size()) {
|
||||
if (buf_size > 0) {
|
||||
buf[0] = '\0';
|
||||
|
@ -10104,7 +10217,7 @@ int llama_model_meta_key_by_index(const struct llama_model * model, int i, char
|
|||
return snprintf(buf, buf_size, "%s", it->first.c_str());
|
||||
}
|
||||
|
||||
int llama_model_meta_val_str_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
|
||||
int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size) {
|
||||
if (i < 0 || i >= (int)model->gguf_kv.size()) {
|
||||
if (buf_size > 0) {
|
||||
buf[0] = '\0';
|
||||
|
@ -10116,9 +10229,10 @@ int llama_model_meta_val_str_by_index(const struct llama_model * model, int i, c
|
|||
return snprintf(buf, buf_size, "%s", it->second.c_str());
|
||||
}
|
||||
|
||||
int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
|
||||
return snprintf(buf, buf_size, "%s %s %s",
|
||||
int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
|
||||
return snprintf(buf, buf_size, "%s %s%s %s",
|
||||
llama_model_arch_name(model->arch).c_str(),
|
||||
model->hparams.n_expert > 0 ? (std::to_string(model->hparams.n_expert) + "x").c_str() : "",
|
||||
llama_model_type_name(model->type),
|
||||
llama_model_ftype_name(model->ftype).c_str());
|
||||
}
|
||||
|
@ -10143,7 +10257,7 @@ struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const ch
|
|||
return ggml_get_tensor(model->ctx, name);
|
||||
}
|
||||
|
||||
int llama_model_quantize(
|
||||
uint32_t llama_model_quantize(
|
||||
const char * fname_inp,
|
||||
const char * fname_out,
|
||||
const llama_model_quantize_params * params) {
|
||||
|
@ -10156,7 +10270,7 @@ int llama_model_quantize(
|
|||
}
|
||||
}
|
||||
|
||||
int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, float scale, const char * path_base_model, int n_threads) {
|
||||
int32_t llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
|
||||
try {
|
||||
return llama_apply_lora_from_file_internal(ctx->model, path_lora, scale, path_base_model, n_threads);
|
||||
} catch (const std::exception & err) {
|
||||
|
@ -10165,7 +10279,7 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
|
|||
}
|
||||
}
|
||||
|
||||
int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int n_threads) {
|
||||
int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
|
||||
try {
|
||||
return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads);
|
||||
} catch (const std::exception & err) {
|
||||
|
@ -10263,7 +10377,7 @@ void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_k
|
|||
}
|
||||
}
|
||||
|
||||
int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
|
||||
int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx) {
|
||||
int result = 0;
|
||||
|
||||
for (uint32_t i = 0; i < ctx->kv_self.size; i++) {
|
||||
|
@ -10273,7 +10387,7 @@ int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
|
|||
return result;
|
||||
}
|
||||
|
||||
int llama_get_kv_cache_used_cells(const struct llama_context * ctx) {
|
||||
int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx) {
|
||||
return ctx->kv_self.used;
|
||||
}
|
||||
|
||||
|
@ -10437,9 +10551,10 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|||
const auto & hparams = ctx->model.hparams;
|
||||
const auto & cparams = ctx->cparams;
|
||||
|
||||
const auto n_layer = hparams.n_layer;
|
||||
const auto n_embd = hparams.n_embd_gqa();
|
||||
const auto n_ctx = cparams.n_ctx;
|
||||
const auto n_layer = hparams.n_layer;
|
||||
const auto n_embd_k_gqa = hparams.n_embd_k_gqa();
|
||||
const auto n_embd_v_gqa = hparams.n_embd_v_gqa();
|
||||
const auto n_ctx = cparams.n_ctx;
|
||||
|
||||
const size_t kv_buf_size = ggml_backend_buffer_get_size(kv_self.buf);
|
||||
const uint32_t kv_head = kv_self.head;
|
||||
|
@ -10461,15 +10576,15 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|||
std::vector<struct ggml_tensor *> vout2d(n_layer);
|
||||
|
||||
for (int il = 0; il < (int) n_layer; ++il) {
|
||||
kout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
|
||||
vout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
|
||||
kout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd_k_gqa, kv_head);
|
||||
vout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd_v_gqa);
|
||||
|
||||
ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
|
||||
n_embd, kv_head,
|
||||
elt_size*n_embd, 0);
|
||||
n_embd_k_gqa, kv_head,
|
||||
elt_size*n_embd_k_gqa, 0);
|
||||
|
||||
ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il],
|
||||
kv_head, n_embd,
|
||||
kv_head, n_embd_v_gqa,
|
||||
elt_size*n_ctx, 0);
|
||||
|
||||
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k2d, kout2d[il]));
|
||||
|
@ -10576,9 +10691,10 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|||
const auto & hparams = ctx->model.hparams;
|
||||
const auto & cparams = ctx->cparams;
|
||||
|
||||
const int n_layer = hparams.n_layer;
|
||||
const int n_embd = hparams.n_embd_gqa();
|
||||
const int n_ctx = cparams.n_ctx;
|
||||
const int n_layer = hparams.n_layer;
|
||||
const int n_embd_k_gqa = hparams.n_embd_k_gqa();
|
||||
const int n_embd_v_gqa = hparams.n_embd_v_gqa();
|
||||
const int n_ctx = cparams.n_ctx;
|
||||
|
||||
size_t kv_buf_size;
|
||||
uint32_t kv_head;
|
||||
|
@ -10602,15 +10718,15 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|||
std::vector<struct ggml_tensor *> vin2d(n_layer);
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
kin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
|
||||
vin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
|
||||
kin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd_k_gqa, kv_head);
|
||||
vin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd_v_gqa);
|
||||
|
||||
ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
|
||||
n_embd, kv_head,
|
||||
elt_size*n_embd, 0);
|
||||
n_embd_k_gqa, kv_head,
|
||||
elt_size*n_embd_k_gqa, 0);
|
||||
|
||||
ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il],
|
||||
kv_head, n_embd,
|
||||
kv_head, n_embd_v_gqa,
|
||||
elt_size*n_ctx, 0);
|
||||
|
||||
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin2d[il], k2d));
|
||||
|
@ -10763,7 +10879,7 @@ int llama_eval(
|
|||
struct llama_context * ctx,
|
||||
llama_token * tokens,
|
||||
int32_t n_tokens,
|
||||
int n_past) {
|
||||
int32_t n_past) {
|
||||
llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
|
||||
|
||||
const int ret = llama_decode_internal(*ctx, llama_batch_get_one(tokens, n_tokens, n_past, 0));
|
||||
|
@ -10778,7 +10894,7 @@ int llama_eval_embd(
|
|||
struct llama_context * ctx,
|
||||
float * embd,
|
||||
int32_t n_tokens,
|
||||
int n_past) {
|
||||
int32_t n_past) {
|
||||
llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
|
||||
|
||||
llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, };
|
||||
|
@ -10849,7 +10965,7 @@ void llama_batch_free(struct llama_batch batch) {
|
|||
if (batch.logits) free(batch.logits);
|
||||
}
|
||||
|
||||
int llama_decode(
|
||||
int32_t llama_decode(
|
||||
struct llama_context * ctx,
|
||||
struct llama_batch batch) {
|
||||
const int ret = llama_decode_internal(*ctx, batch);
|
||||
|
@ -10897,11 +11013,11 @@ llama_token llama_token_nl(const struct llama_model * model) {
|
|||
return model->vocab.linefeed_id;
|
||||
}
|
||||
|
||||
int llama_add_bos_token(const struct llama_model * model) {
|
||||
int32_t llama_add_bos_token(const struct llama_model * model) {
|
||||
return model->vocab.special_add_bos;
|
||||
}
|
||||
|
||||
int llama_add_eos_token(const struct llama_model * model) {
|
||||
int32_t llama_add_eos_token(const struct llama_model * model) {
|
||||
return model->vocab.special_add_eos;
|
||||
}
|
||||
|
||||
|
@ -10921,12 +11037,12 @@ llama_token llama_token_eot(const struct llama_model * model) {
|
|||
return model->vocab.special_eot_id;
|
||||
}
|
||||
|
||||
int llama_tokenize(
|
||||
int32_t llama_tokenize(
|
||||
const struct llama_model * model,
|
||||
const char * text,
|
||||
int text_len,
|
||||
int32_t text_len,
|
||||
llama_token * tokens,
|
||||
int n_max_tokens,
|
||||
int32_t n_max_tokens,
|
||||
bool add_bos,
|
||||
bool special) {
|
||||
auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos, special);
|
||||
|
@ -10954,7 +11070,7 @@ static std::string llama_decode_text(const std::string & text) {
|
|||
}
|
||||
|
||||
// does not write null-terminator to buf
|
||||
int llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int length) {
|
||||
int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length) {
|
||||
if(OldBPETokenizerMode)
|
||||
{
|
||||
return llama_token_to_piece_old(model, token, buf, length);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue