mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-17 04:19:40 +00:00
note: also has support for completion tokens count
This commit is contained in:
commit
a46f8acd03
31 changed files with 138676 additions and 137399 deletions
|
@ -9670,20 +9670,16 @@ static struct ggml_tensor * llm_build_kqv(
|
|||
cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias,
|
||||
hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f);
|
||||
|
||||
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX || model.arch == LLM_ARCH_GEMMA2) {
|
||||
ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
|
||||
}
|
||||
ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
|
||||
|
||||
cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens);
|
||||
} else {
|
||||
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
||||
cb(kq, "kq", il);
|
||||
|
||||
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX || model.arch == LLM_ARCH_QWEN2 || model.arch == LLM_ARCH_NEMOTRON || model.arch == LLM_ARCH_CHATGLM) {
|
||||
// for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
|
||||
// ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
|
||||
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
||||
}
|
||||
// note: this op tends to require high floating point range
|
||||
// while for some models F16 is enough, for others it is not, so we default to F32 here
|
||||
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
||||
|
||||
if (model.arch == LLM_ARCH_GROK) {
|
||||
// need to do the following:
|
||||
|
@ -9692,9 +9688,6 @@ static struct ggml_tensor * llm_build_kqv(
|
|||
// kq = 30 * tanh(kq / 30)
|
||||
// before the softmax below
|
||||
|
||||
//try from phi2
|
||||
//ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
||||
|
||||
kq = ggml_tanh(ctx, ggml_scale(ctx, kq, 0.08838834764831845f/30.0f));
|
||||
kq = ggml_scale(ctx, kq, 30);
|
||||
}
|
||||
|
@ -21793,6 +21786,16 @@ static int32_t llama_chat_apply_template_internal(
|
|||
ss << message->content << "\n\n";
|
||||
}
|
||||
}
|
||||
} else if (tmpl == "granite" || tmpl_contains("<|start_of_role|>")) {
|
||||
// IBM Granite template
|
||||
for (const auto & message : chat) {
|
||||
std::string role(message->role);
|
||||
ss << "<|start_of_role|>" << role << "<|end_of_role|>"
|
||||
<< message->content << "<|end_of_text|>\n";
|
||||
}
|
||||
if (add_ass) {
|
||||
ss << "<|start_of_role|>assistant<|end_of_role|>\n";
|
||||
}
|
||||
} else {
|
||||
// template not supported
|
||||
return -1;
|
||||
|
@ -21855,6 +21858,10 @@ struct llama_sampler * llama_sampler_init_infill(const struct llama_model * mode
|
|||
return llama_sampler_init_infill_impl(model->vocab);
|
||||
}
|
||||
|
||||
struct llama_sampler * llama_sampler_init_dry(const struct llama_model * model, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const char** seq_breakers, size_t num_breakers) {
|
||||
return llama_sampler_init_dry_impl(model->vocab, llama_n_ctx_train(model), dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, seq_breakers, num_breakers);
|
||||
}
|
||||
|
||||
//
|
||||
// model split
|
||||
//
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue