Merge branch 'master' into concedo_experimental

This commit is contained in:
Concedo 2023-12-01 23:48:20 +08:00
commit 495bb3ab1e
9 changed files with 314 additions and 196 deletions

View file

@ -3734,23 +3734,29 @@ static struct ggml_tensor * llm_build_kqv(
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
cb(kq, "kq", il);
kq = ggml_scale(ctx, kq, kq_scale);
cb(kq, "kq_scaled", il);
if (max_alibi_bias > 0.0f) {
// TODO: n_head or n_head_kv
// TODO: K-shift is likely not working
// TODO: change to ggml_add
kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, max_alibi_bias);
cb(kq, "kq_scaled_alibi", il);
// temporary branch until we figure out how to handle ggml_alibi through ggml_add
kq = ggml_scale(ctx, kq, kq_scale);
cb(kq, "kq_scaled", il);
if (max_alibi_bias > 0.0f) {
// TODO: n_head or n_head_kv
// TODO: K-shift is likely not working
// TODO: change to ggml_add
kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, max_alibi_bias);
cb(kq, "kq_scaled_alibi", il);
}
kq = ggml_add(ctx, kq, kq_mask);
cb(kq, "kq_masked", il);
kq = ggml_soft_max(ctx, kq);
cb(kq, "kq_soft_max", il);
} else {
kq = ggml_soft_max_ext(ctx, kq, kq_mask, 1.0f/sqrtf(float(n_embd_head)));
cb(kq, "kq_soft_max_ext", il);
}
kq = ggml_add(ctx, kq, kq_mask);
cb(kq, "kq_masked", il);
kq = ggml_soft_max(ctx, kq);
cb(kq, "kq_soft_max", il);
// split cached v into n_head heads
struct ggml_tensor * v =
ggml_view_3d(ctx, kv.v,
@ -5071,6 +5077,7 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
{ "kq_scaled_alibi", OFFLOAD_FUNC_KQ },
{ "kq_masked", OFFLOAD_FUNC_KQ },
{ "kq_soft_max", OFFLOAD_FUNC_V },
{ "kq_soft_max_ext", OFFLOAD_FUNC_V },
{ "v", OFFLOAD_FUNC_V },
{ "kqv", OFFLOAD_FUNC_V },
{ "kqv_merged", OFFLOAD_FUNC_V },