mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-11 13:11:49 +00:00
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .devops/tools.sh # build-xcframework.sh # ci/run.sh # examples/Miku.sh # examples/chat-13B.sh # examples/chat-persistent.sh # examples/chat-vicuna.sh # examples/chat.sh # examples/jeopardy/jeopardy.sh # examples/reason-act.sh # examples/server-llama2-13B.sh # examples/sycl/build.sh # examples/sycl/run-llama2.sh # examples/sycl/run-llama3.sh # examples/ts-type-to-grammar.sh # ggml/src/ggml-cpu/CMakeLists.txt # ggml/src/ggml-sycl/element_wise.cpp # ggml/src/ggml-sycl/element_wise.hpp # ggml/src/ggml-sycl/ggml-sycl.cpp # scripts/apple/validate-apps.sh # scripts/apple/validate-ios.sh # scripts/apple/validate-macos.sh # scripts/apple/validate-tvos.sh # scripts/apple/validate-visionos.sh # scripts/check-requirements.sh # scripts/ci-run.sh # scripts/compare-commits.sh # scripts/debug-test.sh # scripts/gen-authors.sh # scripts/get-hellaswag.sh # scripts/get-pg.sh # scripts/get-wikitext-103.sh # scripts/get-wikitext-2.sh # scripts/get-winogrande.sh # scripts/hf.sh # scripts/qnt-all.sh # scripts/run-all-perf.sh # scripts/run-all-ppl.sh # scripts/sync-ggml-am.sh # scripts/sync-ggml.sh # scripts/tool_bench.sh # tests/test-backend-ops.cpp # tests/test-lora-conversion-inference.sh # tests/test-tokenizer-0.sh # tools/server/README.md
This commit is contained in:
commit
cdda9d16e0
42 changed files with 1519 additions and 118 deletions
|
|
@ -560,12 +560,20 @@ ggml_tensor * llm_graph_context::build_ffn(
|
|||
|
||||
switch (type_op) {
|
||||
case LLM_FFN_SILU:
|
||||
{
|
||||
if (gate && type_gate == LLM_FFN_PAR) {
|
||||
cur = ggml_swiglu_split(ctx0, cur, tmp);
|
||||
cb(cur, "ffn_swiglu", il);
|
||||
type_gate = LLM_FFN_SEQ;
|
||||
} else {
|
||||
cur = ggml_silu(ctx0, cur);
|
||||
cb(cur, "ffn_silu", il);
|
||||
} break;
|
||||
case LLM_FFN_GELU:
|
||||
{
|
||||
if (gate && type_gate == LLM_FFN_PAR) {
|
||||
cur = ggml_geglu_split(ctx0, cur, tmp);
|
||||
cb(cur, "ffn_geglu", il);
|
||||
type_gate = LLM_FFN_SEQ;
|
||||
} else {
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
cb(cur, "ffn_gelu", il);
|
||||
if (act_scales != NULL) {
|
||||
|
|
@ -574,7 +582,11 @@ ggml_tensor * llm_graph_context::build_ffn(
|
|||
}
|
||||
} break;
|
||||
case LLM_FFN_RELU:
|
||||
{
|
||||
if (gate && type_gate == LLM_FFN_PAR) {
|
||||
cur = ggml_reglu_split(ctx0, cur, tmp);
|
||||
cb(cur, "ffn_reglu", il);
|
||||
type_gate = LLM_FFN_SEQ;
|
||||
} else {
|
||||
cur = ggml_relu(ctx0, cur);
|
||||
cb(cur, "ffn_relu", il);
|
||||
} break;
|
||||
|
|
@ -588,32 +600,19 @@ ggml_tensor * llm_graph_context::build_ffn(
|
|||
} break;
|
||||
case LLM_FFN_SWIGLU:
|
||||
{
|
||||
// Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf
|
||||
int64_t split_point = cur->ne[0] / 2;
|
||||
// TODO: these conts should not be needed, see https://github.com/ggml-org/llama.cpp/pull/14090#discussion_r2137437217
|
||||
ggml_tensor * x0 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0));
|
||||
ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur)));
|
||||
|
||||
x0 = ggml_silu(ctx0, x0);
|
||||
cb(cur, "ffn_silu", il);
|
||||
|
||||
cur = ggml_mul(ctx0, x0, x1);
|
||||
cb(cur, "ffn_mul", il);
|
||||
cur = ggml_swiglu(ctx0, cur);
|
||||
cb(cur, "ffn_swiglu", il);
|
||||
} break;
|
||||
case LLM_FFN_GEGLU:
|
||||
{
|
||||
// Split into two equal parts
|
||||
int64_t split_point = cur->ne[0] / 2;
|
||||
// TODO: these conts should not be needed, see https://github.com/ggml-org/llama.cpp/pull/14090#discussion_r2137437217
|
||||
ggml_tensor * x0 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0));
|
||||
ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur)));
|
||||
|
||||
x0 = ggml_gelu(ctx0, x0);
|
||||
cb(x0, "ffn_gelu", il);
|
||||
|
||||
cur = ggml_mul(ctx0, x0, x1);
|
||||
cur = ggml_geglu(ctx0, cur);
|
||||
cb(cur, "ffn_geglu", il);
|
||||
} break;
|
||||
case LLM_FFN_REGLU:
|
||||
{
|
||||
cur = ggml_reglu(ctx0, cur);
|
||||
cb(cur, "ffn_reglu", il);
|
||||
} break;
|
||||
}
|
||||
|
||||
if (gate && type_gate == LLM_FFN_PAR) {
|
||||
|
|
@ -743,12 +742,18 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|||
|
||||
switch (type_op) {
|
||||
case LLM_FFN_SILU:
|
||||
{
|
||||
if (gate_exps) {
|
||||
cur = ggml_swiglu_split(ctx0, cur, up);
|
||||
cb(cur, "ffn_moe_swiglu", il);
|
||||
} else {
|
||||
cur = ggml_silu(ctx0, cur);
|
||||
cb(cur, "ffn_moe_silu", il);
|
||||
} break;
|
||||
case LLM_FFN_GELU:
|
||||
{
|
||||
if (gate_exps) {
|
||||
cur = ggml_geglu_split(ctx0, cur, up);
|
||||
cb(cur, "ffn_moe_geglu", il);
|
||||
} else {
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
cb(cur, "ffn_moe_gelu", il);
|
||||
} break;
|
||||
|
|
@ -756,11 +761,6 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
if (gate_exps) {
|
||||
cur = ggml_mul(ctx0, cur, up); // [n_ff, n_expert_used, n_tokens]
|
||||
cb(cur, "ffn_moe_gate_par", il);
|
||||
}
|
||||
|
||||
experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
|
||||
cb(experts, "ffn_moe_down", il);
|
||||
|
||||
|
|
|
|||
|
|
@ -38,6 +38,7 @@ enum llm_ffn_op_type {
|
|||
LLM_FFN_RELU_SQR,
|
||||
LLM_FFN_SWIGLU,
|
||||
LLM_FFN_GEGLU,
|
||||
LLM_FFN_REGLU,
|
||||
};
|
||||
|
||||
enum llm_ffn_gate_type {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue