mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-06-01 22:50:53 +00:00
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .github/workflows/release.yml # .github/workflows/server.yml # .github/workflows/ui-build.yml # .github/workflows/ui-publish.yml # CMakeLists.txt # docs/autoparser.md # docs/backend/snapdragon/CMakeUserPresets.json # docs/backend/snapdragon/README.md # docs/backend/snapdragon/windows.md # docs/function-calling.md # examples/model-conversion/scripts/embedding/run-original-model.py # ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c # ggml/src/ggml-opencl/ggml-opencl.cpp # ggml/src/ggml-opencl/kernels/cvt.cl # ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32_ns.cl # ggml/src/ggml-opencl/kernels/gemm_moe_q4_0_f32_ns.cl # ggml/src/ggml-opencl/kernels/gemm_moe_q4_1_f32_ns.cl # ggml/src/ggml-opencl/kernels/gemm_moe_q4_k_f32_ns.cl # ggml/src/ggml-opencl/kernels/gemm_moe_q5_0_f32_ns.cl # ggml/src/ggml-opencl/kernels/gemm_moe_q5_1_f32_ns.cl # ggml/src/ggml-opencl/kernels/gemm_moe_q5_k_f32_ns.cl # ggml/src/ggml-opencl/kernels/gemm_moe_q6_k_f32_ns.cl # ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32_ns.cl # ggml/src/ggml-opencl/kernels/gemv_moe_q4_0_f32_ns.cl # ggml/src/ggml-opencl/kernels/gemv_moe_q4_1_f32_ns.cl # ggml/src/ggml-opencl/kernels/gemv_moe_q4_k_f32_ns.cl # ggml/src/ggml-opencl/kernels/gemv_moe_q5_0_f32_ns.cl # ggml/src/ggml-opencl/kernels/gemv_moe_q5_1_f32_ns.cl # ggml/src/ggml-opencl/kernels/gemv_moe_q5_k_f32_ns.cl # ggml/src/ggml-opencl/kernels/gemv_moe_q6_k_f32_ns.cl # ggml/src/ggml-sycl/common.hpp # ggml/src/ggml-sycl/dmmv.cpp # ggml/src/ggml-sycl/gated_delta_net.cpp # ggml/src/ggml-sycl/ggml-sycl.cpp # ggml/src/ggml-vulkan/CMakeLists.txt # ggml/src/ggml-zendnn/CMakeLists.txt # ggml/src/ggml-zendnn/ggml-zendnn.cpp # requirements/requirements-convert_hf_to_gguf.txt # scripts/snapdragon/windows/setup-build.ps1 # tools/perplexity/perplexity.cpp
This commit is contained in:
commit
8ca4283f55
17 changed files with 323 additions and 203 deletions
|
|
@ -1465,6 +1465,12 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) {
|
|||
if (!layer.ssm_beta_s && layer.ssm_beta) {
|
||||
layer.ssm_beta_s = create_tensor(tn(LLM_TENSOR_SSM_BETA, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
if (!layer.nextn.eh_proj_s && layer.nextn.eh_proj) {
|
||||
layer.nextn.eh_proj_s = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
if (!layer.nextn.shared_head_head_s && layer.nextn.shared_head_head) {
|
||||
layer.nextn.shared_head_head_s = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
|
||||
// input scales
|
||||
if (!layer.wq_in_s && layer.wq) {
|
||||
|
|
@ -1524,6 +1530,12 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) {
|
|||
if (!layer.ssm_beta_in_s && layer.ssm_beta) {
|
||||
layer.ssm_beta_in_s = create_tensor(tn(LLM_TENSOR_SSM_BETA, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
if (!layer.nextn.eh_proj_in_s && layer.nextn.eh_proj) {
|
||||
layer.nextn.eh_proj_in_s = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
if (!layer.nextn.shared_head_head_in_s && layer.nextn.shared_head_head) {
|
||||
layer.nextn.shared_head_head_in_s = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
}
|
||||
// output scales
|
||||
if (output && output->type == GGML_TYPE_NVFP4) {
|
||||
|
|
|
|||
|
|
@ -202,12 +202,16 @@ struct llama_layer_shortconv {
|
|||
};
|
||||
|
||||
struct llama_layer_nextn {
|
||||
struct ggml_tensor * eh_proj = nullptr;
|
||||
struct ggml_tensor * embed_tokens = nullptr;
|
||||
struct ggml_tensor * enorm = nullptr;
|
||||
struct ggml_tensor * hnorm = nullptr;
|
||||
struct ggml_tensor * shared_head_head = nullptr;
|
||||
struct ggml_tensor * shared_head_norm = nullptr;
|
||||
struct ggml_tensor * eh_proj = nullptr;
|
||||
struct ggml_tensor * eh_proj_s = nullptr;
|
||||
struct ggml_tensor * eh_proj_in_s = nullptr;
|
||||
struct ggml_tensor * embed_tokens = nullptr;
|
||||
struct ggml_tensor * enorm = nullptr;
|
||||
struct ggml_tensor * hnorm = nullptr;
|
||||
struct ggml_tensor * shared_head_head = nullptr;
|
||||
struct ggml_tensor * shared_head_head_s = nullptr;
|
||||
struct ggml_tensor * shared_head_head_in_s = nullptr;
|
||||
struct ggml_tensor * shared_head_norm = nullptr;
|
||||
};
|
||||
|
||||
struct llama_layer {
|
||||
|
|
|
|||
|
|
@ -538,7 +538,7 @@ llama_model_qwen35::graph_mtp::graph_mtp(const llama_model & model, const llm_gr
|
|||
ggml_tensor * concat = ggml_concat(ctx0, e_norm, h_norm, /*dim=*/ 0);
|
||||
cb(concat, "mtp_concat", il);
|
||||
|
||||
ggml_tensor * cur = build_lora_mm(layer.nextn.eh_proj, concat);
|
||||
ggml_tensor * cur = build_lora_mm(layer.nextn.eh_proj, concat, layer.nextn.eh_proj_s);
|
||||
cb(cur, "mtp_eh_proj", il);
|
||||
|
||||
ggml_tensor * inpSA = cur;
|
||||
|
|
@ -626,8 +626,9 @@ llama_model_qwen35::graph_mtp::graph_mtp(const llama_model & model, const llm_gr
|
|||
cb(cur, "mtp_shared_head_norm", -1);
|
||||
|
||||
ggml_tensor * head_w = layer.nextn.shared_head_head ? layer.nextn.shared_head_head : model.output;
|
||||
ggml_tensor * head_s = layer.nextn.shared_head_head ? layer.nextn.shared_head_head_s : model.output_s;
|
||||
GGML_ASSERT(head_w && "QWEN35 MTP: missing LM head (nextn.shared_head_head or model.output)");
|
||||
cur = build_lora_mm(head_w, cur);
|
||||
cur = build_lora_mm(head_w, cur, head_s);
|
||||
cb(cur, "result_output", -1);
|
||||
|
||||
res->t_logits = cur;
|
||||
|
|
|
|||
|
|
@ -602,7 +602,7 @@ llama_model_qwen35moe::graph_mtp::graph_mtp(const llama_model & model, const llm
|
|||
ggml_tensor * concat = ggml_concat(ctx0, e_norm, h_norm, /*dim=*/ 0);
|
||||
cb(concat, "mtp_concat", il);
|
||||
|
||||
ggml_tensor * cur = build_lora_mm(layer.nextn.eh_proj, concat);
|
||||
ggml_tensor * cur = build_lora_mm(layer.nextn.eh_proj, concat, layer.nextn.eh_proj_s);
|
||||
cb(cur, "mtp_eh_proj", il);
|
||||
|
||||
ggml_tensor * inpSA = cur;
|
||||
|
|
@ -722,8 +722,9 @@ llama_model_qwen35moe::graph_mtp::graph_mtp(const llama_model & model, const llm
|
|||
cb(cur, "mtp_shared_head_norm", -1);
|
||||
|
||||
ggml_tensor * head_w = layer.nextn.shared_head_head ? layer.nextn.shared_head_head : model.output;
|
||||
ggml_tensor * head_s = layer.nextn.shared_head_head ? layer.nextn.shared_head_head_s : model.output_s;
|
||||
GGML_ASSERT(head_w && "QWEN35MOE MTP: missing LM head (nextn.shared_head_head or model.output)");
|
||||
cur = build_lora_mm(head_w, cur);
|
||||
cur = build_lora_mm(head_w, cur, head_s);
|
||||
cb(cur, "result_output", -1);
|
||||
|
||||
res->t_logits = cur;
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue