mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-05 23:41:45 +00:00
ggml : add NVFP4 quantization type support (#19769)
* WIP: add NVFP4 quantization support * tests * improve NVFP4 dot product implementation performance and fix bad super call * typo * Use nvfp4 kvalues * vulkan : fix NVFP4 shader compilation by including kvalues_mxfp4 lookup table * vulcal and perf fixes * wip * Fix metal * fix vulcan * Rename threshold & fix wrong scale * Fix MOE * Shelf backend implementations (CUDA, Metal, Vulkan, arch-specific SIMD) Remove NVFP4 support from GPU backends and architecture-specific optimized dot products. These should be added in separate PRs so backend specialists can review them independently. Reverted files: - ggml-cuda: common.cuh, convert.cu, mmq.cu/cuh, mmvq.cu, vecdotq.cuh, quantize.cu/cuh, mma.cuh, ggml-cuda.cu, fattn-tile.cuh - ggml-metal: ggml-metal.metal, ggml-metal-device.cpp, ggml-metal-impl.h, ggml-metal-ops.cpp - ggml-vulkan: ggml-vulkan.cpp, all vulkan-shaders/* - ggml-cpu arch: arm/quants.c, x86/quants.c, powerpc/quants.c, s390/quants.c Core NVFP4 support (type definition, CPU fallback dot product, quantization, dequantization, conversion) is retained. * Fix arch-fallback.h: add NVFP4 generic fallback for all platforms After shelving backend-specific SIMD implementations, the generic CPU dot product needs to be aliased on ARM, x86, PowerPC, and s390 platforms that previously relied on arch-specific versions. * quantize: add NVFP4 as a quantization type option * Fix ggml_fp32_to_ue4m3: handle subnormal values Previously, values with ue4m3_exp <= 0 were clamped to 0, causing all small scales to underflow. This made NVFP4 quantization via llama-quantize produce garbage (PPL = 5.8M) since typical transformer weights have amax/6.0 in the range 0.001-0.01, which falls in the UE4M3 subnormal range. Now subnormals are properly encoded as man * 2^-9 (exp=0, man=1..7), matching the decode path in ggml_ue4m3_to_fp32. Result: NVFP4 requantization now produces PPL = 15.25 (vs F16 = 14.33), comparable to Q4_1 (PPL = 15.81) at slightly lower BPW (4.70 vs 5.15). * Restore ARM NEON NVFP4 dot product implementation Restores the optimized ggml_vec_dot_nvfp4_q8_0 for ARM NEON using vqtbl1q_s8 lookup and ggml_vdotq_s32 dot products. tg128 performance: 4.37 t/s (generic) -> 13.66 t/s (NEON) = 3.1x speedup * Optimize ARM NEON NVFP4 dot product: LUT + vpaddq + vfmaq - Add ue4m3_scale_lut[128] to ggml-common.h replacing branch-heavy ggml_ue4m3_to_fp32() in the hot loop - Use vpaddq_s32 for pairwise int32 reduction instead of vaddvq_s32 - Accumulate with vfmaq_f32 into float32x4_t vector accumulators tg128: 8.1 -> 31.0 t/s (3.8x speedup, 77% of Q4_1 speed) * ARM NEON NVFP4: rearrange q8 to match nibble layout Alternative approach: rearrange q8 data to match the NVFP4 lo/hi nibble layout instead of rearranging the looked-up NVFP4 values. Eliminates vcombine_s8(vget_low, vget_low) shuffles. Performance is equivalent (~18.5 t/s) - the bottleneck is the 2x block overhead from QK=16 vs QK=32, not the shuffle instructions. * CPU only backend 64 super-block layout * cleanup * Remove unused LUT * int * exclude NVFP4 from unsupported ops in metal build * remove quantization for now * store scales as native UE4M3, preserve original model bits when possible * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * correct comment * format * reduce duplication and cleanup * Address comments * move detection to prepare_tensors * Use math instead of const * Move * fix comment * Shelf quantize tests * Rebase and move check * cleanup * lint * Update gguf-py/gguf/scripts/gguf_convert_endian.py Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Use fallback quant config * Simplify Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * organize * Refactor * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * add quantize_nvfp4 (required for test_quants.py) * add quantize_nvfp4 (required for test_quants.py) * add quantize_nvfp4 (required for test_quants.py) * fix return type --------- Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
This commit is contained in:
parent
3ca19b0e9f
commit
5eae9cb1d9
31 changed files with 710 additions and 51 deletions
|
|
@ -1166,7 +1166,10 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|||
llama_expert_gating_func_type gating_op,
|
||||
int il,
|
||||
ggml_tensor * probs_in,
|
||||
ggml_tensor * gate_up_exps) const {
|
||||
ggml_tensor * gate_up_exps,
|
||||
ggml_tensor * up_exps_s,
|
||||
ggml_tensor * gate_exps_s,
|
||||
ggml_tensor * down_exps_s) const {
|
||||
return build_moe_ffn(
|
||||
cur,
|
||||
gate_inp, /* gate_inp_b */ nullptr,
|
||||
|
|
@ -1182,7 +1185,11 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|||
gating_op,
|
||||
il,
|
||||
probs_in,
|
||||
gate_up_exps
|
||||
gate_up_exps,
|
||||
/* gate_up_exps_b */ nullptr,
|
||||
up_exps_s,
|
||||
gate_exps_s,
|
||||
down_exps_s
|
||||
);
|
||||
}
|
||||
|
||||
|
|
@ -1206,7 +1213,10 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|||
int il,
|
||||
ggml_tensor * probs_in,
|
||||
ggml_tensor * gate_up_exps,
|
||||
ggml_tensor * gate_up_exps_b) const {
|
||||
ggml_tensor * gate_up_exps_b,
|
||||
ggml_tensor * up_exps_s,
|
||||
ggml_tensor * gate_exps_s,
|
||||
ggml_tensor * down_exps_s) const {
|
||||
const int64_t n_embd = cur->ne[0];
|
||||
const int64_t n_tokens = cur->ne[1];
|
||||
const bool weight_before_ffn = arch == LLM_ARCH_LLAMA4; // for llama4, we apply the sigmoid-ed weights before the FFN
|
||||
|
|
@ -1358,6 +1368,15 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|||
cb(gate_up, "ffn_moe_gate_up_biased", il);
|
||||
}
|
||||
|
||||
// apply per-expert scale2 to merged gate_up (use up_exps_s since gate and up are fused)
|
||||
if (up_exps_s) {
|
||||
ggml_tensor * s = ggml_reshape_3d(ctx0, up_exps_s, 1, n_expert, 1);
|
||||
s = ggml_repeat_4d(ctx0, s, 1, n_expert, n_tokens, 1);
|
||||
s = ggml_get_rows(ctx0, s, selected_experts); // [1, n_expert_used, n_tokens]
|
||||
gate_up = ggml_mul(ctx0, gate_up, s);
|
||||
cb(gate_up, "ffn_moe_gate_up_scaled", il);
|
||||
}
|
||||
|
||||
const int64_t n_ff = gate_up->ne[0] / 2;
|
||||
cur = ggml_view_3d(ctx0, gate_up, n_ff, gate_up->ne[1], gate_up->ne[2], gate_up->nb[1], gate_up->nb[2], 0);
|
||||
cb(cur, "ffn_moe_gate", il);
|
||||
|
|
@ -1373,6 +1392,15 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|||
cb(up, "ffn_moe_up_biased", il);
|
||||
}
|
||||
|
||||
// apply per-expert scale2 to up
|
||||
if (up_exps_s) {
|
||||
ggml_tensor * s = ggml_reshape_3d(ctx0, up_exps_s, 1, n_expert, 1);
|
||||
s = ggml_repeat_4d(ctx0, s, 1, n_expert, n_tokens, 1);
|
||||
s = ggml_get_rows(ctx0, s, selected_experts); // [1, n_expert_used, n_tokens]
|
||||
up = ggml_mul(ctx0, up, s);
|
||||
cb(up, "ffn_moe_up_scaled", il);
|
||||
}
|
||||
|
||||
if (gate_exps) {
|
||||
cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
|
||||
cb(cur, "ffn_moe_gate", il);
|
||||
|
|
@ -1384,6 +1412,15 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|||
cur = ggml_add_id(ctx0, cur, gate_exps_b, selected_experts);
|
||||
cb(cur, "ffn_moe_gate_biased", il);
|
||||
}
|
||||
|
||||
// apply per-expert scale2 to gate
|
||||
if (gate_exps_s) {
|
||||
ggml_tensor * s = ggml_reshape_3d(ctx0, gate_exps_s, 1, n_expert, 1);
|
||||
s = ggml_repeat_4d(ctx0, s, 1, n_expert, n_tokens, 1);
|
||||
s = ggml_get_rows(ctx0, s, selected_experts); // [1, n_expert_used, n_tokens]
|
||||
cur = ggml_mul(ctx0, cur, s);
|
||||
cb(cur, "ffn_moe_gate_scaled", il);
|
||||
}
|
||||
}
|
||||
|
||||
const bool has_gate = gate_exps || gate_up_exps;
|
||||
|
|
@ -1463,6 +1500,15 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|||
cb(experts, "ffn_moe_down_biased", il);
|
||||
}
|
||||
|
||||
// apply per-expert scale2 to down
|
||||
if (down_exps_s) {
|
||||
ggml_tensor * s = ggml_reshape_3d(ctx0, down_exps_s, 1, n_expert, 1);
|
||||
s = ggml_repeat_4d(ctx0, s, 1, n_expert, n_tokens, 1);
|
||||
s = ggml_get_rows(ctx0, s, selected_experts); // [1, n_expert_used, n_tokens]
|
||||
experts = ggml_mul(ctx0, experts, s);
|
||||
cb(experts, "ffn_moe_down_scaled", il);
|
||||
}
|
||||
|
||||
if (!weight_before_ffn) {
|
||||
experts = ggml_mul(ctx0, experts, weights);
|
||||
cb(cur, "ffn_moe_weighted", il);
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue