diff --git a/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-fp8-linear-ggml-experts-serve.yaml b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-fp8-linear-ggml-experts-serve.yaml index 4d5ecb0..56320bf 100644 --- a/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-fp8-linear-ggml-experts-serve.yaml +++ b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-fp8-linear-ggml-experts-serve.yaml @@ -44,7 +44,7 @@ - match: name: "^model\\.layers\\..*\\.self_attn$" replace: - class: ktransformers.operators.attention.flashinfer_attn # optimized MLA implementation + class: ktransformers.operators.balance_serve_attention.flashinfer_attn # optimized MLA implementation kwargs: generate_device: "cuda" prefill_device: "cuda"