mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-31 21:39:42 +00:00
CUDA: route batch>=4 quantized matmul to MMQ on AMD MFMA hardware (#23227)
* CUDA: per-quant MMVQ/MMQ batch threshold on AMD MFMA hardware
The dispatcher uses a single global threshold (MMVQ_MAX_BATCH_SIZE = 8)
to choose between mul_mat_vec_q (per-row GEMV) and mul_mat_q (MFMA-tiled
GEMM) for quantized matmul. On AMD CDNA, the optimal crossover differs
substantially by quant family because the per-row GEMV cost is dominated
by dequantisation, not the dot-product itself: K-quants pay a heavier
super-block decode and so MMQ wins sooner; legacy and IQ quants have
lean decode and stay ahead until the batch fully populates an MFMA tile.
This patch introduces ggml_cuda_should_use_mmvq(type, cc, ne11) -> bool,
mirroring the existing ggml_cuda_should_use_mmq, and gates per-quant
thresholds on amd_mfma_available(cc):
Q3_K, Q4_K, Q5_K : MMVQ <= 3 (MMQ wins from batch=4: +5% .. +76%)
Q2_K, Q6_K : MMVQ <= 5 (MMQ wins from batch=6: +8% .. +35%)
others : MMVQ <= 8 (legacy & IQ regress under MMQ; unchanged)
Non-AMD-MFMA paths (NVIDIA, RDNA, CDNA1 without MFMA) are byte-identical
to master. GGML_CUDA_FORCE_MMVQ=1 restores the original global threshold
for A/B testing.
Measured on MI250X (gfx90a, ROCm 7.2.1) with Llama-3.2-3B-Instruct,
llama-bench pp512 across all 20 supported quants, ubatch 1..8, 10 reps.
Full table in PR description.
Selected pp512 throughput (tok/s, ub=8):
Q4_K_S: 559 -> 940 (+68%)
Q5_K_S: 503 -> 884 (+76%)
Q3_K_S: 629 -> 879 (+40%)
Q2_K : 615 -> 809 (+32%)
Q6_K : 582 -> 776 (+33%)
Selected pp512 throughput (tok/s, ub=4):
Q4_K_S: 444 -> 480 (+ 8%)
Q4_0 : 682 -> 685 (+ 0%) (no regression - retains MMVQ)
IQ4_XS: 706 -> 698 (- 1%) (no regression - retains MMVQ)
* CUDA: address review — inline MMVQ batch table, drop env hatch & doc block
* tune kernel selection logic for CDNA1
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
This commit is contained in:
parent
0b246862b9
commit
bc81d47aba
3 changed files with 51 additions and 0 deletions
|
|
@ -2570,6 +2570,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
|||
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1], /*n_experts=*/0);
|
||||
use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src0->nb, src1->ne[1], /*mul_mat_id=*/false);
|
||||
use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src0->nb, src1->ne[1]);
|
||||
use_mul_mat_vec_q = use_mul_mat_vec_q && ggml_cuda_should_use_mmvq(src0->type, cc, src1->ne[1]);
|
||||
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
|
||||
}
|
||||
} else {
|
||||
|
|
@ -2578,6 +2579,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
|||
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1], /*n_experts=*/0);
|
||||
use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src0->nb, src1->ne[1], /*mul_mat_id=*/false);
|
||||
use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src0->nb, src1->ne[1]);
|
||||
use_mul_mat_vec_q = use_mul_mat_vec_q && ggml_cuda_should_use_mmvq(src0->type, cc, src1->ne[1]);
|
||||
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -271,6 +271,53 @@ int get_mmvq_mmid_max_batch(ggml_type type, int cc) {
|
|||
return MMVQ_MAX_BATCH_SIZE;
|
||||
}
|
||||
|
||||
bool ggml_cuda_should_use_mmvq(enum ggml_type type, int cc, int64_t ne11) {
|
||||
if (GGML_CUDA_CC_IS_CDNA(cc)) {
|
||||
if (GGML_CUDA_CC_IS_CDNA1(cc)) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
return ne11 <= 7;
|
||||
case GGML_TYPE_Q5_1:
|
||||
return ne11 <= 7;
|
||||
case GGML_TYPE_Q8_0:
|
||||
return ne11 <= 6;
|
||||
case GGML_TYPE_Q2_K:
|
||||
return ne11 <= 4;
|
||||
case GGML_TYPE_Q3_K:
|
||||
return ne11 <= 3;
|
||||
case GGML_TYPE_Q4_K:
|
||||
return ne11 <= 2;
|
||||
case GGML_TYPE_Q5_K:
|
||||
return ne11 <= 3;
|
||||
case GGML_TYPE_Q6_K:
|
||||
return ne11 <= 4;
|
||||
case GGML_TYPE_IQ1_S:
|
||||
return ne11 <= 5;
|
||||
case GGML_TYPE_IQ2_XXS:
|
||||
case GGML_TYPE_IQ3_S:
|
||||
case GGML_TYPE_IQ4_XS:
|
||||
return ne11 <= 6;
|
||||
default:
|
||||
return ne11 <= MMVQ_MAX_BATCH_SIZE;
|
||||
}
|
||||
}
|
||||
switch (type) { // tuned for CDNA2
|
||||
case GGML_TYPE_Q2_K:
|
||||
return ne11 <= 5;
|
||||
case GGML_TYPE_Q3_K:
|
||||
case GGML_TYPE_Q4_K:
|
||||
case GGML_TYPE_Q5_K:
|
||||
return ne11 <= 3;
|
||||
case GGML_TYPE_Q6_K:
|
||||
return ne11 <= 5;
|
||||
default:
|
||||
return ne11 <= MMVQ_MAX_BATCH_SIZE;
|
||||
}
|
||||
}
|
||||
return ne11 <= MMVQ_MAX_BATCH_SIZE;
|
||||
}
|
||||
|
||||
// Device constexpr: returns the max batch size for the current arch+type at compile time.
|
||||
template <ggml_type type>
|
||||
static constexpr __device__ int get_mmvq_mmid_max_batch_for_device() {
|
||||
|
|
|
|||
|
|
@ -2,6 +2,8 @@
|
|||
|
||||
#define MMVQ_MAX_BATCH_SIZE 8 // Max. batch size for which to use MMVQ kernels.
|
||||
|
||||
bool ggml_cuda_should_use_mmvq(enum ggml_type type, int cc, int64_t ne11);
|
||||
|
||||
// Returns the maximum batch size for which MMVQ should be used for MUL_MAT_ID,
|
||||
// based on the quantization type and GPU architecture (compute capability).
|
||||
int get_mmvq_mmid_max_batch(ggml_type type, int cc);
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue