From 1f1e57f2bf76df88a668dce924de2bffa89c5c02 Mon Sep 17 00:00:00 2001 From: Oliver Simons Date: Tue, 3 Feb 2026 11:33:14 +0100 Subject: [PATCH 01/11] CUDA: Fix loop unrolling for BW in mul_mat_q_stream_k_fixup (#19053) By providing stride_* variables as size_t (i.e., 64-bit) the compiler can correctly unroll the [two for-loops](https://github.com/ggml-org/llama.cpp/blob/557515be1e93ed8939dd8a7c7d08765fdbe8be31/ggml/src/ggml-cuda/mmq.cuh#L3789-L3816) on BW. This gives some perf for prefill/pp phase on BW, while not affecting other SMs: | GPU | Model | Test | t/s master | t/s osimons/fix_bw_mmq_fixup_kernel | Speedup | |:--------------------------------------------------------|:----------------------|:-------|-------------:|--------------------------------------:|----------:| | NVIDIA RTX 6000 Ada Generation | gpt-oss 20B MXFP4 MoE | pp8096 | 8404.05 | 8375.79 | 1.00 | | NVIDIA RTX 6000 Ada Generation | llama 3B Q4_K_M | pp8096 | 16148.93 | 16019.60 | 0.99 | | NVIDIA RTX 6000 Ada Generation | llama 8B Q4_0 | pp8096 | 8008.29 | 7978.80 | 1.00 | | NVIDIA RTX 6000 Ada Generation | nemotron_h 9B BF16 | pp8096 | 4263.16 | 4248.53 | 1.00 | | NVIDIA RTX 6000 Ada Generation | nemotron_h 9B Q4_K_M | pp8096 | 5165.11 | 5157.43 | 1.00 | | NVIDIA RTX PRO 6000 Blackwell Max-Q Workstation Edition | gpt-oss 20B MXFP4 MoE | pp8096 | 12582.80 | 12758.37 | 1.01 | | NVIDIA RTX PRO 6000 Blackwell Max-Q Workstation Edition | llama 3B Q4_K_M | pp8096 | 16879.10 | 17619.47 | 1.04 | | NVIDIA RTX PRO 6000 Blackwell Max-Q Workstation Edition | llama 8B Q4_0 | pp8096 | 10649.90 | 10982.65 | 1.03 | | NVIDIA RTX PRO 6000 Blackwell Max-Q Workstation Edition | nemotron_h 9B BF16 | pp8096 | 7717.73 | 7716.22 | 1.00 | | NVIDIA RTX PRO 6000 Blackwell Max-Q Workstation Edition | nemotron_h 9B Q4_K_M | pp8096 | 7301.90 | 7370.38 | 1.01 | --- ggml/src/ggml-cuda/mmq.cuh | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh index a382e6a69..f80f98cda 100644 --- a/ggml/src/ggml-cuda/mmq.cuh +++ b/ggml/src/ggml-cuda/mmq.cuh @@ -3697,13 +3697,20 @@ static __global__ void mul_mat_q( tile_x_max_i, tile_y_max_j, kb0_start, kb0_stop); } - template -static __global__ void mul_mat_q_stream_k_fixup( - const int32_t * ids_dst, const int32_t * expert_bounds, float * __restrict__ dst, const float * __restrict__ tmp_last_tile, - const int ncols_x, const int nrows_x, const int ncols_dst, const int stride_col_dst, - const int nchannels_y, const int stride_channel_dst, const int nsamples_y, const int stride_sample_dst, - const int ncols_max) { +static __global__ void mul_mat_q_stream_k_fixup(const int32_t * ids_dst, + const int32_t * expert_bounds, + float * __restrict__ dst, + const float * __restrict__ tmp_last_tile, + const int ncols_x, + const int nrows_x, + const int ncols_dst, + const size_t stride_col_dst, + const int nchannels_y, + const size_t stride_channel_dst, + const int nsamples_y, + const size_t stride_sample_dst, + const int ncols_max) { constexpr int mmq_y = get_mmq_y_device(); constexpr int qk = ggml_cuda_type_traits::qk; constexpr int ITER_K = get_iter_k(type); From c55bce415926c7e8484f20d854afebe0168f7513 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 3 Feb 2026 13:43:29 +0200 Subject: [PATCH 02/11] metal : minor cleanup (#19251) --- ggml/src/ggml-metal/ggml-metal-impl.h | 4 +- ggml/src/ggml-metal/ggml-metal-ops.cpp | 38 ++++-------- ggml/src/ggml-metal/ggml-metal.metal | 86 +++++++------------------- 3 files changed, 34 insertions(+), 94 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal-impl.h b/ggml/src/ggml-metal/ggml-metal-impl.h index 59d88b01a..e074f2ef3 100644 --- a/ggml/src/ggml-metal/ggml-metal-impl.h +++ b/ggml/src/ggml-metal/ggml-metal-impl.h @@ -81,10 +81,10 @@ #define FC_COUNT_EQUAL 1000 // op-specific constants -#define OP_FLASH_ATTN_EXT_NQPTG 8 +#define OP_FLASH_ATTN_EXT_NQPSG 8 #define OP_FLASH_ATTN_EXT_NCPSG 64 -#define OP_FLASH_ATTN_EXT_VEC_NQPTG 1 +#define OP_FLASH_ATTN_EXT_VEC_NQPSG 1 #define OP_FLASH_ATTN_EXT_VEC_NCPSG 32 // kernel argument structs diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp index 7f4cfbba2..f97c4435d 100644 --- a/ggml/src/ggml-metal/ggml-metal-ops.cpp +++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp @@ -2295,7 +2295,7 @@ size_t ggml_metal_op_flash_attn_ext_extra_blk(const ggml_tensor * op) { // return res; //} - const int nqptg = is_vec ? OP_FLASH_ATTN_EXT_VEC_NQPTG : OP_FLASH_ATTN_EXT_NQPTG; + const int nqptg = is_vec ? OP_FLASH_ATTN_EXT_VEC_NQPSG : OP_FLASH_ATTN_EXT_NQPSG; const int ncpsg = is_vec ? OP_FLASH_ATTN_EXT_VEC_NCPSG : OP_FLASH_ATTN_EXT_NCPSG; const int64_t ne1 = (ne01 + nqptg - 1)/nqptg; @@ -2411,7 +2411,7 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) { if (!ggml_metal_op_flash_attn_ext_use_vec(op)) { // half8x8 kernel - const int nqptg = OP_FLASH_ATTN_EXT_NQPTG; // queries per threadgroup + const int nqptg = OP_FLASH_ATTN_EXT_NQPSG; // queries per threadgroup const int ncpsg = OP_FLASH_ATTN_EXT_NCPSG; // cache values per simdgroup GGML_ASSERT(nqptg <= 32); @@ -2578,9 +2578,9 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) { #undef FATTN_SMEM } else { // half4x4 kernel - const int nqptg = OP_FLASH_ATTN_EXT_VEC_NQPTG; // queries per threadgroup + const int nqptg = OP_FLASH_ATTN_EXT_VEC_NQPSG; // queries per threadgroup const int ncpsg = OP_FLASH_ATTN_EXT_VEC_NCPSG; // cache values per simdgroup !! sync with kernel template arguments !! - const int nkpsg = 1*ncpsg; + const int nhptg = 1; // heads per threadgroup GGML_ASSERT(nqptg <= 32); GGML_ASSERT(nqptg % 1 == 0); @@ -2632,6 +2632,9 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) { ggml_metal_op_concurrency_reset(ctx); } + // note: for simplicity assume the K is larger or equal than V + GGML_ASSERT(ne10 >= ne20); + // ne00 + 2*ncpsg*(nsg) // for each query, we load it as f16 in shared memory (ne00) // and store the soft_max values and the mask @@ -2639,28 +2642,9 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) { // ne20*(nsg) // each simdgroup has a full f32 head vector in shared mem to accumulate results // -#define FATTN_SMEM(nsg) (GGML_PAD((nqptg*(GGML_PAD(ne00, 128) + 4*ncpsg*(nsg)) + 2*GGML_PAD(ne20, 128)*(nsg))*(sizeof(float)/2), 16)) - - int64_t nsgmax = 2; - while (true) { - const size_t smem = FATTN_SMEM(nsgmax); - // avoid using more than half of the threadgroup memory - can cause slow downs especially for large head sizes - if (smem > props_dev->max_theadgroup_memory_size/2) { - break; - } - nsgmax *= 2; - } - nsgmax /= 2; - - // simdgroups per threadgroup (a.k.a. warps) - //const int64_t nsgt = MAX(2, MIN(nsgmax, MIN((ne11 + nkpsg - 1)/(nkpsg), (int64_t) pipeline.maxTotalThreadsPerThreadgroup/32))); - const int64_t nsgt = MAX(2, MIN(nsgmax, MIN((ne11 + nkpsg - 1)/(nkpsg), (int64_t) 1024/32))); +#define FATTN_SMEM(nsg) (GGML_PAD(((GGML_PAD(ne00, 128) + 4*ncpsg + 2*GGML_PAD(ne20, 128))*(nsg))*(sizeof(float)/2), 16)) int64_t nsg = 1; - while (nsg <= nsgt) { - nsg *= 2; - } - nsg /= 2; // workgroups // each workgroup handles nsg*nkpsg cache values @@ -2673,7 +2657,7 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) { } else { nwg = 32; nsg = 1; - while (2*nwg*nsg*nkpsg < ne11 && nsg < 4) { + while (2*nwg*nsg*ncpsg < ne11 && nsg < 4) { nsg *= 2; } } @@ -2739,7 +2723,7 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) { ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0); - ggml_metal_encoder_dispatch_threadgroups(enc, (ne01 + nqptg - 1)/nqptg, ne02, ne03*nwg, 32, nsg, 1); + ggml_metal_encoder_dispatch_threadgroups(enc, (ne01 + nqptg - 1)/nqptg, (ne02 + nhptg - 1)/nhptg, ne03*nwg, 32, nsg, 1); } else { // sanity checks assert(ggml_metal_op_flash_attn_ext_extra_tmp(op) != 0); @@ -2752,7 +2736,7 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) { ggml_metal_encoder_set_buffer(enc, bid_tmp, 7); ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0); - ggml_metal_encoder_dispatch_threadgroups(enc, (ne01 + nqptg - 1)/nqptg, ne02, ne03*nwg, 32, nsg, 1); + ggml_metal_encoder_dispatch_threadgroups(enc, (ne01 + nqptg - 1)/nqptg, (ne02 + nhptg - 1)/nhptg, ne03*nwg, 32, nsg, 1); // sync the 2 kernels ggml_metal_op_concurrency_reset(ctx); diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index 17e358d1a..3259213fd 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -5931,7 +5931,7 @@ template< void (*deq_v)(device const vd4x4_t *, short, thread v4x4_t &), short DK, // K head size short DV, // V head size - short Q = OP_FLASH_ATTN_EXT_NQPTG, // queries per threadgroup + short Q = OP_FLASH_ATTN_EXT_NQPSG, // queries per threadgroup short C = OP_FLASH_ATTN_EXT_NCPSG> // cache items per threadgroup kernel void kernel_flash_attn_ext( constant ggml_metal_kargs_flash_attn_ext & args, @@ -6141,11 +6141,10 @@ template< void (*deq_v_t4)(device const vd4_t *, short, thread v4_t &), short DK, // K head size short DV, // V head size - short NE, // head elements per thread - short Q, // queries per threadgroup - short C, // cache items per threadgroup - short NSG> // number of simd groups -void kernel_flash_attn_ext_vec_impl( + short NE = 4, // head elements per thread + short Q = OP_FLASH_ATTN_EXT_VEC_NQPSG, // queries per threadgroup + short C = OP_FLASH_ATTN_EXT_VEC_NCPSG> // cache items per threadgroup +kernel void kernel_flash_attn_ext_vec( constant ggml_metal_kargs_flash_attn_ext_vec & args, device const char * q, device const char * k, @@ -6162,6 +6161,7 @@ void kernel_flash_attn_ext_vec_impl( static_assert(DV % 32 == 0, "DV must be divisible by 32"); #define NWG (FC_flash_attn_ext_vec_nwg) +#define NSG (FC_flash_attn_ext_vec_nsg) #define NS10 (FC_flash_attn_ext_vec_ns10) #define NS20 (FC_flash_attn_ext_vec_ns20) @@ -6190,12 +6190,12 @@ void kernel_flash_attn_ext_vec_impl( const short T = PK + NSG*SH; // shared memory size per query in (half) - //threadgroup q_t * sq = (threadgroup q_t *) (shmem_f16 + 0*PK); // holds the query data - threadgroup q4_t * sq4 = (threadgroup q4_t *) (shmem_f16 + 0*PK); // same as above but in q4_t - threadgroup s_t * ss = (threadgroup s_t *) (shmem_f16 + sgitg*SH + Q*PK); // scratch buffer for attention - threadgroup s4_t * ss4 = (threadgroup s4_t *) (shmem_f16 + sgitg*SH + Q*PK); // same as above but in s4_t - threadgroup half * sm = (threadgroup half *) (shmem_f16 + sgitg*SH + 2*C + Q*PK); // scratch buffer for mask - threadgroup o4_t * so4 = (threadgroup o4_t *) (shmem_f16 + 2*sgitg*PV + Q*T); // scratch buffer for the results + //threadgroup q_t * sq = (threadgroup q_t *) (shmem_f16 + 0*PK); // holds the query data + threadgroup q4_t * sq4 = (threadgroup q4_t *) (shmem_f16 + 0*PK); // same as above but in q4_t + threadgroup s_t * ss = (threadgroup s_t *) (shmem_f16 + sgitg*SH + NSG*PK); // scratch buffer for attention + threadgroup s4_t * ss4 = (threadgroup s4_t *) (shmem_f16 + sgitg*SH + NSG*PK); // same as above but in s4_t + threadgroup half * sm = (threadgroup half *) (shmem_f16 + sgitg*SH + 2*C + NSG*PK); // scratch buffer for mask + threadgroup o4_t * so4 = (threadgroup o4_t *) (shmem_f16 + 2*sgitg*PV + NSG*PK + NSG*SH); // scratch buffer for the results // store the result for all queries in shared memory (the O matrix from the paper) so4 += tiisg; @@ -6213,11 +6213,13 @@ void kernel_flash_attn_ext_vec_impl( // load heads from Q to shared memory device const float4 * q4 = (device const float4 *) ((device const char *) q); - for (short i = tiisg; i < PK4; i += NW) { - if (iq1 < args.ne01 && i < DK4) { - sq4[i] = (q4_t) q4[i]; - } else { - sq4[i] = (q4_t) 0.0f; + if (iq1 < args.ne01) { + for (short i = tiisg; i < PK4; i += NW) { + if (i < DK4) { + sq4[i] = (q4_t) q4[i]; + } else { + sq4[i] = (q4_t) 0.0f; + } } } @@ -6295,7 +6297,7 @@ void kernel_flash_attn_ext_vec_impl( } // skip -INF blocks - if (simd_max(sm[tiisg]) == -INFINITY) { + if (simd_max(sm[tiisg]) <= -MAXHALF) { continue; } @@ -6569,57 +6571,11 @@ void kernel_flash_attn_ext_vec_impl( } #undef NWG +#undef NSG #undef NS10 #undef NS20 } -template< - typename q4_t, // query types in shared memory - typename k4_t, // key types in shared memory - typename v4_t, // value types in shared memory - typename qk_t, // Q*K types - typename s_t, // soft-max types - typename s4_t, - typename o4_t, // attention accumulation types - typename kd4_t, // key type in device memory - short nl_k, - void (*deq_k_t4)(device const kd4_t *, short, thread k4_t &), - typename vd4_t, // value type in device memory - short nl_v, - void (*deq_v_t4)(device const vd4_t *, short, thread v4_t &), - short DK, // K head size - short DV, // V head size - short NE = 4, // head elements per thread - short Q = OP_FLASH_ATTN_EXT_VEC_NQPTG, // queries per threadgroup - short C = OP_FLASH_ATTN_EXT_VEC_NCPSG> // cache items per threadgroup -kernel void kernel_flash_attn_ext_vec( - constant ggml_metal_kargs_flash_attn_ext_vec & args, - device const char * q, - device const char * k, - device const char * v, - device const char * mask, - device const char * sinks, - device const char * pad, - device char * dst, - threadgroup half * shmem_f16 [[threadgroup(0)]], - uint3 tgpig[[threadgroup_position_in_grid]], - ushort tiisg[[thread_index_in_simdgroup]], - ushort sgitg[[simdgroup_index_in_threadgroup]]) { -#define FWD_TMPL q4_t, k4_t, v4_t, qk_t, s_t, s4_t, o4_t, kd4_t, nl_k, deq_k_t4, vd4_t, nl_v, deq_v_t4, DK, DV, NE, Q, C -#define FWD_ARGS args, q, k, v, mask, sinks, pad, dst, shmem_f16, tgpig, tiisg, sgitg - switch (FC_flash_attn_ext_vec_nsg) { - // note: disabled cases to reduce library load time - case 1: kernel_flash_attn_ext_vec_impl(FWD_ARGS); break; - case 2: kernel_flash_attn_ext_vec_impl(FWD_ARGS); break; - case 4: kernel_flash_attn_ext_vec_impl(FWD_ARGS); break; - //case 8: kernel_flash_attn_ext_vec_impl(FWD_ARGS); break; - //case 16: kernel_flash_attn_ext_vec_impl(FWD_ARGS); break; - //case 32: kernel_flash_attn_ext_vec_impl(FWD_ARGS); break; - } -#undef FWD_TMPL -#undef FWD_ARGS -} - // note: I think the s_t can be half instead of float, because the Q*K scaling is done before storing to shared mem // in the other (non-vec) kernel, we need s_t to also be float because we scale during the soft_max // From a6fd8ca1fee621addff1695165414c4822fb08bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Tue, 3 Feb 2026 14:20:57 +0100 Subject: [PATCH 03/11] models : remove unnecessary cont in openelm (#19289) --- src/models/openelm.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/models/openelm.cpp b/src/models/openelm.cpp index ee46a3375..fbf682ec8 100644 --- a/src/models/openelm.cpp +++ b/src/models/openelm.cpp @@ -43,7 +43,7 @@ llm_build_openelm::llm_build_openelm(const llama_model & model, const llm_graph_ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*n_head); cb(Kcur, "Kcur", il); - ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*(n_head+n_head_kv))); + ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*(n_head+n_head_kv)); cb(Vcur, "Vcur", il); Qcur = build_norm(Qcur, From 8bece2eb20f0134632ae229849fbde6559882d36 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Tue, 3 Feb 2026 23:31:23 +0800 Subject: [PATCH 04/11] CUDA: use mmvq for mul-mat-id for small batch sizes (#18958) * CUDA: use mmvq for mul-mat-id for small batch sizes * add mmvq too * Fix perf issue on ampere. Use mmvf mm-id only for non-nvidia GPUs * templatize multi_token_path --- ggml/src/ggml-cuda/ggml-cuda.cu | 14 ++- ggml/src/ggml-cuda/mmvf.cu | 194 +++++++++++++++++++++----------- ggml/src/ggml-cuda/mmvf.cuh | 2 + ggml/src/ggml-cuda/mmvq.cu | 135 ++++++++++++++-------- 4 files changed, 224 insertions(+), 121 deletions(-) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 1bcd1ab1f..eeb8625db 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -2279,13 +2279,19 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc; if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - if (ne2 == 1) { + static_assert(MMVQ_MAX_BATCH_SIZE == MMVF_MAX_BATCH_SIZE); + if (ne2 <= MMVQ_MAX_BATCH_SIZE) { if (ggml_is_quantized(src0->type)) { - ggml_cuda_mul_mat_vec_q(ctx, src0, src1, ids, dst); + if (ne2 <= 4) { + ggml_cuda_mul_mat_vec_q(ctx, src0, src1, ids, dst); + return; + } } else { - ggml_cuda_mul_mat_vec_f(ctx, src0, src1, ids, dst); + if (GGML_CUDA_CC_IS_AMD(cc)) { + ggml_cuda_mul_mat_vec_f(ctx, src0, src1, ids, dst); + return; + } } - return; } if (ggml_cuda_should_use_mmq(src0->type, cc, ne12, /*n_experts=*/ne02)) { diff --git a/ggml/src/ggml-cuda/mmvf.cu b/ggml/src/ggml-cuda/mmvf.cu index 32948e4d7..d91472024 100644 --- a/ggml/src/ggml-cuda/mmvf.cu +++ b/ggml/src/ggml-cuda/mmvf.cu @@ -4,26 +4,48 @@ #include "mmvf.cuh" #include "convert.cuh" -template +template static __global__ void mul_mat_vec_f( const T * __restrict__ x, const float * __restrict__ y, const int32_t * __restrict__ ids, const ggml_cuda_mm_fusion_args_device fusion, float * __restrict__ dst, - const int ncols2, const int nchannels_y, const int stride_row, const int stride_col_y2, const int stride_col_dst, + const int ncols2, const uint3 nchannels_y, const int stride_row, const int stride_col_y2, const int stride_col_dst, const uint3 channel_ratio, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst, - const uint3 sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst) { + const uint3 sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst, + const int ids_stride) { const int row = blockIdx.x; + // for MUL_MAT_ID - blockIdx.y = n_expert_used, blockIdx.z = ncols_dst (tokens) const int channel_dst = blockIdx.y; - const int channel_x = ids ? ids[channel_dst] : fastdiv((uint32_t) channel_dst, channel_ratio); - const int channel_y = ids ? channel_dst % nchannels_y : channel_dst; - const int sample_dst = blockIdx.z; + const int tid = threadIdx.x; + + int token_idx; + int channel_x; + int channel_y; + int sample_dst; + + if constexpr (is_multi_token_id) { + // Multi-token MUL_MAT_ID path, adding these in the normal path causes a perf regression for n_tokens=1 case + token_idx = blockIdx.z; + channel_x = ids[channel_dst + token_idx * ids_stride]; + channel_y = fastmodulo(channel_dst, nchannels_y); + sample_dst = 0; + } else { + token_idx = ids ? blockIdx.z : 0; + channel_x = ids ? ids[blockIdx.y + token_idx * ids_stride] : fastdiv((uint32_t) channel_dst, channel_ratio); + channel_y = ids ? fastmodulo(blockIdx.y, nchannels_y) : channel_dst; + sample_dst = ids ? 0 : blockIdx.z; + } + const int sample_x = fastdiv((uint32_t) sample_dst, sample_ratio); const int sample_y = sample_dst; - const int tid = threadIdx.x; constexpr int warp_size = ggml_cuda_get_physical_warp_size(); x += int64_t(sample_x) *stride_sample_x + channel_x *stride_channel_x + row*stride_row; y += int64_t(sample_y) *stride_sample_y + channel_y *stride_channel_y; dst += int64_t(sample_dst)*stride_sample_dst + channel_dst*stride_channel_dst; + if constexpr (is_multi_token_id) { + y += token_idx*stride_col_y2*2; + dst += token_idx*stride_col_dst; + } bool use_gate = false; bool use_bias = false; @@ -56,8 +78,10 @@ static __global__ void mul_mat_vec_f( if (use_gate) { gate_x += int64_t(sample_x) *stride_sample_x + channel_x *stride_channel_x + row*stride_row; } + + const int channel_bias = ids ? channel_x : channel_dst; + if constexpr (has_fusion) { - const int channel_bias = ids ? channel_x : channel_dst; if (use_bias) { x_bias += int64_t(sample_dst)*stride_sample_dst + channel_bias*stride_channel_dst; } @@ -349,36 +373,36 @@ static __global__ void mul_mat_vec_f( } } -template +template static void mul_mat_vec_f_switch_fusion( const T * x, const float * y, const int32_t * ids, const ggml_cuda_mm_fusion_args_device fusion, float * dst, - const int64_t ncols, const int64_t nrows, + const int64_t ncols, const uint3 nchannels_y, const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst, const uint3 channel_ratio, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst, const uint3 sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst, - const dim3 & block_dims, const dim3 & block_nums, const int nbytes_shared, const cudaStream_t stream) { + const dim3 & block_dims, const dim3 & block_nums, const int nbytes_shared, const int ids_stride, const cudaStream_t stream) { const bool has_fusion = fusion.gate != nullptr || fusion.x_bias != nullptr || fusion.gate_bias != nullptr; if constexpr (ncols_dst == 1) { if (has_fusion) { - mul_mat_vec_f<<>> - (x, y, ids, fusion, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst, + mul_mat_vec_f<<>> + (x, y, ids, fusion, dst, ncols, nchannels_y, stride_row, stride_col_y, stride_col_dst, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride); return; } } GGML_ASSERT(!has_fusion && "fusion only supported for ncols_dst=1"); - mul_mat_vec_f<<>> - (x, y, ids, fusion, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst, + mul_mat_vec_f<<>> + (x, y, ids, fusion, dst, ncols, nchannels_y, stride_row, stride_col_y, stride_col_dst, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride); } -template +template void launch_mul_mat_vec_f_cuda( const T * x, const float * y, const int32_t * ids, const ggml_cuda_mm_fusion_args_device fusion, float * dst, const int64_t ncols, const int64_t nrows, @@ -386,12 +410,13 @@ void launch_mul_mat_vec_f_cuda( const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x, const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst, - cudaStream_t stream) { + const int64_t nsamples_or_ntokens, const int64_t ids_stride, cudaStream_t stream) { GGML_ASSERT(ncols % 2 == 0); GGML_ASSERT(stride_row % 2 == 0); GGML_ASSERT(stride_col_y % 2 == 0); GGML_ASSERT(ids || nchannels_dst % nchannels_x == 0); GGML_ASSERT( nsamples_dst % nsamples_x == 0); + const uint3 nchannels_y_fd = ids ? init_fastdiv_values(nchannels_y) : make_uint3(0, 0, 0); const uint3 channel_ratio_fd = ids ? make_uint3(0, 0, 0) : init_fastdiv_values(nchannels_dst / nchannels_x); const uint3 sample_ratio_fd = init_fastdiv_values(nsamples_dst / nsamples_x); @@ -415,56 +440,56 @@ void launch_mul_mat_vec_f_cuda( const bool has_fusion = fusion.gate != nullptr || fusion.x_bias != nullptr || fusion.gate_bias != nullptr; const int nbytes_shared = warp_size*sizeof(float) + (has_fusion ? warp_size*sizeof(float) : 0); - const dim3 block_nums(nrows, nchannels_dst, nsamples_dst); + const dim3 block_nums(nrows, nchannels_dst, nsamples_or_ntokens); const dim3 block_dims(block_size_best, 1, 1); switch (block_size_best) { case 32: { - mul_mat_vec_f_switch_fusion - (x, y, ids, fusion, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst, + mul_mat_vec_f_switch_fusion + (x, y, ids, fusion, dst, ncols/2, nchannels_y_fd, stride_row, stride_col_y/2, stride_col_dst, channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, stream); + sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, ids_stride, stream); } break; case 64: { - mul_mat_vec_f_switch_fusion - (x, y, ids, fusion, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst, + mul_mat_vec_f_switch_fusion + (x, y, ids, fusion, dst, ncols/2, nchannels_y_fd, stride_row, stride_col_y/2, stride_col_dst, channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, stream); + sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, ids_stride, stream); } break; case 96: { - mul_mat_vec_f_switch_fusion - (x, y, ids, fusion, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst, + mul_mat_vec_f_switch_fusion + (x, y, ids, fusion, dst, ncols/2, nchannels_y_fd, stride_row, stride_col_y/2, stride_col_dst, channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, stream); + sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, ids_stride, stream); } break; case 128: { - mul_mat_vec_f_switch_fusion - (x, y, ids, fusion, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst, + mul_mat_vec_f_switch_fusion + (x, y, ids, fusion, dst, ncols/2, nchannels_y_fd, stride_row, stride_col_y/2, stride_col_dst, channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, stream); + sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, ids_stride, stream); } break; case 160: { - mul_mat_vec_f_switch_fusion - (x, y, ids, fusion, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst, + mul_mat_vec_f_switch_fusion + (x, y, ids, fusion, dst, ncols/2, nchannels_y_fd, stride_row, stride_col_y/2, stride_col_dst, channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, stream); + sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, ids_stride, stream); } break; case 192: { - mul_mat_vec_f_switch_fusion - (x, y, ids, fusion, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst, + mul_mat_vec_f_switch_fusion + (x, y, ids, fusion, dst, ncols/2, nchannels_y_fd, stride_row, stride_col_y/2, stride_col_dst, channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, stream); + sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, ids_stride, stream); } break; case 224: { - mul_mat_vec_f_switch_fusion - (x, y, ids, fusion, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst, + mul_mat_vec_f_switch_fusion + (x, y, ids, fusion, dst, ncols/2, nchannels_y_fd, stride_row, stride_col_y/2, stride_col_dst, channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, stream); + sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, ids_stride, stream); } break; case 256: { - mul_mat_vec_f_switch_fusion - (x, y, ids, fusion, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst, + mul_mat_vec_f_switch_fusion + (x, y, ids, fusion, dst, ncols/2, nchannels_y_fd, stride_row, stride_col_y/2, stride_col_dst, channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, stream); + sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, ids_stride, stream); } break; default: { GGML_ABORT("fatal error"); @@ -480,55 +505,88 @@ static void mul_mat_vec_f_cuda_switch_ncols_dst( const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x, const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst, - cudaStream_t stream) { + const int64_t ids_stride, cudaStream_t stream) { + + const bool has_ids = ids != nullptr; + + if (has_ids && ncols_dst > 1) { + // Multi-token MUL_MAT_ID path only - single-token goes through regular path below + constexpr int c_ncols_dst = 1; + launch_mul_mat_vec_f_cuda + (x, y, ids, fusion, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, + stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + ncols_dst, ids_stride, stream); + return; + } + + if (has_ids) { + // Single-token MUL_MAT_ID path + constexpr int c_ncols_dst = 1; + launch_mul_mat_vec_f_cuda + (x, y, ids, fusion, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, + stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + ncols_dst, ids_stride, stream); + return; + } + switch (ncols_dst) { case 1: launch_mul_mat_vec_f_cuda (x, y, ids, fusion, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, - stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + nsamples_dst, ids_stride, stream); break; case 2: launch_mul_mat_vec_f_cuda (x, y, ids, fusion, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, - stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + nsamples_dst, ids_stride, stream); break; case 3: launch_mul_mat_vec_f_cuda (x, y, ids, fusion, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, - stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + nsamples_dst, ids_stride, stream); break; case 4: launch_mul_mat_vec_f_cuda (x, y, ids, fusion, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, - stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + nsamples_dst, ids_stride, stream); break; case 5: launch_mul_mat_vec_f_cuda (x, y, ids, fusion, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, - stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + nsamples_dst, ids_stride, stream); break; case 6: launch_mul_mat_vec_f_cuda (x, y, ids, fusion, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, - stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + nsamples_dst, ids_stride, stream); break; case 7: launch_mul_mat_vec_f_cuda (x, y, ids, fusion, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, - stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + nsamples_dst, ids_stride, stream); break; case 8: launch_mul_mat_vec_f_cuda (x, y, ids, fusion, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, - stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, + nsamples_dst, ids_stride, stream); break; default: GGML_ABORT("fatal error"); @@ -544,21 +602,21 @@ static void mul_mat_vec_f_cuda( const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x, const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst, - enum ggml_prec prec, cudaStream_t stream) { + const int64_t ids_stride, enum ggml_prec prec, cudaStream_t stream) { if constexpr(std::is_same_v) { if (prec == GGML_PREC_DEFAULT) { mul_mat_vec_f_cuda_switch_ncols_dst (x, y, ids, fusion, dst, ncols, nrows, ncols_dst, stride_row, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, - stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream); return; } } mul_mat_vec_f_cuda_switch_ncols_dst (x, y, ids, fusion, dst, ncols, nrows, ncols_dst, stride_row, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, - stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream); } void ggml_cuda_mul_mat_vec_f(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst, @@ -573,7 +631,7 @@ void ggml_cuda_mul_mat_vec_f(ggml_backend_cuda_context & ctx, const ggml_tensor const size_t ts_src1 = ggml_type_size(src1->type); const size_t ts_dst = ggml_type_size(dst->type); - GGML_ASSERT(!ids || ne12 == 1); // Implementation is only correct for batch size 1. + GGML_ASSERT(!ids || ne12 <= MMVF_MAX_BATCH_SIZE); GGML_ASSERT(ne13 == ne3); GGML_ASSERT( nb00 == ts_src0); @@ -626,29 +684,31 @@ void ggml_cuda_mul_mat_vec_f(ggml_backend_cuda_context & ctx, const ggml_tensor const int64_t ncols_dst = ids ? ne2 : ne1; const int64_t nchannels_y = ids ? ne11 : ne12; const int64_t nchannels_dst = ids ? ne1 : ne2; + const int64_t stride_col_dst = ids ? s2 : s1; + const int64_t stride_col_y = ids ? s12 : s11; const int64_t stride_channel_dst = ids ? s1 : s2; const int64_t stride_channel_y = ids ? s11 : s12; - GGML_ASSERT(!ids || ncols_dst == 1); + const int64_t ids_stride = ids ? ids->nb[1] / ggml_type_size(ids->type) : 0; switch (src0->type) { case GGML_TYPE_F32: { const float * src0_d = (const float *) src0->data; - mul_mat_vec_f_cuda(src0_d, src1_d, ids_d, fusion_local, dst_d, ne00, ne01, ncols_dst, s01, s11, s1, + mul_mat_vec_f_cuda(src0_d, src1_d, ids_d, fusion_local, dst_d, ne00, ne01, ncols_dst, s01, stride_col_y, stride_col_dst, ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst, - ne03, ne3, s03, s13, s3, prec, ctx.stream()); + ne03, ne3, s03, s13, s3, ids_stride, prec, ctx.stream()); } break; case GGML_TYPE_F16: { const half * src0_d = (const half *) src0->data; - mul_mat_vec_f_cuda(src0_d, src1_d, ids_d, fusion_local, dst_d, ne00, ne01, ncols_dst, s01, s11, s1, + mul_mat_vec_f_cuda(src0_d, src1_d, ids_d, fusion_local, dst_d, ne00, ne01, ncols_dst, s01, stride_col_y, stride_col_dst, ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst, - ne03, ne3, s03, s13, s3, prec, ctx.stream()); + ne03, ne3, s03, s13, s3, ids_stride, prec, ctx.stream()); } break; case GGML_TYPE_BF16: { const nv_bfloat16 * src0_d = (const nv_bfloat16 *) src0->data; - mul_mat_vec_f_cuda(src0_d, src1_d, ids_d, fusion_local, dst_d, ne00, ne01, ncols_dst, s01, s11, s1, + mul_mat_vec_f_cuda(src0_d, src1_d, ids_d, fusion_local, dst_d, ne00, ne01, ncols_dst, s01, stride_col_y, stride_col_dst, ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst, - ne03, ne3, s03, s13, s3, prec, ctx.stream()); + ne03, ne3, s03, s13, s3, ids_stride, prec, ctx.stream()); } break; default: GGML_ABORT("unsupported type: %s", ggml_type_name(src0->type)); @@ -695,19 +755,19 @@ void ggml_cuda_op_mul_mat_vec_f( const float * src0_d = (const float *) src0_dd_i; mul_mat_vec_f_cuda(src0_d, src1_ddf_i, nullptr, empty, dst_dd_i, ne00, row_diff, src1_ncols, stride_row, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream); + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, 0, prec, stream); } break; case GGML_TYPE_F16: { const half * src0_d = (const half *) src0_dd_i; mul_mat_vec_f_cuda(src0_d, src1_ddf_i, nullptr, empty, dst_dd_i, ne00, row_diff, src1_ncols, stride_row, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream); + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, 0, prec, stream); } break; case GGML_TYPE_BF16: { const nv_bfloat16 * src0_d = (const nv_bfloat16 *) src0_dd_i; mul_mat_vec_f_cuda(src0_d, src1_ddf_i, nullptr, empty, dst_dd_i, ne00, row_diff, src1_ncols, stride_row, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream); + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, 0, prec, stream); } break; default: GGML_ABORT("unsupported type: %s", ggml_type_name(src0->type)); diff --git a/ggml/src/ggml-cuda/mmvf.cuh b/ggml/src/ggml-cuda/mmvf.cuh index a09fbdc72..a50f7c021 100644 --- a/ggml/src/ggml-cuda/mmvf.cuh +++ b/ggml/src/ggml-cuda/mmvf.cuh @@ -1,5 +1,7 @@ #include "common.cuh" +#define MMVF_MAX_BATCH_SIZE 8 // Max. batch size for which to use MMVF kernels. + void ggml_cuda_mul_mat_vec_f(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst, const ggml_cuda_mm_fusion_args_host * fusion = nullptr); diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu index d671551c1..ce25ccf42 100644 --- a/ggml/src/ggml-cuda/mmvq.cu +++ b/ggml/src/ggml-cuda/mmvq.cu @@ -137,15 +137,15 @@ static constexpr __host__ __device__ int calc_rows_per_block(int ncols_dst, int return 1; } -// tell the compiler to use as many registers as it wants, see nwarps definition below -template +template __launch_bounds__(calc_nwarps(ncols_dst, get_device_table_id())*ggml_cuda_get_physical_warp_size(), 1) static __global__ void mul_mat_vec_q( const void * __restrict__ vx, const void * __restrict__ vy, const int32_t * __restrict__ ids, const ggml_cuda_mm_fusion_args_device fusion, float * __restrict__ dst, const uint32_t ncols_x, const uint3 nchannels_y, const uint32_t stride_row_x, const uint32_t stride_col_y, const uint32_t stride_col_dst, const uint3 channel_ratio, const uint32_t stride_channel_x, const uint32_t stride_channel_y, const uint32_t stride_channel_dst, const uint3 sample_ratio, - const uint32_t stride_sample_x, const uint32_t stride_sample_y, const uint32_t stride_sample_dst) { + const uint32_t stride_sample_x, const uint32_t stride_sample_y, const uint32_t stride_sample_dst, + const uint32_t ids_stride) { constexpr int qk = ggml_cuda_type_traits::qk; constexpr int qi = ggml_cuda_type_traits::qi; @@ -162,11 +162,25 @@ static __global__ void mul_mat_vec_q( const int blocks_per_row_x = ncols_x / qk; constexpr int blocks_per_iter = vdr * nwarps*warp_size / qi; - // The MUL_MAT_ID code path with ids != nullptr is only implemented for ncols_dst == 1. const uint32_t channel_dst = blockIdx.y; - const uint32_t channel_x = ncols_dst == 1 && ids ? ids[channel_dst] : fastdiv(channel_dst, channel_ratio); - const uint32_t channel_y = ncols_dst == 1 && ids ? fastmodulo(channel_dst, nchannels_y) : channel_dst; - const uint32_t sample_dst = blockIdx.z; + + uint32_t token_idx = 0; + uint32_t channel_x; + uint32_t channel_y; + uint32_t sample_dst; + + if constexpr (is_multi_token_id) { + // Multi-token MUL_MAT_ID path, adding these in the normal path causes a perf regression for n_tokens=1 case + token_idx = blockIdx.z; + channel_x = ids[channel_dst + token_idx * ids_stride]; + channel_y = fastmodulo(channel_dst, nchannels_y); + sample_dst = 0; + } else { + channel_x = ncols_dst == 1 && ids ? ids[channel_dst] : fastdiv(channel_dst, channel_ratio); + channel_y = ncols_dst == 1 && ids ? fastmodulo(channel_dst, nchannels_y) : channel_dst; + sample_dst = blockIdx.z; + } + const uint32_t sample_x = fastdiv(sample_dst, sample_ratio); const uint32_t sample_y = sample_dst; @@ -188,11 +202,11 @@ static __global__ void mul_mat_vec_q( active_glu = fusion.glu_op; } - const uint32_t channel_bias = ids ? channel_x : channel_dst; float x_biases[ncols_dst] = { 0.0f }; float gate_biases[ncols_dst] = { 0.0f }; if constexpr (has_fusion) { + const uint32_t channel_bias = ids ? channel_x : channel_dst; if (use_bias) { x_bias = x_bias + sample_dst*stride_sample_dst + channel_bias*stride_channel_dst + row0; // 1. Hide latency by prefetching bias and gate here @@ -222,6 +236,9 @@ static __global__ void mul_mat_vec_q( float tmp_gate[ncols_dst][rows_per_cuda_block] = {{0.0f}}; const block_q8_1 * y = ((const block_q8_1 *) vy) + sample_y*stride_sample_y + channel_y*stride_channel_y; + if constexpr (is_multi_token_id) { + y += token_idx*stride_col_y; + } const int kbx_offset = sample_x*stride_sample_x + channel_x*stride_channel_x + row0*stride_row_x; for (int kbx = tid / (qi/vdr); kbx < blocks_per_row_x; kbx += blocks_per_iter) { @@ -275,6 +292,10 @@ static __global__ void mul_mat_vec_q( dst += sample_dst*stride_sample_dst + channel_dst*stride_channel_dst + row0; + if constexpr (is_multi_token_id) { + dst += token_idx*stride_col_dst; + } + // sum up partial sums and write back result #pragma unroll for (int j = 0; j < ncols_dst; ++j) { @@ -335,40 +356,41 @@ static __global__ void mul_mat_vec_q( } static std::pair calc_launch_params( - const int ncols_dst, const int nrows_x, const int nchannels_y, const int nsamples_y, + const int ncols_dst, const int nrows_x, const int nchannels_dst, const int nsamples_or_ntokens, const int warp_size, const mmvq_parameter_table_id table_id) { const int64_t nblocks = (nrows_x + calc_rows_per_block(ncols_dst, table_id) - 1) / calc_rows_per_block(ncols_dst, table_id); - const dim3 block_nums(nblocks, nchannels_y, nsamples_y); + const dim3 block_nums(nblocks, nchannels_dst, nsamples_or_ntokens); const dim3 block_dims(warp_size, calc_nwarps(ncols_dst, table_id), 1); return {block_nums, block_dims}; } -template +template static void mul_mat_vec_q_switch_fusion( const void * vx, const void * vy, const int32_t * ids, const ggml_cuda_mm_fusion_args_device fusion, float * dst, const uint32_t ncols_x, const uint3 nchannels_y, const uint32_t stride_row_x, const uint32_t stride_col_y, const uint32_t stride_col_dst, const uint3 channel_ratio, const uint32_t stride_channel_x, const uint32_t stride_channel_y, const uint32_t stride_channel_dst, const uint3 sample_ratio, const uint32_t stride_sample_x, const uint32_t stride_sample_y, const uint32_t stride_sample_dst, - const dim3 & block_nums, const dim3 & block_dims, const int nbytes_shared, cudaStream_t stream) { + const dim3 & block_nums, const dim3 & block_dims, const int nbytes_shared, + const uint32_t ids_stride, cudaStream_t stream) { const bool has_fusion = fusion.gate != nullptr || fusion.x_bias != nullptr || fusion.gate_bias != nullptr; if constexpr (c_ncols_dst == 1) { if (has_fusion) { - mul_mat_vec_q<<>> + mul_mat_vec_q<<>> (vx, vy, ids, fusion, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride); return; } } GGML_ASSERT(!has_fusion && "fusion only supported for ncols_dst=1"); - mul_mat_vec_q<<>> + mul_mat_vec_q<<>> (vx, vy, ids, fusion, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst); + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride); } template @@ -379,7 +401,7 @@ static void mul_mat_vec_q_switch_ncols_dst( const int nchannels_x, const int nchannels_y, const int nchannels_dst, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst, const int nsamples_x, const int nsamples_dst, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst, - cudaStream_t stream) { + const int ids_stride, cudaStream_t stream) { GGML_ASSERT(ncols_x % ggml_blck_size(type) == 0); GGML_ASSERT(ncols_dst <= MMVQ_MAX_BATCH_SIZE); @@ -393,8 +415,19 @@ static void mul_mat_vec_q_switch_ncols_dst( const mmvq_parameter_table_id table_id = get_device_table_id(ggml_cuda_info().devices[device].cc); const bool has_fusion = fusion.gate != nullptr || fusion.x_bias != nullptr || fusion.gate_bias != nullptr; + const bool has_ids = ids != nullptr; + + if (has_ids && ncols_dst > 1) { + // Multi-token MUL_MAT_ID path only - single-token goes through regular path below + constexpr int c_ncols_dst = 1; + std::pair dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, ncols_dst, warp_size, table_id); + mul_mat_vec_q_switch_fusion(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst, + channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst, + sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, + dims.first, dims.second, 0, ids_stride, stream); + return; + } - GGML_ASSERT(!ids || ncols_dst == 1); switch (ncols_dst) { case 1: { constexpr int c_ncols_dst = 1; @@ -402,7 +435,7 @@ static void mul_mat_vec_q_switch_ncols_dst( mul_mat_vec_q_switch_fusion(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst, channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst, sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, - dims.first, dims.second, 0, stream); + dims.first, dims.second, 0, ids_stride, stream); } break; case 2: { constexpr int c_ncols_dst = 2; @@ -410,7 +443,7 @@ static void mul_mat_vec_q_switch_ncols_dst( mul_mat_vec_q_switch_fusion(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst, channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst, sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, - dims.first, dims.second, 0, stream); + dims.first, dims.second, 0, ids_stride, stream); } break; case 3: { constexpr int c_ncols_dst = 3; @@ -418,7 +451,7 @@ static void mul_mat_vec_q_switch_ncols_dst( mul_mat_vec_q_switch_fusion(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst, channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst, sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, - dims.first, dims.second, 0, stream); + dims.first, dims.second, 0, ids_stride, stream); } break; case 4: { constexpr int c_ncols_dst = 4; @@ -426,7 +459,7 @@ static void mul_mat_vec_q_switch_ncols_dst( mul_mat_vec_q_switch_fusion(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst, channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst, sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, - dims.first, dims.second, 0, stream); + dims.first, dims.second, 0, ids_stride, stream); } break; case 5: { constexpr int c_ncols_dst = 5; @@ -434,7 +467,7 @@ static void mul_mat_vec_q_switch_ncols_dst( mul_mat_vec_q_switch_fusion(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst, channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst, sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, - dims.first, dims.second, 0, stream); + dims.first, dims.second, 0, ids_stride, stream); } break; case 6: { constexpr int c_ncols_dst = 6; @@ -442,7 +475,7 @@ static void mul_mat_vec_q_switch_ncols_dst( mul_mat_vec_q_switch_fusion(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst, channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst, sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, - dims.first, dims.second, 0, stream); + dims.first, dims.second, 0, ids_stride, stream); } break; case 7: { constexpr int c_ncols_dst = 7; @@ -450,7 +483,7 @@ static void mul_mat_vec_q_switch_ncols_dst( mul_mat_vec_q_switch_fusion(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst, channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst, sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, - dims.first, dims.second, 0, stream); + dims.first, dims.second, 0, ids_stride, stream); } break; case 8: { constexpr int c_ncols_dst = 8; @@ -458,7 +491,7 @@ static void mul_mat_vec_q_switch_ncols_dst( mul_mat_vec_q_switch_fusion(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst, channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst, sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, - dims.first, dims.second, 0, stream); + dims.first, dims.second, 0, ids_stride, stream); } break; default: GGML_ABORT("fatal error"); @@ -474,127 +507,127 @@ static void mul_mat_vec_q_switch_type( const int nchannels_x, const int nchannels_y, const int nchannels_dst, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst, const int nsamples_x, const int nsamples_dst, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst, - cudaStream_t stream) { + const int ids_stride, cudaStream_t stream) { switch (type_x) { case GGML_TYPE_Q4_0: mul_mat_vec_q_switch_ncols_dst (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream); break; case GGML_TYPE_Q4_1: mul_mat_vec_q_switch_ncols_dst (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream); break; case GGML_TYPE_Q5_0: mul_mat_vec_q_switch_ncols_dst (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream); break; case GGML_TYPE_Q5_1: mul_mat_vec_q_switch_ncols_dst (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream); break; case GGML_TYPE_Q8_0: mul_mat_vec_q_switch_ncols_dst (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream); break; case GGML_TYPE_MXFP4: mul_mat_vec_q_switch_ncols_dst (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream); break; case GGML_TYPE_Q2_K: mul_mat_vec_q_switch_ncols_dst (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream); break; case GGML_TYPE_Q3_K: mul_mat_vec_q_switch_ncols_dst (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream); break; case GGML_TYPE_Q4_K: mul_mat_vec_q_switch_ncols_dst (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream); break; case GGML_TYPE_Q5_K: mul_mat_vec_q_switch_ncols_dst (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream); break; case GGML_TYPE_Q6_K: mul_mat_vec_q_switch_ncols_dst (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream); break; case GGML_TYPE_IQ2_XXS: mul_mat_vec_q_switch_ncols_dst (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream); break; case GGML_TYPE_IQ2_XS: mul_mat_vec_q_switch_ncols_dst (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream); break; case GGML_TYPE_IQ2_S: mul_mat_vec_q_switch_ncols_dst (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream); break; case GGML_TYPE_IQ3_XXS: mul_mat_vec_q_switch_ncols_dst (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream); break; case GGML_TYPE_IQ1_S: mul_mat_vec_q_switch_ncols_dst (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream); break; case GGML_TYPE_IQ1_M: mul_mat_vec_q_switch_ncols_dst (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream); break; case GGML_TYPE_IQ4_NL: mul_mat_vec_q_switch_ncols_dst (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream); break; case GGML_TYPE_IQ4_XS: mul_mat_vec_q_switch_ncols_dst (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream); break; case GGML_TYPE_IQ3_S: mul_mat_vec_q_switch_ncols_dst (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, - nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream); break; default: GGML_ABORT("fatal error"); @@ -622,7 +655,7 @@ void ggml_cuda_mul_mat_vec_q( GGML_ASSERT( nb0 == ts_dst); GGML_ASSERT(!ids || ids->nb[0] == ggml_type_size(ids->type)); - GGML_ASSERT(!ids || ne12 == 1); // Implementation is only correct for batch size 1. + GGML_ASSERT(!ids || ne12 <= MMVQ_MAX_BATCH_SIZE); const float * src1_d = (const float *) src1->data; const int32_t * ids_d = ids ? (const int32_t *) ids->data : nullptr; @@ -693,11 +726,13 @@ void ggml_cuda_mul_mat_vec_q( const int64_t stride_channel_dst = ids ? s1 : s2; const int64_t stride_channel_y = ids ? s11 : s12; + const int64_t ids_stride = ids ? ids->nb[1] / ggml_type_size(ids->type) : 0; + mul_mat_vec_q_switch_type( src0->data, src0->type, src1_q8_1.get(), ids_d, fusion_local, dst_d, ne00, ne01, ncols_dst, s01, stride_col_y, stride_col_dst, ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst, - ne03, ne3, s03, s13, s3, stream); + ne03, ne3, s03, s13, s3, ids_stride, stream); } void ggml_cuda_op_mul_mat_vec_q( @@ -726,7 +761,7 @@ void ggml_cuda_op_mul_mat_vec_q( ggml_cuda_mm_fusion_args_device fusion_local{}; mul_mat_vec_q_switch_type( src0_dd_i, src0->type, src1_ddq_i, nullptr, fusion_local, dst_dd_i, ne00, row_diff, src1_ncols, stride_row_x, stride_col_y, nrows_dst, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, stream); + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, stream); GGML_UNUSED_VARS(src1, dst, src1_ddf_i, src1_ncols, src1_padded_row_size); } From 32b17abdb044a17cfeb6cdf26215cfa01756fea4 Mon Sep 17 00:00:00 2001 From: Ruben Ortlam Date: Tue, 3 Feb 2026 17:37:32 +0100 Subject: [PATCH 05/11] vulkan: disable coopmat1 fa on Nvidia Turing (#19290) --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index a99375c08..cb7fa2c9c 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -254,6 +254,7 @@ enum vk_device_architecture { AMD_RDNA3, INTEL_XE2, NVIDIA_PRE_TURING, + NVIDIA_TURING, }; static vk_device_architecture get_device_architecture(const vk::PhysicalDevice& device) { @@ -336,18 +337,34 @@ static vk_device_architecture get_device_architecture(const vk::PhysicalDevice& const std::vector ext_props = device.enumerateDeviceExtensionProperties(); bool cooperative_matrix = false; + bool sm_builtins = false; // Detect "pre-turing" based on lack of coopmat support. for (const auto& properties : ext_props) { if (strcmp("VK_KHR_cooperative_matrix", properties.extensionName) == 0) { cooperative_matrix = true; - break; + } else if (strcmp("VK_NV_shader_sm_builtins", properties.extensionName) == 0) { + sm_builtins = true; } } if (!cooperative_matrix) { return vk_device_architecture::NVIDIA_PRE_TURING; } + + if (sm_builtins) { + vk::PhysicalDeviceProperties2 props2; + vk::PhysicalDeviceShaderSMBuiltinsPropertiesNV sm_props; + + props2.pNext = &sm_props; + + device.getProperties2(&props2); + + // Turing has 32, following architectures have 48 + if (sm_props.shaderWarpsPerSM == 32) { + return vk_device_architecture::NVIDIA_TURING; + } + } } return vk_device_architecture::OTHER; } @@ -8460,6 +8477,11 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx FaCodePath path = ctx->device->coopmat2 ? FA_COOPMAT2 : ctx->device->coopmat1_fa_support ? FA_COOPMAT1 : FA_SCALAR; + if (path == FA_COOPMAT1 && ctx->device->architecture == vk_device_architecture::NVIDIA_TURING) { + // Nvidia compiler bug, see https://github.com/ggml-org/llama.cpp/pull/19075#issuecomment-3820716090 + path = FA_SCALAR; + } + if (path == FA_COOPMAT1) { const bool coopmat_shape_supported = (dst->op_params[3] == GGML_PREC_F32 && ctx->device->coopmat_support_16x16x16_f32acc) || (dst->op_params[3] != GGML_PREC_F32 && ctx->device->coopmat_support_16x16x16_f16acc); From faa1bc26eed0e56539ac5e7a751607d76f01f10f Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 3 Feb 2026 22:16:16 +0200 Subject: [PATCH 06/11] sampling : delegate input allocation to the scheduler (#19266) * sampling : delegate input allocation to the scheduler * graph : compute backend samplers only if needed --- src/llama-context.cpp | 6 +--- src/llama-graph.cpp | 19 ++++++---- src/llama-sampling.cpp | 81 ++++++++++-------------------------------- 3 files changed, 33 insertions(+), 73 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 203852d0f..95b207e9e 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1027,11 +1027,7 @@ bool llama_context::set_sampler(llama_seq_id seq_id, llama_sampler * sampler) { llama_sampler_chain_n(sampler) > 0; if (sampler && can_offload) { - ggml_backend_buffer_type_t buft = ggml_backend_dev_buffer_type(model.dev_output()); - auto * host_buft = ggml_backend_dev_host_buffer_type(model.dev_output()); - if (host_buft) { - buft = host_buft; - } + auto * buft = ggml_backend_dev_buffer_type(model.dev_output()); sampler->iface->backend_init(sampler, buft); diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 16d42c4ae..54f4ed248 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -2419,6 +2419,9 @@ void llm_graph_context::build_sampling() const { return; } + std::array outs; + outs[0] = res->t_logits; + auto inp_sampling = std::make_unique(samplers); res->add_input(std::move(inp_sampling)); @@ -2439,14 +2442,14 @@ void llm_graph_context::build_sampling() const { // add a dummy row of logits // this trick makes the graph static, regardless of which samplers are activated // this is important in order to minimize graph reallocations - // TODO: use `ggml_build_forward_select()` when available (https://github.com/ggml-org/llama.cpp/pull/18550) ggml_tensor * logits_t = ggml_pad(ctx0, res->t_logits, 0, 1, 0, 0); for (const auto & [seq_id, sampler] : samplers) { const auto it = seq_to_logit_row.find(seq_id); // inactive samplers always work on the first row - const auto row_idx = seq_to_logit_row.find(seq_id) != seq_to_logit_row.end() ? it->second : 0; + const auto row_idx = it != seq_to_logit_row.end() ? it->second : 0; + const int i_out = it != seq_to_logit_row.end() ? 1 : 0; ggml_tensor * logits_seq = ggml_view_1d(ctx0, logits_t, logits_t->ne[0], row_idx * logits_t->nb[1]); ggml_format_name(logits_seq, "logits_seq_%d", seq_id); @@ -2463,22 +2466,26 @@ void llm_graph_context::build_sampling() const { if (data.sampled != nullptr) { res->t_sampled[seq_id] = data.sampled; - ggml_build_forward_expand(gf, data.sampled); + outs[1] = data.sampled; + ggml_build_forward_select(gf, outs.data(), outs.size(), i_out); } if (data.probs != nullptr) { res->t_sampled_probs[seq_id] = data.probs; - ggml_build_forward_expand(gf, data.probs); + outs[1] = data.probs; + ggml_build_forward_select(gf, outs.data(), outs.size(), i_out); } if (data.logits != nullptr) { res->t_sampled_logits[seq_id] = data.logits; - ggml_build_forward_expand(gf, data.logits); + outs[1] = data.logits; + ggml_build_forward_select(gf, outs.data(), outs.size(), i_out); } if (data.candidates != nullptr) { res->t_candidates[seq_id] = data.candidates; - ggml_build_forward_expand(gf, data.candidates); + outs[1] = data.candidates; + ggml_build_forward_select(gf, outs.data(), outs.size(), i_out); } } diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 5dde51306..515d6c163 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -1025,11 +1025,7 @@ struct llama_sampler_dist : public llama_sampler_backend { std::mt19937 rng; - // backend input - struct ggml_tensor * inp_uniform; - - ggml_context_ptr inp_ctx; - ggml_backend_buffer_ptr inp_buf; + ggml_tensor * inp_uniform; }; static const char * llama_sampler_dist_name(const struct llama_sampler * smpl) { @@ -1138,37 +1134,10 @@ static bool llama_sampler_dist_backend_init( ggml_backend_buffer_type_t buft) { auto * sctx = (llama_sampler_dist *) smpl->ctx; - // allocate inputs - { - ggml_init_params params = { - /*.mem_size =*/ ggml_tensor_overhead(), - /*.mem_buffer =*/ nullptr, - /*.no_alloc =*/ true, - }; - - sctx->inp_ctx.reset(ggml_init(params)); - - // Create the uniform random scalar input tensor. This will be set by - // llama_sampler_dist_backend_set_input after this graph is built. - sctx->inp_uniform = ggml_new_tensor_1d(sctx->inp_ctx.get(), GGML_TYPE_F32, 1); - ggml_set_name (sctx->inp_uniform, "uniform"); - ggml_set_input(sctx->inp_uniform); - - // Allocate all tensors from our context to the backend - sctx->inp_buf.reset(ggml_backend_alloc_ctx_tensors_from_buft(sctx->inp_ctx.get(), buft)); - - ggml_backend_buffer_clear(sctx->inp_buf.get(), 0); - } - const bool res = llama_sampler_backend_support(smpl, buft); sctx->init(res); - if (!res) { - sctx->inp_ctx.reset(nullptr); - sctx->inp_buf.reset(nullptr); - } - return res; } @@ -1178,8 +1147,13 @@ static void llama_sampler_dist_backend_apply( struct ggml_cgraph * gf, struct llama_sampler_data * data) { GGML_UNUSED(gf); + auto * sctx = (llama_sampler_dist *) smpl->ctx; + sctx->inp_uniform = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); + ggml_set_name (sctx->inp_uniform, "uniform"); + ggml_set_input(sctx->inp_uniform); + struct ggml_tensor * probs = ggml_soft_max(ctx, data->logits); ggml_set_name(probs, "dist_probs"); @@ -1226,6 +1200,7 @@ static void llama_sampler_dist_backend_apply( static void llama_sampler_dist_backend_set_input(struct llama_sampler * smpl) { auto * sctx = (llama_sampler_dist *) smpl->ctx; + GGML_ASSERT(sctx->inp_uniform != nullptr); // We sample in double precision and cast to float to match rnd numbers of @@ -1262,8 +1237,6 @@ struct llama_sampler * llama_sampler_init_dist(uint32_t seed) { /* .seed_cur = */ seed_cur, /* .rng = */ std::mt19937(seed_cur), /* .inp_uniform = */ nullptr, - /* .inp_ctx = */ nullptr, - /* .inp_buf = */ nullptr, } ); } @@ -3461,9 +3434,6 @@ struct llama_sampler_logit_bias : public llama_sampler_backend { struct ggml_tensor * inp_logit_bias; struct ggml_tensor * inp_logit_idxs; - - ggml_context_ptr inp_ctx; - ggml_backend_buffer_ptr inp_buf; }; static const char * llama_sampler_logit_bias_name(const struct llama_sampler * smpl) { @@ -3526,6 +3496,16 @@ static void llama_sampler_logit_bias_backend_apply( return; } + const size_t n = sctx->logit_bias.size(); + + sctx->inp_logit_bias = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, n); + ggml_set_name(sctx->inp_logit_bias, "logit_bias"); + ggml_set_input(sctx->inp_logit_bias); + + sctx->inp_logit_idxs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n); + ggml_set_name(sctx->inp_logit_idxs, "logit_idxs"); + ggml_set_input(sctx->inp_logit_idxs); + ggml_tensor * cur = ggml_fill(ctx, data->logits, 0.0f); cur = ggml_reshape_2d(ctx, cur, 1, ggml_nelements(cur)); @@ -3562,6 +3542,8 @@ static void llama_sampler_logit_bias_backend_set_input(struct llama_sampler * sm static bool llama_sampler_logit_bias_backend_init( struct llama_sampler * smpl, ggml_backend_buffer_type_t buft) { + GGML_UNUSED(buft); + auto * sctx = (llama_sampler_logit_bias *) smpl->ctx; sctx->init(true); @@ -3570,29 +3552,6 @@ static bool llama_sampler_logit_bias_backend_init( return true; } - ggml_init_params params = { - /*.mem_size =*/ 2*ggml_tensor_overhead(), - /*.mem_buffer =*/ nullptr, - /*.no_alloc =*/ true, - }; - - sctx->inp_ctx.reset(ggml_init(params)); - - const size_t n = sctx->logit_bias.size(); - - sctx->inp_logit_bias = ggml_new_tensor_2d(sctx->inp_ctx.get(), GGML_TYPE_F32, 1, n); - ggml_set_name(sctx->inp_logit_bias, "logit_bias"); - ggml_set_input(sctx->inp_logit_bias); - - sctx->inp_logit_idxs = ggml_new_tensor_1d(sctx->inp_ctx.get(), GGML_TYPE_I32, n); - ggml_set_name(sctx->inp_logit_idxs, "logit_idxs"); - ggml_set_input(sctx->inp_logit_idxs); - - // Allocate all tensors from our context to the backend - sctx->inp_buf.reset(ggml_backend_alloc_ctx_tensors_from_buft(sctx->inp_ctx.get(), buft)); - - ggml_backend_buffer_clear(sctx->inp_buf.get(), 0); - return true; } @@ -3628,8 +3587,6 @@ struct llama_sampler * llama_sampler_init_logit_bias( /* .to_search = */ {}, /* .inp_logit_bias = */ nullptr, /* .inp_logit_idxs = */ nullptr, - /* .inp_ctx = */ nullptr, - /* .inp_buf = */ nullptr, } ); } From 6a9bf2f7882394d74fc54a43764d47316b3f6aef Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 3 Feb 2026 22:41:20 +0200 Subject: [PATCH 07/11] ci : add sanitizer runs for server (#19291) --- .github/workflows/server.yml | 16 ++++++++++++---- CMakeLists.txt | 23 ----------------------- cmake/common.cmake | 23 +++++++++++++++++++++++ 3 files changed, 35 insertions(+), 27 deletions(-) diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml index 9f1ef48c8..3d342c35f 100644 --- a/.github/workflows/server.yml +++ b/.github/workflows/server.yml @@ -36,7 +36,7 @@ jobs: strategy: matrix: - sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken + sanitizer: [ADDRESS, UNDEFINED] # THREAD is very slow build_type: [RelWithDebInfo] include: - build_type: Release @@ -45,7 +45,7 @@ jobs: - build_type: Release sanitizer: "" extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1" - fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken + fail-fast: false steps: - name: Dependencies @@ -72,7 +72,15 @@ jobs: - name: Build id: cmake_build run: | - cmake -B build -DLLAMA_BUILD_BORINGSSL=ON -DGGML_SCHED_NO_REALLOC=ON + cmake -B build \ + -DLLAMA_BUILD_BORINGSSL=ON \ + -DGGML_SCHED_NO_REALLOC=ON \ + -DGGML_SANITIZE_ADDRESS=${{ matrix.sanitizer == 'ADDRESS' }} \ + -DGGML_SANITIZE_THREAD=${{ matrix.sanitizer == 'THREAD' }} \ + -DGGML_SANITIZE_UNDEFINED=${{ matrix.sanitizer == 'UNDEFINED' }} \ + -DLLAMA_SANITIZE_ADDRESS=${{ matrix.sanitizer == 'ADDRESS' }} \ + -DLLAMA_SANITIZE_THREAD=${{ matrix.sanitizer == 'THREAD' }} \ + -DLLAMA_SANITIZE_UNDEFINED=${{ matrix.sanitizer == 'UNDEFINED' }} cmake --build build --config ${{ matrix.build_type }} -j ${env:NUMBER_OF_PROCESSORS} --target llama-server - name: Python setup @@ -88,7 +96,7 @@ jobs: - name: Tests id: server_integration_tests - if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) && matrix.build_type == 'Release' }} + if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }} run: | cd tools/server/tests export ${{ matrix.extra_args }} diff --git a/CMakeLists.txt b/CMakeLists.txt index d24fa080a..6d4ed6702 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -164,29 +164,6 @@ llama_option_depr(WARNING LLAMA_SYCL GGML_SYCL) llama_option_depr(WARNING LLAMA_SYCL_F16 GGML_SYCL_F16) llama_option_depr(WARNING LLAMA_CANN GGML_CANN) -if (NOT MSVC) - if (LLAMA_SANITIZE_THREAD) - message(STATUS "Using -fsanitize=thread") - - add_compile_options(-fsanitize=thread) - link_libraries (-fsanitize=thread) - endif() - - if (LLAMA_SANITIZE_ADDRESS) - message(STATUS "Using -fsanitize=address") - - add_compile_options(-fsanitize=address -fno-omit-frame-pointer) - link_libraries (-fsanitize=address) - endif() - - if (LLAMA_SANITIZE_UNDEFINED) - message(STATUS "Using -fsanitize=undefined") - - add_compile_options(-fsanitize=undefined) - link_libraries (-fsanitize=undefined) - endif() -endif() - include("cmake/license.cmake") license_add_file("llama.cpp" "LICENSE") diff --git a/cmake/common.cmake b/cmake/common.cmake index a5bb787f1..bcf403e0e 100644 --- a/cmake/common.cmake +++ b/cmake/common.cmake @@ -32,4 +32,27 @@ function(llama_add_compile_flags) set(CXX_FLAGS "" PARENT_SCOPE) endif() endif() + + if (NOT MSVC) + if (LLAMA_SANITIZE_THREAD) + message(STATUS "Using -fsanitize=thread") + + add_compile_options(-fsanitize=thread) + link_libraries (-fsanitize=thread) + endif() + + if (LLAMA_SANITIZE_ADDRESS) + message(STATUS "Using -fsanitize=address") + + add_compile_options(-fsanitize=address -fno-omit-frame-pointer) + link_libraries (-fsanitize=address) + endif() + + if (LLAMA_SANITIZE_UNDEFINED) + message(STATUS "Using -fsanitize=undefined") + + add_compile_options(-fsanitize=undefined) + link_libraries (-fsanitize=undefined) + endif() + endif() endfunction() From 44008ce8f93a5dc025a40ba3150ab9d119af22f0 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 3 Feb 2026 23:43:14 +0200 Subject: [PATCH 08/11] metal : add solve_tri (#19302) --- ggml/src/ggml-metal/ggml-metal-device.cpp | 30 +++++++++ ggml/src/ggml-metal/ggml-metal-device.h | 1 + ggml/src/ggml-metal/ggml-metal-device.m | 1 + ggml/src/ggml-metal/ggml-metal-impl.h | 30 ++++++++- ggml/src/ggml-metal/ggml-metal-ops.cpp | 61 ++++++++++++++++++ ggml/src/ggml-metal/ggml-metal-ops.h | 1 + ggml/src/ggml-metal/ggml-metal.metal | 77 +++++++++++++++++++++++ 7 files changed, 200 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-metal/ggml-metal-device.cpp b/ggml/src/ggml-metal/ggml-metal-device.cpp index 377b0d3eb..4cd3d93d8 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.cpp +++ b/ggml/src/ggml-metal/ggml-metal-device.cpp @@ -534,6 +534,36 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_rwkv(ggml_metal_ return res; } +ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_solve_tri(ggml_metal_library_t lib, const ggml_tensor * op) { + char base[256]; + char name[256]; + + const int nsg = 8; + const int n = op->src[1]->ne[1]; + const int k = op->src[1]->ne[0]; + + snprintf(base, 256, "kernel_solve_tri_%s", ggml_type_name(op->src[0]->type)); + snprintf(name, 256, "%s_nsg=%d_n=%d_k=%d", base, nsg, n, k); + + ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name); + if (!res.pipeline) { + ggml_metal_cv_t cv = ggml_metal_cv_init(); + + ggml_metal_cv_set_int16(cv, nsg, FC_SOLVE_TRI + 0); + ggml_metal_cv_set_int16(cv, n, FC_SOLVE_TRI + 1); + ggml_metal_cv_set_int16(cv, k, FC_SOLVE_TRI + 2); + + res = ggml_metal_library_compile_pipeline(lib, base, name, cv); + + ggml_metal_cv_free(cv); + } + + res.nsg = nsg; + res.smem = GGML_PAD(GGML_PAD(n, 32)*nsg*sizeof(float), 16); + + return res; +} + ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv_ext(ggml_metal_library_t lib, ggml_type tsrc0, ggml_type tsrc1, int nsg, int nxpsg, int r1ptg) { char base[256]; char name[256]; diff --git a/ggml/src/ggml-metal/ggml-metal-device.h b/ggml/src/ggml-metal/ggml-metal-device.h index afb091e72..d89843271 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.h +++ b/ggml/src/ggml-metal/ggml-metal-device.h @@ -121,6 +121,7 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_conv struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_conv_batched (ggml_metal_library_t lib, const struct ggml_tensor * op, int ssm_conv_bs); struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_scan (ggml_metal_library_t lib, const struct ggml_tensor * op); struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_rwkv (ggml_metal_library_t lib, const struct ggml_tensor * op); +struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_solve_tri (ggml_metal_library_t lib, const struct ggml_tensor * op); struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv_ext (ggml_metal_library_t lib, enum ggml_type tsrc0, enum ggml_type tsrc1, int nsg, int nxpsg, int r1ptg); struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mm (ggml_metal_library_t lib, const struct ggml_tensor * op); struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv (ggml_metal_library_t lib, const struct ggml_tensor * op); diff --git a/ggml/src/ggml-metal/ggml-metal-device.m b/ggml/src/ggml-metal/ggml-metal-device.m index 285dd1630..8a0b85c6e 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.m +++ b/ggml/src/ggml-metal/ggml-metal-device.m @@ -1152,6 +1152,7 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te return has_simdgroup_reduction; case GGML_OP_RWKV_WKV6: case GGML_OP_RWKV_WKV7: + case GGML_OP_SOLVE_TRI: return true; case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT_ID: diff --git a/ggml/src/ggml-metal/ggml-metal-impl.h b/ggml/src/ggml-metal/ggml-metal-impl.h index e074f2ef3..640ade8f8 100644 --- a/ggml/src/ggml-metal/ggml-metal-impl.h +++ b/ggml/src/ggml-metal/ggml-metal-impl.h @@ -78,7 +78,8 @@ #define FC_MUL_MM 700 #define FC_ROPE 800 #define FC_SSM_CONV 900 -#define FC_COUNT_EQUAL 1000 +#define FC_SOLVE_TRI 1000 +#define FC_COUNT_EQUAL 1100 // op-specific constants #define OP_FLASH_ATTN_EXT_NQPSG 8 @@ -733,6 +734,33 @@ typedef struct { uint64_t nb0; } ggml_metal_kargs_ssm_scan; +typedef struct { + int32_t ne00; + int32_t ne01; + int32_t ne02; + int32_t ne03; + uint64_t nb00; + uint64_t nb01; + uint64_t nb02; + uint64_t nb03; + int32_t ne10; + int32_t ne11; + int32_t ne12; + int32_t ne13; + uint64_t nb10; + uint64_t nb11; + uint64_t nb12; + uint64_t nb13; + int32_t ne0; + int32_t ne1; + int32_t ne2; + int32_t ne3; + uint64_t nb0; + uint64_t nb1; + uint64_t nb2; + uint64_t nb3; +} ggml_metal_kargs_solve_tri; + typedef struct { int32_t ne00t; int32_t ne00; diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp index f97c4435d..753fcec31 100644 --- a/ggml/src/ggml-metal/ggml-metal-ops.cpp +++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp @@ -341,6 +341,10 @@ static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) { { n_fuse = ggml_metal_op_rwkv(ctx, idx); } break; + case GGML_OP_SOLVE_TRI: + { + n_fuse = ggml_metal_op_solve_tri(ctx, idx); + } break; case GGML_OP_MUL_MAT: { n_fuse = ggml_metal_op_mul_mat(ctx, idx); @@ -1557,6 +1561,63 @@ int ggml_metal_op_rwkv(ggml_metal_op_t ctx, int idx) { return 1; } +int ggml_metal_op_solve_tri(ggml_metal_op_t ctx, int idx) { + ggml_tensor * op = ctx->node(idx); + + ggml_metal_library_t lib = ctx->lib; + ggml_metal_encoder_t enc = ctx->enc; + + GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne); + GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb); + GGML_TENSOR_LOCALS( int32_t, ne1, op->src[1], ne); + GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb); + GGML_TENSOR_LOCALS( int32_t, ne, op, ne); + GGML_TENSOR_LOCALS(uint64_t, nb, op, nb); + + ggml_metal_kargs_solve_tri args = { + /*.ne00 =*/ ne00, + /*.ne01 =*/ ne01, + /*.ne02 =*/ ne02, + /*.ne03 =*/ ne03, + /*.nb00 =*/ nb00, + /*.nb01 =*/ nb01, + /*.nb02 =*/ nb02, + /*.nb03 =*/ nb03, + /*.ne10 =*/ ne10, + /*.ne11 =*/ ne11, + /*.ne12 =*/ ne12, + /*.ne13 =*/ ne13, + /*.nb10 =*/ nb10, + /*.nb11 =*/ nb11, + /*.nb12 =*/ nb12, + /*.nb13 =*/ nb13, + /*.ne0 =*/ ne0, + /*.ne1 =*/ ne1, + /*.ne2 =*/ ne2, + /*.ne3 =*/ ne3, + /*.nb0 =*/ nb0, + /*.nb1 =*/ nb1, + /*.nb2 =*/ nb2, + /*.nb3 =*/ nb3, + }; + + auto pipeline = ggml_metal_library_get_pipeline_solve_tri(lib, op); + + ggml_metal_encoder_set_pipeline(enc, pipeline); + ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0); + ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1); + ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 2); + ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 3); + + const int nsg = pipeline.nsg; + + ggml_metal_encoder_set_threadgroup_memory_size(enc, pipeline.smem, 0); + + ggml_metal_encoder_dispatch_threadgroups(enc, (ne10 + nsg - 1)/nsg, ne02, ne03, 32, nsg, 1); + + return 1; +} + int ggml_metal_op_cpy(ggml_metal_op_t ctx, int idx) { ggml_tensor * op = ctx->node(idx); diff --git a/ggml/src/ggml-metal/ggml-metal-ops.h b/ggml/src/ggml-metal/ggml-metal-ops.h index 10686a334..2e4c7d3fa 100644 --- a/ggml/src/ggml-metal/ggml-metal-ops.h +++ b/ggml/src/ggml-metal/ggml-metal-ops.h @@ -60,6 +60,7 @@ int ggml_metal_op_soft_max (ggml_metal_op_t ctx, int idx); int ggml_metal_op_ssm_conv (ggml_metal_op_t ctx, int idx); int ggml_metal_op_ssm_scan (ggml_metal_op_t ctx, int idx); int ggml_metal_op_rwkv (ggml_metal_op_t ctx, int idx); +int ggml_metal_op_solve_tri (ggml_metal_op_t ctx, int idx); int ggml_metal_op_cpy (ggml_metal_op_t ctx, int idx); int ggml_metal_op_pool_1d (ggml_metal_op_t ctx, int idx); int ggml_metal_op_pool_2d (ggml_metal_op_t ctx, int idx); diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index 3259213fd..c09a54e66 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -2737,6 +2737,83 @@ kernel void kernel_rwkv_wkv7_f32( } } +constant short FC_solve_tri_nsg [[function_constant(FC_SOLVE_TRI + 0)]]; +constant short FC_solve_tri_n [[function_constant(FC_SOLVE_TRI + 1)]]; +constant short FC_solve_tri_k [[function_constant(FC_SOLVE_TRI + 2)]]; + +kernel void kernel_solve_tri_f32( + constant ggml_metal_kargs_solve_tri & args, + device const char * src0, + device const char * src1, + device char * dst, + threadgroup char * shmem [[threadgroup(0)]], + ushort3 tgpig[[threadgroup_position_in_grid]], + ushort sgitg[[simdgroup_index_in_threadgroup]], + ushort tiisg[[thread_index_in_simdgroup]], + ushort3 ntg[[threads_per_threadgroup]]) { + constexpr short NW = N_SIMDWIDTH; + + const short NSG = FC_solve_tri_nsg; + const short N = FC_solve_tri_n; + const short K = FC_solve_tri_k; + const short NP = PAD2(N, NW); + + const int32_t ne02 = args.ne02; + const int32_t ne03 = args.ne03; + + const int32_t i03 = tgpig.z; + const int32_t i02 = tgpig.y; + const int32_t i01 = tgpig.x*NSG + sgitg; + + threadgroup float * sh0 = (threadgroup float *) shmem; + + device const float * src0_ptr = (device const float *)(src0 + i02 * args.nb02 + i03 * args.nb03) + sgitg*N; + device const float * src1_ptr = (device const float *)(src1 + i02 * args.nb12 + i03 * args.nb13) + i01; + device float * dst_ptr = (device float *)(dst + i02 * args.nb2 + i03 * args.nb3) + i01; + + for (short rr = 0; rr < N; rr += NSG) { + threadgroup_barrier(mem_flags::mem_threadgroup); + + { + threadgroup float * sh0_cur = sh0 + sgitg*NP; + + for (short t = 0; t*NW < N; ++t) { + const short idx = t*NW + tiisg; + sh0_cur[idx] = src0_ptr[idx]; + } + + src0_ptr += NSG*N; + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + if (i01 >= args.ne10) { + continue; + } + + for (short ir = 0; ir < NSG && rr + ir < N; ++ir) { + const short r = rr + ir; + + threadgroup float * sh0_cur = sh0 + ir*NP; + + float sum = 0.0f; + + for (short t = 0; t*NW < r; ++t) { + const short idx = t*NW + tiisg; + sum += sh0_cur[idx] * dst_ptr[idx*K] * (idx < r); + } + + sum = simd_sum(sum); + + if (tiisg == 0) { + const float diag = sh0_cur[r]; + + dst_ptr[r*K] = (src1_ptr[r*K] - sum) / diag; + } + } + } +} + kernel void kernel_argmax_f32( constant ggml_metal_kargs_argmax & args, device const char * src0, From 2ceda3f6622661af839c19767705f33fb9f6cdd2 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Wed, 4 Feb 2026 09:43:29 +0800 Subject: [PATCH 09/11] ggml-cpu: use LUT for converting e8->f32 scales on x86 (#19288) * ggml-cpu: use LUT for converting e8->f32 scales on x86 * add dispatch based on macro --- ggml/src/ggml-cpu/arch/x86/quants.c | 18 +++++++++--------- ggml/src/ggml-cpu/ggml-cpu.c | 8 ++++++++ ggml/src/ggml-cpu/simd-mappings.h | 11 +++++++++++ 3 files changed, 28 insertions(+), 9 deletions(-) diff --git a/ggml/src/ggml-cpu/arch/x86/quants.c b/ggml/src/ggml-cpu/arch/x86/quants.c index cb49320a6..74d699f63 100644 --- a/ggml/src/ggml-cpu/arch/x86/quants.c +++ b/ggml/src/ggml-cpu/arch/x86/quants.c @@ -268,9 +268,9 @@ static inline __m256 quad_fp16_delta_float(const float x0, const float y0, const _mm_set1_ps(GGML_CPU_FP16_TO_FP32(x0) * GGML_CPU_FP16_TO_FP32(y0))); } -static inline __m256 quad_mx_delta_float(const int8_t x0, const float y0, const int8_t x1, const float y1) { - return _mm256_set_m128(_mm_set1_ps(GGML_E8M0_TO_FP32_HALF(x1) * GGML_CPU_FP16_TO_FP32(y1)), - _mm_set1_ps(GGML_E8M0_TO_FP32_HALF(x0) * GGML_CPU_FP16_TO_FP32(y0))); +static inline __m256 quad_mx_delta_float(const uint8_t x0, const float y0, const uint8_t x1, const float y1) { + return _mm256_set_m128(_mm_set1_ps(GGML_CPU_E8M0_TO_FP32_HALF(x1) * GGML_CPU_FP16_TO_FP32(y1)), + _mm_set1_ps(GGML_CPU_E8M0_TO_FP32_HALF(x0) * GGML_CPU_FP16_TO_FP32(y0))); } #endif #elif defined(__SSSE3__) @@ -782,6 +782,7 @@ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo __m256 accum1 = _mm256_setzero_ps(); __m256 accum2 = _mm256_setzero_ps(); + for (; ib + 1 < nb; ib += 2) { const __m128i q4bits_1 = _mm_loadu_si128((const __m128i*)x[ib + 0].qs); const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)x[ib + 1].qs); @@ -795,10 +796,10 @@ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2); const __m256i p_1 = _mm256_madd_epi16(p16_1, mone); const __m256i p_2 = _mm256_madd_epi16(p16_2, mone); - accum1 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 0].d)*GGML_E8M0_TO_FP32_HALF(x[ib + 0].e)), - _mm256_cvtepi32_ps(p_1), accum1); - accum2 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 1].d)*GGML_E8M0_TO_FP32_HALF(x[ib + 1].e)), - _mm256_cvtepi32_ps(p_2), accum2); + const __m256 scale0 = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 0].d)*GGML_CPU_E8M0_TO_FP32_HALF(x[ib + 0].e)); + const __m256 scale1 = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 1].d)*GGML_CPU_E8M0_TO_FP32_HALF(x[ib + 1].e)); + accum1 = _mm256_fmadd_ps(scale0, _mm256_cvtepi32_ps(p_1), accum1); + accum2 = _mm256_fmadd_ps(scale1, _mm256_cvtepi32_ps(p_2), accum2); } sumf = hsum_float_8(_mm256_add_ps(accum1, accum2)); @@ -830,7 +831,7 @@ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo #endif for (; ib < nb; ++ib) { - const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_E8M0_TO_FP32_HALF(x[ib].e); + const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_E8M0_TO_FP32_HALF(x[ib].e); int sumi1 = 0; int sumi2 = 0; for (int j = 0; j < QK_MXFP4/2; ++j) { @@ -3817,4 +3818,3 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v ggml_vec_dot_iq4_xs_q8_K_generic(n, s, bs, vx, bx, vy, by, nrc); #endif } - diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 3e5f01e3f..b003fe13f 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -75,6 +75,9 @@ // precomputed f32 table for f16 (256 KB) (simd-mappings.h) float ggml_table_f32_f16[1 << 16]; +// precomputed f32 table for e8m0 half (1 KB) (simd-mappings.h) +float ggml_table_f32_e8m0_half[1 << 8]; + #if defined(__ARM_ARCH) struct ggml_arm_arch_features_type { int sve_cnt; @@ -3681,6 +3684,11 @@ void ggml_cpu_init(void) { ggml_table_gelu_quick_f16[i] = GGML_CPU_FP32_TO_FP16(ggml_gelu_quick_f32(f)); } + // initialize E8M0 half table (256 entries) + for (int i = 0; i < (1 << 8); ++i) { + ggml_table_f32_e8m0_half[i] = GGML_E8M0_TO_FP32_HALF(i); + } + const uint64_t t_end = ggml_time_us(); UNUSED(t_end); GGML_PRINT_DEBUG("%s: GELU, Quick GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0); diff --git a/ggml/src/ggml-cpu/simd-mappings.h b/ggml/src/ggml-cpu/simd-mappings.h index e367f110b..630e50654 100644 --- a/ggml/src/ggml-cpu/simd-mappings.h +++ b/ggml/src/ggml-cpu/simd-mappings.h @@ -116,6 +116,17 @@ extern "C" { // defined in ggml-cpu.c, initialized in ggml_cpu_init() extern float ggml_table_f32_f16[1 << 16]; +// precomputed f32 table for e8m0 half (1 KB) +// defined in ggml-cpu.c, initialized in ggml_cpu_init() +extern float ggml_table_f32_e8m0_half[1 << 8]; + +// Use lookup table for E8M0 on x86 (faster than bit manipulation) +#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) +#define GGML_CPU_E8M0_TO_FP32_HALF(x) ggml_table_f32_e8m0_half[(uint8_t)(x)] +#else +#define GGML_CPU_E8M0_TO_FP32_HALF(x) GGML_E8M0_TO_FP32_HALF(x) +#endif + // On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32, // so we define GGML_CPU_FP16_TO_FP32 and GGML_CPU_FP32_TO_FP16 elsewhere for NEON. // This is also true for POWER9. From 015deb90485b9ebfff492e10fe7080a0439202b6 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 4 Feb 2026 03:46:18 +0100 Subject: [PATCH 10/11] ggml-virtgpu: make the code thread safe (#19204) * ggml-virtgpu: regenerate_remoting.py: add the ability to deprecate a function * ggml-virtgpu: deprecate buffer_type is_host remoting not necessary * ggml-virtgpu: stop using static vars as cache The static init isn't thread safe. * ggml-virtgpu: protect the use of the shared memory to transfer data * ggml-virtgpu: make the remote calls thread-safe * ggml-virtgpu: backend: don't continue if couldn't allocate the tensor memory * ggml-virtgpu: add a cleanup function for consistency * ggml-virtgpu: backend: don't crash if buft->iface.get_max_size is missing * fix style and ordering * Remove the static variable in apir_device_get_count * ggml-virtgpu: improve the logging * fix review minor formatting changes --- ggml/include/ggml-virtgpu.h | 2 - .../ggml-virtgpu/apir_cs_ggml-rpc-front.cpp | 2 +- .../backend/backend-dispatched-backend.cpp | 4 +- .../backend-dispatched-buffer-type.cpp | 12 +- .../backend/backend-dispatched-buffer.cpp | 6 +- .../backend/backend-dispatched-device.cpp | 2 +- .../backend/backend-dispatched.cpp | 8 +- .../backend/backend-dispatched.gen.h | 5 +- .../ggml-virtgpu/backend/backend-dispatched.h | 2 + ggml/src/ggml-virtgpu/backend/backend.cpp | 32 ++-- .../src/ggml-virtgpu/backend/shared/apir_cs.h | 11 +- .../backend/shared/apir_cs_ggml.h | 14 +- .../ggml-virtgpu/ggml-backend-buffer-type.cpp | 29 +--- ggml/src/ggml-virtgpu/ggml-backend-device.cpp | 61 ++++--- ggml/src/ggml-virtgpu/ggml-backend-reg.cpp | 94 ++++++++--- ggml/src/ggml-virtgpu/ggml-remoting.h | 5 +- .../ggml-virtgpu/ggmlremoting_functions.yaml | 20 ++- ggml/src/ggml-virtgpu/regenerate_remoting.py | 18 ++- .../ggml-virtgpu/virtgpu-forward-backend.cpp | 14 +- .../virtgpu-forward-buffer-type.cpp | 41 ++--- .../ggml-virtgpu/virtgpu-forward-buffer.cpp | 28 +++- .../ggml-virtgpu/virtgpu-forward-device.cpp | 22 +-- ggml/src/ggml-virtgpu/virtgpu-forward-impl.h | 6 +- ggml/src/ggml-virtgpu/virtgpu-forward.gen.h | 21 +-- ggml/src/ggml-virtgpu/virtgpu-shm.cpp | 3 +- ggml/src/ggml-virtgpu/virtgpu.cpp | 149 ++++++++++++------ ggml/src/ggml-virtgpu/virtgpu.h | 23 +++ 27 files changed, 397 insertions(+), 237 deletions(-) diff --git a/ggml/include/ggml-virtgpu.h b/ggml/include/ggml-virtgpu.h index 1cb4bd7a0..faaba8f24 100644 --- a/ggml/include/ggml-virtgpu.h +++ b/ggml/include/ggml-virtgpu.h @@ -7,8 +7,6 @@ extern "C" { #endif -#define GGML_REMOTING_FRONTEND_NAME "RemotingFrontend" - GGML_BACKEND_API ggml_backend_reg_t ggml_backend_virtgpu_reg(); #ifdef __cplusplus diff --git a/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp b/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp index f60ae3556..d2e87330a 100644 --- a/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +++ b/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp @@ -36,7 +36,7 @@ apir_rpc_tensor apir_serialize_tensor(const ggml_tensor * tensor) { result.data = reinterpret_cast(tensor->data); if (tensor->data) { if (!tensor->buffer) { - GGML_ABORT("tensor has data but not buffer"); + GGML_ABORT("%s: tensor has data but not buffer", __func__); } // tensor->data is serialized as an offset to the buffer base address result.data -= reinterpret_cast(BUFFER_TO_GGML_CONTEXT(tensor->buffer)->base); diff --git a/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp b/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp index 77b4ee71e..cc879e51d 100644 --- a/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +++ b/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp @@ -27,7 +27,7 @@ uint32_t backend_backend_graph_compute(apir_encoder * enc, apir_decoder * dec, v const void * shmem_data = ctx->iface->get_shmem_ptr(ctx->ctx_id, shmem_res_id); if (!shmem_data) { - GGML_LOG_ERROR("Couldn't get the shmem addr from virgl\n"); + GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Couldn't get the shmem addr from virgl\n", __func__); apir_decoder_set_fatal(dec); return 1; } @@ -45,7 +45,7 @@ uint32_t backend_backend_graph_compute(apir_encoder * enc, apir_decoder * dec, v if (dev->iface.supports_op(dev, op)) { continue; } - GGML_LOG_ERROR("Graph node %d (%s) not supported by the backend\n", idx, ggml_op_desc(op)); + GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Graph node %d (%s) not supported by the backend\n", idx, ggml_op_desc(op)); status = GGML_STATUS_ABORTED; apir_encode_ggml_status(enc, &status); diff --git a/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp b/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp index 8ea1bb4fb..d55eec276 100644 --- a/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +++ b/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp @@ -36,18 +36,22 @@ uint32_t backend_buffer_type_get_max_size(apir_encoder * enc, apir_decoder * dec ggml_backend_buffer_type_t buft; buft = apir_decode_ggml_buffer_type(dec); - size_t value = buft->iface.get_max_size(buft); + size_t value = SIZE_MAX; + if (buft->iface.get_max_size) { + value = buft->iface.get_max_size(buft); + } + apir_encode_size_t(enc, &value); return 0; } +/* APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST is deprecated. Keeping the handler for backward compatibility. */ uint32_t backend_buffer_type_is_host(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) { GGML_UNUSED(ctx); - ggml_backend_buffer_type_t buft; - buft = apir_decode_ggml_buffer_type(dec); + GGML_UNUSED(dec); + const bool is_host = false; - bool is_host = buft->iface.is_host(buft); apir_encode_bool_t(enc, &is_host); return 0; diff --git a/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp b/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp index cf81888e9..8cc063ff0 100644 --- a/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +++ b/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp @@ -40,7 +40,7 @@ uint32_t backend_buffer_set_tensor(apir_encoder * enc, apir_decoder * dec, virgl void * shmem_data = ctx->iface->get_shmem_ptr(ctx->ctx_id, shmem_res_id); if (!shmem_data) { - GGML_LOG_ERROR("Couldn't get the shmem addr from virgl\n"); + GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Couldn't get the shmem addr from virgl\n", __func__); return 1; } @@ -71,7 +71,7 @@ uint32_t backend_buffer_get_tensor(apir_encoder * enc, apir_decoder * dec, virgl void * shmem_data = ctx->iface->get_shmem_ptr(ctx->ctx_id, shmem_res_id); if (!shmem_data) { - GGML_LOG_ERROR("Couldn't get the shmem addr from virgl\n"); + GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Couldn't get the shmem addr from virgl\n", __func__); return 1; } @@ -121,7 +121,7 @@ uint32_t backend_buffer_free_buffer(apir_encoder * enc, apir_decoder * dec, virg buffer = apir_decode_ggml_buffer(dec); if (!apir_untrack_backend_buffer(buffer)) { - GGML_LOG_WARN("%s: unknown buffer %p\n", __func__, (void *) buffer); + GGML_LOG_WARN(GGML_VIRTGPU_BCK "%s: unknown buffer %p\n", __func__, (void *) buffer); return 1; } diff --git a/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp b/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp index 497f737a8..c7acb8b51 100644 --- a/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +++ b/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp @@ -124,7 +124,7 @@ uint32_t backend_device_buffer_from_ptr(apir_encoder * enc, apir_decoder * dec, void * shmem_ptr = ctx->iface->get_shmem_ptr(ctx->ctx_id, shmem_res_id); if (!shmem_ptr) { - GGML_LOG_ERROR("Couldn't get the shmem addr from virgl\n"); + GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Couldn't get the shmem addr from virgl\n", __func__); apir_decoder_set_fatal(dec); return 1; } diff --git a/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp b/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp index 51d445725..64152eef0 100644 --- a/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +++ b/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp @@ -17,26 +17,26 @@ uint64_t timer_count = 0; uint32_t backend_dispatch_initialize(void * ggml_backend_reg_fct_p) { if (reg != NULL) { - GGML_LOG_WARN("%s: already initialized\n", __func__); + GGML_LOG_WARN(GGML_VIRTGPU_BCK "%s: already initialized\n", __func__); return APIR_BACKEND_INITIALIZE_ALREADY_INITED; } ggml_backend_reg_t (*ggml_backend_reg_fct)(void) = (ggml_backend_reg_t (*)()) ggml_backend_reg_fct_p; reg = ggml_backend_reg_fct(); if (reg == NULL) { - GGML_LOG_ERROR("%s: backend registration failed\n", __func__); + GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: backend registration failed\n", __func__); return APIR_BACKEND_INITIALIZE_BACKEND_REG_FAILED; } if (!reg->iface.get_device_count(reg)) { - GGML_LOG_ERROR("%s: backend initialization failed: no device found\n", __func__); + GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: backend initialization failed: no device found\n", __func__); return APIR_BACKEND_INITIALIZE_NO_DEVICE; } dev = reg->iface.get_device(reg, 0); if (!dev) { - GGML_LOG_ERROR("%s: backend initialization failed: no device received\n", __func__); + GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: backend initialization failed: no device received\n", __func__); return APIR_BACKEND_INITIALIZE_NO_DEVICE; } diff --git a/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h b/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h index b81fd5039..481d7f315 100644 --- a/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +++ b/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h @@ -16,6 +16,7 @@ uint32_t backend_device_buffer_from_ptr(apir_encoder * enc, apir_decoder * dec, uint32_t backend_buffer_type_get_name(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx); uint32_t backend_buffer_type_get_alignment(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx); uint32_t backend_buffer_type_get_max_size(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx); +/* APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST is deprecated. Keeping the handler for backward compatibility. */ uint32_t backend_buffer_type_is_host(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx); uint32_t backend_buffer_type_alloc_buffer(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx); uint32_t backend_buffer_type_get_alloc_size(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx); @@ -62,7 +63,7 @@ static inline const char * backend_dispatch_command_name(ApirBackendCommandType case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE: return "backend_buffer_type_get_max_size"; case APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST: - return "backend_buffer_type_is_host"; + return "backend_buffer_type_is_host (DEPRECATED)"; case APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER: return "backend_buffer_type_alloc_buffer"; case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALLOC_SIZE: @@ -110,7 +111,7 @@ static const backend_dispatch_t apir_backend_dispatch_table[APIR_BACKEND_DISPATC /* APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME = */ backend_buffer_type_get_name, /* APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT = */ backend_buffer_type_get_alignment, /* APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE = */ backend_buffer_type_get_max_size, - /* APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST = */ backend_buffer_type_is_host, + /* APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST = */ backend_buffer_type_is_host /* DEPRECATED */, /* APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER = */ backend_buffer_type_alloc_buffer, /* APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALLOC_SIZE = */ backend_buffer_type_get_alloc_size, diff --git a/ggml/src/ggml-virtgpu/backend/backend-dispatched.h b/ggml/src/ggml-virtgpu/backend/backend-dispatched.h index 6ccbecf07..10311631d 100644 --- a/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +++ b/ggml/src/ggml-virtgpu/backend/backend-dispatched.h @@ -11,6 +11,8 @@ #include "shared/apir_cs.h" #include "shared/apir_cs_ggml.h" +#define GGML_VIRTGPU_BCK "ggml-virtgpu-backend: " + struct virgl_apir_context { uint32_t ctx_id; virgl_apir_callbacks * iface; diff --git a/ggml/src/ggml-virtgpu/backend/backend.cpp b/ggml/src/ggml-virtgpu/backend/backend.cpp index 95d602ed6..d93414a07 100644 --- a/ggml/src/ggml-virtgpu/backend/backend.cpp +++ b/ggml/src/ggml-virtgpu/backend/backend.cpp @@ -35,14 +35,8 @@ void apir_backend_deinit(uint32_t virgl_ctx_id) { buffer->iface.free_buffer(buffer); } - if (dev) { - size_t free, total; - dev->iface.get_memory(dev, &free, &total); - GGML_LOG_INFO("%s: free memory: %ld MB\n", __func__, (size_t) free / 1024 / 1024); - } - if (backend_library_handle) { - GGML_LOG_INFO("%s: The GGML backend library was loaded. Unloading it.\n", __func__); + GGML_LOG_INFO(GGML_VIRTGPU_BCK "The GGML backend library was loaded. Unloading it.\n"); dlclose(backend_library_handle); backend_library_handle = NULL; } @@ -65,7 +59,7 @@ ApirLoadLibraryReturnCode apir_backend_initialize(uint32_t virgl_ctx_id, struct if (apir_logfile) { ggml_log_set(log_to_file_callback, apir_logfile); } else { - GGML_LOG_INFO("Could not open the log file at '%s'\n", apir_log_to_file); + GGML_LOG_INFO(GGML_VIRTGPU_BCK "Could not open the log file at '%s'\n", apir_log_to_file); } } @@ -74,7 +68,10 @@ ApirLoadLibraryReturnCode apir_backend_initialize(uint32_t virgl_ctx_id, struct const char * library_reg = virgl_library_reg ? virgl_library_reg : GGML_DEFAULT_BACKEND_REG; if (!library_name) { - GGML_LOG_ERROR("cannot open the GGML library: env var '%s' not defined\n", APIR_LLAMA_CPP_GGML_LIBRARY_PATH_ENV); + GGML_LOG_ERROR(GGML_VIRTGPU_BCK + "%s: cannot open the GGML library: env var '%s' not defined\n", + __func__, APIR_LLAMA_CPP_GGML_LIBRARY_PATH_ENV); + return APIR_LOAD_LIBRARY_ENV_VAR_MISSING; } @@ -82,13 +79,16 @@ ApirLoadLibraryReturnCode apir_backend_initialize(uint32_t virgl_ctx_id, struct backend_library_handle = dlopen(library_name, RTLD_LAZY); if (!backend_library_handle) { - GGML_LOG_ERROR("cannot open the GGML library: %s\n", dlerror()); + GGML_LOG_ERROR(GGML_VIRTGPU_BCK + "%s: cannot open the GGML library: %s\n", __func__, dlerror()); return APIR_LOAD_LIBRARY_CANNOT_OPEN; } if (!library_reg) { - GGML_LOG_ERROR("cannot register the GGML library: env var '%s' not defined\n", APIR_LLAMA_CPP_GGML_LIBRARY_REG_ENV); + GGML_LOG_ERROR(GGML_VIRTGPU_BCK + "%s: cannot register the GGML library: env var '%s' not defined\n", + __func__, APIR_LLAMA_CPP_GGML_LIBRARY_REG_ENV); return APIR_LOAD_LIBRARY_ENV_VAR_MISSING; } @@ -96,8 +96,10 @@ ApirLoadLibraryReturnCode apir_backend_initialize(uint32_t virgl_ctx_id, struct void * ggml_backend_reg_fct = dlsym(backend_library_handle, library_reg); dlsym_error = dlerror(); if (dlsym_error) { - GGML_LOG_ERROR("cannot find the GGML backend registration symbol '%s' (from %s): %s\n", library_reg, - APIR_LLAMA_CPP_GGML_LIBRARY_REG_ENV, dlsym_error); + GGML_LOG_ERROR(GGML_VIRTGPU_BCK + "%s: cannot find the GGML backend registration symbol '%s' (from %s): %s\n", + __func__, library_reg, APIR_LLAMA_CPP_GGML_LIBRARY_REG_ENV, dlsym_error); + return APIR_LOAD_LIBRARY_SYMBOL_MISSING; } @@ -134,7 +136,9 @@ uint32_t apir_backend_dispatcher(uint32_t virgl_ctx_id, }; if (cmd_type >= APIR_BACKEND_DISPATCH_TABLE_COUNT) { - GGML_LOG_ERROR("Received an invalid dispatch index (%d >= %d)\n", cmd_type, APIR_BACKEND_DISPATCH_TABLE_COUNT); + GGML_LOG_ERROR(GGML_VIRTGPU_BCK + "%s: Received an invalid dispatch index (%d >= %d)\n", + __func__, cmd_type, APIR_BACKEND_DISPATCH_TABLE_COUNT); return APIR_BACKEND_FORWARD_INDEX_INVALID; } diff --git a/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h b/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h index 27a61091f..1bc3a5f68 100644 --- a/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +++ b/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h @@ -86,7 +86,7 @@ static inline bool apir_decoder_peek_internal(apir_decoder * dec, assert(val_size <= size); if (unlikely(size > (size_t) (dec->end - dec->cur))) { - GGML_LOG_ERROR("reading too much from the decoder ...\n"); + GGML_LOG_ERROR("%s: reading too much from the decoder ...\n", __func__); apir_decoder_set_fatal(dec); memset(val, 0, val_size); return false; @@ -103,7 +103,7 @@ static inline void apir_decoder_peek(apir_decoder * dec, size_t size, void * val static inline const void * apir_decoder_use_inplace(apir_decoder * dec, size_t size) { if (unlikely(size > (size_t) (dec->end - dec->cur))) { - GGML_LOG_ERROR("reading too much from the decoder ...\n"); + GGML_LOG_ERROR("%s: reading too much from the decoder ...\n", __func__); apir_decoder_set_fatal(dec); return NULL; } @@ -221,7 +221,7 @@ static inline uint64_t apir_decode_array_size(apir_decoder * dec, uint64_t expec uint64_t size; apir_decode_uint64_t(dec, &size); if (size != expected_size) { - GGML_LOG_ERROR("Couldn't decode array from the decoder\n"); + GGML_LOG_ERROR("%s: Couldn't decode array from the decoder\n", __func__); apir_decoder_set_fatal(dec); size = 0; } @@ -322,7 +322,7 @@ static inline void apir_decode_char_array(apir_decoder * dec, char * val, size_t if (size) { val[size - 1] = '\0'; } else { - GGML_LOG_ERROR("Couldn't decode the blog array\n"); + GGML_LOG_ERROR("%s: Couldn't decode the blog array\n", __func__); apir_decoder_set_fatal(dec); } } @@ -332,7 +332,8 @@ static inline void apir_decode_char_array(apir_decoder * dec, char * val, size_t static inline void * apir_decoder_alloc_array(size_t size, size_t count) { size_t alloc_size; if (unlikely(__builtin_mul_overflow(size, count, &alloc_size))) { - GGML_LOG_ERROR("overflow in array allocation of %zu * %zu bytes\n", size, count); + GGML_LOG_ERROR("%s: overflow in array allocation of %zu * %zu bytes\n", + __func__, size, count); return NULL; } diff --git a/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h b/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h index 070c3b25f..289f4b77d 100644 --- a/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +++ b/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h @@ -39,11 +39,17 @@ static inline void apir_encode_ggml_tensor(apir_encoder * enc, const ggml_tensor static inline const ggml_tensor * apir_decode_ggml_tensor(apir_decoder * dec) { const apir_rpc_tensor * apir_rpc_tensor = apir_decode_apir_rpc_tensor_inplace(dec); + + if (!apir_rpc_tensor) { + return NULL; + } + ggml_init_params params{ /*.mem_size =*/ ggml_tensor_overhead(), /*.mem_buffer =*/ NULL, /*.no_alloc =*/ true, }; + ggml_context * ctx = ggml_init(params); const ggml_tensor * tensor = apir_deserialize_tensor(ctx, apir_rpc_tensor); @@ -71,6 +77,10 @@ static inline ggml_backend_buffer_type_t apir_decode_ggml_buffer_type(apir_decod return (ggml_backend_buffer_type_t) handle; } +static inline void apir_encode_apir_buffer_type_host_handle(apir_encoder * enc, apir_buffer_type_host_handle_t handle) { + apir_encoder_write(enc, sizeof(handle), &handle, sizeof(handle)); +} + static inline apir_buffer_type_host_handle_t apir_decode_apir_buffer_type_host_handle(apir_decoder * dec) { apir_buffer_type_host_handle_t handle; @@ -154,13 +164,13 @@ static inline void apir_encode_ggml_tensor_inline(apir_encoder * enc, const ggml size_t tensor_size = sizeof(*tensor); if (tensor->extra) { - GGML_ABORT("Cannot pass tensors with extra"); + GGML_ABORT("%s: Cannot pass tensors with extra", __func__); } if (tensor->src[0] && tensor->buffer) { static int first = 1; if (first) { - GGML_LOG_WARN("Cannot pass tensors with src and buffer\n"); + GGML_LOG_WARN("%s: Cannot pass tensors with src and buffer\n", __func__); first = 0; } } diff --git a/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp b/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp index 7f650659b..c493a8e2a 100644 --- a/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +++ b/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp @@ -6,7 +6,7 @@ static ggml_backend_buffer_t ggml_backend_remoting_buffer_type_alloc_buffer(ggml ggml_backend_remoting_buffer_context * context = (ggml_backend_remoting_buffer_context *) malloc(sizeof(*context)); if (!context) { - GGML_ABORT("Couldn't allocate the buffer context ..."); + GGML_ABORT(GGML_VIRTGPU "%s: Couldn't allocate the buffer context ...", __func__); } context->gpu = gpu; @@ -20,7 +20,7 @@ static ggml_backend_buffer_t ggml_backend_remoting_buffer_type_alloc_buffer(ggml context->base = context->apir_context.shmem.mmap_ptr; context->is_from_ptr = true; } else { - context->apir_context = apir_buffer_type_alloc_buffer(gpu, buft, size); + context->apir_context = apir_buffer_type_alloc_buffer(gpu, gpu->cached_buffer_type.host_handle, size); context->is_from_ptr = false; context->base = NULL; } @@ -34,36 +34,19 @@ static ggml_backend_buffer_t ggml_backend_remoting_buffer_type_alloc_buffer(ggml static const char * ggml_backend_remoting_buffer_type_get_name(ggml_backend_buffer_type_t buft) { virtgpu * gpu = BUFT_TO_GPU(buft); - return apir_buffer_type_get_name(gpu, buft); + return gpu->cached_buffer_type.name; } static size_t ggml_backend_remoting_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { virtgpu * gpu = BUFT_TO_GPU(buft); - static size_t align = 0; - - if (align == 0) { - align = apir_buffer_type_get_alignment(gpu, buft); - } - - return align; + return gpu->cached_buffer_type.alignment; } static size_t ggml_backend_remoting_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { virtgpu * gpu = BUFT_TO_GPU(buft); - static size_t max_size = 0; - if (max_size == 0) { - max_size = apir_buffer_type_get_max_size(gpu, buft); - } - - return max_size; -} - -static bool ggml_backend_remoting_buffer_type_is_host(ggml_backend_buffer_type_t buft) { - virtgpu * gpu = BUFT_TO_GPU(buft); - - return apir_buffer_type_is_host(gpu, buft); + return gpu->cached_buffer_type.max_size; } static size_t ggml_backend_remoting_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, @@ -76,7 +59,7 @@ static size_t ggml_backend_remoting_buffer_type_get_alloc_size(ggml_backend_buff return ggml_nbytes(tensor); } - return apir_buffer_type_get_alloc_size(gpu, buft, tensor); + return apir_buffer_type_get_alloc_size(gpu, gpu->cached_buffer_type.host_handle, tensor); } const ggml_backend_buffer_type_i ggml_backend_remoting_buffer_type_interface = { diff --git a/ggml/src/ggml-virtgpu/ggml-backend-device.cpp b/ggml/src/ggml-virtgpu/ggml-backend-device.cpp index 579eb9907..c7d288105 100644 --- a/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +++ b/ggml/src/ggml-virtgpu/ggml-backend-device.cpp @@ -3,32 +3,27 @@ static const char * ggml_backend_remoting_device_get_name(ggml_backend_dev_t dev) { virtgpu * gpu = DEV_TO_GPU(dev); - return apir_device_get_name(gpu); + return gpu->cached_device_info.name; } static const char * ggml_backend_remoting_device_get_description(ggml_backend_dev_t dev) { virtgpu * gpu = DEV_TO_GPU(dev); - return apir_device_get_description(gpu); + // Return the pre-cached description from the virtgpu structure + return gpu->cached_device_info.description; } static enum ggml_backend_dev_type ggml_backend_remoting_device_get_type(ggml_backend_dev_t dev) { virtgpu * gpu = DEV_TO_GPU(dev); - static enum ggml_backend_dev_type type; - static bool has_type = false; - if (!has_type) { - has_type = true; - type = (enum ggml_backend_dev_type) apir_device_get_type(gpu); - } - - return type; + return (enum ggml_backend_dev_type) gpu->cached_device_info.type; } static void ggml_backend_remoting_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { virtgpu * gpu = DEV_TO_GPU(dev); - return apir_device_get_memory(gpu, free, total); + *free = gpu->cached_device_info.memory_free; + *total = gpu->cached_device_info.memory_total; } static bool ggml_backend_remoting_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { @@ -77,13 +72,22 @@ static void ggml_backend_remoting_device_get_props(ggml_backend_dev_t dev, ggml_ ggml_backend_buffer_type_t ggml_backend_remoting_device_get_buffer_type(ggml_backend_dev_t dev) { virtgpu * gpu = DEV_TO_GPU(dev); - apir_buffer_type_host_handle_t ctx = apir_device_get_buffer_type(gpu); + static std::atomic initialized = false; + static ggml_backend_buffer_type buft; - static ggml_backend_buffer_type buft{ - /* .iface = */ ggml_backend_remoting_buffer_type_interface, - /* .device = */ dev, - /* .context = */ (void *) ctx, - }; + if (!initialized) { + static std::mutex mutex; + std::lock_guard lock(mutex); + + if (!initialized) { + buft = { + /* .iface = */ ggml_backend_remoting_buffer_type_interface, + /* .device = */ dev, + /* .context = */ (void *) gpu->cached_buffer_type.host_handle, + }; + initialized = true; + } + } return &buft; } @@ -91,13 +95,22 @@ ggml_backend_buffer_type_t ggml_backend_remoting_device_get_buffer_type(ggml_bac static ggml_backend_buffer_type_t ggml_backend_remoting_device_get_buffer_from_ptr_type(ggml_backend_dev_t dev) { virtgpu * gpu = DEV_TO_GPU(dev); - apir_buffer_type_host_handle_t ctx = apir_device_get_buffer_type(gpu); + static std::atomic initialized = false; + static ggml_backend_buffer_type buft; - static ggml_backend_buffer_type buft{ - /* .iface = */ ggml_backend_remoting_buffer_from_ptr_type_interface, - /* .device = */ dev, - /* .context = */ (void *) ctx, - }; + if (!initialized) { + static std::mutex mutex; + std::lock_guard lock(mutex); + + if (!initialized) { + buft = { + /* .iface = */ ggml_backend_remoting_buffer_from_ptr_type_interface, + /* .device = */ dev, + /* .context = */ (void *) gpu->cached_buffer_type.host_handle, + }; + initialized = true; + } + } return &buft; } @@ -110,7 +123,7 @@ static ggml_backend_buffer_t ggml_backend_remoting_device_buffer_from_ptr(ggml_b ggml_backend_remoting_buffer_context * context = (ggml_backend_remoting_buffer_context *) malloc(sizeof(*context)); if (!context) { - GGML_ABORT("Couldn't allocate the buffer context ..."); + GGML_ABORT(GGML_VIRTGPU "%s: Couldn't allocate the buffer context ...", __func__); } context->gpu = gpu; diff --git a/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp b/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp index c46cf51c0..2d02cfec1 100644 --- a/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +++ b/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp @@ -4,37 +4,70 @@ #include #include +void ggml_virtgpu_cleanup(virtgpu * gpu); + static virtgpu * apir_initialize() { - static virtgpu * apir_gpu_instance = NULL; - static bool apir_initialized = false; + static virtgpu * gpu = NULL; + static std::atomic initialized = false; + + if (initialized) { + // fast track + return gpu; + } { static std::mutex mutex; std::lock_guard lock(mutex); - if (apir_initialized) { - return apir_gpu_instance; + if (initialized) { + // thread safe + return gpu; } - apir_gpu_instance = create_virtgpu(); - if (!apir_gpu_instance) { - GGML_ABORT("failed to initialize the virtgpu"); + gpu = create_virtgpu(); + if (!gpu) { + initialized = true; + return NULL; } - apir_initialized = true; + // Pre-fetch and cache all device information, it will not change + gpu->cached_device_info.description = apir_device_get_description(gpu); + if (!gpu->cached_device_info.description) { + GGML_ABORT(GGML_VIRTGPU "%s: failed to initialize the virtgpu device description", __func__); + } + gpu->cached_device_info.name = apir_device_get_name(gpu); + if (!gpu->cached_device_info.name) { + GGML_ABORT(GGML_VIRTGPU "%s: failed to initialize the virtgpu device name", __func__); + } + gpu->cached_device_info.device_count = apir_device_get_count(gpu); + gpu->cached_device_info.type = apir_device_get_type(gpu); + + apir_device_get_memory(gpu, + &gpu->cached_device_info.memory_free, + &gpu->cached_device_info.memory_total); + + apir_buffer_type_host_handle_t buft_host_handle = apir_device_get_buffer_type(gpu); + gpu->cached_buffer_type.host_handle = buft_host_handle; + gpu->cached_buffer_type.name = apir_buffer_type_get_name(gpu, buft_host_handle); + if (!gpu->cached_buffer_type.name) { + GGML_ABORT(GGML_VIRTGPU "%s: failed to initialize the virtgpu buffer type name", __func__); + } + gpu->cached_buffer_type.alignment = apir_buffer_type_get_alignment(gpu, buft_host_handle); + gpu->cached_buffer_type.max_size = apir_buffer_type_get_max_size(gpu, buft_host_handle); + + initialized = true; } - return apir_gpu_instance; + return gpu; } static int ggml_backend_remoting_get_device_count() { virtgpu * gpu = apir_initialize(); if (!gpu) { - GGML_LOG_WARN("apir_initialize failed\n"); return 0; } - return apir_device_get_count(gpu); + return gpu->cached_device_info.device_count; } static size_t ggml_backend_remoting_reg_get_device_count(ggml_backend_reg_t reg) { @@ -52,17 +85,21 @@ ggml_backend_dev_t ggml_backend_remoting_get_device(size_t device) { static void ggml_backend_remoting_reg_init_devices(ggml_backend_reg_t reg) { if (devices.size() > 0) { - GGML_LOG_INFO("%s: already initialized\n", __func__); + GGML_LOG_INFO(GGML_VIRTGPU "%s: already initialized\n", __func__); return; } virtgpu * gpu = apir_initialize(); if (!gpu) { - GGML_LOG_ERROR("apir_initialize failed\n"); + GGML_LOG_ERROR(GGML_VIRTGPU "%s: apir_initialize failed\n", __func__); return; } - static bool initialized = false; + static std::atomic initialized = false; + + if (initialized) { + return; // fast track + } { static std::mutex mutex; @@ -70,10 +107,10 @@ static void ggml_backend_remoting_reg_init_devices(ggml_backend_reg_t reg) { if (!initialized) { for (int i = 0; i < ggml_backend_remoting_get_device_count(); i++) { ggml_backend_remoting_device_context * ctx = new ggml_backend_remoting_device_context; - char desc[256] = "API Remoting device"; + char desc[256] = "ggml-virtgpu API Remoting device"; ctx->device = i; - ctx->name = GGML_REMOTING_FRONTEND_NAME + std::to_string(i); + ctx->name = GGML_VIRTGPU_NAME + std::to_string(i); ctx->description = desc; ctx->gpu = gpu; @@ -98,7 +135,7 @@ static ggml_backend_dev_t ggml_backend_remoting_reg_get_device(ggml_backend_reg_ static const char * ggml_backend_remoting_reg_get_name(ggml_backend_reg_t reg) { UNUSED(reg); - return GGML_REMOTING_FRONTEND_NAME; + return GGML_VIRTGPU_NAME; } static const ggml_backend_reg_i ggml_backend_remoting_reg_i = { @@ -111,8 +148,7 @@ static const ggml_backend_reg_i ggml_backend_remoting_reg_i = { ggml_backend_reg_t ggml_backend_virtgpu_reg() { virtgpu * gpu = apir_initialize(); if (!gpu) { - GGML_LOG_ERROR("virtgpu_apir_initialize failed\n"); - return NULL; + GGML_LOG_ERROR(GGML_VIRTGPU "%s: virtgpu_apir_initialize failed\n", __func__); } static ggml_backend_reg reg = { @@ -129,9 +165,25 @@ ggml_backend_reg_t ggml_backend_virtgpu_reg() { ggml_backend_remoting_reg_init_devices(®); - GGML_LOG_INFO("%s: initialized\n", __func__); - return ® } +// public function, not exposed in the GGML interface at the moment +void ggml_virtgpu_cleanup(virtgpu * gpu) { + if (gpu->cached_device_info.name) { + free(gpu->cached_device_info.name); + gpu->cached_device_info.name = NULL; + } + if (gpu->cached_device_info.description) { + free(gpu->cached_device_info.description); + gpu->cached_device_info.description = NULL; + } + if (gpu->cached_buffer_type.name) { + free(gpu->cached_buffer_type.name); + gpu->cached_buffer_type.name = NULL; + } + + mtx_destroy(&gpu->data_shmem_mutex); +} + GGML_BACKEND_DL_IMPL(ggml_backend_virtgpu_reg) diff --git a/ggml/src/ggml-virtgpu/ggml-remoting.h b/ggml/src/ggml-virtgpu/ggml-remoting.h index 36fc6b2a7..087664086 100644 --- a/ggml/src/ggml-virtgpu/ggml-remoting.h +++ b/ggml/src/ggml-virtgpu/ggml-remoting.h @@ -8,6 +8,9 @@ #include #include +#define GGML_VIRTGPU_NAME "ggml-virtgpu" +#define GGML_VIRTGPU "ggml-virtgpu: " + // USE_ALWAYS_TRUE_SUPPORTS_OP: 1 is fast, 0 avoid micro-benchmark crashes #define USE_ALWAYS_TRUE_SUPPORTS_OP 1 @@ -62,7 +65,7 @@ static inline apir_buffer_type_host_handle_t ggml_buffer_type_to_apir_handle(ggm static inline apir_buffer_host_handle_t ggml_buffer_to_apir_handle(ggml_backend_buffer_t buffer) { if (!buffer->context) { - GGML_ABORT("%s: no context available :/", __func__); + GGML_ABORT(GGML_VIRTGPU "%s: no context available :/", __func__); } return BUFFER_TO_HOST_HANDLE(buffer); } diff --git a/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml b/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml index 0b7cccfe9..14ef2433e 100644 --- a/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +++ b/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml @@ -24,10 +24,10 @@ functions: frontend_return: "int" get_name: - frontend_return: "const char *" + frontend_return: "char *" get_description: - frontend_return: "const char *" + frontend_return: "char *" get_type: frontend_return: "uint32_t" @@ -64,35 +64,33 @@ functions: group_description: "buffer-type" functions: get_name: - frontend_return: "const char *" + frontend_return: "char *" frontend_extra_params: - - "ggml_backend_buffer_type_t buft" + - "apir_buffer_type_host_handle_t host_handle" get_alignment: frontend_return: "size_t" frontend_extra_params: - - "ggml_backend_buffer_type_t buft" + - "apir_buffer_type_host_handle_t host_handle" get_max_size: frontend_return: "size_t" frontend_extra_params: - - "ggml_backend_buffer_type_t buft" + - "apir_buffer_type_host_handle_t host_handle" is_host: - frontend_return: "bool" - frontend_extra_params: - - "ggml_backend_buffer_type_t buft" + deprecated: true alloc_buffer: frontend_return: "apir_buffer_context_t" frontend_extra_params: - - "ggml_backend_buffer_type_t buffer_buft" + - "apir_buffer_type_host_handle_t host_handle" - "size_t size" get_alloc_size: frontend_return: "size_t" frontend_extra_params: - - "ggml_backend_buffer_type_t buft" + - "apir_buffer_type_host_handle_t host_handle" - "const ggml_tensor *op" buffer: diff --git a/ggml/src/ggml-virtgpu/regenerate_remoting.py b/ggml/src/ggml-virtgpu/regenerate_remoting.py index 4174a2432..aeb48a408 100755 --- a/ggml/src/ggml-virtgpu/regenerate_remoting.py +++ b/ggml/src/ggml-virtgpu/regenerate_remoting.py @@ -116,7 +116,7 @@ class RemotingCodebaseGenerator: 'frontend_return': func_metadata.get('frontend_return', 'void'), 'frontend_extra_params': func_metadata.get('frontend_extra_params', []), 'group_description': group_description, - 'newly_added': func_metadata.get('newly_added', False) + 'deprecated': func_metadata.get('deprecated', False), }) enum_value += 1 @@ -165,6 +165,9 @@ class RemotingCodebaseGenerator: signature = "uint32_t" params = "apir_encoder *enc, apir_decoder *dec, virgl_apir_context *ctx" + if func['deprecated']: + decl_lines.append(f"/* {func['enum_name']} is deprecated. Keeping the handler for backward compatibility. */") + decl_lines.append(f"{signature} {func['backend_function']}({params});") # Switch cases @@ -176,7 +179,9 @@ class RemotingCodebaseGenerator: switch_lines.append(f" /* {func['group_description']} */") current_group = func['group_name'] - switch_lines.append(f" case {func['enum_name']}: return \"{func['backend_function']}\";") + deprecated = " (DEPRECATED)" if func['deprecated'] else "" + + switch_lines.append(f" case {func['enum_name']}: return \"{func['backend_function']}{deprecated}\";") # Dispatch table table_lines = [] @@ -188,7 +193,8 @@ class RemotingCodebaseGenerator: table_lines.append("") current_group = func['group_name'] - table_lines.append(f" /* {func['enum_name']} = */ {func['backend_function']},") + deprecated = " /* DEPRECATED */" if func['deprecated'] else "" + table_lines.append(f" /* {func['enum_name']} = */ {func['backend_function']}{deprecated},") header_content = f'''\ #pragma once @@ -225,6 +231,10 @@ static const backend_dispatch_t apir_backend_dispatch_table[APIR_BACKEND_DISPATC decl_lines.append(f"/* {func['group_description']} */") current_group = func['group_name'] + if func['deprecated']: + decl_lines.append(f"/* {func['frontend_function']} is deprecated. */") + continue + # Build parameter list params = [self.naming_patterns['frontend_base_param']] params.extend(func['frontend_extra_params']) @@ -287,7 +297,7 @@ static const backend_dispatch_t apir_backend_dispatch_table[APIR_BACKEND_DISPATC generated_files = [apir_backend_path, backend_dispatched_path, virtgpu_forward_path] if not self.clang_format_available: - logging.warning("\n⚠️clang-format not found in PATH. Generated files will not be formatted." + logging.warning("\n⚠️clang-format not found in PATH. Generated files will not be formatted.\n" " Install clang-format to enable automatic code formatting.") else: logging.info("\n🎨 Formatting files with clang-format...") diff --git a/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp b/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp index bf3c41011..07d9a6684 100644 --- a/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +++ b/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp @@ -18,12 +18,17 @@ ggml_status apir_backend_graph_compute(virtgpu * gpu, ggml_cgraph * cgraph) { virtgpu_shmem temp_shmem; // Local storage for large buffers virtgpu_shmem * shmem = &temp_shmem; + bool using_shared_shmem = false; if (cgraph_size <= gpu->data_shmem.mmap_size) { - // prefer the init-time allocated page, if large enough + // Lock mutex before using shared data_shmem buffer + if (mtx_lock(&gpu->data_shmem_mutex) != thrd_success) { + GGML_ABORT(GGML_VIRTGPU "%s: Failed to lock data_shmem mutex", __func__); + } + using_shared_shmem = true; shmem = &gpu->data_shmem; } else if (virtgpu_shmem_create(gpu, cgraph_size, shmem)) { - GGML_ABORT("Couldn't allocate the guest-host shared buffer"); + GGML_ABORT(GGML_VIRTGPU "%s: Couldn't allocate the guest-host shared buffer", __func__); } apir_encode_virtgpu_shmem_res_id(encoder, shmem->res_id); @@ -42,7 +47,10 @@ ggml_status apir_backend_graph_compute(virtgpu * gpu, ggml_cgraph * cgraph) { remote_call_finish(gpu, encoder, decoder); - if (shmem != &gpu->data_shmem) { + // Unlock mutex before cleanup + if (using_shared_shmem) { + mtx_unlock(&gpu->data_shmem_mutex); + } else { virtgpu_shmem_destroy(gpu, shmem); } diff --git a/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp b/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp index 03cb09e06..cab74fd17 100644 --- a/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +++ b/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp @@ -1,20 +1,20 @@ #include "virtgpu-forward-impl.h" -const char * apir_buffer_type_get_name(virtgpu * gpu, ggml_backend_buffer_type_t buft) { +char * apir_buffer_type_get_name(virtgpu * gpu, apir_buffer_type_host_handle_t host_handle) { apir_encoder * encoder; apir_decoder * decoder; ApirForwardReturnCode ret; REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME); - apir_encode_ggml_buffer_type(encoder, buft); + apir_encode_apir_buffer_type_host_handle(encoder, host_handle); REMOTE_CALL(gpu, encoder, decoder, ret); const size_t string_size = apir_decode_array_size_unchecked(decoder); char * string = (char *) apir_decoder_alloc_array(sizeof(char), string_size); if (!string) { - GGML_LOG_ERROR("%s: Could not allocate the device name buffer\n", __func__); + GGML_LOG_ERROR(GGML_VIRTGPU "%s: Could not allocate the device name buffer\n", __func__); apir_decoder_set_fatal(decoder); } apir_decode_char_array(decoder, string, string_size); @@ -24,14 +24,14 @@ const char * apir_buffer_type_get_name(virtgpu * gpu, ggml_backend_buffer_type_t return string; } -size_t apir_buffer_type_get_alignment(virtgpu * gpu, ggml_backend_buffer_type_t buft) { +size_t apir_buffer_type_get_alignment(virtgpu * gpu, apir_buffer_type_host_handle_t host_handle) { apir_encoder * encoder; apir_decoder * decoder; ApirForwardReturnCode ret; REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT); - apir_encode_ggml_buffer_type(encoder, buft); + apir_encode_apir_buffer_type_host_handle(encoder, host_handle); REMOTE_CALL(gpu, encoder, decoder, ret); @@ -43,14 +43,14 @@ size_t apir_buffer_type_get_alignment(virtgpu * gpu, ggml_backend_buffer_type_t return alignment; } -size_t apir_buffer_type_get_max_size(virtgpu * gpu, ggml_backend_buffer_type_t buft) { +size_t apir_buffer_type_get_max_size(virtgpu * gpu, apir_buffer_type_host_handle_t host_handle) { apir_encoder * encoder; apir_decoder * decoder; ApirForwardReturnCode ret; REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE); - apir_encode_ggml_buffer_type(encoder, buft); + apir_encode_apir_buffer_type_host_handle(encoder, host_handle); REMOTE_CALL(gpu, encoder, decoder, ret); @@ -62,26 +62,7 @@ size_t apir_buffer_type_get_max_size(virtgpu * gpu, ggml_backend_buffer_type_t b return max_size; } -bool apir_buffer_type_is_host(virtgpu * gpu, ggml_backend_buffer_type_t buft) { - apir_encoder * encoder; - apir_decoder * decoder; - ApirForwardReturnCode ret; - - REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST); - - apir_encode_ggml_buffer_type(encoder, buft); - - REMOTE_CALL(gpu, encoder, decoder, ret); - - bool is_host; - apir_decode_bool_t(decoder, &is_host); - - remote_call_finish(gpu, encoder, decoder); - - return is_host; -} - -apir_buffer_context_t apir_buffer_type_alloc_buffer(virtgpu * gpu, ggml_backend_buffer_type_t buft, size_t size) { +apir_buffer_context_t apir_buffer_type_alloc_buffer(virtgpu * gpu, apir_buffer_type_host_handle_t host_handle, size_t size) { apir_encoder * encoder; apir_decoder * decoder; ApirForwardReturnCode ret; @@ -90,7 +71,7 @@ apir_buffer_context_t apir_buffer_type_alloc_buffer(virtgpu * gpu, ggml_backend_ REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER); - apir_encode_ggml_buffer_type(encoder, buft); + apir_encode_apir_buffer_type_host_handle(encoder, host_handle); apir_encode_size_t(encoder, &size); @@ -103,14 +84,14 @@ apir_buffer_context_t apir_buffer_type_alloc_buffer(virtgpu * gpu, ggml_backend_ return buffer_context; } -size_t apir_buffer_type_get_alloc_size(virtgpu * gpu, ggml_backend_buffer_type_t buft, const ggml_tensor * op) { +size_t apir_buffer_type_get_alloc_size(virtgpu * gpu, apir_buffer_type_host_handle_t host_handle, const ggml_tensor * op) { apir_encoder * encoder; apir_decoder * decoder; ApirForwardReturnCode ret; REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALLOC_SIZE); - apir_encode_ggml_buffer_type(encoder, buft); + apir_encode_apir_buffer_type_host_handle(encoder, host_handle); apir_encode_ggml_tensor_inline(encoder, op); diff --git a/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp b/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp index 3181e3944..86eee358c 100644 --- a/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +++ b/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp @@ -36,13 +36,18 @@ void apir_buffer_set_tensor(virtgpu * gpu, virtgpu_shmem temp_shmem; // Local storage for large buffers virtgpu_shmem * shmem = &temp_shmem; + bool using_shared_shmem = false; if (size <= gpu->data_shmem.mmap_size) { - // prefer the init-time allocated page, if large enough + // Lock mutex before using shared data_shmem buffer + if (mtx_lock(&gpu->data_shmem_mutex) != thrd_success) { + GGML_ABORT(GGML_VIRTGPU "%s: Failed to lock data_shmem mutex", __func__); + } + using_shared_shmem = true; shmem = &gpu->data_shmem; } else if (virtgpu_shmem_create(gpu, size, shmem)) { - GGML_ABORT("Couldn't allocate the guest-host shared buffer"); + GGML_ABORT(GGML_VIRTGPU "%s: Couldn't allocate the guest-host shared buffer", __func__); } memcpy(shmem->mmap_ptr, data, size); @@ -55,7 +60,10 @@ void apir_buffer_set_tensor(virtgpu * gpu, remote_call_finish(gpu, encoder, decoder); - if (shmem != &gpu->data_shmem) { + // Unlock mutex before cleanup + if (using_shared_shmem) { + mtx_unlock(&gpu->data_shmem_mutex); + } else { virtgpu_shmem_destroy(gpu, shmem); } @@ -79,13 +87,18 @@ void apir_buffer_get_tensor(virtgpu * gpu, virtgpu_shmem temp_shmem; // Local storage for large buffers virtgpu_shmem * shmem = &temp_shmem; + bool using_shared_shmem = false; if (size <= gpu->data_shmem.mmap_size) { - // prefer the init-time allocated page, if large enough + // Lock mutex before using shared data_shmem buffer + if (mtx_lock(&gpu->data_shmem_mutex) != thrd_success) { + GGML_ABORT(GGML_VIRTGPU "%s: Failed to lock data_shmem mutex", __func__); + } + using_shared_shmem = true; shmem = &gpu->data_shmem; } else if (virtgpu_shmem_create(gpu, size, shmem)) { - GGML_ABORT("Couldn't allocate the guest-host shared buffer"); + GGML_ABORT(GGML_VIRTGPU "%s: Couldn't allocate the guest-host shared buffer", __func__); } apir_encode_virtgpu_shmem_res_id(encoder, shmem->res_id); @@ -98,7 +111,10 @@ void apir_buffer_get_tensor(virtgpu * gpu, remote_call_finish(gpu, encoder, decoder); - if (shmem != &gpu->data_shmem) { + // Unlock mutex before cleanup + if (using_shared_shmem) { + mtx_unlock(&gpu->data_shmem_mutex); + } else { virtgpu_shmem_destroy(gpu, shmem); } } diff --git a/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp b/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp index 3e45e55bd..4b6b8f527 100644 --- a/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +++ b/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp @@ -2,11 +2,6 @@ #include "virtgpu-shm.h" int apir_device_get_count(virtgpu * gpu) { - static int32_t dev_count = -1; - if (dev_count != -1) { - return dev_count; - } - apir_encoder * encoder; apir_decoder * decoder; ApirForwardReturnCode ret; @@ -14,6 +9,7 @@ int apir_device_get_count(virtgpu * gpu) { REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_GET_COUNT); REMOTE_CALL(gpu, encoder, decoder, ret); + int32_t dev_count = -1; apir_decode_int32_t(decoder, &dev_count); remote_call_finish(gpu, encoder, decoder); @@ -21,11 +17,7 @@ int apir_device_get_count(virtgpu * gpu) { return dev_count; } -const char * apir_device_get_name(virtgpu * gpu) { - static char * string = nullptr; - if (string) { - return string; - } +char * apir_device_get_name(virtgpu * gpu) { apir_encoder * encoder; apir_decoder * decoder; ApirForwardReturnCode ret; @@ -34,9 +26,9 @@ const char * apir_device_get_name(virtgpu * gpu) { REMOTE_CALL(gpu, encoder, decoder, ret); const size_t string_size = apir_decode_array_size_unchecked(decoder); - string = (char *) apir_decoder_alloc_array(sizeof(char), string_size); + char * string = (char *) apir_decoder_alloc_array(sizeof(char), string_size); if (!string) { - GGML_LOG_ERROR("%s: Could not allocate the device name buffer\n", __func__); + GGML_LOG_ERROR(GGML_VIRTGPU "%s: Could not allocate the device name buffer\n", __func__); return NULL; } apir_decode_char_array(decoder, string, string_size); @@ -46,7 +38,7 @@ const char * apir_device_get_name(virtgpu * gpu) { return string; } -const char * apir_device_get_description(virtgpu * gpu) { +char * apir_device_get_description(virtgpu * gpu) { apir_encoder * encoder; apir_decoder * decoder; ApirForwardReturnCode ret; @@ -58,7 +50,7 @@ const char * apir_device_get_description(virtgpu * gpu) { const size_t string_size = apir_decode_array_size_unchecked(decoder); char * string = (char *) apir_decoder_alloc_array(sizeof(char), string_size); if (!string) { - GGML_LOG_ERROR("%s: Could not allocate the device description buffer\n", __func__); + GGML_LOG_ERROR(GGML_VIRTGPU "%s: Could not allocate the device description buffer\n", __func__); return NULL; } @@ -181,7 +173,7 @@ apir_buffer_context_t apir_device_buffer_from_ptr(virtgpu * gpu, size_t size, si REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_BUFFER_FROM_PTR); if (virtgpu_shmem_create(gpu, size, &buffer_context.shmem)) { - GGML_ABORT("Couldn't allocate the guest-host shared buffer"); + GGML_ABORT(GGML_VIRTGPU "Couldn't allocate the guest-host shared buffer"); } apir_encode_virtgpu_shmem_res_id(encoder, buffer_context.shmem.res_id); diff --git a/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h b/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h index eea3e7e5a..f23c75bb9 100644 --- a/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +++ b/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h @@ -11,7 +11,7 @@ int32_t forward_flag = (int32_t) apir_command_type__; \ encoder_name = remote_call_prepare(gpu_dev_name, APIR_COMMAND_TYPE_FORWARD, forward_flag); \ if (!encoder_name) { \ - GGML_ABORT("%s: failed to prepare the remote call encoder", __func__); \ + GGML_ABORT(GGML_VIRTGPU "%s: failed to prepare the remote call encoder", __func__); \ } \ } while (0) @@ -19,10 +19,10 @@ do { \ ret_name = (ApirForwardReturnCode) remote_call(gpu_dev_name, encoder_name, &decoder_name, 0, NULL); \ if (!decoder_name) { \ - GGML_ABORT("%s: failed to kick the remote call", __func__); \ + GGML_ABORT(GGML_VIRTGPU "%s: failed to kick the remote call", __func__); \ } \ if (ret_name < APIR_FORWARD_BASE_INDEX) { \ - GGML_ABORT("%s: failed to forward the API call: %s: code %d", __func__, \ + GGML_ABORT(GGML_VIRTGPU "%s: failed to forward the API call: %s: code %d", __func__, \ apir_forward_error(ret_name), ret_name); \ } \ ret_name = (ApirForwardReturnCode) (ret_name - APIR_FORWARD_BASE_INDEX); \ diff --git a/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h b/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h index c27c07f08..fe4cae202 100644 --- a/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +++ b/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h @@ -3,8 +3,8 @@ /* device */ void apir_device_get_device_count(struct virtgpu * gpu); int apir_device_get_count(struct virtgpu * gpu); -const char * apir_device_get_name(struct virtgpu * gpu); -const char * apir_device_get_description(struct virtgpu * gpu); +char * apir_device_get_name(struct virtgpu * gpu); +char * apir_device_get_description(struct virtgpu * gpu); uint32_t apir_device_get_type(struct virtgpu * gpu); void apir_device_get_memory(struct virtgpu * gpu, size_t * free, size_t * total); bool apir_device_supports_op(struct virtgpu * gpu, const ggml_tensor * op); @@ -17,14 +17,15 @@ void apir_device_get_props(struct virtgpu * gpu, apir_buffer_context_t apir_device_buffer_from_ptr(struct virtgpu * gpu, size_t size, size_t max_tensor_size); /* buffer-type */ -const char * apir_buffer_type_get_name(struct virtgpu * gpu, ggml_backend_buffer_type_t buft); -size_t apir_buffer_type_get_alignment(struct virtgpu * gpu, ggml_backend_buffer_type_t buft); -size_t apir_buffer_type_get_max_size(struct virtgpu * gpu, ggml_backend_buffer_type_t buft); -bool apir_buffer_type_is_host(struct virtgpu * gpu, ggml_backend_buffer_type_t buft); -apir_buffer_context_t apir_buffer_type_alloc_buffer(struct virtgpu * gpu, - ggml_backend_buffer_type_t buffer_buft, - size_t size); -size_t apir_buffer_type_get_alloc_size(struct virtgpu * gpu, ggml_backend_buffer_type_t buft, const ggml_tensor * op); +char * apir_buffer_type_get_name(struct virtgpu * gpu, apir_buffer_type_host_handle_t host_handle); +size_t apir_buffer_type_get_alignment(struct virtgpu * gpu, apir_buffer_type_host_handle_t host_handle); +size_t apir_buffer_type_get_max_size(struct virtgpu * gpu, apir_buffer_type_host_handle_t host_handle); +apir_buffer_context_t apir_buffer_type_alloc_buffer(struct virtgpu * gpu, + apir_buffer_type_host_handle_t host_handle, + size_t size); +size_t apir_buffer_type_get_alloc_size(struct virtgpu * gpu, + apir_buffer_type_host_handle_t host_handle, + const ggml_tensor * op); /* buffer */ void * apir_buffer_get_base(struct virtgpu * gpu, apir_buffer_context_t * buffer_context); diff --git a/ggml/src/ggml-virtgpu/virtgpu-shm.cpp b/ggml/src/ggml-virtgpu/virtgpu-shm.cpp index 4def405a6..ce6b3b3e6 100644 --- a/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +++ b/ggml/src/ggml-virtgpu/virtgpu-shm.cpp @@ -85,8 +85,7 @@ int virtgpu_shmem_create(virtgpu * gpu, size_t size, virtgpu_shmem * shmem) { void * ptr = virtgpu_ioctl_map(gpu, gem_handle, size); if (!ptr) { virtgpu_ioctl_gem_close(gpu, gem_handle); - GGML_LOG_ERROR("virtgpu_ioctl_map FAILED\n"); - exit(1); + GGML_LOG_ERROR(GGML_VIRTGPU "%s: virtgpu_ioctl_map failed\n", __func__); return 1; } diff --git a/ggml/src/ggml-virtgpu/virtgpu.cpp b/ggml/src/ggml-virtgpu/virtgpu.cpp index 005c8e21d..1e650dc65 100644 --- a/ggml/src/ggml-virtgpu/virtgpu.cpp +++ b/ggml/src/ggml-virtgpu/virtgpu.cpp @@ -33,7 +33,7 @@ static int virtgpu_handshake(virtgpu * gpu) { encoder = remote_call_prepare(gpu, APIR_COMMAND_TYPE_HANDSHAKE, 0); if (!encoder) { - GGML_ABORT("%s: failed to prepare the remote call encoder", __func__); + GGML_ABORT(GGML_VIRTGPU "%s: failed to prepare the remote call encoder", __func__); return 1; } @@ -52,7 +52,7 @@ static int virtgpu_handshake(virtgpu * gpu) { log_call_duration(call_duration_ns, "API Remoting handshake"); if (!decoder) { - GGML_ABORT( + GGML_ABORT(GGML_VIRTGPU "%s: failed to initiate the communication with the virglrenderer library. " "Most likely, the wrong virglrenderer library was loaded in the hypervisor.", __func__); @@ -65,7 +65,8 @@ static int virtgpu_handshake(virtgpu * gpu) { uint32_t host_minor; if (ret_magic != APIR_HANDSHAKE_MAGIC) { - GGML_ABORT("%s: handshake with the virglrenderer failed (code=%d | %s)", __func__, ret_magic, + GGML_ABORT(GGML_VIRTGPU + "%s: handshake with the virglrenderer failed (code=%d | %s)", __func__, ret_magic, apir_backend_initialize_error(ret_magic)); } else { apir_decode_uint32_t(decoder, &host_major); @@ -78,13 +79,13 @@ static int virtgpu_handshake(virtgpu * gpu) { return 1; } - GGML_LOG_INFO("%s: Guest is running with %u.%u\n", __func__, guest_major, guest_minor); - GGML_LOG_INFO("%s: Host is running with %u.%u\n", __func__, host_major, host_minor); + GGML_LOG_INFO(GGML_VIRTGPU "%s: Guest is running with %u.%u\n", __func__, guest_major, guest_minor); + GGML_LOG_INFO(GGML_VIRTGPU "%s: Host is running with %u.%u\n", __func__, host_major, host_minor); if (guest_major != host_major) { - GGML_LOG_ERROR("Host major (%d) and guest major (%d) version differ\n", host_major, guest_major); + GGML_LOG_ERROR(GGML_VIRTGPU "Host major (%d) and guest major (%d) version differ\n", host_major, guest_major); } else if (guest_minor != host_minor) { - GGML_LOG_WARN("Host minor (%d) and guest minor (%d) version differ\n", host_minor, guest_minor); + GGML_LOG_WARN(GGML_VIRTGPU "Host minor (%d) and guest minor (%d) version differ\n", host_minor, guest_minor); } return 0; @@ -97,7 +98,7 @@ static ApirLoadLibraryReturnCode virtgpu_load_library(virtgpu * gpu) { encoder = remote_call_prepare(gpu, APIR_COMMAND_TYPE_LOADLIBRARY, 0); if (!encoder) { - GGML_ABORT("%s: hypercall error: failed to prepare the remote call encoder", __func__); + GGML_ABORT(GGML_VIRTGPU "%s: hypercall error: failed to prepare the API Remoting command encoder", __func__); return APIR_LOAD_LIBRARY_HYPERCALL_INITIALIZATION_ERROR; } @@ -108,36 +109,67 @@ static ApirLoadLibraryReturnCode virtgpu_load_library(virtgpu * gpu) { log_call_duration(call_duration_ns, "API Remoting LoadLibrary"); if (!decoder) { - GGML_ABORT("%s: hypercall error: failed to kick the API remoting hypercall.\n", __func__); + GGML_ABORT(GGML_VIRTGPU "%s: hypercall error: failed to trigger the API Remoting hypercall.\n", __func__); return APIR_LOAD_LIBRARY_HYPERCALL_INITIALIZATION_ERROR; } remote_call_finish(gpu, encoder, decoder); if (ret == APIR_LOAD_LIBRARY_SUCCESS) { - GGML_LOG_INFO("%s: The API Remoting backend was successfully loaded and initialized\n", __func__); + GGML_LOG_INFO(GGML_VIRTGPU "The API Remoting backend was successfully loaded and initialized\n"); return ret; } // something wrong happened, find out what. - if (ret < APIR_LOAD_LIBRARY_INIT_BASE_INDEX) { - GGML_ABORT("%s: virglrenderer could not load the API Remoting backend library: %s (code %d)", __func__, - apir_load_library_error(ret), ret); + if (ret == APIR_LOAD_LIBRARY_ENV_VAR_MISSING) { + GGML_ABORT(GGML_VIRTGPU + "%s: virglrenderer could not open the API Remoting backend library, " + "some environment variables are missing. " + "Make sure virglrenderer is correctly configured by the hypervisor. (%s)", + __func__, apir_load_library_error(ret)); + } else if (ret == APIR_LOAD_LIBRARY_CANNOT_OPEN) { + GGML_ABORT(GGML_VIRTGPU + "%s: virglrenderer could not open the API Remoting backend library. " + "Make sure virglrenderer is correctly configured by the hypervisor. (%s)", + __func__, apir_load_library_error(ret)); + } else if (ret == APIR_LOAD_LIBRARY_ENV_VAR_MISSING) { + GGML_ABORT(GGML_VIRTGPU + "%s: could not load the backend library, some symbols are missing. " + "Make sure virglrenderer is correctly configured by the hypervisor. (%s) ", + __func__, apir_load_library_error(ret)); + } else { + GGML_ABORT(GGML_VIRTGPU + "%s: virglrenderer could not load the API Remoting backend library. (%s - code %d)", __func__, + apir_load_library_error(ret), ret); + } return ret; } - GGML_LOG_INFO("%s: virglrenderer successfully loaded the API Remoting backend library", __func__); + GGML_LOG_INFO(GGML_VIRTGPU + "%s: virglrenderer successfully loaded the API Remoting backend library.\n", __func__); ApirLoadLibraryReturnCode apir_ret = (ApirLoadLibraryReturnCode) (ret - APIR_LOAD_LIBRARY_INIT_BASE_INDEX); - if (apir_ret < APIR_LOAD_LIBRARY_INIT_BASE_INDEX) { - GGML_ABORT("%s: the API Remoting backend library couldn't load the backend library: apir code=%d | %s)", + if (apir_ret == APIR_LOAD_LIBRARY_CANNOT_OPEN) { + GGML_ABORT(GGML_VIRTGPU + "%s: the API Remoting backend library couldn't load the GGML backend library. " + "Make sure virglrenderer is correctly configured by the hypervisor. (%s)", + __func__, apir_load_library_error(apir_ret)); + } else if (apir_ret == APIR_LOAD_LIBRARY_SYMBOL_MISSING) { + GGML_ABORT(GGML_VIRTGPU + "%s: the API Remoting backend library couldn't load the GGML backend library, some symbols are missing. " + "Make sure virglrenderer is correctly configured by the hypervisor. (%s)", + __func__, apir_load_library_error(apir_ret)); + } else if (apir_ret < APIR_LOAD_LIBRARY_INIT_BASE_INDEX) { + GGML_ABORT(GGML_VIRTGPU + "%s: the API Remoting backend library couldn't load the GGML backend library: apir code=%d | %s)", __func__, apir_ret, apir_load_library_error(apir_ret)); } else { uint32_t lib_ret = apir_ret - APIR_LOAD_LIBRARY_INIT_BASE_INDEX; - GGML_ABORT("%s: the API Remoting backend library initialize its backend library: apir code=%d)", __func__, + GGML_ABORT(GGML_VIRTGPU + "%s: the API Remoting backend library initialize its backend library: apir code=%d)", __func__, lib_ret); } return ret; @@ -149,38 +181,58 @@ virtgpu * create_virtgpu() { gpu->use_apir_capset = getenv("GGML_REMOTING_USE_APIR_CAPSET") != nullptr; util_sparse_array_init(&gpu->shmem_array, sizeof(virtgpu_shmem), 1024); + // Initialize mutex to protect shared data_shmem buffer + if (mtx_init(&gpu->data_shmem_mutex, mtx_plain) != thrd_success) { + delete gpu; + GGML_ABORT(GGML_VIRTGPU + "%s: failed to initialize data_shmem mutex", __func__); + return NULL; + } + if (virtgpu_open(gpu) != APIR_SUCCESS) { - GGML_ABORT("%s: failed to open the virtgpu device", __func__); + GGML_LOG_ERROR(GGML_VIRTGPU + "%s: failed to open the virtgpu device\n", __func__); return NULL; } if (virtgpu_init_capset(gpu) != APIR_SUCCESS) { - GGML_ABORT("%s: failed to initialize the GPU capset", __func__); + if (gpu->use_apir_capset) { + GGML_ABORT(GGML_VIRTGPU + "%s: failed to initialize the virtgpu APIR capset. Make sure that the virglrenderer library supports it.", __func__); + } else { + GGML_ABORT(GGML_VIRTGPU + "%s: failed to initialize the virtgpu Venus capset", __func__); + } return NULL; } if (virtgpu_init_context(gpu) != APIR_SUCCESS) { - GGML_ABORT("%s: failed to initialize the GPU context", __func__); + GGML_ABORT(GGML_VIRTGPU + "%s: failed to initialize the GPU context", __func__); return NULL; } if (virtgpu_shmem_create(gpu, SHMEM_REPLY_SIZE, &gpu->reply_shmem)) { - GGML_ABORT("%s: failed to create the shared reply memory pages", __func__); + GGML_ABORT(GGML_VIRTGPU + "%s: failed to create the shared reply memory pages", __func__); return NULL; } if (virtgpu_shmem_create(gpu, SHMEM_DATA_SIZE, &gpu->data_shmem)) { - GGML_ABORT("%s: failed to create the shared data memory pages", __func__); + GGML_ABORT(GGML_VIRTGPU + "%s: failed to create the shared data memory pages", __func__); return NULL; } if (virtgpu_handshake(gpu)) { - GGML_ABORT("%s: failed to handshake with the virglrenderer library", __func__); + GGML_ABORT(GGML_VIRTGPU + "%s: failed to handshake with the virglrenderer library", __func__); return NULL; } if (virtgpu_load_library(gpu) != APIR_LOAD_LIBRARY_SUCCESS) { - GGML_ABORT("%s: failed to load the backend library", __func__); + GGML_ABORT(GGML_VIRTGPU + "%s: failed to load the backend library", __func__); return NULL; } @@ -191,7 +243,8 @@ static virt_gpu_result_t virtgpu_open(virtgpu * gpu) { drmDevicePtr devs[8]; int count = drmGetDevices2(0, devs, ARRAY_SIZE(devs)); if (count < 0) { - GGML_LOG_ERROR("%s: failed to enumerate DRM devices\n", __func__); + GGML_LOG_ERROR(GGML_VIRTGPU + "%s: failed to enumerate DRM devices\n", __func__); return APIR_ERROR_INITIALIZATION_FAILED; } @@ -213,16 +266,19 @@ static virt_gpu_result_t virtgpu_open_device(virtgpu * gpu, const drmDevicePtr d int fd = open(node_path, O_RDWR | O_CLOEXEC); if (fd < 0) { - GGML_ABORT("failed to open %s", node_path); + GGML_ABORT(GGML_VIRTGPU + "%s: failed to open %s", __func__, node_path); return APIR_ERROR_INITIALIZATION_FAILED; } drmVersionPtr version = drmGetVersion(fd); if (!version || strcmp(version->name, "virtio_gpu") || version->version_major != 0) { if (version) { - GGML_ABORT("unknown DRM driver %s version %d", version->name, version->version_major); + GGML_LOG_ERROR(GGML_VIRTGPU + "%s: unknown DRM driver %s version %d\n", __func__, version->name, version->version_major); } else { - GGML_ABORT("failed to get DRM driver version"); + GGML_LOG_ERROR(GGML_VIRTGPU + "%s: failed to get DRM driver version\n", __func__); } if (version) { @@ -236,7 +292,7 @@ static virt_gpu_result_t virtgpu_open_device(virtgpu * gpu, const drmDevicePtr d drmFreeVersion(version); - GGML_LOG_INFO("using DRM device %s\n", node_path); + GGML_LOG_INFO(GGML_VIRTGPU "using DRM device %s\n", node_path); return APIR_SUCCESS; } @@ -245,7 +301,7 @@ static virt_gpu_result_t virtgpu_init_context(virtgpu * gpu) { assert(!gpu->capset.version); const int ret = virtgpu_ioctl_context_init(gpu, gpu->capset.id); if (ret) { - GGML_LOG_INFO("failed to initialize context: %s\n", strerror(errno)); + GGML_LOG_ERROR(GGML_VIRTGPU "%s: failed to initialize context: %s\n", __func__, strerror(errno)); return APIR_ERROR_INITIALIZATION_FAILED; } @@ -254,10 +310,10 @@ static virt_gpu_result_t virtgpu_init_context(virtgpu * gpu) { static virt_gpu_result_t virtgpu_init_capset(virtgpu * gpu) { if (gpu->use_apir_capset) { - GGML_LOG_INFO("Using the APIR capset\n"); + GGML_LOG_INFO(GGML_VIRTGPU "Using the APIR capset\n"); gpu->capset.id = VIRTGPU_DRM_CAPSET_APIR; } else { - GGML_LOG_INFO("Using the Venus capset\n"); + GGML_LOG_INFO(GGML_VIRTGPU "Using the Venus capset\n"); gpu->capset.id = VIRTGPU_DRM_CAPSET_VENUS; } gpu->capset.version = 0; @@ -266,7 +322,9 @@ static virt_gpu_result_t virtgpu_init_capset(virtgpu * gpu) { virtgpu_ioctl_get_caps(gpu, gpu->capset.id, gpu->capset.version, &gpu->capset.data, sizeof(gpu->capset.data)); if (ret) { - GGML_LOG_INFO("failed to get APIR v%d capset: %s\n", gpu->capset.version, strerror(errno)); + GGML_LOG_ERROR(GGML_VIRTGPU + "%s: failed to get APIR v%d capset: %s\n", + __func__, gpu->capset.version, strerror(errno)); return APIR_ERROR_INITIALIZATION_FAILED; } @@ -333,9 +391,9 @@ apir_encoder * remote_call_prepare(virtgpu * gpu, ApirCommandType apir_cmd_type, * Prepare the command encoder and its buffer */ - static char encoder_buffer[4096]; + thread_local char encoder_buffer[4096]; - static apir_encoder enc; + thread_local apir_encoder enc; enc = { .cur = encoder_buffer, .start = encoder_buffer, @@ -369,19 +427,19 @@ void remote_call_finish(virtgpu * gpu, apir_encoder * enc, apir_decoder * dec) { UNUSED(gpu); if (!enc) { - GGML_LOG_ERROR("Invalid (null) encoder\n"); + GGML_ABORT(GGML_VIRTGPU "%s: Invalid (null) encoder", __func__); } if (!dec) { - GGML_LOG_ERROR("Invalid (null) decoder\n"); + GGML_ABORT(GGML_VIRTGPU "%s: Invalid (null) decoder", __func__); } if (apir_encoder_get_fatal(enc)) { - GGML_LOG_ERROR("Failed to encode the output parameters.\n"); + GGML_LOG_ERROR(GGML_VIRTGPU "%s: Failed to encode the output parameters.", __func__); } if (apir_decoder_get_fatal(dec)) { - GGML_LOG_ERROR("Failed to decode the input parameters.\n"); + GGML_LOG_ERROR(GGML_VIRTGPU "%s: Failed to decode the input parameters.", __func__); } } @@ -423,7 +481,7 @@ uint32_t remote_call(virtgpu * gpu, int ret = drmIoctl(gpu->fd, DRM_IOCTL_VIRTGPU_EXECBUFFER, &args); if (ret != 0) { - GGML_ABORT("%s: the virtgpu EXECBUFFER ioctl failed (%d)", __func__, ret); + GGML_ABORT(GGML_VIRTGPU "%s: the virtgpu EXECBUFFER ioctl failed (%d)", __func__, ret); } /* @@ -467,7 +525,7 @@ uint32_t remote_call(virtgpu * gpu, } if (max_wait_ms && timedout) { - GGML_LOG_ERROR("timed out waiting for the host answer...\n"); + GGML_LOG_ERROR(GGML_VIRTGPU "%s: timed out waiting for the host answer...\n", __func__); return APIR_FORWARD_TIMEOUT; } @@ -489,10 +547,13 @@ static void log_call_duration(long long call_duration_ns, const char * name) { double call_duration_s = (double) call_duration_ns / 1e9; // 1 second = 1e9 nanoseconds if (call_duration_s > 1) { - GGML_LOG_INFO("%s: waited %.2fs for the %s host reply...\n", __func__, call_duration_s, name); + GGML_LOG_INFO(GGML_VIRTGPU + "waited %.2fs for the %s host reply...\n", call_duration_s, name); } else if (call_duration_ms > 1) { - GGML_LOG_INFO("%s: waited %.2fms for the %s host reply...\n", __func__, call_duration_ms, name); + GGML_LOG_INFO(GGML_VIRTGPU + "waited %.2fms for the %s host reply...\n", call_duration_ms, name); } else { - GGML_LOG_INFO("%s: waited %lldns for the %s host reply...\n", __func__, call_duration_ns, name); + GGML_LOG_INFO(GGML_VIRTGPU + "waited %lldns for the %s host reply...\n", call_duration_ns, name); } } diff --git a/ggml/src/ggml-virtgpu/virtgpu.h b/ggml/src/ggml-virtgpu/virtgpu.h index d4bb42e20..68e0f3a37 100644 --- a/ggml/src/ggml-virtgpu/virtgpu.h +++ b/ggml/src/ggml-virtgpu/virtgpu.h @@ -17,6 +17,8 @@ #include +#include "ggml-remoting.h" + #define VIRGL_RENDERER_UNSTABLE_APIS 1 #include "apir_hw.h" #include @@ -73,6 +75,27 @@ struct virtgpu { /* APIR communication pages */ virtgpu_shmem reply_shmem; virtgpu_shmem data_shmem; + + /* Mutex to protect shared data_shmem buffer from concurrent access */ + mtx_t data_shmem_mutex; + + /* Cached device information to prevent memory leaks and race conditions */ + struct { + char * description; + char * name; + int32_t device_count; + uint32_t type; + size_t memory_free; + size_t memory_total; + } cached_device_info; + + /* Cached buffer type information to prevent memory leaks and race conditions */ + struct { + apir_buffer_type_host_handle_t host_handle; + char * name; + size_t alignment; + size_t max_size; + } cached_buffer_type; }; static inline int virtgpu_ioctl(virtgpu * gpu, unsigned long request, void * args) { From 25f40ca65f1aa596f8b1702bbac4bc48a45b87d7 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Wed, 4 Feb 2026 05:43:28 +0100 Subject: [PATCH 11/11] completion : simplify batch (embd) processing (#19286) * completion : simplify batch (embd) processing This commit simplifies the processing of embd by removing the for loop that currently exists which uses params.n_batch as its increment. This commit also removes the clamping of n_eval as the size of embd is always at most the size of params.n_batch. The motivation is to clarify the code as it is currently a little confusing when looking at this for loop in isolation and thinking that it can process multiple batches. * add an assert to verify n_eval is not greater than n_batch --- tools/completion/completion.cpp | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/tools/completion/completion.cpp b/tools/completion/completion.cpp index f368a2f4c..977132756 100644 --- a/tools/completion/completion.cpp +++ b/tools/completion/completion.cpp @@ -674,15 +674,12 @@ int main(int argc, char ** argv) { } } - for (int i = 0; i < (int) embd.size(); i += params.n_batch) { - int n_eval = (int) embd.size() - i; - if (n_eval > params.n_batch) { - n_eval = params.n_batch; - } - + if (!embd.empty()) { + int n_eval = (int) embd.size(); LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str()); - if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval))) { + GGML_ASSERT(n_eval <= params.n_batch); + if (llama_decode(ctx, llama_batch_get_one(embd.data(), n_eval))) { LOG_ERR("%s : failed to eval\n", __func__); return 1; } @@ -743,7 +740,7 @@ int main(int argc, char ** argv) { common_sampler_accept(smpl, embd_inp[n_consumed], /* accept_grammar= */ false); ++n_consumed; - if ((int) embd.size() >= params.n_batch) { + if ((int) embd.size() == params.n_batch) { break; } }