diff --git a/ggml/src/ggml-cuda/allreduce.cu b/ggml/src/ggml-cuda/allreduce.cu new file mode 100644 index 000000000..434689abd --- /dev/null +++ b/ggml/src/ggml-cuda/allreduce.cu @@ -0,0 +1,968 @@ +#include "allreduce.cuh" + +#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) + +#include "convert.cuh" +#include "ggml-impl.h" + +#include +#include +#include +#include + +// --------------------------------------------------------------------------- +// CUDA AllReduce for tensor-parallel inference across two GPUs. +// +// Provides an in-place sum reduction over matching tensors on two CUDA +// devices in the same process. Used by the tensor-split path alongside +// NCCL; targets setups without NVLink, where data is exchanged between the +// GPUs by staging it through pinned host memory over PCIe. +// +// Two reduction strategies are selected per call by tensor size: +// +// * Chunked kernel path (small reductions): a single CUDA kernel both +// stages data through pinned host memory and performs the local sum. +// Cross-GPU synchronization happens *inside the kernel* (busy-wait on +// a host-memory flag), which keeps launch overhead low for the +// latency-sensitive token-generation case. +// +// * Copy-engine path (large reductions): the transfer is split into +// D2H + H2D cudaMemcpyAsync chunks driven by the GPU's copy engine, +// followed by a small device-side add kernel. Cross-GPU +// synchronization happens *outside the kernel*, via CUDA events +// between streams. This keeps the compute engine free while large +// transfers are in flight, which matters for prefill-sized tensors. +// Reductions larger than the per-call inner cap are processed by an +// outer chunker that issues sequential inner calls. +// --------------------------------------------------------------------------- + +// --------------------------------------------------------------------------- +// Cross-GPU signal mechanism +// +// One int per (slot, rank) pair in pinned host memory. Each AR call writes a +// strictly increasing token (= the AR call number) into its own arrival int. +// The peer spins until its read of the other's arrival int equals the token +// it expects for this call -- a mismatch means the peer hasn't arrived yet. +// Tokens never repeat over realistic call rates (32-bit int wraps in tens of +// days at thousands of ARs/sec), so arrival ints don't need to be reset +// between calls; we initialize once at pipeline init and let the values +// accumulate. +// +// There is exactly one writer (the owning GPU) and one reader (the peer), so +// we don't need atomics. A volatile store paired with __threadfence_system() +// provides the release ordering that makes the D2H writes visible system-wide +// before the arrival token is observed. +// +// atomicAdd_system() requires hostNativeAtomicSupported, which is unavailable +// on PCIe-attached consumer GPUs without NVLink, so the volatile path is the +// portable choice. +// --------------------------------------------------------------------------- + +static __device__ __forceinline__ void ggml_cuda_ar_signal_set(int * p, int token) { + *(volatile int *)p = token; +} +static __device__ __forceinline__ int ggml_cuda_ar_signal_get(const int * p) { + return *(const volatile int *)p; +} + +// Byte spacing between adjacent arrival ints. 64 bytes (one cache line) +// ensures each GPU/block's arrival slot lives on its own line, preventing +// false-sharing stalls on the polling GPU. +static constexpr size_t GGML_CUDA_AR_ARRIVAL_STRIDE = 64; + +// Number of blocks the chunked kernel launches with. Each block stripes a +// disjoint slice of the data and synchronizes through its own arrival-token +// slot so multiple SMs can pump PCIe stores in parallel. +static constexpr int GGML_CUDA_AR_KERNEL_BLOCKS = 8; + +// --------------------------------------------------------------------------- +// Chunked kernel AllReduce -- 2 GPUs, supports float, half, and bfloat16. +// +// Both GPUs run this kernel simultaneously on independent streams. sendbuf +// and recvbuf live in T_dst (the caller's tensor type); host_mine / host_other +// carry data in T_wire (the on-wire type, possibly narrower than T_dst -- e.g. +// T_dst=F32 with T_wire=BF16 halves the bytes pushed across PCIe). When +// T_dst == T_wire the casts below are no-ops. +// +// Each GPU runs three phases: +// +// Phase 1 (all threads): cast sendbuf (T_dst) -> T_wire and store as +// single-instruction-width vectors into host_mine. +// __threadfence_system() commits these writes to host +// memory. +// Phase 2 (thread 0): write token to arrival_mine; spin until +// arrival_other == token. +// Phase 3 (all threads): read T_wire vectors from host_other, cast +// each element to T_dst, and sum with the local +// sendbuf value (also rounded through T_wire so that +// both GPUs truncate identically -- this guarantees +// bit-equivalent results across the two devices). +// +// Multi-block: blocks stripe vectors across (gridDim.x * blockDim.x) global +// threads to keep multiple SMs issuing PCIe stores in parallel. Each block +// has its own arrival-token slot (offset by blockIdx.x * ARRIVAL_STRIDE); +// thread 0 of each block signals/spins on that slot independently of other +// blocks. Tail elements (the leftover < ELEMS_PER_VEC at the end) are +// handled only by block 0 to avoid cross-block writes to the same slots. +// --------------------------------------------------------------------------- +template +static __global__ void ggml_cuda_ar_kernel( + const T_dst * sendbuf, + T_dst * recvbuf, + T_wire * __restrict__ host_mine, + const T_wire * __restrict__ host_other, + int count, + int * arrival_mine, + int * arrival_other, + int token) { + + // Vector unit for the wire type, sized to the arch's widest single-instruction + // copy (16 B on Volta+). Each phase-1 iter writes one vector to host memory; + // each phase-3 iter reads one and produces ELEMS_PER_VEC sums. + constexpr int ELEMS_PER_VEC = ggml_cuda_get_max_cpy_bytes() / sizeof(T_wire); + constexpr int ARRIVAL_INTS = (int)(GGML_CUDA_AR_ARRIVAL_STRIDE / sizeof(int)); + + const int tid = threadIdx.x; + const int nt = blockDim.x; + const int bid = blockIdx.x; + const int gtid = bid * nt + tid; + const int gnt = gridDim.x * nt; + const int count_vec = count / ELEMS_PER_VEC; + const int tail = count_vec * ELEMS_PER_VEC; + + // Phase 1: cast sendbuf (T_dst) -> host_mine (T_wire) and store as vectors. + { + for (int i = gtid; i < count_vec; i += gnt) { + const int off = i * ELEMS_PER_VEC; + T_wire wire[ELEMS_PER_VEC]; + #pragma unroll + for (int k = 0; k < ELEMS_PER_VEC; ++k) { + wire[k] = ggml_cuda_cast(sendbuf[off + k]); + } + ggml_cuda_memcpy_1(&host_mine[off], wire); + } + if (bid == 0 && tid < count - tail) { + host_mine[tail + tid] = ggml_cuda_cast(sendbuf[tail + tid]); + } + } + + // Commit this block's host writes before signalling. + __threadfence_system(); + __syncthreads(); + + // Phase 2: thread 0 of each block signals on its own arrival slot, then + // spins for the matching slot from peer. Per-block tokens mean blocks + // proceed independently -- no inter-block barrier needed. + if (tid == 0) { + int * my_slot = arrival_mine + bid * ARRIVAL_INTS; + const int * other_slot = arrival_other + bid * ARRIVAL_INTS; + + ggml_cuda_ar_signal_set(my_slot, token); + __threadfence_system(); // make our signal visible system-wide + + while (ggml_cuda_ar_signal_get(other_slot) != token) { +#if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA + __nanosleep(100); +#else + NO_DEVICE_CODE; +#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA + } + } + + __syncthreads(); + + // Acquire peer's host_other writes (this block's stripe of them). + __threadfence_system(); + + // Phase 3: read peer's T_wire vector, cast both sides through T_wire for + // bit-equivalence, sum in T_dst precision, and write back to recvbuf. + { + for (int i = gtid; i < count_vec; i += gnt) { + const int off = i * ELEMS_PER_VEC; + T_wire wire[ELEMS_PER_VEC]; + ggml_cuda_memcpy_1(wire, &host_other[off]); + #pragma unroll + for (int k = 0; k < ELEMS_PER_VEC; ++k) { + const T_wire d_low = ggml_cuda_cast(sendbuf[off + k]); + recvbuf[off + k] = ggml_cuda_cast(d_low) + ggml_cuda_cast(wire[k]); + } + } + if (bid == 0 && tid < count - tail) { + const T_wire d_low = ggml_cuda_cast(sendbuf[tail + tid]); + recvbuf[tail + tid] = + ggml_cuda_cast(d_low) + ggml_cuda_cast(host_other[tail + tid]); + } + } +} + +// Combined load-convert-add kernel. The peer's contribution arrives as T_src +// (which may be a lower-precision type than T_dst when the BF16 round-trip is +// active). For bit-equivalence between the two GPUs, dst is first rounded +// through T_src's precision via ggml_cuda_cast -- peer already truncated its +// own value the same way before sending -- so both sides perform identical +// arithmetic. When T_dst == T_src the round-trip cast is a no-op. +template +static __global__ void ggml_cuda_ar_add_kernel( + T_dst * __restrict__ dst, + const T_src * __restrict__ src, + int count) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + const int nt = gridDim.x * blockDim.x; + for (int i = tid; i < count; i += nt) { + const T_src d_low = ggml_cuda_cast(dst[i]); + dst[i] = ggml_cuda_cast(d_low) + ggml_cuda_cast(src[i]); + } +} + +// --------------------------------------------------------------------------- +// Pipeline structure +// --------------------------------------------------------------------------- + +// Number of slots in the event / arrival ring. Two slots is sufficient: +// lockstep guarantees the two GPUs are at most one AR (or chunk) apart, so +// slot[N%2] is always safe to reuse -- peer has already consumed slot[N%2] +// from AR N-2 by the time we get to AR N. acquire_slot's +// cudaEventSynchronize on ev.ker for both devices makes that consumption +// explicit before we overwrite host_buf[slot] for the new AR. +static constexpr int GGML_CUDA_AR_POOL_SIZE = 2; + +// Maximum chunk size (bytes per GPU) handled by one chunked kernel launch. +// Larger tensors are reduced by issuing multiple chunked launches. +static constexpr size_t GGML_CUDA_AR_MAX_BYTES = 1024 * 1024; // 1 MB + +// Copy-engine path: largest tensor accepted on this path; sets host_large / +// dev_tmp allocation size. +static constexpr size_t GGML_CUDA_AR_COPY_MAX_BYTES = 32 * 1024 * 1024; // 32 MB + +// AR wire size at which the copy-engine path takes over from the chunked- +// kernel path. Override via GGML_CUDA_AR_COPY_THRESHOLD. +static constexpr size_t GGML_CUDA_AR_COPY_THRESHOLD_DEFAULT = 1024 * 1024; // 1 MB +// Per-call CE chunk-size heuristic: chunk_bytes = clamp(nbytes / 4, MIN, MAX). +// The /4 keeps ~4 chunks in flight at any moment (good D2H/H2D overlap with +// the peer); the clamps cover the cases where nbytes/4 is too small (per- +// memcpy fixed cost dominates) or too large (chunk-level pipelining stalls). +// Env var GGML_CUDA_AR_COPY_CHUNK_BYTES can override with a fixed value. +static constexpr size_t GGML_CUDA_AR_COPY_CHUNK_BYTES_HEURISTIC_MIN = 512 * 1024; // 512 KB +static constexpr size_t GGML_CUDA_AR_COPY_CHUNK_BYTES_HEURISTIC_MAX = 2 * 1024 * 1024; // 2 MB +// Absolute floor that an env-var override is allowed to set; this caps the +// per-slot copy-event array. 256 KB -> up to 128 chunks per 32 MB tensor. +static constexpr size_t GGML_CUDA_AR_COPY_CHUNK_BYTES_MIN = 256 * 1024; +static constexpr int GGML_CUDA_AR_COPY_MAX_CHUNKS = + static_cast((GGML_CUDA_AR_COPY_MAX_BYTES + GGML_CUDA_AR_COPY_CHUNK_BYTES_MIN - 1) / + GGML_CUDA_AR_COPY_CHUNK_BYTES_MIN); + +struct ggml_cuda_ar_event_slot { + cudaEvent_t app = nullptr; // upstream computation complete + cudaEvent_t cpy[GGML_CUDA_AR_COPY_MAX_CHUNKS] = {}; // copy-engine D2H chunks complete + cudaEvent_t h2d = nullptr; // copy-engine H2Ds complete (handoff AR stream -> compute stream) + cudaEvent_t ker = nullptr; // AllReduce kernel complete +}; + +// Mapped pinned host allocation: cudaHostAlloc + cudaHostGetDevicePointer +// in one place, with the host handle preserved for cudaFreeHost. Used where +// the CPU never touches the buffer -- only the device reads/writes via the +// mapped device pointer. Required on systems where cudaDevAttrCanUseHost- +// PointerForRegisteredMem is 0 and the host pointer can't be used as a +// device pointer. +struct ggml_cuda_ar_host_mapping { + uint8_t * host = nullptr; // cudaFreeHost handle; also the H-side ptr for cudaMemcpyAsync + uint8_t * dev = nullptr; // device-side pointer for kernels / cudaMemset + + cudaError_t alloc(size_t bytes) { + cudaError_t rc = cudaHostAlloc(reinterpret_cast(&host), bytes, + cudaHostAllocPortable | cudaHostAllocMapped); + if (rc != cudaSuccess) { + host = nullptr; + return rc; + } + rc = cudaHostGetDevicePointer(reinterpret_cast(&dev), host, 0); + if (rc != cudaSuccess) { + cudaFreeHost(host); + host = nullptr; + dev = nullptr; + } + return rc; + } + + void free() { + if (host) { + cudaFreeHost(host); + host = nullptr; + dev = nullptr; + } + } +}; + +struct ggml_cuda_ar_pipeline { + int n_devices; + int devices[GGML_CUDA_MAX_DEVICES]; + size_t buf_bytes; // bytes per device in host_buf[] + size_t copy_bytes; // bytes per device in host_large[] / dev_tmp[] + size_t copy_threshold; + size_t copy_chunk_bytes; + size_t bf16_threshold; // tensors >= this size (bytes) are reduced via FP32->BF16 round-trip; 0 disables + uint64_t call_count; + + // Per-device resources. + ggml_cuda_ar_host_mapping host_buf[GGML_CUDA_MAX_DEVICES]; // pinned staging (chunked kernel) + ggml_cuda_ar_host_mapping host_large[GGML_CUDA_MAX_DEVICES]; // pinned staging (copy-engine) + char * dev_tmp[GGML_CUDA_MAX_DEVICES]; // device scratch for copy-engine path + cudaStream_t streams[GGML_CUDA_MAX_DEVICES]; // non-blocking + ggml_cuda_ar_event_slot ev_pool[GGML_CUDA_MAX_DEVICES][GGML_CUDA_AR_POOL_SIZE]; + + // Copy-engine: per-device "I finished reading my peer's host_large" + // event. Indexed by RECORDER device. Recorded same-device on streams[i] + // after stage 2's last H2D from host_large[peer]. Waited cross-device + // by peer's stage-1 stream before the next AR overwrites host_large[peer]. + cudaEvent_t host_large_read_done[GGML_CUDA_MAX_DEVICES]; + bool host_large_read_done_valid; + + // Copy-engine: per-device "my add_kernel is done with dev_tmp" event. + // Recorded on the compute stream after each add_kernel; the AR stream + // waits on it before the next copy_impl's H2D overwrites dev_tmp. Lets us + // single-buffer dev_tmp despite add_kernel running on a separate stream. + cudaEvent_t dev_tmp_kernel_done[GGML_CUDA_MAX_DEVICES]; + bool dev_tmp_kernel_done_valid; + + // Arrival ring: ARRIVAL_STRIDE bytes between adjacent ints. Mapped pinned + // memory; CPU never reads/writes -- only the kernel and cudaMemset. + // Use ggml_cuda_ar_arrival_ptr() to index. + ggml_cuda_ar_host_mapping arrival; +}; + +// Base pointer for the (slot, rank) per-block token block. The kernel adds +// blockIdx.x * (ARRIVAL_STRIDE/sizeof(int)) internally to land on its own slot. +static int * ggml_cuda_ar_arrival_ptr(const ggml_cuda_ar_pipeline * p, int slot, int rank) { + const size_t offset = ((size_t)slot * p->n_devices + rank) * + GGML_CUDA_AR_KERNEL_BLOCKS * GGML_CUDA_AR_ARRIVAL_STRIDE; + return reinterpret_cast(p->arrival.dev + offset); +} + +static uint64_t ggml_cuda_ar_env_u64(const char * name, uint64_t default_value) { + const char * value = getenv(name); + if (value == nullptr || value[0] == '\0') { + return default_value; + } + + char * end = nullptr; + const unsigned long long parsed = strtoull(value, &end, 10); + return end != value ? (uint64_t) parsed : default_value; +} + +struct ggml_cuda_ar_slot_info { + int slot; + int token; +}; + +static ggml_cuda_ar_slot_info ggml_cuda_ar_acquire_slot(ggml_cuda_ar_pipeline * p) { + const int slot = static_cast(p->call_count % GGML_CUDA_AR_POOL_SIZE); + const bool pool_lapped = p->call_count >= GGML_CUDA_AR_POOL_SIZE; + p->call_count++; + + if (pool_lapped) { + for (int i = 0; i < p->n_devices; ++i) { + ggml_cuda_set_device(p->devices[i]); + CUDA_CHECK(cudaEventSynchronize(p->ev_pool[i][slot].ker)); + } + } + + return { slot, (int) p->call_count }; +} + +// Per-AR copy-engine chunk size: env-var override if set, else heuristic +// (clamp(nbytes/4, HEURISTIC_MIN, HEURISTIC_MAX)). +static size_t ggml_cuda_ar_chunk_bytes(const ggml_cuda_ar_pipeline * p, size_t nbytes) { + if (p->copy_chunk_bytes > 0) { + return p->copy_chunk_bytes; + } + return std::min(GGML_CUDA_AR_COPY_CHUNK_BYTES_HEURISTIC_MAX, + std::max(GGML_CUDA_AR_COPY_CHUNK_BYTES_HEURISTIC_MIN, nbytes / 4)); +} + +static void ggml_cuda_ar_wait_for_compute( + ggml_cuda_ar_pipeline * p, ggml_backend_cuda_context * cuda_ctx, int rank, int slot) { + ggml_cuda_ar_event_slot & ev = p->ev_pool[rank][slot]; + CUDA_CHECK(cudaEventRecord(ev.app, cuda_ctx->stream())); + CUDA_CHECK(cudaStreamWaitEvent(p->streams[rank], ev.app)); +} + +// --------------------------------------------------------------------------- +// Init / free +// --------------------------------------------------------------------------- + +ggml_cuda_ar_pipeline * ggml_cuda_ar_pipeline_init(const int * devices, size_t n_devices) { + + if (n_devices != 2) { + GGML_LOG_DEBUG("%s: internal AllReduce only supports n_devices=2 (got %zu); " + "falling back\n", __func__, n_devices); + return nullptr; + } + + // The chunked kernel uses __nanosleep, which is sm70+ (Volta+). + for (size_t i = 0; i < n_devices; ++i) { + const int cc = ggml_cuda_info().devices[devices[i]].cc; + if (cc < GGML_CUDA_CC_VOLTA) { + GGML_LOG_DEBUG("%s: internal AllReduce requires compute capability >= %d " + "(device %d has cc=%d); falling back\n", + __func__, GGML_CUDA_CC_VOLTA, devices[i], cc); + return nullptr; + } + } + + auto * p = new ggml_cuda_ar_pipeline{}; + p->n_devices = n_devices; + p->copy_bytes = GGML_CUDA_AR_COPY_MAX_BYTES; + p->copy_threshold = ggml_cuda_ar_env_u64("GGML_CUDA_AR_COPY_THRESHOLD", GGML_CUDA_AR_COPY_THRESHOLD_DEFAULT); + // 0 = use the per-call heuristic (default). Non-zero env value forces a + // fixed chunk size for diagnostics, with a floor at COPY_CHUNK_BYTES_MIN. + p->copy_chunk_bytes = ggml_cuda_ar_env_u64("GGML_CUDA_AR_COPY_CHUNK_BYTES", 0); + if (p->copy_chunk_bytes > 0 && p->copy_chunk_bytes < GGML_CUDA_AR_COPY_CHUNK_BYTES_MIN) { + GGML_LOG_WARN("%s: GGML_CUDA_AR_COPY_CHUNK_BYTES=%zu below minimum %zu; clamping\n", + __func__, p->copy_chunk_bytes, GGML_CUDA_AR_COPY_CHUNK_BYTES_MIN); + p->copy_chunk_bytes = GGML_CUDA_AR_COPY_CHUNK_BYTES_MIN; + } + // Default 1: BF16 round-trip is always on for F32 inputs (any non-zero + // ne). Set GGML_CUDA_AR_BF16_THRESHOLD=0 to disable, or to a larger + // byte threshold to opt out for small tensors. + p->bf16_threshold = ggml_cuda_ar_env_u64("GGML_CUDA_AR_BF16_THRESHOLD", 1); + for (size_t i = 0; i < n_devices; ++i) { + p->devices[i] = devices[i]; + } + + // Per-device streams and event pools. + for (size_t i = 0; i < n_devices; ++i) { + ggml_cuda_set_device(p->devices[i]); + + cudaStream_t stream = nullptr; + if (cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking) != cudaSuccess) { + GGML_LOG_ERROR("%s: cudaStreamCreateWithFlags failed for device %d\n", + __func__, p->devices[i]); + ggml_cuda_ar_pipeline_free(p); + return nullptr; + } + p->streams[i] = stream; + + for (int s = 0; s < GGML_CUDA_AR_POOL_SIZE; ++s) { + bool ok = + cudaEventCreateWithFlags(&p->ev_pool[i][s].app, cudaEventDisableTiming) == cudaSuccess && + cudaEventCreateWithFlags(&p->ev_pool[i][s].h2d, cudaEventDisableTiming) == cudaSuccess && + cudaEventCreateWithFlags(&p->ev_pool[i][s].ker, cudaEventDisableTiming) == cudaSuccess; + for (int c = 0; ok && c < GGML_CUDA_AR_COPY_MAX_CHUNKS; ++c) { + ok = cudaEventCreateWithFlags(&p->ev_pool[i][s].cpy[c], cudaEventDisableTiming) == cudaSuccess; + } + if (!ok) { + GGML_LOG_ERROR("%s: cudaEventCreate failed for device %d slot %d\n", + __func__, p->devices[i], s); + ggml_cuda_ar_pipeline_free(p); + return nullptr; + } + } + + if (cudaEventCreateWithFlags(&p->host_large_read_done[i], cudaEventDisableTiming) != cudaSuccess) { + GGML_LOG_ERROR("%s: cudaEventCreate for host_large_read_done failed for device %d\n", + __func__, p->devices[i]); + ggml_cuda_ar_pipeline_free(p); + return nullptr; + } + if (cudaEventCreateWithFlags(&p->dev_tmp_kernel_done[i], cudaEventDisableTiming) != cudaSuccess) { + GGML_LOG_ERROR("%s: cudaEventCreate for dev_tmp_kernel_done failed for device %d\n", + __func__, p->devices[i]); + ggml_cuda_ar_pipeline_free(p); + return nullptr; + } + } + + // Arrival ring: cache-line padded so each GPU's int is on its own line. + const size_t arrival_bytes = + (size_t)GGML_CUDA_AR_POOL_SIZE * n_devices * + GGML_CUDA_AR_KERNEL_BLOCKS * GGML_CUDA_AR_ARRIVAL_STRIDE; + if (p->arrival.alloc(arrival_bytes) != cudaSuccess) { + GGML_LOG_ERROR("%s: alloc for arrival ring failed (%zu bytes)\n", + __func__, arrival_bytes); + ggml_cuda_ar_pipeline_free(p); + return nullptr; + } + ggml_cuda_set_device(p->devices[0]); + if (cudaMemset(p->arrival.dev, 0, arrival_bytes) != cudaSuccess) { + GGML_LOG_ERROR("%s: cudaMemset for arrival ring failed (%zu bytes)\n", + __func__, arrival_bytes); + ggml_cuda_ar_pipeline_free(p); + return nullptr; + } + + // Per-device pinned staging buffers -- POOL_SIZE-deep ring so the chunked- + // kernel can write the next slot's data while the peer is still reading + // the previous slot's. Indexed by (slot * buf_bytes) at the call site. + p->buf_bytes = GGML_CUDA_AR_MAX_BYTES; + const size_t host_buf_total = (size_t) GGML_CUDA_AR_POOL_SIZE * p->buf_bytes; + for (size_t i = 0; i < n_devices; ++i) { + if (p->host_buf[i].alloc(host_buf_total) != cudaSuccess) { + GGML_LOG_ERROR("%s: alloc for staging failed (%zu bytes)\n", + __func__, host_buf_total); + ggml_cuda_ar_pipeline_free(p); + return nullptr; + } + } + + // Copy-engine path: pinned host staging + device scratch, sized for the + // largest tensor we accept on this path (GGML_CUDA_AR_COPY_MAX_BYTES). + // dev_tmp is single-buffered; cross-AR safety is enforced by an explicit + // cross-stream wait in copy_impl on the prior AR's add_kernel-done event. + for (size_t i = 0; i < n_devices; ++i) { + ggml_cuda_set_device(p->devices[i]); + if (p->host_large[i].alloc(p->copy_bytes) != cudaSuccess) { + GGML_LOG_ERROR("%s: alloc for large staging failed (%zu bytes)\n", + __func__, p->copy_bytes); + ggml_cuda_ar_pipeline_free(p); + return nullptr; + } + if (cudaMalloc(reinterpret_cast(&p->dev_tmp[i]), p->copy_bytes) != cudaSuccess) { + GGML_LOG_ERROR("%s: cudaMalloc for copy scratch failed (%zu bytes) on device %d\n", + __func__, p->copy_bytes, p->devices[i]); + ggml_cuda_ar_pipeline_free(p); + return nullptr; + } + } + + GGML_LOG_INFO("%s: initialized AllReduce pipeline: %zu GPUs, " + "%zu KB chunked kernel staging + %zu MB copy-engine staging per GPU\n", + __func__, n_devices, p->buf_bytes >> 10, p->copy_bytes >> 20); + + return p; +} + +void ggml_cuda_ar_pipeline_free(ggml_cuda_ar_pipeline * p) { + if (!p) { + return; + } + + // Drain all in-flight kernels before tearing down resources. + for (int i = 0; i < p->n_devices; ++i) { + if (p->streams[i]) { + ggml_cuda_set_device(p->devices[i]); + cudaStreamSynchronize(p->streams[i]); + } + } + + for (int i = 0; i < p->n_devices; ++i) { + p->host_buf[i].free(); + p->host_large[i].free(); + if (p->dev_tmp[i]) { + ggml_cuda_set_device(p->devices[i]); + cudaFree(p->dev_tmp[i]); + } + ggml_cuda_set_device(p->devices[i]); + for (int s = 0; s < GGML_CUDA_AR_POOL_SIZE; ++s) { + if (p->ev_pool[i][s].app) { cudaEventDestroy(p->ev_pool[i][s].app); } + for (int c = 0; c < GGML_CUDA_AR_COPY_MAX_CHUNKS; ++c) { + if (p->ev_pool[i][s].cpy[c]) { cudaEventDestroy(p->ev_pool[i][s].cpy[c]); } + } + if (p->ev_pool[i][s].h2d) { cudaEventDestroy(p->ev_pool[i][s].h2d); } + if (p->ev_pool[i][s].ker) { cudaEventDestroy(p->ev_pool[i][s].ker); } + } + if (p->host_large_read_done[i]) { + ggml_cuda_set_device(p->devices[i]); + cudaEventDestroy(p->host_large_read_done[i]); + } + if (p->dev_tmp_kernel_done[i]) { + ggml_cuda_set_device(p->devices[i]); + cudaEventDestroy(p->dev_tmp_kernel_done[i]); + } + if (p->streams[i]) { + ggml_cuda_set_device(p->devices[i]); + cudaStreamDestroy(p->streams[i]); + } + } + p->arrival.free(); + delete p; +} + +// --------------------------------------------------------------------------- +// Dispatch +// --------------------------------------------------------------------------- + +// Asymmetric copy_impl: data sent over PCIe in T_src precision (one element of +// nbytes per ne element); accumulated locally into a T_dst buffer. When +// T_src == T_dst this is the original homogeneous reduction. When they differ +// (e.g. BF16 wire / F32 accumulator) the add kernel rounds dst through T_src +// for bit-equivalence between GPUs and we skip the otherwise-needed +// post-conversion entirely. +template +static bool ggml_cuda_ar_allreduce_copy_impl( + ggml_cuda_ar_pipeline * p, + ggml_backend_t * backends, + T_src * const src_buf[GGML_CUDA_MAX_DEVICES], + T_dst * const dst_buf[GGML_CUDA_MAX_DEVICES], + const bool compute[GGML_CUDA_MAX_DEVICES], + int64_t ne, + size_t nbytes) { + GGML_ASSERT(p->n_devices == 2); + GGML_ASSERT(nbytes <= p->copy_bytes); + GGML_ASSERT(ne <= std::numeric_limits::max()); + + const size_t chunk_bytes = ggml_cuda_ar_chunk_bytes(p, nbytes); + GGML_ASSERT(chunk_bytes > 0); + + const int slot = ggml_cuda_ar_acquire_slot(p).slot; + const size_t copy_chunks = (nbytes + chunk_bytes - 1) / chunk_bytes; + GGML_ASSERT(copy_chunks <= GGML_CUDA_AR_COPY_MAX_CHUNKS); + + ggml_backend_cuda_context * cuda_ctx[2] = {}; + + // Stage 1: both GPUs copy their local contribution to pinned host memory. + for (int i = 0; i < 2; ++i) { + ggml_cuda_set_device(p->devices[i]); + cuda_ctx[i] = static_cast(backends[i]->context); + GGML_ASSERT(cuda_ctx[i]->device == p->devices[i]); + + ggml_cuda_ar_wait_for_compute(p, cuda_ctx[i], i, slot); + + // Wait for peer's H2D from our host_large[i] (recorded in the + // previous AR's stage 2) to complete before we overwrite host_large[i]. + // host_large_read_done[peer] = peer finished reading host_large[i]. + // No-op on the first AR -- no prior record exists. + if (p->host_large_read_done_valid) { + const int peer = 1 - i; + CUDA_CHECK(cudaStreamWaitEvent(p->streams[i], p->host_large_read_done[peer])); + } + + if (!compute[i]) { + CUDA_CHECK(cudaMemsetAsync(src_buf[i], 0, nbytes, p->streams[i])); + } + + for (size_t c = 0; c < copy_chunks; ++c) { + const size_t offset = c * chunk_bytes; + const size_t this_bytes = (nbytes - offset) < chunk_bytes ? + (nbytes - offset) : chunk_bytes; + + CUDA_CHECK(cudaMemcpyAsync( + p->host_large[i].host + offset, reinterpret_cast(src_buf[i]) + offset, this_bytes, + cudaMemcpyDeviceToHost, p->streams[i])); + CUDA_CHECK(cudaEventRecord(p->ev_pool[i][slot].cpy[c], p->streams[i])); + } + } + + // Stage 2: each GPU waits for each peer D2H chunk, pulls that chunk back to + // local device scratch (dev_tmp), then performs one device-local add over + // the assembled peer tensor. The H2Ds run on the AR stream (copy engine) + // and the add_kernel runs on the caller's compute stream, so the AR stream + // stays pure-copy and avoids an in-stream copy->compute engine switch every + // AR. dev_tmp is single-buffered: the AR stream waits cross-stream on the + // prior AR's add_kernel-done event before overwriting it. + for (int i = 0; i < 2; ++i) { + const int peer = 1 - i; + ggml_cuda_set_device(p->devices[i]); + + // Wait for the previous AR's add_kernel (on the compute stream) to + // finish reading dev_tmp before our H2D overwrites it. No-op on the + // first copy_impl call. + if (p->dev_tmp_kernel_done_valid) { + CUDA_CHECK(cudaStreamWaitEvent(p->streams[i], p->dev_tmp_kernel_done[i])); + } + + for (size_t c = 0; c < copy_chunks; ++c) { + const size_t offset = c * chunk_bytes; + const size_t this_bytes = (nbytes - offset) < chunk_bytes ? + (nbytes - offset) : chunk_bytes; + + CUDA_CHECK(cudaStreamWaitEvent(p->streams[i], p->ev_pool[peer][slot].cpy[c])); + CUDA_CHECK(cudaMemcpyAsync( + p->dev_tmp[i] + offset, p->host_large[peer].host + offset, this_bytes, + cudaMemcpyHostToDevice, p->streams[i])); + } + + // Mark our reads of host_large[peer] complete so peer's next AR can + // safely overwrite it. + CUDA_CHECK(cudaEventRecord(p->host_large_read_done[i], p->streams[i])); + + // Hand off from AR stream (copy engine) to compute stream: compute + // stream waits for all H2Ds to finish, then runs the add_kernel. + CUDA_CHECK(cudaEventRecord(p->ev_pool[i][slot].h2d, p->streams[i])); + CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx[i]->stream(), p->ev_pool[i][slot].h2d)); + + const int block_size = 256; + int n_blocks = (int) ((ne + block_size - 1) / block_size); + if (n_blocks > 1024) { + n_blocks = 1024; + } + ggml_cuda_ar_add_kernel<<stream()>>>( + dst_buf[i], + reinterpret_cast(p->dev_tmp[i]), + (int) ne); + CUDA_CHECK(cudaGetLastError()); + + // Record dev_tmp-released on the compute stream so the next copy_impl + // can wait for the kernel to finish before overwriting dev_tmp. Also + // record AR-done as ev.ker for acquire_slot's pool-wraparound sync. + CUDA_CHECK(cudaEventRecord(p->dev_tmp_kernel_done[i], cuda_ctx[i]->stream())); + CUDA_CHECK(cudaEventRecord(p->ev_pool[i][slot].ker, cuda_ctx[i]->stream())); + } + p->host_large_read_done_valid = true; + p->dev_tmp_kernel_done_valid = true; + + return true; +} + +// Outer-level chunker: copy_impl handles up to copy_bytes per call (limited by +// the host_large / dev_tmp allocation size). When the full AR exceeds that, +// slice the tensor into copy_bytes-sized pieces and call copy_impl repeatedly. +// Each slice goes through its own stage 1 -> stage 2 cycle and acquires its own +// slot, so cross-AR fences and pool wraparound work the same way as for any +// other sequence of small ARs. +template +static bool ggml_cuda_ar_allreduce_copy_outer( + ggml_cuda_ar_pipeline * p, + ggml_backend_t * backends, + T_src * const src_buf[GGML_CUDA_MAX_DEVICES], + T_dst * const dst_buf[GGML_CUDA_MAX_DEVICES], + const bool compute[GGML_CUDA_MAX_DEVICES], + int64_t ne) { + const int64_t outer_max_elems = (int64_t) (p->copy_bytes / sizeof(T_src)); + GGML_ASSERT(outer_max_elems > 0); + + bool ok = true; + for (int64_t outer_start = 0; outer_start < ne && ok; outer_start += outer_max_elems) { + const int64_t outer_ne = std::min(outer_max_elems, ne - outer_start); + const size_t outer_nbytes = (size_t) outer_ne * sizeof(T_src); + + T_src * src[GGML_CUDA_MAX_DEVICES] = {}; + T_dst * dst[GGML_CUDA_MAX_DEVICES] = {}; + for (int i = 0; i < p->n_devices; ++i) { + src[i] = src_buf[i] + outer_start; + dst[i] = dst_buf[i] + outer_start; + } + ok = ggml_cuda_ar_allreduce_copy_impl( + p, backends, src, dst, compute, outer_ne, outer_nbytes); + } + return ok; +} + +bool ggml_cuda_ar_allreduce( + ggml_cuda_ar_pipeline * p, + ggml_backend_t * backends, + ggml_tensor ** tensors) { + GGML_ASSERT(p != nullptr); + + const int n = p->n_devices; + GGML_ASSERT(n == 2); + + const ggml_type input_type = tensors[0]->type; + GGML_ASSERT(input_type == GGML_TYPE_F32 || input_type == GGML_TYPE_F16 || input_type == GGML_TYPE_BF16); + + const int64_t ne = ggml_nelements(tensors[0]); + GGML_ASSERT(ne > 0); + + const size_t input_nbytes = ggml_nbytes(tensors[0]); + + // BF16 round-trip: F32 inputs >= bf16_threshold are converted to BF16 for + // the reduction (chunked or copy-engine), halving on-wire bytes. Matches + // NCCL's behaviour. The pre-conversion zeroes inactive shards so the + // inner paths see them as already-prepared compute tensors. + const bool use_bf16 = + input_type == GGML_TYPE_F32 && + p->bf16_threshold > 0 && + input_nbytes >= p->bf16_threshold; + + const ggml_type kernel_type = use_bf16 ? GGML_TYPE_BF16 : input_type; + const size_t type_size = ggml_type_size(kernel_type); + GGML_ASSERT(p->buf_bytes >= type_size); + const size_t nbytes = (size_t) ne * type_size; + + bool compute_flag[GGML_CUDA_MAX_DEVICES] = {}; + for (int i = 0; i < n; ++i) { + compute_flag[i] = (tensors[i]->flags & GGML_TENSOR_FLAG_COMPUTE) != 0; + } + + // Decide between copy-engine and chunked kernel paths based on the working + // type's actual byte count. No upper bound: copy_outer slices reductions + // larger than copy_bytes into copy_bytes-sized pieces. + const bool use_copy_engine = + p->copy_threshold > 0 && + nbytes >= p->copy_threshold; + + // BF16 inactive-shard zeroing: when use_bf16 is on, the combined kernel + // (chunked kernel path) and the combined add kernel (copy_engine path) + // both accumulate into the F32 tensor data directly, so an inactive + // shard's accumulator must start at zero. + if (use_bf16) { + for (int i = 0; i < n; ++i) { + if (!compute_flag[i]) { + auto * cuda_ctx = static_cast(backends[i]->context); + GGML_ASSERT(cuda_ctx->device == p->devices[i]); + ggml_cuda_set_device(p->devices[i]); + CUDA_CHECK(cudaMemsetAsync(tensors[i]->data, 0, (size_t) ne * sizeof(float), cuda_ctx->stream())); + } + } + } + + // Pre-convert F32 -> BF16 into bf16_tmp ONLY for the copy_engine + use_bf16 + // path; the chunked kernel path's combined kernel does the conversion + // inline as it writes to host_buf. + ggml_cuda_pool_alloc bf16_tmp[GGML_CUDA_MAX_DEVICES]; + void * copy_src_ptr[GGML_CUDA_MAX_DEVICES] = {}; + + if (use_copy_engine && use_bf16) { + to_bf16_cuda_t to_bf16 = ggml_get_to_bf16_cuda(GGML_TYPE_F32); + for (int i = 0; i < n; ++i) { + auto * cuda_ctx = static_cast(backends[i]->context); + GGML_ASSERT(cuda_ctx->device == p->devices[i]); + bf16_tmp[i].pool = &cuda_ctx->pool(); + bf16_tmp[i].alloc(ne); + ggml_cuda_set_device(p->devices[i]); + if (compute_flag[i]) { + to_bf16(tensors[i]->data, bf16_tmp[i].get(), ne, cuda_ctx->stream()); + CUDA_CHECK(cudaGetLastError()); + } else { + CUDA_CHECK(cudaMemsetAsync(bf16_tmp[i].get(), 0, nbytes, cuda_ctx->stream())); + } + copy_src_ptr[i] = bf16_tmp[i].get(); + } + } + + bool ok = true; + if (use_copy_engine) { + // After up-front BF16 conversion, the tmp buffers already hold the + // (possibly zeroed-for-inactive) data, so the inner path can treat + // every shard as compute. + bool inner_compute[GGML_CUDA_MAX_DEVICES]; + for (int i = 0; i < n; ++i) { + inner_compute[i] = use_bf16 ? true : compute_flag[i]; + } + + // Dispatch into copy_impl with explicit src/dst types. When use_bf16 + // is on, the wire type is BF16 (src = bf16_tmp) and the accumulator + // is F32 (dst = tensors[i]->data); the combined add kernel rounds dst + // through BF16 for bit-equivalence and writes F32 directly, so no + // post-conversion is needed. Otherwise src == dst (same native type). + if (use_bf16) { + GGML_ASSERT(kernel_type == GGML_TYPE_BF16); + nv_bfloat16 * src[GGML_CUDA_MAX_DEVICES] = {}; + float * dst[GGML_CUDA_MAX_DEVICES] = {}; + for (int i = 0; i < n; ++i) { + src[i] = static_cast(copy_src_ptr[i]); + dst[i] = static_cast(tensors[i]->data); + } + ok = ggml_cuda_ar_allreduce_copy_outer( + p, backends, src, dst, inner_compute, ne); + } else { + switch (kernel_type) { + case GGML_TYPE_F32: { + float * buf[GGML_CUDA_MAX_DEVICES] = {}; + for (int i = 0; i < n; ++i) { + buf[i] = static_cast(tensors[i]->data); + } + ok = ggml_cuda_ar_allreduce_copy_outer( + p, backends, buf, buf, inner_compute, ne); + break; + } + case GGML_TYPE_BF16: { + nv_bfloat16 * buf[GGML_CUDA_MAX_DEVICES] = {}; + for (int i = 0; i < n; ++i) { + buf[i] = static_cast(tensors[i]->data); + } + ok = ggml_cuda_ar_allreduce_copy_outer( + p, backends, buf, buf, inner_compute, ne); + break; + } + case GGML_TYPE_F16: { + half * buf[GGML_CUDA_MAX_DEVICES] = {}; + for (int i = 0; i < n; ++i) { + buf[i] = static_cast(tensors[i]->data); + } + ok = ggml_cuda_ar_allreduce_copy_outer( + p, backends, buf, buf, inner_compute, ne); + break; + } + default: + GGML_ASSERT(false); + } + } + } else { + // host_buf carries T_wire-typed data; max_chunk_elems is the count that + // fits in one host_buf at the wire size. + const size_t max_chunk_elems = p->buf_bytes / type_size; + const size_t input_type_size = ggml_type_size(input_type); + + // Chunked kernel path runs entirely on the caller's compute stream: + // since AR is a barrier here, same-stream ordering subsumes any + // cross-stream event handshake that the copy-engine path needs, and + // skips the cross-stream scheduling overhead that was hurting the + // small-tensor (tg) latency on the AR-stream variant. Only ev.ker is + // still recorded at end-of-AR for acquire_slot's pool-wraparound check. + for (int64_t chunk_start = 0; chunk_start < ne; chunk_start += (int64_t) max_chunk_elems) { + const size_t remaining_elems = (size_t) (ne - chunk_start); + const size_t chunk_elems = remaining_elems < max_chunk_elems ? remaining_elems : max_chunk_elems; + const size_t chunk_dst_bytes = chunk_elems * input_type_size; + + const auto [slot, token] = ggml_cuda_ar_acquire_slot(p); + const bool last_chunk = chunk_start + (int64_t) chunk_elems == ne; + + for (int i = 0; i < n; ++i) { + const int peer = 1 - i; // valid for n == 2 only + ggml_cuda_set_device(p->devices[i]); + auto * cuda_ctx = static_cast(backends[i]->context); + GGML_ASSERT(cuda_ctx->device == p->devices[i]); + cudaStream_t stream = cuda_ctx->stream(); + + char * data = static_cast(tensors[i]->data) + chunk_start * (int64_t) input_type_size; + + // Match NCCL/meta-backend semantics: inactive shards contribute + // zeros. On the BF16 path the F32 tensor data was already + // zeroed up-front (above), so per-chunk zeroing isn't needed. + if (!compute_flag[i] && !use_bf16) { + CUDA_CHECK(cudaMemsetAsync(data, 0, chunk_dst_bytes, stream)); + } + +#define LAUNCH_AR_KERNEL(T_dst, T_wire) \ + ggml_cuda_ar_kernel<<>>( \ + reinterpret_cast(data), \ + reinterpret_cast(data), \ + reinterpret_cast(p->host_buf[i].dev + (size_t) slot * p->buf_bytes), \ + reinterpret_cast(p->host_buf[peer].dev + (size_t) slot * p->buf_bytes), \ + static_cast(chunk_elems), \ + ggml_cuda_ar_arrival_ptr(p, slot, i), \ + ggml_cuda_ar_arrival_ptr(p, slot, peer), \ + token) + + if (use_bf16) { + GGML_ASSERT(input_type == GGML_TYPE_F32); + LAUNCH_AR_KERNEL(float, nv_bfloat16); + } else { + switch (input_type) { + case GGML_TYPE_F32: LAUNCH_AR_KERNEL(float, float); break; + case GGML_TYPE_F16: LAUNCH_AR_KERNEL(half, half); break; + case GGML_TYPE_BF16: LAUNCH_AR_KERNEL(nv_bfloat16, nv_bfloat16); break; + default: GGML_ASSERT(false); + } + } + +#undef LAUNCH_AR_KERNEL + CUDA_CHECK(cudaGetLastError()); + + if (last_chunk) { + CUDA_CHECK(cudaEventRecord(p->ev_pool[i][slot].ker, stream)); + } + } + } + } + + return ok; +} + +#else // defined(GGML_USE_HIP) || defined(GGML_USE_MUSA) + +// HIP and MUSA lack the host-mapped pinned-memory APIs (cudaHostAllocPortable +// / cudaHostAllocMapped / cudaHostGetDevicePointer) and __nanosleep that this +// implementation relies on, so the internal AllReduce is a CUDA-only feature. +// The dispatcher in ggml-cuda.cu treats a nullptr pipeline as "init failed" +// and silently falls back to the meta backend's generic AllReduce. +ggml_cuda_ar_pipeline * ggml_cuda_ar_pipeline_init(const int *, size_t) { + return nullptr; +} +void ggml_cuda_ar_pipeline_free(ggml_cuda_ar_pipeline *) { +} +bool ggml_cuda_ar_allreduce(ggml_cuda_ar_pipeline *, ggml_backend_t *, ggml_tensor **) { + return false; +} + +#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) diff --git a/ggml/src/ggml-cuda/allreduce.cuh b/ggml/src/ggml-cuda/allreduce.cuh new file mode 100644 index 000000000..0f2c9518d --- /dev/null +++ b/ggml/src/ggml-cuda/allreduce.cuh @@ -0,0 +1,29 @@ +#pragma once + +#include "common.cuh" +#include "ggml-backend-impl.h" + +#include + +// Opaque pipeline context -- owns all pinned buffers, streams, and events. +struct ggml_cuda_ar_pipeline; + +// Allocate a pipeline for n_devices GPUs. +// devices[] holds the CUDA device IDs in rank order. +// Returns nullptr on allocation failure. +ggml_cuda_ar_pipeline * ggml_cuda_ar_pipeline_init( + const int * devices, size_t n_devices); + +// Release all resources owned by the pipeline. +void ggml_cuda_ar_pipeline_free(ggml_cuda_ar_pipeline * pipeline); + +// Execute an in-place AllReduce (sum) across tensors[0..n_devices-1]. +// tensors[i] must live on the device managed by backends[i] and be +// contiguous F32, F16, or BF16. +// Preconditions are checked by the CUDA comm dispatcher before calling this. +// Returns true once the reduction work has been enqueued successfully. +bool ggml_cuda_ar_allreduce( + ggml_cuda_ar_pipeline * pipeline, + ggml_backend_t * backends, + ggml_tensor ** tensors); + diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 4df1b9308..b92a20870 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -2,6 +2,7 @@ #include "ggml-impl.h" #include "ggml-backend-impl.h" +#include "ggml-cuda/allreduce.cuh" #include "ggml-cuda/common.cuh" #include "ggml-cuda/acc.cuh" #include "ggml-cuda/add-id.cuh" @@ -86,6 +87,9 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size"); +#define GGML_LOG_WARN_ONCE(str) \ + { static std::once_flag warn_flag; std::call_once(warn_flag, []() { GGML_LOG_WARN(str); }); } + [[noreturn]] void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) { int id = -1; // in case cudaGetDevice fails @@ -1139,70 +1143,46 @@ static const ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_inte /* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host, }; -#ifdef GGML_USE_NCCL +// Communication context for multi-GPU AllReduce during tensor parallelism. +// +// Created once per meta backend instance. Resources for the selected mode +// (NCCL communicators or the internal AllReduce pipeline) are initialised +// eagerly during comm_init so any init failure surfaces at startup rather +// than mid-run. struct ggml_backend_cuda_comm_context { + using try_allreduce_fn = bool(*)(ggml_backend_cuda_comm_context *, struct ggml_tensor **); + std::vector backends; - std::vector comms; + std::vector dev_ids; + + // Set by the init chain (comm_init_{nccl, internal, none}) to one of + // try_allreduce_{nccl, internal, butterfly}. nccl needs `comms`, + // internal needs `ar_pipeline`, butterfly needs nothing. Per-call + // failures return false; the meta backend's generic implementation then + // handles that call. + try_allreduce_fn try_allreduce = nullptr; + + ggml_cuda_ar_pipeline * ar_pipeline = nullptr; + +#ifdef GGML_USE_NCCL + std::vector comms; +#endif // GGML_USE_NCCL ~ggml_backend_cuda_comm_context() { +#ifdef GGML_USE_NCCL for (ncclComm_t comm : comms) { NCCL_CHECK(ncclCommDestroy(comm)); } +#endif // GGML_USE_NCCL + ggml_cuda_ar_pipeline_free(ar_pipeline); } }; -#endif // GGML_USE_NCCL -static void ggml_backend_cuda_comm_free(void * comm_ctx_v) { -#ifdef GGML_USE_NCCL - if (comm_ctx_v == nullptr) { - return; - } - ggml_backend_cuda_comm_context * comm_ctx = (ggml_backend_cuda_comm_context *) comm_ctx_v; - delete comm_ctx; -#else - GGML_UNUSED(comm_ctx_v); -#endif // GGML_USE_NCCL -} - -static void * ggml_backend_cuda_comm_init(ggml_backend_t * backends, size_t n_backends) { -#ifdef GGML_USE_NCCL - for (size_t i = 0; i < n_backends; i++) { - if (!ggml_backend_is_cuda(backends[i])) { - return nullptr; - } - } - ggml_backend_cuda_comm_context * ret = new ggml_backend_cuda_comm_context; - std::vector dev_ids; - ret->backends.reserve(n_backends); - dev_ids.reserve(n_backends); - for (size_t i = 0; i < n_backends; i++) { - ret->backends.push_back(backends[i]); - ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backends[i]->context; - dev_ids.push_back(cuda_ctx->device); - } - - ret->comms.resize(n_backends); - NCCL_CHECK(ncclCommInitAll(ret->comms.data(), n_backends, dev_ids.data())); - return ret; -#else - // If NCCL is installed it is used by default for optimal performance. - // However, NVIDIA does not distribute NCCL with CUDA so users may be unwittingly missing this package. - // RCCL is disabled by default, users are explicitly opting in. - // Therefore print no warning for RCCL. -#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) - static bool warning_printed = false; - if (!warning_printed) { - GGML_LOG_WARN("%s: NVIDIA Collective Communications Library (NCCL) is unavailable, multi GPU performance will be suboptimal\n", __func__); - warning_printed = true; - } -#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) - GGML_UNUSED_VARS(backends, n_backends); - return nullptr; -#endif // GGML_USE_NCCL -} - -static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct ggml_tensor ** tensors) { #ifdef GGML_USE_NCCL +// AllReduce via NCCL. Reduces as FP32 for small tensors and BF16 for large +// tensors (bandwidth-bound), then converts back to FP32. +static bool ggml_backend_cuda_comm_allreduce_nccl( + ggml_backend_cuda_comm_context * comm_ctx, struct ggml_tensor ** tensors) { const int64_t ne = ggml_nelements(tensors[0]); // FIXME the input of llm_graph_context::build_in_out_ids can produce a tensor with 0 elements if n_outputs == 0 // This then causes a crash in this function @@ -1210,8 +1190,6 @@ static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct gg return true; } - GGML_ASSERT(comm_ctx_v != nullptr); - ggml_backend_cuda_comm_context * comm_ctx = (ggml_backend_cuda_comm_context *) comm_ctx_v; const size_t n_backends = comm_ctx->backends.size(); for (size_t i = 0; i < n_backends; ++i) { @@ -1236,7 +1214,6 @@ static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct gg NCCL_CHECK(ncclAllReduce(tensors[i]->data, tensors[i]->data, ne, ncclFloat, ncclSum, comm_ctx->comms[i], cuda_ctx->stream())); } NCCL_CHECK(ncclGroupEnd()); - return true; } @@ -1275,10 +1252,184 @@ static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct gg } return true; -#else - GGML_UNUSED_VARS(comm_ctx_v, tensors); - return false; +} #endif // GGML_USE_NCCL + +// Run the internal AR pipeline. Returns false on unsupported / failed input +// -- the caller decides whether to abort (env-forced) or fall back silently. +static bool ggml_backend_cuda_comm_allreduce_internal( + ggml_backend_cuda_comm_context * comm_ctx, struct ggml_tensor ** tensors) { + GGML_ASSERT(comm_ctx->ar_pipeline != nullptr); + + const size_t n_backends = comm_ctx->backends.size(); + GGML_ASSERT(n_backends == 2); + GGML_ASSERT(tensors[0] != nullptr); + + const int64_t ne = ggml_nelements(tensors[0]); + const ggml_type type = tensors[0]->type; + + if (type != GGML_TYPE_F32 && type != GGML_TYPE_F16 && type != GGML_TYPE_BF16) { + GGML_LOG_DEBUG("%s: internal unsupported: type=%d\n", __func__, (int) type); + return false; + } + + if (ne == 0) { + return true; + } + + for (size_t i = 0; i < n_backends; ++i) { + if (tensors[i] == nullptr) { + GGML_LOG_ERROR("%s: internal failed: tensor[%zu] is null\n", __func__, i); + return false; + } + if (ggml_nelements(tensors[i]) != ne || tensors[i]->type != type) { + GGML_LOG_ERROR("%s: internal failed: tensor[%zu] ne=%" PRId64 " type=%d expected ne=%" PRId64 " type=%d\n", + __func__, i, ggml_nelements(tensors[i]), (int) tensors[i]->type, ne, (int) type); + return false; + } + if (!ggml_is_contiguously_allocated(tensors[i])) { + GGML_LOG_DEBUG("%s: internal unsupported: tensor[%zu] is not contiguously allocated: ne=%" PRId64 " nbytes=%zu packed=%zu type=%d\n", + __func__, i, ne, ggml_nbytes(tensors[i]), + (size_t) ne * ggml_type_size(type) / ggml_blck_size(type), (int) type); + return false; + } + if (((uintptr_t) tensors[i]->data & 0xF) != 0) { + GGML_LOG_DEBUG("%s: internal unsupported: tensor[%zu] data pointer is not 16-byte aligned: %p type=%d ne=%" PRId64 "\n", + __func__, i, tensors[i]->data, (int) type, ne); + return false; + } + GGML_ASSERT((ggml_nbytes(tensors[i]) & 0xF) == 0); + } + + return ggml_cuda_ar_allreduce(comm_ctx->ar_pipeline, comm_ctx->backends.data(), tensors); +} + +// --------------------------------------------------------------------------- +// Per-call dispatch -- three variants, one per backend. Each is set as +// comm_ctx->try_allreduce by the matching init step. Per-call failure +// returns false; the meta backend's generic implementation handles that call. +// --------------------------------------------------------------------------- + +#ifdef GGML_USE_NCCL +static bool ggml_backend_cuda_comm_try_allreduce_nccl( + ggml_backend_cuda_comm_context * comm_ctx, struct ggml_tensor ** tensors) { + return ggml_backend_cuda_comm_allreduce_nccl(comm_ctx, tensors); +} +#endif // GGML_USE_NCCL + +static bool ggml_backend_cuda_comm_try_allreduce_internal( + ggml_backend_cuda_comm_context * comm_ctx, struct ggml_tensor ** tensors) { + return ggml_backend_cuda_comm_allreduce_internal(comm_ctx, tensors); +} + +static bool ggml_backend_cuda_comm_try_allreduce_butterfly( + ggml_backend_cuda_comm_context *, struct ggml_tensor **) { + return false; +} + +static void ggml_backend_cuda_comm_free(void * comm_ctx_v) { + if (comm_ctx_v == nullptr) { + return; + } + delete static_cast(comm_ctx_v); +} + +// --------------------------------------------------------------------------- +// Init -- chained nccl -> internal -> none. Each step tries to bring up its +// resource; on failure it warns and recurses into the next step. +// --------------------------------------------------------------------------- +static void ggml_backend_cuda_comm_init_none(ggml_backend_cuda_comm_context * ret) { + ret->try_allreduce = ggml_backend_cuda_comm_try_allreduce_butterfly; +} + +static void ggml_backend_cuda_comm_init_internal(ggml_backend_cuda_comm_context * ret) { + ret->ar_pipeline = ggml_cuda_ar_pipeline_init(ret->dev_ids.data(), ret->dev_ids.size()); + if (ret->ar_pipeline) { + ret->try_allreduce = ggml_backend_cuda_comm_try_allreduce_internal; + return; + } + + // Clear sticky CUDA error from the failed init. + (void) cudaGetLastError(); + GGML_LOG_WARN("internal AllReduce init failed (n_devices != 2?); " + "falling back to meta-backend butterfly\n"); + ggml_backend_cuda_comm_init_none(ret); +} + +static void ggml_backend_cuda_comm_init_nccl(ggml_backend_cuda_comm_context * ret) { +#ifdef GGML_USE_NCCL + const size_t n = ret->dev_ids.size(); + ret->comms.resize(n); + ncclResult_t rc = ncclCommInitAll(ret->comms.data(), (int) n, ret->dev_ids.data()); + if (rc == ncclSuccess) { + ret->try_allreduce = ggml_backend_cuda_comm_try_allreduce_nccl; + return; + } + + ret->comms.clear(); + GGML_LOG_WARN("NCCL init failed (%s); falling back to internal AllReduce\n", + ncclGetErrorString(rc)); +#else // GGML_USE_NCCL +#ifndef GGML_USE_HIP + GGML_LOG_WARN("NCCL not compiled in; falling back to internal AllReduce. " + "Recompile with -DGGML_CUDA_NCCL=ON for best multi-GPU performance.\n"); +#endif // !GGML_USE_HIP +#endif // GGML_USE_NCCL + + ggml_backend_cuda_comm_init_internal(ret); +} + +// Top-level init. Picks one of the three init paths based on +// GGML_CUDA_ALLREDUCE (or the platform default) and lets the chain handle +// any fallback. Unrecognised env values warn and fall through to the +// platform default. +static void * ggml_backend_cuda_comm_init(ggml_backend_t * backends, size_t n_backends) { + for (size_t i = 0; i < n_backends; i++) { + if (!ggml_backend_is_cuda(backends[i])) { + return nullptr; + } + } + + auto * ret = new ggml_backend_cuda_comm_context; + ret->backends.assign(backends, backends + n_backends); + ret->dev_ids.reserve(n_backends); + for (size_t i = 0; i < n_backends; i++) { + ret->dev_ids.push_back(static_cast(backends[i]->context)->device); + } + + const char * env = getenv("GGML_CUDA_ALLREDUCE"); + if (!env) { + // Platform default: Linux uses NCCL, otherwise (generally Windows) internal +#if defined(__linux__) + ggml_backend_cuda_comm_init_nccl(ret); +#else + ggml_backend_cuda_comm_init_internal(ret); +#endif // defined(__linux__) + } else { + std::string env_str(env); + if (env_str == "nccl") { + ggml_backend_cuda_comm_init_nccl(ret); + } else if (env_str == "internal") { + ggml_backend_cuda_comm_init_internal(ret); + } else if (env_str == "none") { + ggml_backend_cuda_comm_init_none(ret); + } else { + GGML_LOG_WARN("unknown GGML_CUDA_ALLREDUCE value: %s\n", env); + ggml_backend_cuda_comm_init_none(ret); + } + } + + return ret; +} + +// Top-level dispatch -- calls the function pointer chosen by comm_init. +// Returns false to let the meta-backend's butterfly run. +static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct ggml_tensor ** tensors) { + if (comm_ctx_v == nullptr) { + return false; + } + auto * comm_ctx = static_cast(comm_ctx_v); + return comm_ctx->try_allreduce(comm_ctx, tensors); } ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split) {