Merge branch 'upstream' into concedo_experimental

# Conflicts:
#	.devops/intel.Dockerfile
#	.devops/nix/package.nix
#	.gitignore
#	docs/backend/SYCL.md
#	docs/ops.md
#	docs/ops/SYCL.csv
#	ggml/CMakeLists.txt
#	ggml/src/ggml-cuda/fattn.cu
#	ggml/src/ggml-cuda/ggml-cuda.cu
#	ggml/src/ggml-sycl/CMakeLists.txt
#	ggml/src/ggml-sycl/common.hpp
#	ggml/src/ggml-sycl/convert.cpp
#	ggml/src/ggml-sycl/dequantize.hpp
#	ggml/src/ggml-sycl/fattn-common.hpp
#	ggml/src/ggml-sycl/getrows.cpp
#	ggml/src/ggml-sycl/ggml-sycl.cpp
#	ggml/src/ggml-sycl/im2col.cpp
#	ggml/src/ggml-sycl/im2col.hpp
#	ggml/src/ggml-sycl/mmvq.cpp
#	ggml/src/ggml-sycl/quants.hpp
#	ggml/src/ggml-sycl/vecdotq.hpp
#	ggml/src/ggml-virtgpu/ggml-backend-device.cpp
#	scripts/sync-ggml.last
#	scripts/sync_vendor.py
#	tests/test-backend-ops.cpp
This commit is contained in:
Concedo 2026-05-11 16:18:28 +08:00
commit 2771e16fbc
31 changed files with 1487 additions and 106 deletions

View file

@ -547,6 +547,8 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
auto & chain = gsmpl->chain;
auto & cur_p = gsmpl->cur_p; // initialized by set_logits
gsmpl->set_logits(ctx, idx);
// Check if a backend sampler has already sampled a token in which case we
// return that token id directly.
{
@ -558,17 +560,17 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
GGML_ASSERT(!gsmpl->grmr && "using grammar in combination with backend sampling is not supported");
GGML_ASSERT(!gsmpl->rbudget && "using reasoning budget in combination with backend sampling is not supported");
// TODO: simplify
gsmpl->cur.resize(1);
gsmpl->cur[0] = { id, 0.0f, 1.0f };
cur_p = { gsmpl->cur.data(), gsmpl->cur.size(), 0, true };
for (size_t i = 0; i < cur_p.size; ++i) {
if (cur_p.data[i].id == id) {
cur_p.selected = i;
break;
}
}
return id;
}
}
gsmpl->set_logits(ctx, idx);
// apply reasoning budget first
llama_sampler_apply(rbudget, &cur_p);

View file

@ -1570,6 +1570,9 @@ class TextModel(ModelBase):
if chkhsh == "862f827721df956049dff5ca81a57f29e575280bc622e290d3bf4e35eca29015":
# ref: https://huggingface.co/codefuse-ai/F2LLM-v2-4B
res = "f2llmv2"
if chkhsh == "62f6fb0a6fd5098caeabb19b07a5c1099cafc8b9c40eab6ea89ece4ec02fbc57":
# ref: https://huggingface.co/sarvamai/sarvam-30b
res = "sarvam-moe"
if res is None:
logger.warning("\n")
@ -11591,6 +11594,34 @@ class BailingMoeV2Model(TextModel):
raise ValueError(f"Unprocessed experts: {experts}")
@ModelBase.register("SarvamMoEForCausalLM", "modeling_sarvam_moe.SarvamMoEForCausalLM")
class SarvamMoEModel(BailingMoeV2Model):
model_arch = gguf.MODEL_ARCH.BAILINGMOE2
# Sarvam-MoE shares the BailingMoeV2 architecture; only differences:
# - full rotary (no partial_rotary_factor)
# - expert bias is zero-mean normalized at load time
def set_gguf_parameters(self):
super().set_gguf_parameters()
hparams = self.hparams
if (rope_dim := hparams.get("head_dim")) is None:
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
# Override the partial-rotary value written by BailingMoeV2 with the full rotary dim
self.gguf_writer.add_rope_dimension_count(rope_dim)
@classmethod
def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
name, gen = item
if name.endswith(".expert_bias"):
# Sarvam normalizes expert bias to zero mean
inner = gen
def gen():
t = inner()
return t - t.mean()
return super().filter_tensors((name, gen))
@ModelBase.register("GroveMoeForCausalLM", "modeling_grove_moe.GroveMoeForCausalLM")
class GroveMoeModel(TextModel):
model_arch = gguf.MODEL_ARCH.GROVEMOE

View file

@ -155,6 +155,7 @@ models = [
{"name": "joyai-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jdopensource/JoyAI-LLM-Flash", },
{"name": "kanana2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/kakaocorp/kanana-2-30b-a3b-instruct-2601", },
{"name": "f2llmv2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/codefuse-ai/F2LLM-v2-4B", },
{"name": "sarvam-moe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sarvamai/sarvam-30b", },
]
# some models are known to be broken upstream, so we will skip them as exceptions

View file

@ -0,0 +1,968 @@
#include "allreduce.cuh"
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
#include "convert.cuh"
#include "ggml-impl.h"
#include <algorithm>
#include <cstdlib>
#include <cstring>
#include <limits>
// ---------------------------------------------------------------------------
// CUDA AllReduce for tensor-parallel inference across two GPUs.
//
// Provides an in-place sum reduction over matching tensors on two CUDA
// devices in the same process. Used by the tensor-split path alongside
// NCCL; targets setups without NVLink, where data is exchanged between the
// GPUs by staging it through pinned host memory over PCIe.
//
// Two reduction strategies are selected per call by tensor size:
//
// * Chunked kernel path (small reductions): a single CUDA kernel both
// stages data through pinned host memory and performs the local sum.
// Cross-GPU synchronization happens *inside the kernel* (busy-wait on
// a host-memory flag), which keeps launch overhead low for the
// latency-sensitive token-generation case.
//
// * Copy-engine path (large reductions): the transfer is split into
// D2H + H2D cudaMemcpyAsync chunks driven by the GPU's copy engine,
// followed by a small device-side add kernel. Cross-GPU
// synchronization happens *outside the kernel*, via CUDA events
// between streams. This keeps the compute engine free while large
// transfers are in flight, which matters for prefill-sized tensors.
// Reductions larger than the per-call inner cap are processed by an
// outer chunker that issues sequential inner calls.
// ---------------------------------------------------------------------------
// ---------------------------------------------------------------------------
// Cross-GPU signal mechanism
//
// One int per (slot, rank) pair in pinned host memory. Each AR call writes a
// strictly increasing token (= the AR call number) into its own arrival int.
// The peer spins until its read of the other's arrival int equals the token
// it expects for this call -- a mismatch means the peer hasn't arrived yet.
// Tokens never repeat over realistic call rates (32-bit int wraps in tens of
// days at thousands of ARs/sec), so arrival ints don't need to be reset
// between calls; we initialize once at pipeline init and let the values
// accumulate.
//
// There is exactly one writer (the owning GPU) and one reader (the peer), so
// we don't need atomics. A volatile store paired with __threadfence_system()
// provides the release ordering that makes the D2H writes visible system-wide
// before the arrival token is observed.
//
// atomicAdd_system() requires hostNativeAtomicSupported, which is unavailable
// on PCIe-attached consumer GPUs without NVLink, so the volatile path is the
// portable choice.
// ---------------------------------------------------------------------------
static __device__ __forceinline__ void ggml_cuda_ar_signal_set(int * p, int token) {
*(volatile int *)p = token;
}
static __device__ __forceinline__ int ggml_cuda_ar_signal_get(const int * p) {
return *(const volatile int *)p;
}
// Byte spacing between adjacent arrival ints. 64 bytes (one cache line)
// ensures each GPU/block's arrival slot lives on its own line, preventing
// false-sharing stalls on the polling GPU.
static constexpr size_t GGML_CUDA_AR_ARRIVAL_STRIDE = 64;
// Number of blocks the chunked kernel launches with. Each block stripes a
// disjoint slice of the data and synchronizes through its own arrival-token
// slot so multiple SMs can pump PCIe stores in parallel.
static constexpr int GGML_CUDA_AR_KERNEL_BLOCKS = 8;
// ---------------------------------------------------------------------------
// Chunked kernel AllReduce -- 2 GPUs, supports float, half, and bfloat16.
//
// Both GPUs run this kernel simultaneously on independent streams. sendbuf
// and recvbuf live in T_dst (the caller's tensor type); host_mine / host_other
// carry data in T_wire (the on-wire type, possibly narrower than T_dst -- e.g.
// T_dst=F32 with T_wire=BF16 halves the bytes pushed across PCIe). When
// T_dst == T_wire the casts below are no-ops.
//
// Each GPU runs three phases:
//
// Phase 1 (all threads): cast sendbuf (T_dst) -> T_wire and store as
// single-instruction-width vectors into host_mine.
// __threadfence_system() commits these writes to host
// memory.
// Phase 2 (thread 0): write token to arrival_mine; spin until
// arrival_other == token.
// Phase 3 (all threads): read T_wire vectors from host_other, cast
// each element to T_dst, and sum with the local
// sendbuf value (also rounded through T_wire so that
// both GPUs truncate identically -- this guarantees
// bit-equivalent results across the two devices).
//
// Multi-block: blocks stripe vectors across (gridDim.x * blockDim.x) global
// threads to keep multiple SMs issuing PCIe stores in parallel. Each block
// has its own arrival-token slot (offset by blockIdx.x * ARRIVAL_STRIDE);
// thread 0 of each block signals/spins on that slot independently of other
// blocks. Tail elements (the leftover < ELEMS_PER_VEC at the end) are
// handled only by block 0 to avoid cross-block writes to the same slots.
// ---------------------------------------------------------------------------
template <typename T_dst, typename T_wire>
static __global__ void ggml_cuda_ar_kernel(
const T_dst * sendbuf,
T_dst * recvbuf,
T_wire * __restrict__ host_mine,
const T_wire * __restrict__ host_other,
int count,
int * arrival_mine,
int * arrival_other,
int token) {
// Vector unit for the wire type, sized to the arch's widest single-instruction
// copy (16 B on Volta+). Each phase-1 iter writes one vector to host memory;
// each phase-3 iter reads one and produces ELEMS_PER_VEC sums.
constexpr int ELEMS_PER_VEC = ggml_cuda_get_max_cpy_bytes() / sizeof(T_wire);
constexpr int ARRIVAL_INTS = (int)(GGML_CUDA_AR_ARRIVAL_STRIDE / sizeof(int));
const int tid = threadIdx.x;
const int nt = blockDim.x;
const int bid = blockIdx.x;
const int gtid = bid * nt + tid;
const int gnt = gridDim.x * nt;
const int count_vec = count / ELEMS_PER_VEC;
const int tail = count_vec * ELEMS_PER_VEC;
// Phase 1: cast sendbuf (T_dst) -> host_mine (T_wire) and store as vectors.
{
for (int i = gtid; i < count_vec; i += gnt) {
const int off = i * ELEMS_PER_VEC;
T_wire wire[ELEMS_PER_VEC];
#pragma unroll
for (int k = 0; k < ELEMS_PER_VEC; ++k) {
wire[k] = ggml_cuda_cast<T_wire>(sendbuf[off + k]);
}
ggml_cuda_memcpy_1<sizeof(wire)>(&host_mine[off], wire);
}
if (bid == 0 && tid < count - tail) {
host_mine[tail + tid] = ggml_cuda_cast<T_wire>(sendbuf[tail + tid]);
}
}
// Commit this block's host writes before signalling.
__threadfence_system();
__syncthreads();
// Phase 2: thread 0 of each block signals on its own arrival slot, then
// spins for the matching slot from peer. Per-block tokens mean blocks
// proceed independently -- no inter-block barrier needed.
if (tid == 0) {
int * my_slot = arrival_mine + bid * ARRIVAL_INTS;
const int * other_slot = arrival_other + bid * ARRIVAL_INTS;
ggml_cuda_ar_signal_set(my_slot, token);
__threadfence_system(); // make our signal visible system-wide
while (ggml_cuda_ar_signal_get(other_slot) != token) {
#if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
__nanosleep(100);
#else
NO_DEVICE_CODE;
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
}
}
__syncthreads();
// Acquire peer's host_other writes (this block's stripe of them).
__threadfence_system();
// Phase 3: read peer's T_wire vector, cast both sides through T_wire for
// bit-equivalence, sum in T_dst precision, and write back to recvbuf.
{
for (int i = gtid; i < count_vec; i += gnt) {
const int off = i * ELEMS_PER_VEC;
T_wire wire[ELEMS_PER_VEC];
ggml_cuda_memcpy_1<sizeof(wire)>(wire, &host_other[off]);
#pragma unroll
for (int k = 0; k < ELEMS_PER_VEC; ++k) {
const T_wire d_low = ggml_cuda_cast<T_wire>(sendbuf[off + k]);
recvbuf[off + k] = ggml_cuda_cast<T_dst>(d_low) + ggml_cuda_cast<T_dst>(wire[k]);
}
}
if (bid == 0 && tid < count - tail) {
const T_wire d_low = ggml_cuda_cast<T_wire>(sendbuf[tail + tid]);
recvbuf[tail + tid] =
ggml_cuda_cast<T_dst>(d_low) + ggml_cuda_cast<T_dst>(host_other[tail + tid]);
}
}
}
// Combined load-convert-add kernel. The peer's contribution arrives as T_src
// (which may be a lower-precision type than T_dst when the BF16 round-trip is
// active). For bit-equivalence between the two GPUs, dst is first rounded
// through T_src's precision via ggml_cuda_cast -- peer already truncated its
// own value the same way before sending -- so both sides perform identical
// arithmetic. When T_dst == T_src the round-trip cast is a no-op.
template <typename T_dst, typename T_src>
static __global__ void ggml_cuda_ar_add_kernel(
T_dst * __restrict__ dst,
const T_src * __restrict__ src,
int count) {
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
const int nt = gridDim.x * blockDim.x;
for (int i = tid; i < count; i += nt) {
const T_src d_low = ggml_cuda_cast<T_src>(dst[i]);
dst[i] = ggml_cuda_cast<T_dst>(d_low) + ggml_cuda_cast<T_dst>(src[i]);
}
}
// ---------------------------------------------------------------------------
// Pipeline structure
// ---------------------------------------------------------------------------
// Number of slots in the event / arrival ring. Two slots is sufficient:
// lockstep guarantees the two GPUs are at most one AR (or chunk) apart, so
// slot[N%2] is always safe to reuse -- peer has already consumed slot[N%2]
// from AR N-2 by the time we get to AR N. acquire_slot's
// cudaEventSynchronize on ev.ker for both devices makes that consumption
// explicit before we overwrite host_buf[slot] for the new AR.
static constexpr int GGML_CUDA_AR_POOL_SIZE = 2;
// Maximum chunk size (bytes per GPU) handled by one chunked kernel launch.
// Larger tensors are reduced by issuing multiple chunked launches.
static constexpr size_t GGML_CUDA_AR_MAX_BYTES = 1024 * 1024; // 1 MB
// Copy-engine path: largest tensor accepted on this path; sets host_large /
// dev_tmp allocation size.
static constexpr size_t GGML_CUDA_AR_COPY_MAX_BYTES = 32 * 1024 * 1024; // 32 MB
// AR wire size at which the copy-engine path takes over from the chunked-
// kernel path. Override via GGML_CUDA_AR_COPY_THRESHOLD.
static constexpr size_t GGML_CUDA_AR_COPY_THRESHOLD_DEFAULT = 1024 * 1024; // 1 MB
// Per-call CE chunk-size heuristic: chunk_bytes = clamp(nbytes / 4, MIN, MAX).
// The /4 keeps ~4 chunks in flight at any moment (good D2H/H2D overlap with
// the peer); the clamps cover the cases where nbytes/4 is too small (per-
// memcpy fixed cost dominates) or too large (chunk-level pipelining stalls).
// Env var GGML_CUDA_AR_COPY_CHUNK_BYTES can override with a fixed value.
static constexpr size_t GGML_CUDA_AR_COPY_CHUNK_BYTES_HEURISTIC_MIN = 512 * 1024; // 512 KB
static constexpr size_t GGML_CUDA_AR_COPY_CHUNK_BYTES_HEURISTIC_MAX = 2 * 1024 * 1024; // 2 MB
// Absolute floor that an env-var override is allowed to set; this caps the
// per-slot copy-event array. 256 KB -> up to 128 chunks per 32 MB tensor.
static constexpr size_t GGML_CUDA_AR_COPY_CHUNK_BYTES_MIN = 256 * 1024;
static constexpr int GGML_CUDA_AR_COPY_MAX_CHUNKS =
static_cast<int>((GGML_CUDA_AR_COPY_MAX_BYTES + GGML_CUDA_AR_COPY_CHUNK_BYTES_MIN - 1) /
GGML_CUDA_AR_COPY_CHUNK_BYTES_MIN);
struct ggml_cuda_ar_event_slot {
cudaEvent_t app = nullptr; // upstream computation complete
cudaEvent_t cpy[GGML_CUDA_AR_COPY_MAX_CHUNKS] = {}; // copy-engine D2H chunks complete
cudaEvent_t h2d = nullptr; // copy-engine H2Ds complete (handoff AR stream -> compute stream)
cudaEvent_t ker = nullptr; // AllReduce kernel complete
};
// Mapped pinned host allocation: cudaHostAlloc + cudaHostGetDevicePointer
// in one place, with the host handle preserved for cudaFreeHost. Used where
// the CPU never touches the buffer -- only the device reads/writes via the
// mapped device pointer. Required on systems where cudaDevAttrCanUseHost-
// PointerForRegisteredMem is 0 and the host pointer can't be used as a
// device pointer.
struct ggml_cuda_ar_host_mapping {
uint8_t * host = nullptr; // cudaFreeHost handle; also the H-side ptr for cudaMemcpyAsync
uint8_t * dev = nullptr; // device-side pointer for kernels / cudaMemset
cudaError_t alloc(size_t bytes) {
cudaError_t rc = cudaHostAlloc(reinterpret_cast<void **>(&host), bytes,
cudaHostAllocPortable | cudaHostAllocMapped);
if (rc != cudaSuccess) {
host = nullptr;
return rc;
}
rc = cudaHostGetDevicePointer(reinterpret_cast<void **>(&dev), host, 0);
if (rc != cudaSuccess) {
cudaFreeHost(host);
host = nullptr;
dev = nullptr;
}
return rc;
}
void free() {
if (host) {
cudaFreeHost(host);
host = nullptr;
dev = nullptr;
}
}
};
struct ggml_cuda_ar_pipeline {
int n_devices;
int devices[GGML_CUDA_MAX_DEVICES];
size_t buf_bytes; // bytes per device in host_buf[]
size_t copy_bytes; // bytes per device in host_large[] / dev_tmp[]
size_t copy_threshold;
size_t copy_chunk_bytes;
size_t bf16_threshold; // tensors >= this size (bytes) are reduced via FP32->BF16 round-trip; 0 disables
uint64_t call_count;
// Per-device resources.
ggml_cuda_ar_host_mapping host_buf[GGML_CUDA_MAX_DEVICES]; // pinned staging (chunked kernel)
ggml_cuda_ar_host_mapping host_large[GGML_CUDA_MAX_DEVICES]; // pinned staging (copy-engine)
char * dev_tmp[GGML_CUDA_MAX_DEVICES]; // device scratch for copy-engine path
cudaStream_t streams[GGML_CUDA_MAX_DEVICES]; // non-blocking
ggml_cuda_ar_event_slot ev_pool[GGML_CUDA_MAX_DEVICES][GGML_CUDA_AR_POOL_SIZE];
// Copy-engine: per-device "I finished reading my peer's host_large"
// event. Indexed by RECORDER device. Recorded same-device on streams[i]
// after stage 2's last H2D from host_large[peer]. Waited cross-device
// by peer's stage-1 stream before the next AR overwrites host_large[peer].
cudaEvent_t host_large_read_done[GGML_CUDA_MAX_DEVICES];
bool host_large_read_done_valid;
// Copy-engine: per-device "my add_kernel is done with dev_tmp" event.
// Recorded on the compute stream after each add_kernel; the AR stream
// waits on it before the next copy_impl's H2D overwrites dev_tmp. Lets us
// single-buffer dev_tmp despite add_kernel running on a separate stream.
cudaEvent_t dev_tmp_kernel_done[GGML_CUDA_MAX_DEVICES];
bool dev_tmp_kernel_done_valid;
// Arrival ring: ARRIVAL_STRIDE bytes between adjacent ints. Mapped pinned
// memory; CPU never reads/writes -- only the kernel and cudaMemset.
// Use ggml_cuda_ar_arrival_ptr() to index.
ggml_cuda_ar_host_mapping arrival;
};
// Base pointer for the (slot, rank) per-block token block. The kernel adds
// blockIdx.x * (ARRIVAL_STRIDE/sizeof(int)) internally to land on its own slot.
static int * ggml_cuda_ar_arrival_ptr(const ggml_cuda_ar_pipeline * p, int slot, int rank) {
const size_t offset = ((size_t)slot * p->n_devices + rank) *
GGML_CUDA_AR_KERNEL_BLOCKS * GGML_CUDA_AR_ARRIVAL_STRIDE;
return reinterpret_cast<int *>(p->arrival.dev + offset);
}
static uint64_t ggml_cuda_ar_env_u64(const char * name, uint64_t default_value) {
const char * value = getenv(name);
if (value == nullptr || value[0] == '\0') {
return default_value;
}
char * end = nullptr;
const unsigned long long parsed = strtoull(value, &end, 10);
return end != value ? (uint64_t) parsed : default_value;
}
struct ggml_cuda_ar_slot_info {
int slot;
int token;
};
static ggml_cuda_ar_slot_info ggml_cuda_ar_acquire_slot(ggml_cuda_ar_pipeline * p) {
const int slot = static_cast<int>(p->call_count % GGML_CUDA_AR_POOL_SIZE);
const bool pool_lapped = p->call_count >= GGML_CUDA_AR_POOL_SIZE;
p->call_count++;
if (pool_lapped) {
for (int i = 0; i < p->n_devices; ++i) {
ggml_cuda_set_device(p->devices[i]);
CUDA_CHECK(cudaEventSynchronize(p->ev_pool[i][slot].ker));
}
}
return { slot, (int) p->call_count };
}
// Per-AR copy-engine chunk size: env-var override if set, else heuristic
// (clamp(nbytes/4, HEURISTIC_MIN, HEURISTIC_MAX)).
static size_t ggml_cuda_ar_chunk_bytes(const ggml_cuda_ar_pipeline * p, size_t nbytes) {
if (p->copy_chunk_bytes > 0) {
return p->copy_chunk_bytes;
}
return std::min(GGML_CUDA_AR_COPY_CHUNK_BYTES_HEURISTIC_MAX,
std::max(GGML_CUDA_AR_COPY_CHUNK_BYTES_HEURISTIC_MIN, nbytes / 4));
}
static void ggml_cuda_ar_wait_for_compute(
ggml_cuda_ar_pipeline * p, ggml_backend_cuda_context * cuda_ctx, int rank, int slot) {
ggml_cuda_ar_event_slot & ev = p->ev_pool[rank][slot];
CUDA_CHECK(cudaEventRecord(ev.app, cuda_ctx->stream()));
CUDA_CHECK(cudaStreamWaitEvent(p->streams[rank], ev.app));
}
// ---------------------------------------------------------------------------
// Init / free
// ---------------------------------------------------------------------------
ggml_cuda_ar_pipeline * ggml_cuda_ar_pipeline_init(const int * devices, size_t n_devices) {
if (n_devices != 2) {
GGML_LOG_DEBUG("%s: internal AllReduce only supports n_devices=2 (got %zu); "
"falling back\n", __func__, n_devices);
return nullptr;
}
// The chunked kernel uses __nanosleep, which is sm70+ (Volta+).
for (size_t i = 0; i < n_devices; ++i) {
const int cc = ggml_cuda_info().devices[devices[i]].cc;
if (cc < GGML_CUDA_CC_VOLTA) {
GGML_LOG_DEBUG("%s: internal AllReduce requires compute capability >= %d "
"(device %d has cc=%d); falling back\n",
__func__, GGML_CUDA_CC_VOLTA, devices[i], cc);
return nullptr;
}
}
auto * p = new ggml_cuda_ar_pipeline{};
p->n_devices = n_devices;
p->copy_bytes = GGML_CUDA_AR_COPY_MAX_BYTES;
p->copy_threshold = ggml_cuda_ar_env_u64("GGML_CUDA_AR_COPY_THRESHOLD", GGML_CUDA_AR_COPY_THRESHOLD_DEFAULT);
// 0 = use the per-call heuristic (default). Non-zero env value forces a
// fixed chunk size for diagnostics, with a floor at COPY_CHUNK_BYTES_MIN.
p->copy_chunk_bytes = ggml_cuda_ar_env_u64("GGML_CUDA_AR_COPY_CHUNK_BYTES", 0);
if (p->copy_chunk_bytes > 0 && p->copy_chunk_bytes < GGML_CUDA_AR_COPY_CHUNK_BYTES_MIN) {
GGML_LOG_WARN("%s: GGML_CUDA_AR_COPY_CHUNK_BYTES=%zu below minimum %zu; clamping\n",
__func__, p->copy_chunk_bytes, GGML_CUDA_AR_COPY_CHUNK_BYTES_MIN);
p->copy_chunk_bytes = GGML_CUDA_AR_COPY_CHUNK_BYTES_MIN;
}
// Default 1: BF16 round-trip is always on for F32 inputs (any non-zero
// ne). Set GGML_CUDA_AR_BF16_THRESHOLD=0 to disable, or to a larger
// byte threshold to opt out for small tensors.
p->bf16_threshold = ggml_cuda_ar_env_u64("GGML_CUDA_AR_BF16_THRESHOLD", 1);
for (size_t i = 0; i < n_devices; ++i) {
p->devices[i] = devices[i];
}
// Per-device streams and event pools.
for (size_t i = 0; i < n_devices; ++i) {
ggml_cuda_set_device(p->devices[i]);
cudaStream_t stream = nullptr;
if (cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking) != cudaSuccess) {
GGML_LOG_ERROR("%s: cudaStreamCreateWithFlags failed for device %d\n",
__func__, p->devices[i]);
ggml_cuda_ar_pipeline_free(p);
return nullptr;
}
p->streams[i] = stream;
for (int s = 0; s < GGML_CUDA_AR_POOL_SIZE; ++s) {
bool ok =
cudaEventCreateWithFlags(&p->ev_pool[i][s].app, cudaEventDisableTiming) == cudaSuccess &&
cudaEventCreateWithFlags(&p->ev_pool[i][s].h2d, cudaEventDisableTiming) == cudaSuccess &&
cudaEventCreateWithFlags(&p->ev_pool[i][s].ker, cudaEventDisableTiming) == cudaSuccess;
for (int c = 0; ok && c < GGML_CUDA_AR_COPY_MAX_CHUNKS; ++c) {
ok = cudaEventCreateWithFlags(&p->ev_pool[i][s].cpy[c], cudaEventDisableTiming) == cudaSuccess;
}
if (!ok) {
GGML_LOG_ERROR("%s: cudaEventCreate failed for device %d slot %d\n",
__func__, p->devices[i], s);
ggml_cuda_ar_pipeline_free(p);
return nullptr;
}
}
if (cudaEventCreateWithFlags(&p->host_large_read_done[i], cudaEventDisableTiming) != cudaSuccess) {
GGML_LOG_ERROR("%s: cudaEventCreate for host_large_read_done failed for device %d\n",
__func__, p->devices[i]);
ggml_cuda_ar_pipeline_free(p);
return nullptr;
}
if (cudaEventCreateWithFlags(&p->dev_tmp_kernel_done[i], cudaEventDisableTiming) != cudaSuccess) {
GGML_LOG_ERROR("%s: cudaEventCreate for dev_tmp_kernel_done failed for device %d\n",
__func__, p->devices[i]);
ggml_cuda_ar_pipeline_free(p);
return nullptr;
}
}
// Arrival ring: cache-line padded so each GPU's int is on its own line.
const size_t arrival_bytes =
(size_t)GGML_CUDA_AR_POOL_SIZE * n_devices *
GGML_CUDA_AR_KERNEL_BLOCKS * GGML_CUDA_AR_ARRIVAL_STRIDE;
if (p->arrival.alloc(arrival_bytes) != cudaSuccess) {
GGML_LOG_ERROR("%s: alloc for arrival ring failed (%zu bytes)\n",
__func__, arrival_bytes);
ggml_cuda_ar_pipeline_free(p);
return nullptr;
}
ggml_cuda_set_device(p->devices[0]);
if (cudaMemset(p->arrival.dev, 0, arrival_bytes) != cudaSuccess) {
GGML_LOG_ERROR("%s: cudaMemset for arrival ring failed (%zu bytes)\n",
__func__, arrival_bytes);
ggml_cuda_ar_pipeline_free(p);
return nullptr;
}
// Per-device pinned staging buffers -- POOL_SIZE-deep ring so the chunked-
// kernel can write the next slot's data while the peer is still reading
// the previous slot's. Indexed by (slot * buf_bytes) at the call site.
p->buf_bytes = GGML_CUDA_AR_MAX_BYTES;
const size_t host_buf_total = (size_t) GGML_CUDA_AR_POOL_SIZE * p->buf_bytes;
for (size_t i = 0; i < n_devices; ++i) {
if (p->host_buf[i].alloc(host_buf_total) != cudaSuccess) {
GGML_LOG_ERROR("%s: alloc for staging failed (%zu bytes)\n",
__func__, host_buf_total);
ggml_cuda_ar_pipeline_free(p);
return nullptr;
}
}
// Copy-engine path: pinned host staging + device scratch, sized for the
// largest tensor we accept on this path (GGML_CUDA_AR_COPY_MAX_BYTES).
// dev_tmp is single-buffered; cross-AR safety is enforced by an explicit
// cross-stream wait in copy_impl on the prior AR's add_kernel-done event.
for (size_t i = 0; i < n_devices; ++i) {
ggml_cuda_set_device(p->devices[i]);
if (p->host_large[i].alloc(p->copy_bytes) != cudaSuccess) {
GGML_LOG_ERROR("%s: alloc for large staging failed (%zu bytes)\n",
__func__, p->copy_bytes);
ggml_cuda_ar_pipeline_free(p);
return nullptr;
}
if (cudaMalloc(reinterpret_cast<void **>(&p->dev_tmp[i]), p->copy_bytes) != cudaSuccess) {
GGML_LOG_ERROR("%s: cudaMalloc for copy scratch failed (%zu bytes) on device %d\n",
__func__, p->copy_bytes, p->devices[i]);
ggml_cuda_ar_pipeline_free(p);
return nullptr;
}
}
GGML_LOG_INFO("%s: initialized AllReduce pipeline: %zu GPUs, "
"%zu KB chunked kernel staging + %zu MB copy-engine staging per GPU\n",
__func__, n_devices, p->buf_bytes >> 10, p->copy_bytes >> 20);
return p;
}
void ggml_cuda_ar_pipeline_free(ggml_cuda_ar_pipeline * p) {
if (!p) {
return;
}
// Drain all in-flight kernels before tearing down resources.
for (int i = 0; i < p->n_devices; ++i) {
if (p->streams[i]) {
ggml_cuda_set_device(p->devices[i]);
cudaStreamSynchronize(p->streams[i]);
}
}
for (int i = 0; i < p->n_devices; ++i) {
p->host_buf[i].free();
p->host_large[i].free();
if (p->dev_tmp[i]) {
ggml_cuda_set_device(p->devices[i]);
cudaFree(p->dev_tmp[i]);
}
ggml_cuda_set_device(p->devices[i]);
for (int s = 0; s < GGML_CUDA_AR_POOL_SIZE; ++s) {
if (p->ev_pool[i][s].app) { cudaEventDestroy(p->ev_pool[i][s].app); }
for (int c = 0; c < GGML_CUDA_AR_COPY_MAX_CHUNKS; ++c) {
if (p->ev_pool[i][s].cpy[c]) { cudaEventDestroy(p->ev_pool[i][s].cpy[c]); }
}
if (p->ev_pool[i][s].h2d) { cudaEventDestroy(p->ev_pool[i][s].h2d); }
if (p->ev_pool[i][s].ker) { cudaEventDestroy(p->ev_pool[i][s].ker); }
}
if (p->host_large_read_done[i]) {
ggml_cuda_set_device(p->devices[i]);
cudaEventDestroy(p->host_large_read_done[i]);
}
if (p->dev_tmp_kernel_done[i]) {
ggml_cuda_set_device(p->devices[i]);
cudaEventDestroy(p->dev_tmp_kernel_done[i]);
}
if (p->streams[i]) {
ggml_cuda_set_device(p->devices[i]);
cudaStreamDestroy(p->streams[i]);
}
}
p->arrival.free();
delete p;
}
// ---------------------------------------------------------------------------
// Dispatch
// ---------------------------------------------------------------------------
// Asymmetric copy_impl: data sent over PCIe in T_src precision (one element of
// nbytes per ne element); accumulated locally into a T_dst buffer. When
// T_src == T_dst this is the original homogeneous reduction. When they differ
// (e.g. BF16 wire / F32 accumulator) the add kernel rounds dst through T_src
// for bit-equivalence between GPUs and we skip the otherwise-needed
// post-conversion entirely.
template <typename T_src, typename T_dst>
static bool ggml_cuda_ar_allreduce_copy_impl(
ggml_cuda_ar_pipeline * p,
ggml_backend_t * backends,
T_src * const src_buf[GGML_CUDA_MAX_DEVICES],
T_dst * const dst_buf[GGML_CUDA_MAX_DEVICES],
const bool compute[GGML_CUDA_MAX_DEVICES],
int64_t ne,
size_t nbytes) {
GGML_ASSERT(p->n_devices == 2);
GGML_ASSERT(nbytes <= p->copy_bytes);
GGML_ASSERT(ne <= std::numeric_limits<int>::max());
const size_t chunk_bytes = ggml_cuda_ar_chunk_bytes(p, nbytes);
GGML_ASSERT(chunk_bytes > 0);
const int slot = ggml_cuda_ar_acquire_slot(p).slot;
const size_t copy_chunks = (nbytes + chunk_bytes - 1) / chunk_bytes;
GGML_ASSERT(copy_chunks <= GGML_CUDA_AR_COPY_MAX_CHUNKS);
ggml_backend_cuda_context * cuda_ctx[2] = {};
// Stage 1: both GPUs copy their local contribution to pinned host memory.
for (int i = 0; i < 2; ++i) {
ggml_cuda_set_device(p->devices[i]);
cuda_ctx[i] = static_cast<ggml_backend_cuda_context *>(backends[i]->context);
GGML_ASSERT(cuda_ctx[i]->device == p->devices[i]);
ggml_cuda_ar_wait_for_compute(p, cuda_ctx[i], i, slot);
// Wait for peer's H2D from our host_large[i] (recorded in the
// previous AR's stage 2) to complete before we overwrite host_large[i].
// host_large_read_done[peer] = peer finished reading host_large[i].
// No-op on the first AR -- no prior record exists.
if (p->host_large_read_done_valid) {
const int peer = 1 - i;
CUDA_CHECK(cudaStreamWaitEvent(p->streams[i], p->host_large_read_done[peer]));
}
if (!compute[i]) {
CUDA_CHECK(cudaMemsetAsync(src_buf[i], 0, nbytes, p->streams[i]));
}
for (size_t c = 0; c < copy_chunks; ++c) {
const size_t offset = c * chunk_bytes;
const size_t this_bytes = (nbytes - offset) < chunk_bytes ?
(nbytes - offset) : chunk_bytes;
CUDA_CHECK(cudaMemcpyAsync(
p->host_large[i].host + offset, reinterpret_cast<char *>(src_buf[i]) + offset, this_bytes,
cudaMemcpyDeviceToHost, p->streams[i]));
CUDA_CHECK(cudaEventRecord(p->ev_pool[i][slot].cpy[c], p->streams[i]));
}
}
// Stage 2: each GPU waits for each peer D2H chunk, pulls that chunk back to
// local device scratch (dev_tmp), then performs one device-local add over
// the assembled peer tensor. The H2Ds run on the AR stream (copy engine)
// and the add_kernel runs on the caller's compute stream, so the AR stream
// stays pure-copy and avoids an in-stream copy->compute engine switch every
// AR. dev_tmp is single-buffered: the AR stream waits cross-stream on the
// prior AR's add_kernel-done event before overwriting it.
for (int i = 0; i < 2; ++i) {
const int peer = 1 - i;
ggml_cuda_set_device(p->devices[i]);
// Wait for the previous AR's add_kernel (on the compute stream) to
// finish reading dev_tmp before our H2D overwrites it. No-op on the
// first copy_impl call.
if (p->dev_tmp_kernel_done_valid) {
CUDA_CHECK(cudaStreamWaitEvent(p->streams[i], p->dev_tmp_kernel_done[i]));
}
for (size_t c = 0; c < copy_chunks; ++c) {
const size_t offset = c * chunk_bytes;
const size_t this_bytes = (nbytes - offset) < chunk_bytes ?
(nbytes - offset) : chunk_bytes;
CUDA_CHECK(cudaStreamWaitEvent(p->streams[i], p->ev_pool[peer][slot].cpy[c]));
CUDA_CHECK(cudaMemcpyAsync(
p->dev_tmp[i] + offset, p->host_large[peer].host + offset, this_bytes,
cudaMemcpyHostToDevice, p->streams[i]));
}
// Mark our reads of host_large[peer] complete so peer's next AR can
// safely overwrite it.
CUDA_CHECK(cudaEventRecord(p->host_large_read_done[i], p->streams[i]));
// Hand off from AR stream (copy engine) to compute stream: compute
// stream waits for all H2Ds to finish, then runs the add_kernel.
CUDA_CHECK(cudaEventRecord(p->ev_pool[i][slot].h2d, p->streams[i]));
CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx[i]->stream(), p->ev_pool[i][slot].h2d));
const int block_size = 256;
int n_blocks = (int) ((ne + block_size - 1) / block_size);
if (n_blocks > 1024) {
n_blocks = 1024;
}
ggml_cuda_ar_add_kernel<T_dst, T_src><<<n_blocks, block_size, 0, cuda_ctx[i]->stream()>>>(
dst_buf[i],
reinterpret_cast<const T_src *>(p->dev_tmp[i]),
(int) ne);
CUDA_CHECK(cudaGetLastError());
// Record dev_tmp-released on the compute stream so the next copy_impl
// can wait for the kernel to finish before overwriting dev_tmp. Also
// record AR-done as ev.ker for acquire_slot's pool-wraparound sync.
CUDA_CHECK(cudaEventRecord(p->dev_tmp_kernel_done[i], cuda_ctx[i]->stream()));
CUDA_CHECK(cudaEventRecord(p->ev_pool[i][slot].ker, cuda_ctx[i]->stream()));
}
p->host_large_read_done_valid = true;
p->dev_tmp_kernel_done_valid = true;
return true;
}
// Outer-level chunker: copy_impl handles up to copy_bytes per call (limited by
// the host_large / dev_tmp allocation size). When the full AR exceeds that,
// slice the tensor into copy_bytes-sized pieces and call copy_impl repeatedly.
// Each slice goes through its own stage 1 -> stage 2 cycle and acquires its own
// slot, so cross-AR fences and pool wraparound work the same way as for any
// other sequence of small ARs.
template <typename T_src, typename T_dst>
static bool ggml_cuda_ar_allreduce_copy_outer(
ggml_cuda_ar_pipeline * p,
ggml_backend_t * backends,
T_src * const src_buf[GGML_CUDA_MAX_DEVICES],
T_dst * const dst_buf[GGML_CUDA_MAX_DEVICES],
const bool compute[GGML_CUDA_MAX_DEVICES],
int64_t ne) {
const int64_t outer_max_elems = (int64_t) (p->copy_bytes / sizeof(T_src));
GGML_ASSERT(outer_max_elems > 0);
bool ok = true;
for (int64_t outer_start = 0; outer_start < ne && ok; outer_start += outer_max_elems) {
const int64_t outer_ne = std::min(outer_max_elems, ne - outer_start);
const size_t outer_nbytes = (size_t) outer_ne * sizeof(T_src);
T_src * src[GGML_CUDA_MAX_DEVICES] = {};
T_dst * dst[GGML_CUDA_MAX_DEVICES] = {};
for (int i = 0; i < p->n_devices; ++i) {
src[i] = src_buf[i] + outer_start;
dst[i] = dst_buf[i] + outer_start;
}
ok = ggml_cuda_ar_allreduce_copy_impl<T_src, T_dst>(
p, backends, src, dst, compute, outer_ne, outer_nbytes);
}
return ok;
}
bool ggml_cuda_ar_allreduce(
ggml_cuda_ar_pipeline * p,
ggml_backend_t * backends,
ggml_tensor ** tensors) {
GGML_ASSERT(p != nullptr);
const int n = p->n_devices;
GGML_ASSERT(n == 2);
const ggml_type input_type = tensors[0]->type;
GGML_ASSERT(input_type == GGML_TYPE_F32 || input_type == GGML_TYPE_F16 || input_type == GGML_TYPE_BF16);
const int64_t ne = ggml_nelements(tensors[0]);
GGML_ASSERT(ne > 0);
const size_t input_nbytes = ggml_nbytes(tensors[0]);
// BF16 round-trip: F32 inputs >= bf16_threshold are converted to BF16 for
// the reduction (chunked or copy-engine), halving on-wire bytes. Matches
// NCCL's behaviour. The pre-conversion zeroes inactive shards so the
// inner paths see them as already-prepared compute tensors.
const bool use_bf16 =
input_type == GGML_TYPE_F32 &&
p->bf16_threshold > 0 &&
input_nbytes >= p->bf16_threshold;
const ggml_type kernel_type = use_bf16 ? GGML_TYPE_BF16 : input_type;
const size_t type_size = ggml_type_size(kernel_type);
GGML_ASSERT(p->buf_bytes >= type_size);
const size_t nbytes = (size_t) ne * type_size;
bool compute_flag[GGML_CUDA_MAX_DEVICES] = {};
for (int i = 0; i < n; ++i) {
compute_flag[i] = (tensors[i]->flags & GGML_TENSOR_FLAG_COMPUTE) != 0;
}
// Decide between copy-engine and chunked kernel paths based on the working
// type's actual byte count. No upper bound: copy_outer slices reductions
// larger than copy_bytes into copy_bytes-sized pieces.
const bool use_copy_engine =
p->copy_threshold > 0 &&
nbytes >= p->copy_threshold;
// BF16 inactive-shard zeroing: when use_bf16 is on, the combined kernel
// (chunked kernel path) and the combined add kernel (copy_engine path)
// both accumulate into the F32 tensor data directly, so an inactive
// shard's accumulator must start at zero.
if (use_bf16) {
for (int i = 0; i < n; ++i) {
if (!compute_flag[i]) {
auto * cuda_ctx = static_cast<ggml_backend_cuda_context *>(backends[i]->context);
GGML_ASSERT(cuda_ctx->device == p->devices[i]);
ggml_cuda_set_device(p->devices[i]);
CUDA_CHECK(cudaMemsetAsync(tensors[i]->data, 0, (size_t) ne * sizeof(float), cuda_ctx->stream()));
}
}
}
// Pre-convert F32 -> BF16 into bf16_tmp ONLY for the copy_engine + use_bf16
// path; the chunked kernel path's combined kernel does the conversion
// inline as it writes to host_buf.
ggml_cuda_pool_alloc<nv_bfloat16> bf16_tmp[GGML_CUDA_MAX_DEVICES];
void * copy_src_ptr[GGML_CUDA_MAX_DEVICES] = {};
if (use_copy_engine && use_bf16) {
to_bf16_cuda_t to_bf16 = ggml_get_to_bf16_cuda(GGML_TYPE_F32);
for (int i = 0; i < n; ++i) {
auto * cuda_ctx = static_cast<ggml_backend_cuda_context *>(backends[i]->context);
GGML_ASSERT(cuda_ctx->device == p->devices[i]);
bf16_tmp[i].pool = &cuda_ctx->pool();
bf16_tmp[i].alloc(ne);
ggml_cuda_set_device(p->devices[i]);
if (compute_flag[i]) {
to_bf16(tensors[i]->data, bf16_tmp[i].get(), ne, cuda_ctx->stream());
CUDA_CHECK(cudaGetLastError());
} else {
CUDA_CHECK(cudaMemsetAsync(bf16_tmp[i].get(), 0, nbytes, cuda_ctx->stream()));
}
copy_src_ptr[i] = bf16_tmp[i].get();
}
}
bool ok = true;
if (use_copy_engine) {
// After up-front BF16 conversion, the tmp buffers already hold the
// (possibly zeroed-for-inactive) data, so the inner path can treat
// every shard as compute.
bool inner_compute[GGML_CUDA_MAX_DEVICES];
for (int i = 0; i < n; ++i) {
inner_compute[i] = use_bf16 ? true : compute_flag[i];
}
// Dispatch into copy_impl with explicit src/dst types. When use_bf16
// is on, the wire type is BF16 (src = bf16_tmp) and the accumulator
// is F32 (dst = tensors[i]->data); the combined add kernel rounds dst
// through BF16 for bit-equivalence and writes F32 directly, so no
// post-conversion is needed. Otherwise src == dst (same native type).
if (use_bf16) {
GGML_ASSERT(kernel_type == GGML_TYPE_BF16);
nv_bfloat16 * src[GGML_CUDA_MAX_DEVICES] = {};
float * dst[GGML_CUDA_MAX_DEVICES] = {};
for (int i = 0; i < n; ++i) {
src[i] = static_cast<nv_bfloat16 *>(copy_src_ptr[i]);
dst[i] = static_cast<float *>(tensors[i]->data);
}
ok = ggml_cuda_ar_allreduce_copy_outer<nv_bfloat16, float>(
p, backends, src, dst, inner_compute, ne);
} else {
switch (kernel_type) {
case GGML_TYPE_F32: {
float * buf[GGML_CUDA_MAX_DEVICES] = {};
for (int i = 0; i < n; ++i) {
buf[i] = static_cast<float *>(tensors[i]->data);
}
ok = ggml_cuda_ar_allreduce_copy_outer<float, float>(
p, backends, buf, buf, inner_compute, ne);
break;
}
case GGML_TYPE_BF16: {
nv_bfloat16 * buf[GGML_CUDA_MAX_DEVICES] = {};
for (int i = 0; i < n; ++i) {
buf[i] = static_cast<nv_bfloat16 *>(tensors[i]->data);
}
ok = ggml_cuda_ar_allreduce_copy_outer<nv_bfloat16, nv_bfloat16>(
p, backends, buf, buf, inner_compute, ne);
break;
}
case GGML_TYPE_F16: {
half * buf[GGML_CUDA_MAX_DEVICES] = {};
for (int i = 0; i < n; ++i) {
buf[i] = static_cast<half *>(tensors[i]->data);
}
ok = ggml_cuda_ar_allreduce_copy_outer<half, half>(
p, backends, buf, buf, inner_compute, ne);
break;
}
default:
GGML_ASSERT(false);
}
}
} else {
// host_buf carries T_wire-typed data; max_chunk_elems is the count that
// fits in one host_buf at the wire size.
const size_t max_chunk_elems = p->buf_bytes / type_size;
const size_t input_type_size = ggml_type_size(input_type);
// Chunked kernel path runs entirely on the caller's compute stream:
// since AR is a barrier here, same-stream ordering subsumes any
// cross-stream event handshake that the copy-engine path needs, and
// skips the cross-stream scheduling overhead that was hurting the
// small-tensor (tg) latency on the AR-stream variant. Only ev.ker is
// still recorded at end-of-AR for acquire_slot's pool-wraparound check.
for (int64_t chunk_start = 0; chunk_start < ne; chunk_start += (int64_t) max_chunk_elems) {
const size_t remaining_elems = (size_t) (ne - chunk_start);
const size_t chunk_elems = remaining_elems < max_chunk_elems ? remaining_elems : max_chunk_elems;
const size_t chunk_dst_bytes = chunk_elems * input_type_size;
const auto [slot, token] = ggml_cuda_ar_acquire_slot(p);
const bool last_chunk = chunk_start + (int64_t) chunk_elems == ne;
for (int i = 0; i < n; ++i) {
const int peer = 1 - i; // valid for n == 2 only
ggml_cuda_set_device(p->devices[i]);
auto * cuda_ctx = static_cast<ggml_backend_cuda_context *>(backends[i]->context);
GGML_ASSERT(cuda_ctx->device == p->devices[i]);
cudaStream_t stream = cuda_ctx->stream();
char * data = static_cast<char *>(tensors[i]->data) + chunk_start * (int64_t) input_type_size;
// Match NCCL/meta-backend semantics: inactive shards contribute
// zeros. On the BF16 path the F32 tensor data was already
// zeroed up-front (above), so per-chunk zeroing isn't needed.
if (!compute_flag[i] && !use_bf16) {
CUDA_CHECK(cudaMemsetAsync(data, 0, chunk_dst_bytes, stream));
}
#define LAUNCH_AR_KERNEL(T_dst, T_wire) \
ggml_cuda_ar_kernel<T_dst, T_wire><<<dim3(GGML_CUDA_AR_KERNEL_BLOCKS), dim3(256), 0, stream>>>( \
reinterpret_cast<const T_dst *>(data), \
reinterpret_cast<T_dst *>(data), \
reinterpret_cast<T_wire *>(p->host_buf[i].dev + (size_t) slot * p->buf_bytes), \
reinterpret_cast<const T_wire *>(p->host_buf[peer].dev + (size_t) slot * p->buf_bytes), \
static_cast<int>(chunk_elems), \
ggml_cuda_ar_arrival_ptr(p, slot, i), \
ggml_cuda_ar_arrival_ptr(p, slot, peer), \
token)
if (use_bf16) {
GGML_ASSERT(input_type == GGML_TYPE_F32);
LAUNCH_AR_KERNEL(float, nv_bfloat16);
} else {
switch (input_type) {
case GGML_TYPE_F32: LAUNCH_AR_KERNEL(float, float); break;
case GGML_TYPE_F16: LAUNCH_AR_KERNEL(half, half); break;
case GGML_TYPE_BF16: LAUNCH_AR_KERNEL(nv_bfloat16, nv_bfloat16); break;
default: GGML_ASSERT(false);
}
}
#undef LAUNCH_AR_KERNEL
CUDA_CHECK(cudaGetLastError());
if (last_chunk) {
CUDA_CHECK(cudaEventRecord(p->ev_pool[i][slot].ker, stream));
}
}
}
}
return ok;
}
#else // defined(GGML_USE_HIP) || defined(GGML_USE_MUSA)
// HIP and MUSA lack the host-mapped pinned-memory APIs (cudaHostAllocPortable
// / cudaHostAllocMapped / cudaHostGetDevicePointer) and __nanosleep that this
// implementation relies on, so the internal AllReduce is a CUDA-only feature.
// The dispatcher in ggml-cuda.cu treats a nullptr pipeline as "init failed"
// and silently falls back to the meta backend's generic AllReduce.
ggml_cuda_ar_pipeline * ggml_cuda_ar_pipeline_init(const int *, size_t) {
return nullptr;
}
void ggml_cuda_ar_pipeline_free(ggml_cuda_ar_pipeline *) {
}
bool ggml_cuda_ar_allreduce(ggml_cuda_ar_pipeline *, ggml_backend_t *, ggml_tensor **) {
return false;
}
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)

View file

@ -0,0 +1,29 @@
#pragma once
#include "common.cuh"
#include "ggml-backend-impl.h"
#include <cstddef>
// Opaque pipeline context -- owns all pinned buffers, streams, and events.
struct ggml_cuda_ar_pipeline;
// Allocate a pipeline for n_devices GPUs.
// devices[] holds the CUDA device IDs in rank order.
// Returns nullptr on allocation failure.
ggml_cuda_ar_pipeline * ggml_cuda_ar_pipeline_init(
const int * devices, size_t n_devices);
// Release all resources owned by the pipeline.
void ggml_cuda_ar_pipeline_free(ggml_cuda_ar_pipeline * pipeline);
// Execute an in-place AllReduce (sum) across tensors[0..n_devices-1].
// tensors[i] must live on the device managed by backends[i] and be
// contiguous F32, F16, or BF16.
// Preconditions are checked by the CUDA comm dispatcher before calling this.
// Returns true once the reduction work has been enqueued successfully.
bool ggml_cuda_ar_allreduce(
ggml_cuda_ar_pipeline * pipeline,
ggml_backend_t * backends,
ggml_tensor ** tensors);

View file

@ -61,6 +61,11 @@ static constexpr __host__ __device__ fattn_mma_config ggml_cuda_fattn_mma_get_co
GGML_CUDA_FATTN_MMA_CONFIG_CASE(128, 128, 32, 128, 2, 64, 64, 64, 64, 2, true);
GGML_CUDA_FATTN_MMA_CONFIG_CASE(128, 128, 64, 128, 2, 64, 64, 64, 64, 2, true);
GGML_CUDA_FATTN_MMA_CONFIG_CASE(192, 128, 8, 64, 4, 64, 96, 64, 64, 2, true);
GGML_CUDA_FATTN_MMA_CONFIG_CASE(192, 128, 16, 64, 4, 32, 96, 64, 64, 2, true);
GGML_CUDA_FATTN_MMA_CONFIG_CASE(192, 128, 32, 128, 2, 32, 96, 64, 64, 2, true);
GGML_CUDA_FATTN_MMA_CONFIG_CASE(192, 128, 64, 128, 2, 32, 96, 64, 64, 2, true);
GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 8, 64, 4, 64, 128, 128, 128, 2, true);
GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 16, 64, 4, 32, 128, 128, 128, 2, true);
GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 32, 128, 2, 32, 128, 128, 128, 2, true);
@ -1561,6 +1566,10 @@ static __global__ void flash_attn_ext_f16(
NO_DEVICE_CODE;
return;
}
if (DKQ == 192 && ncols2 != 8 && ncols2 != 16) {
NO_DEVICE_CODE;
return;
}
#ifdef VOLTA_MMA_AVAILABLE
if (ncols1*ncols2 < 32) {
NO_DEVICE_CODE;

View file

@ -34,6 +34,10 @@ void ggml_cuda_flash_attn_ext_tile(ggml_backend_cuda_context & ctx, ggml_tensor
GGML_ASSERT(V->ne[0] == K->ne[0]);
ggml_cuda_flash_attn_ext_tile_case<128, 128>(ctx, dst);
} break;
case 192: {
GGML_ASSERT(V->ne[0] == 128);
ggml_cuda_flash_attn_ext_tile_case<192, 128>(ctx, dst);
} break;
case 256: {
GGML_ASSERT(V->ne[0] == K->ne[0]);
ggml_cuda_flash_attn_ext_tile_case<256, 256>(ctx, dst);

View file

@ -62,6 +62,12 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv
GGML_CUDA_FATTN_TILE_CONFIG_CASE(128, 128, 16, 256, 2, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(128, 128, 32, 256, 2, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 2, 64, 2, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 4, 128, 2, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 8, 256, 2, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 16, 256, 2, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 32, 256, 2, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 2, 64, 2, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 4, 128, 2, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 8, 256, 2, 64, 64)
@ -124,6 +130,12 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv
GGML_CUDA_FATTN_TILE_CONFIG_CASE(128, 128, 16, 128, 3, 32, 128)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(128, 128, 32, 256, 2, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 2, 128, 3, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 4, 128, 3, 32, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 8, 256, 2, 32, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 16, 256, 2, 32, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 32, 256, 2, 32, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 2, 128, 3, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 4, 128, 3, 32, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 8, 256, 2, 32, 256)
@ -193,6 +205,12 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_am
GGML_CUDA_FATTN_TILE_CONFIG_CASE(128, 128, 32, 256, 2, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(128, 128, 64, 256, 2, 64, 32)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 2, 256, 2, 128, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 4, 256, 2, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 8, 256, 2, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 16, 256, 2, 32, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 32, 256, 2, 32, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 2, 256, 2, 128, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 4, 256, 2, 64, 128)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 8, 256, 2, 64, 128)
@ -264,6 +282,12 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_am
GGML_CUDA_FATTN_TILE_CONFIG_CASE(128, 128, 32, 256, 3, 128, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(128, 128, 64, 256, 3, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 2, 64, 8, 32, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 4, 128, 6, 32, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 8, 128, 6, 32, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 16, 256, 5, 32, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(192, 128, 32, 256, 3, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 2, 64, 8, 32, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 4, 128, 6, 32, 256)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 8, 128, 6, 32, 256)
@ -1250,7 +1274,20 @@ static void launch_fattn_tile_switch_ncols2(ggml_backend_cuda_context & ctx, ggm
}
}
if constexpr (DKQ <= 512 && DKQ != 320) {
if constexpr (DKQ == 192) {
// MiMo-V2.5 / V2.5-Pro / V2-Flash: gqa_ratio is 8 (SWA) or 16 (full attn)
if (use_gqa_opt && gqa_ratio % 16 == 0) {
launch_fattn_tile_switch_ncols1<DKQ, DV, 16, use_logit_softcap>(ctx, dst);
return;
}
if (use_gqa_opt && gqa_ratio % 8 == 0) {
launch_fattn_tile_switch_ncols1<DKQ, DV, 8, use_logit_softcap>(ctx, dst);
return;
}
GGML_ABORT("flash-attn tile (192/128): expected GQA ratio multiple of 8");
}
if constexpr (DKQ <= 512 && DKQ != 320 && DKQ != 192) {
if (use_gqa_opt && gqa_ratio % 8 == 0) {
launch_fattn_tile_switch_ncols1<DKQ, DV, 8, use_logit_softcap>(ctx, dst);
return;
@ -1303,6 +1340,7 @@ extern DECL_FATTN_TILE_CASE( 80, 80);
extern DECL_FATTN_TILE_CASE( 96, 96);
extern DECL_FATTN_TILE_CASE(112, 112);
extern DECL_FATTN_TILE_CASE(128, 128);
extern DECL_FATTN_TILE_CASE(192, 128);
extern DECL_FATTN_TILE_CASE(256, 256);
extern DECL_FATTN_TILE_CASE(320, 256);
extern DECL_FATTN_TILE_CASE(512, 512);

View file

@ -139,6 +139,22 @@ static void ggml_cuda_flash_attn_ext_mma_f16(ggml_backend_cuda_context & ctx, gg
GGML_ASSERT(V->ne[0] == 128);
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2<128, 128>(ctx, dst);
break;
case 192: {
// MiMo-V2.5 / V2.5-Pro / V2-Flash: gqa_ratio is 8 (SWA) or 16 (full attn)
GGML_ASSERT(V->ne[0] == 128);
float max_bias = 0.0f;
memcpy(&max_bias, (const float *) KQV->op_params + 1, sizeof(float));
const bool use_gqa_opt = mask && max_bias == 0.0f;
GGML_ASSERT(use_gqa_opt);
GGML_ASSERT(Q->ne[2] % K->ne[2] == 0);
const int gqa_ratio = Q->ne[2] / K->ne[2];
if (gqa_ratio % 16 == 0) {
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<192, 128, 16>(ctx, dst);
} else {
GGML_ASSERT(gqa_ratio % 8 == 0);
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<192, 128, 8>(ctx, dst);
}
} break;
case 256:
GGML_ASSERT(V->ne[0] == 256);
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2<256, 256>(ctx, dst);
@ -369,6 +385,14 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
return BEST_FATTN_KERNEL_NONE;
}
break;
case 192:
if (V->ne[0] != 128 || !gqa_opt_applies) {
return BEST_FATTN_KERNEL_NONE;
}
if (gqa_ratio % 8 != 0) {
return BEST_FATTN_KERNEL_NONE;
}
break;
case 320:
if (V->ne[0] != 256 || !gqa_opt_applies) {
return BEST_FATTN_KERNEL_NONE;
@ -426,7 +450,8 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
}
// For small batch sizes the vector kernel may be preferable over the kernels optimized for large batch sizes:
const bool can_use_vector_kernel = Q->ne[0] <= 256 && Q->ne[0] % 64 == 0 && K->ne[1] % FATTN_KQ_STRIDE == 0;
// 192 satisfies % 64 == 0 but has no vec instance (DKQ != DV); force it onto the MMA path.
const bool can_use_vector_kernel = Q->ne[0] <= 256 && Q->ne[0] % 64 == 0 && Q->ne[0] != 192 && K->ne[1] % FATTN_KQ_STRIDE == 0;
// If Turing tensor cores are available, use them:
if (turing_mma_available(cc) && Q->ne[0] != 40 && Q->ne[0] != 72) {
@ -455,7 +480,7 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
// Use the WMMA kernel if possible but only for HIP
#if defined(GGML_HIP_ROCWMMA_FATTN)
if (ggml_cuda_should_use_wmma_fattn(cc) && K->ne[1] % FATTN_KQ_STRIDE == 0 && Q->ne[0] != 40 && Q->ne[0] != 72 && Q->ne[0] != 512 && Q->ne[0] != 576) {
if (ggml_cuda_should_use_wmma_fattn(cc) && K->ne[1] % FATTN_KQ_STRIDE == 0 && Q->ne[0] != 40 && Q->ne[0] != 72 && Q->ne[0] != 192 && Q->ne[0] != 512 && Q->ne[0] != 576) {
if (can_use_vector_kernel && Q->ne[1] <= 2) {
return BEST_FATTN_KERNEL_VEC;
}
@ -489,7 +514,7 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
}
// Use MFMA flash attention for CDNA (MI100+):
if (amd_mfma_available(cc) && Q->ne[0] != 40 && Q->ne[0] != 72 && Q->ne[0] != 256 && Q->ne[0] != 512 && Q->ne[0] != 576) {
if (amd_mfma_available(cc) && Q->ne[0] != 40 && Q->ne[0] != 72 && Q->ne[0] != 192 && Q->ne[0] != 256 && Q->ne[0] != 512 && Q->ne[0] != 576) {
const int64_t eff_nq = Q->ne[1] * (gqa_opt_applies ? gqa_ratio : 1);
// MMA vs tile crossover benchmarked on MI300X @ d32768:
// hsk=64 (gqa=4): MMA wins at eff >= 128 (+11%)

View file

@ -4,6 +4,7 @@
bool g_mul_mat_q = true;
#include "ggml-cuda/allreduce.cuh"
#include "ggml-cuda/common.cuh"
#include "ggml-cuda/acc.cuh"
#include "ggml-cuda/add-id.cuh"
@ -88,6 +89,9 @@ bool g_mul_mat_q = true;
static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
#define GGML_LOG_WARN_ONCE(str) \
{ static std::once_flag warn_flag; std::call_once(warn_flag, []() { GGML_LOG_WARN(str); }); }
[[noreturn]]
void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
int id = -1; // in case cudaGetDevice fails
@ -1139,70 +1143,46 @@ static const ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_inte
/* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
};
#ifdef GGML_USE_NCCL
// Communication context for multi-GPU AllReduce during tensor parallelism.
//
// Created once per meta backend instance. Resources for the selected mode
// (NCCL communicators or the internal AllReduce pipeline) are initialised
// eagerly during comm_init so any init failure surfaces at startup rather
// than mid-run.
struct ggml_backend_cuda_comm_context {
using try_allreduce_fn = bool(*)(ggml_backend_cuda_comm_context *, struct ggml_tensor **);
std::vector<ggml_backend_t> backends;
std::vector<ncclComm_t> comms;
std::vector<int> dev_ids;
// Set by the init chain (comm_init_{nccl, internal, none}) to one of
// try_allreduce_{nccl, internal, butterfly}. nccl needs `comms`,
// internal needs `ar_pipeline`, butterfly needs nothing. Per-call
// failures return false; the meta backend's generic implementation then
// handles that call.
try_allreduce_fn try_allreduce = nullptr;
ggml_cuda_ar_pipeline * ar_pipeline = nullptr;
#ifdef GGML_USE_NCCL
std::vector<ncclComm_t> comms;
#endif // GGML_USE_NCCL
~ggml_backend_cuda_comm_context() {
#ifdef GGML_USE_NCCL
for (ncclComm_t comm : comms) {
NCCL_CHECK(ncclCommDestroy(comm));
}
#endif // GGML_USE_NCCL
ggml_cuda_ar_pipeline_free(ar_pipeline);
}
};
#endif // GGML_USE_NCCL
static void ggml_backend_cuda_comm_free(void * comm_ctx_v) {
#ifdef GGML_USE_NCCL
if (comm_ctx_v == nullptr) {
return;
}
ggml_backend_cuda_comm_context * comm_ctx = (ggml_backend_cuda_comm_context *) comm_ctx_v;
delete comm_ctx;
#else
GGML_UNUSED(comm_ctx_v);
#endif // GGML_USE_NCCL
}
static void * ggml_backend_cuda_comm_init(ggml_backend_t * backends, size_t n_backends) {
#ifdef GGML_USE_NCCL
for (size_t i = 0; i < n_backends; i++) {
if (!ggml_backend_is_cuda(backends[i])) {
return nullptr;
}
}
ggml_backend_cuda_comm_context * ret = new ggml_backend_cuda_comm_context;
std::vector<int> dev_ids;
ret->backends.reserve(n_backends);
dev_ids.reserve(n_backends);
for (size_t i = 0; i < n_backends; i++) {
ret->backends.push_back(backends[i]);
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backends[i]->context;
dev_ids.push_back(cuda_ctx->device);
}
ret->comms.resize(n_backends);
NCCL_CHECK(ncclCommInitAll(ret->comms.data(), n_backends, dev_ids.data()));
return ret;
#else
// If NCCL is installed it is used by default for optimal performance.
// However, NVIDIA does not distribute NCCL with CUDA so users may be unwittingly missing this package.
// RCCL is disabled by default, users are explicitly opting in.
// Therefore print no warning for RCCL.
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
static bool warning_printed = false;
if (!warning_printed) {
GGML_LOG_WARN("%s: NVIDIA Collective Communications Library (NCCL) is unavailable, multi GPU performance will be suboptimal\n", __func__);
warning_printed = true;
}
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
GGML_UNUSED_VARS(backends, n_backends);
return nullptr;
#endif // GGML_USE_NCCL
}
static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct ggml_tensor ** tensors) {
#ifdef GGML_USE_NCCL
// AllReduce via NCCL. Reduces as FP32 for small tensors and BF16 for large
// tensors (bandwidth-bound), then converts back to FP32.
static bool ggml_backend_cuda_comm_allreduce_nccl(
ggml_backend_cuda_comm_context * comm_ctx, struct ggml_tensor ** tensors) {
const int64_t ne = ggml_nelements(tensors[0]);
// FIXME the input of llm_graph_context::build_in_out_ids can produce a tensor with 0 elements if n_outputs == 0
// This then causes a crash in this function
@ -1210,8 +1190,6 @@ static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct gg
return true;
}
GGML_ASSERT(comm_ctx_v != nullptr);
ggml_backend_cuda_comm_context * comm_ctx = (ggml_backend_cuda_comm_context *) comm_ctx_v;
const size_t n_backends = comm_ctx->backends.size();
for (size_t i = 0; i < n_backends; ++i) {
@ -1236,7 +1214,6 @@ static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct gg
NCCL_CHECK(ncclAllReduce(tensors[i]->data, tensors[i]->data, ne, ncclFloat, ncclSum, comm_ctx->comms[i], cuda_ctx->stream()));
}
NCCL_CHECK(ncclGroupEnd());
return true;
}
@ -1275,10 +1252,184 @@ static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct gg
}
return true;
#else
GGML_UNUSED_VARS(comm_ctx_v, tensors);
return false;
}
#endif // GGML_USE_NCCL
// Run the internal AR pipeline. Returns false on unsupported / failed input
// -- the caller decides whether to abort (env-forced) or fall back silently.
static bool ggml_backend_cuda_comm_allreduce_internal(
ggml_backend_cuda_comm_context * comm_ctx, struct ggml_tensor ** tensors) {
GGML_ASSERT(comm_ctx->ar_pipeline != nullptr);
const size_t n_backends = comm_ctx->backends.size();
GGML_ASSERT(n_backends == 2);
GGML_ASSERT(tensors[0] != nullptr);
const int64_t ne = ggml_nelements(tensors[0]);
const ggml_type type = tensors[0]->type;
if (type != GGML_TYPE_F32 && type != GGML_TYPE_F16 && type != GGML_TYPE_BF16) {
GGML_LOG_DEBUG("%s: internal unsupported: type=%d\n", __func__, (int) type);
return false;
}
if (ne == 0) {
return true;
}
for (size_t i = 0; i < n_backends; ++i) {
if (tensors[i] == nullptr) {
GGML_LOG_ERROR("%s: internal failed: tensor[%zu] is null\n", __func__, i);
return false;
}
if (ggml_nelements(tensors[i]) != ne || tensors[i]->type != type) {
GGML_LOG_ERROR("%s: internal failed: tensor[%zu] ne=%" PRId64 " type=%d expected ne=%" PRId64 " type=%d\n",
__func__, i, ggml_nelements(tensors[i]), (int) tensors[i]->type, ne, (int) type);
return false;
}
if (!ggml_is_contiguously_allocated(tensors[i])) {
GGML_LOG_DEBUG("%s: internal unsupported: tensor[%zu] is not contiguously allocated: ne=%" PRId64 " nbytes=%zu packed=%zu type=%d\n",
__func__, i, ne, ggml_nbytes(tensors[i]),
(size_t) ne * ggml_type_size(type) / ggml_blck_size(type), (int) type);
return false;
}
if (((uintptr_t) tensors[i]->data & 0xF) != 0) {
GGML_LOG_DEBUG("%s: internal unsupported: tensor[%zu] data pointer is not 16-byte aligned: %p type=%d ne=%" PRId64 "\n",
__func__, i, tensors[i]->data, (int) type, ne);
return false;
}
GGML_ASSERT((ggml_nbytes(tensors[i]) & 0xF) == 0);
}
return ggml_cuda_ar_allreduce(comm_ctx->ar_pipeline, comm_ctx->backends.data(), tensors);
}
// ---------------------------------------------------------------------------
// Per-call dispatch -- three variants, one per backend. Each is set as
// comm_ctx->try_allreduce by the matching init step. Per-call failure
// returns false; the meta backend's generic implementation handles that call.
// ---------------------------------------------------------------------------
#ifdef GGML_USE_NCCL
static bool ggml_backend_cuda_comm_try_allreduce_nccl(
ggml_backend_cuda_comm_context * comm_ctx, struct ggml_tensor ** tensors) {
return ggml_backend_cuda_comm_allreduce_nccl(comm_ctx, tensors);
}
#endif // GGML_USE_NCCL
static bool ggml_backend_cuda_comm_try_allreduce_internal(
ggml_backend_cuda_comm_context * comm_ctx, struct ggml_tensor ** tensors) {
return ggml_backend_cuda_comm_allreduce_internal(comm_ctx, tensors);
}
static bool ggml_backend_cuda_comm_try_allreduce_butterfly(
ggml_backend_cuda_comm_context *, struct ggml_tensor **) {
return false;
}
static void ggml_backend_cuda_comm_free(void * comm_ctx_v) {
if (comm_ctx_v == nullptr) {
return;
}
delete static_cast<ggml_backend_cuda_comm_context *>(comm_ctx_v);
}
// ---------------------------------------------------------------------------
// Init -- chained nccl -> internal -> none. Each step tries to bring up its
// resource; on failure it warns and recurses into the next step.
// ---------------------------------------------------------------------------
static void ggml_backend_cuda_comm_init_none(ggml_backend_cuda_comm_context * ret) {
ret->try_allreduce = ggml_backend_cuda_comm_try_allreduce_butterfly;
}
static void ggml_backend_cuda_comm_init_internal(ggml_backend_cuda_comm_context * ret) {
ret->ar_pipeline = ggml_cuda_ar_pipeline_init(ret->dev_ids.data(), ret->dev_ids.size());
if (ret->ar_pipeline) {
ret->try_allreduce = ggml_backend_cuda_comm_try_allreduce_internal;
return;
}
// Clear sticky CUDA error from the failed init.
(void) cudaGetLastError();
GGML_LOG_WARN("internal AllReduce init failed (n_devices != 2?); "
"falling back to meta-backend butterfly\n");
ggml_backend_cuda_comm_init_none(ret);
}
static void ggml_backend_cuda_comm_init_nccl(ggml_backend_cuda_comm_context * ret) {
#ifdef GGML_USE_NCCL
const size_t n = ret->dev_ids.size();
ret->comms.resize(n);
ncclResult_t rc = ncclCommInitAll(ret->comms.data(), (int) n, ret->dev_ids.data());
if (rc == ncclSuccess) {
ret->try_allreduce = ggml_backend_cuda_comm_try_allreduce_nccl;
return;
}
ret->comms.clear();
GGML_LOG_WARN("NCCL init failed (%s); falling back to internal AllReduce\n",
ncclGetErrorString(rc));
#else // GGML_USE_NCCL
#ifndef GGML_USE_HIP
GGML_LOG_WARN("NCCL not compiled in; falling back to internal AllReduce. "
"Recompile with -DGGML_CUDA_NCCL=ON for best multi-GPU performance.\n");
#endif // !GGML_USE_HIP
#endif // GGML_USE_NCCL
ggml_backend_cuda_comm_init_internal(ret);
}
// Top-level init. Picks one of the three init paths based on
// GGML_CUDA_ALLREDUCE (or the platform default) and lets the chain handle
// any fallback. Unrecognised env values warn and fall through to the
// platform default.
static void * ggml_backend_cuda_comm_init(ggml_backend_t * backends, size_t n_backends) {
for (size_t i = 0; i < n_backends; i++) {
if (!ggml_backend_is_cuda(backends[i])) {
return nullptr;
}
}
auto * ret = new ggml_backend_cuda_comm_context;
ret->backends.assign(backends, backends + n_backends);
ret->dev_ids.reserve(n_backends);
for (size_t i = 0; i < n_backends; i++) {
ret->dev_ids.push_back(static_cast<ggml_backend_cuda_context *>(backends[i]->context)->device);
}
const char * env = getenv("GGML_CUDA_ALLREDUCE");
if (!env) {
// Platform default: Linux uses NCCL, otherwise (generally Windows) internal
#if defined(__linux__)
ggml_backend_cuda_comm_init_nccl(ret);
#else
ggml_backend_cuda_comm_init_internal(ret);
#endif // defined(__linux__)
} else {
std::string env_str(env);
if (env_str == "nccl") {
ggml_backend_cuda_comm_init_nccl(ret);
} else if (env_str == "internal") {
ggml_backend_cuda_comm_init_internal(ret);
} else if (env_str == "none") {
ggml_backend_cuda_comm_init_none(ret);
} else {
GGML_LOG_WARN("unknown GGML_CUDA_ALLREDUCE value: %s\n", env);
ggml_backend_cuda_comm_init_none(ret);
}
}
return ret;
}
// Top-level dispatch -- calls the function pointer chosen by comm_init.
// Returns false to let the meta-backend's butterfly run.
static bool ggml_backend_cuda_comm_allreduce_tensor(void * comm_ctx_v, struct ggml_tensor ** tensors) {
if (comm_ctx_v == nullptr) {
return false;
}
auto * comm_ctx = static_cast<ggml_backend_cuda_comm_context *>(comm_ctx_v);
return comm_ctx->try_allreduce(comm_ctx, tensors);
}
ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split) {

View file

@ -2,4 +2,5 @@
#include "../fattn-mma-f16.cuh"
DECL_FATTN_MMA_F16_CASE(192, 128, 1, 16);
DECL_FATTN_MMA_F16_CASE(576, 512, 1, 16);

View file

@ -7,5 +7,6 @@ DECL_FATTN_MMA_F16_CASE(80, 80, 1, 8);
DECL_FATTN_MMA_F16_CASE(96, 96, 1, 8);
DECL_FATTN_MMA_F16_CASE(112, 112, 1, 8);
DECL_FATTN_MMA_F16_CASE(128, 128, 1, 8);
DECL_FATTN_MMA_F16_CASE(192, 128, 1, 8);
DECL_FATTN_MMA_F16_CASE(256, 256, 1, 8);
DECL_FATTN_MMA_F16_CASE(512, 512, 1, 8);

View file

@ -2,4 +2,5 @@
#include "../fattn-mma-f16.cuh"
DECL_FATTN_MMA_F16_CASE(192, 128, 2, 16);
DECL_FATTN_MMA_F16_CASE(576, 512, 2, 16);

View file

@ -7,5 +7,6 @@ DECL_FATTN_MMA_F16_CASE(80, 80, 2, 8);
DECL_FATTN_MMA_F16_CASE(96, 96, 2, 8);
DECL_FATTN_MMA_F16_CASE(112, 112, 2, 8);
DECL_FATTN_MMA_F16_CASE(128, 128, 2, 8);
DECL_FATTN_MMA_F16_CASE(192, 128, 2, 8);
DECL_FATTN_MMA_F16_CASE(256, 256, 2, 8);
DECL_FATTN_MMA_F16_CASE(512, 512, 2, 8);

View file

@ -2,4 +2,5 @@
#include "../fattn-mma-f16.cuh"
DECL_FATTN_MMA_F16_CASE(192, 128, 4, 16);
DECL_FATTN_MMA_F16_CASE(576, 512, 4, 16);

View file

@ -7,5 +7,6 @@ DECL_FATTN_MMA_F16_CASE(80, 80, 4, 8);
DECL_FATTN_MMA_F16_CASE(96, 96, 4, 8);
DECL_FATTN_MMA_F16_CASE(112, 112, 4, 8);
DECL_FATTN_MMA_F16_CASE(128, 128, 4, 8);
DECL_FATTN_MMA_F16_CASE(192, 128, 4, 8);
DECL_FATTN_MMA_F16_CASE(256, 256, 4, 8);
DECL_FATTN_MMA_F16_CASE(512, 512, 4, 8);

View file

@ -7,5 +7,6 @@ DECL_FATTN_MMA_F16_CASE(80, 80, 8, 8);
DECL_FATTN_MMA_F16_CASE(96, 96, 8, 8);
DECL_FATTN_MMA_F16_CASE(112, 112, 8, 8);
DECL_FATTN_MMA_F16_CASE(128, 128, 8, 8);
DECL_FATTN_MMA_F16_CASE(192, 128, 8, 8);
DECL_FATTN_MMA_F16_CASE(256, 256, 8, 8);
DECL_FATTN_MMA_F16_CASE(512, 512, 8, 8);

View file

@ -0,0 +1,5 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../fattn-tile.cuh"
DECL_FATTN_TILE_CASE(192, 128);

View file

@ -3,7 +3,10 @@
from glob import glob
import os
HEAD_SIZES_KQ = [40, 64, 72, 80, 96, 112, 128, 256, 320, 512, 576]
HEAD_SIZES_KQ = [40, 64, 72, 80, 96, 112, 128, 192, 256, 320, 512, 576]
# DKQ -> DV override for asymmetric head dims.
HEAD_SIZES_V_OVERRIDE = {576: 512, 320: 256, 192: 128}
TYPES_KV = ["GGML_TYPE_F16", "GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0", "GGML_TYPE_BF16"]
@ -62,7 +65,7 @@ for filename in glob("*.cu"):
os.remove(filename)
for head_size_kq in HEAD_SIZES_KQ:
head_size_v = 256 if head_size_kq == 320 else (head_size_kq if head_size_kq != 576 else 512)
head_size_v = HEAD_SIZES_V_OVERRIDE.get(head_size_kq, head_size_kq)
with open(f"fattn-tile-instance-dkq{head_size_kq}-dv{head_size_v}.cu", "w") as f:
f.write(SOURCE_FATTN_TILE.format(head_size_kq=head_size_kq, head_size_v=head_size_v))
@ -85,15 +88,17 @@ for ncols in [8, 16, 32, 64]:
if head_size_kq == 72:
continue
# Skip compilation of unused ncols2 values for niche head sizes:
if head_size_kq == 192 and ncols2 not in (8, 16): # MiMo-V2.5
continue
if head_size_kq == 320 and ncols2 != 32: # Mistral Small 4
continue
if head_size_kq == 512 and ncols2 not in (4, 8): # Gemma 4
continue
if head_size_kq == 576 and ncols2 not in (4, 16, 32): # Deepseek, GLM 4.7 Flash
continue
if head_size_kq not in (320, 576) and ncols2 in (16, 32):
if head_size_kq not in (192, 320, 576) and ncols2 in (16, 32):
continue
head_size_v = 256 if head_size_kq == 320 else (head_size_kq if head_size_kq != 576 else 512)
head_size_v = HEAD_SIZES_V_OVERRIDE.get(head_size_kq, head_size_kq)
f.write(SOURCE_FATTN_MMA_CASE.format(ncols1=ncols1, ncols2=ncols2, head_size_kq=head_size_kq, head_size_v=head_size_v))
for type in TYPES_MMQ:

View file

@ -1261,10 +1261,6 @@ void llama_model_base::load_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT_SWA, hparams.n_rot_swa, false);
}
// for differentiating model types
uint32_t n_vocab = 0;
ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false);
// for classifier models
ml.get_arr(LLM_KV_CLASSIFIER_OUTPUT_LABELS, classifier_labels, false);
if (!classifier_labels.empty()) {

View file

@ -728,6 +728,14 @@ struct llm_tokenizer_bpe : llm_tokenizer {
};
byte_encode = false; // uses raw UTF-8, not GPT-2 byte encoding
break;
case LLAMA_VOCAB_PRE_TYPE_SARVAM_MOE:
// Sarvam uses SPM-style BPE (same shape as Gemma4): spaces replaced with U+2581
// by the normalizer, BPE merges over the whole text on raw UTF-8.
regex_exprs = {
"[^\\n]+|[\\n]+",
};
byte_encode = false;
break;
default:
// default regex for BPE tokenization pre-processing
regex_exprs = {
@ -2241,6 +2249,11 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
tokenizer_pre == "gemma4") {
pre_type = LLAMA_VOCAB_PRE_TYPE_GEMMA4;
escape_whitespaces = true;
} else if (
tokenizer_pre == "sarvam-moe") {
pre_type = LLAMA_VOCAB_PRE_TYPE_SARVAM_MOE;
escape_whitespaces = true;
clean_spaces = false;
} else if (
tokenizer_pre == "jina-v1-en" ||
tokenizer_pre == "jina-v2-code" ||

View file

@ -60,6 +60,7 @@ enum llama_vocab_pre_type {
LLAMA_VOCAB_PRE_TYPE_JOYAI_LLM = 48,
LLAMA_VOCAB_PRE_TYPE_JAIS2 = 49,
LLAMA_VOCAB_PRE_TYPE_GEMMA4 = 50,
LLAMA_VOCAB_PRE_TYPE_SARVAM_MOE = 51,
};
struct LLM_KV;

View file

@ -1,7 +1,8 @@
#include "models.h"
void llama_model_deepseek2::load_arch_hparams(llama_model_loader & ml) {
const auto n_vocab = vocab.n_tokens();
uint32_t n_vocab = 0;
ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false);
// lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B, Kanana-2-30B-A3B
const bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26 || (hparams.n_layer == 48 && n_vocab == 128256));

View file

@ -1,7 +1,8 @@
#include "models.h"
void llama_model_llama::load_arch_hparams(llama_model_loader & ml) {
const auto n_vocab = vocab.n_tokens();
uint32_t n_vocab = 0;
ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);

View file

@ -1317,7 +1317,7 @@ private:
return false;
}
const bool need_logits = task.params.sampling.n_probs > 0;
const bool need_pre_sample_logits = task.params.sampling.n_probs > 0 && !task.params.post_sampling_probs;
bool backend_sampling = true;
@ -1326,8 +1326,8 @@ private:
// TODO: speculative decoding requires multiple samples per batch - not supported yet
backend_sampling &= !(slot.can_speculate() && common_speculative_n_max(slot.spec.get(), task.params.speculative) > 0);
// TODO: getting post/pre sampling logits is not yet supported with backend sampling
backend_sampling &= !need_logits;
// TODO: getting pre sampling logits is not yet supported with backend sampling
backend_sampling &= !need_pre_sample_logits;
// TODO: tmp until backend sampling is fully implemented
if (backend_sampling) {
@ -1504,6 +1504,12 @@ private:
// set probability for top n_probs tokens
result.probs.reserve(n_probs);
for (size_t i = 0; i < n_probs; i++) {
// Some samplers do return 0.0 probabilities, others don't.
// Filter 0.0 probailities, to ensure the behavior is consistent.
if (cur_p->data[i].p == 0.0) {
break;
}
result.probs.push_back({
cur_p->data[i].id,
common_token_to_piece(ctx, cur_p->data[i].id, special),

View file

@ -381,7 +381,8 @@ server_task_result_ptr server_response_reader::next(const std::function<bool()>
if (result == nullptr) {
// timeout, check stop condition
if (should_stop()) {
SRV_DBG("%s", "stopping wait for next result due to should_stop condition\n");
SRV_WRN("%s", "stopping wait for next result due to should_stop condition (adjust the --timeout argument if needed)\n");
SRV_WRN("%s", "ref: https://github.com/ggml-org/llama.cpp/pull/22907\n");
return nullptr;
}
} else {

View file

@ -491,29 +491,82 @@ def test_n_probs_post_sampling():
global server
server.start()
res = server.make_request("POST", "/completion", data={
"prompt": "I believe the meaning of life is",
"prompt": "Today was the day. Today I would finally become a",
"n_probs": 10,
"temperature": 0.0,
"temperature": 1.0,
"n_predict": 5,
"post_sampling_probs": True,
})
assert res.status_code == 200
assert "completion_probabilities" in res.body
assert len(res.body["completion_probabilities"]) == 5
for tok in res.body["completion_probabilities"]:
for (i, tok) in enumerate(res.body["completion_probabilities"]):
assert "id" in tok and tok["id"] > 0
assert "token" in tok and type(tok["token"]) == str
assert "prob" in tok and 0.0 < tok["prob"] <= 1.0
assert "bytes" in tok and type(tok["bytes"]) == list
assert len(tok["top_probs"]) == 10
assert "top_probs" in tok and type(tok["top_probs"]) == list
for prob in tok["top_probs"]:
assert "id" in prob and prob["id"] > 0
assert "token" in prob and type(prob["token"]) == str
assert "prob" in prob and 0.0 <= prob["prob"] <= 1.0
# 0.0 probability tokens should never be returned by the server
assert "prob" in prob and 0.0 < prob["prob"] <= 1.0
assert "bytes" in prob and type(prob["bytes"]) == list
# because the test model usually output token with either 100% or 0% probability, we need to check all the top_probs
assert any(prob["prob"] == 1.0 for prob in tok["top_probs"])
if i == 0:
# The prompt is vague enough that we should get at least 10 possibilities
# for the first token.
assert len(tok["top_probs"]) == 10
if len(tok["top_probs"]) < 10:
# Getting less than the requested number of probabilities should only happen
# if the ones we did get already sum to 1.0.
assert sum(p["prob"] for p in tok["top_probs"]) == pytest.approx(1.0)
def test_n_probs_post_backend_sampling():
"""Verify that the same probabilities are returned with and without backend sampling."""
global server
server.backend_sampling = True
server.start()
def make_request(backend_sampling):
n_predict = 20
res = server.make_request("POST", "/completion", data={
"prompt": "The countries of Europe, in random order, are:",
"n_probs": 10,
"n_predict": n_predict,
"post_sampling_probs": True,
"seed": 4242,
"backend_sampling": backend_sampling,
})
assert res.status_code == 200
total_probs = 0
completions = res.body["completion_probabilities"]
assert len(completions) == n_predict
for tok in completions:
# Handling of 0.0 probabilities differs between samplers and backend sampling. Filter them to normalize the
# data.
tok["top_probs"] = [x for x in tok["top_probs"] if x["prob"] > 0.0]
total_probs += len(tok["top_probs"])
# Verify that we got at least two top probs on average, to ensure the effectiveness of the test.
assert total_probs >= 2 * n_predict
return completions
def verify_token(a, b):
assert a["id"] == b["id"]
assert a["token"] == b["token"]
assert a["bytes"] == b["bytes"]
assert a["prob"] == pytest.approx(b["prob"], abs=0.01)
for (a, b) in zip(make_request(True), make_request(False)):
verify_token(a, b)
assert len(a["top_probs"]) == len(b["top_probs"])
for (aa, bb) in zip(a["top_probs"], b["top_probs"]):
verify_token(aa, bb)
@pytest.mark.parametrize("tokenize,openai_style", [(False, False), (False, True), (True, False), (True, True)])
def test_logit_bias(tokenize, openai_style):

View file

@ -108,6 +108,7 @@ class ServerProcess:
no_cache_idle_slots: bool = False
log_path: str | None = None
webui_mcp_proxy: bool = False
backend_sampling: bool = False
gcp_compat: bool = False
# session variables
@ -252,6 +253,8 @@ class ServerProcess:
server_args.append("--no-cache-idle-slots")
if self.webui_mcp_proxy:
server_args.append("--webui-mcp-proxy")
if self.backend_sampling:
server_args.append("--backend_sampling")
if self.gcp_compat:
env["AIP_MODE"] = "PREDICTION"

View file

@ -41,7 +41,7 @@ if (LLAMA_BUILD_BORINGSSL)
set(FIPS OFF CACHE BOOL "Enable FIPS (BoringSSL)")
set(BORINGSSL_GIT "https://boringssl.googlesource.com/boringssl" CACHE STRING "BoringSSL git repository")
set(BORINGSSL_VERSION "0.20260413.0" CACHE STRING "BoringSSL version")
set(BORINGSSL_VERSION "0.20260508.0" CACHE STRING "BoringSSL version")
message(STATUS "Fetching BoringSSL version ${BORINGSSL_VERSION}")

View file

@ -1161,12 +1161,11 @@ bool parse_header(const char *beg, const char *end, T fn) {
if (!detail::fields::is_field_value(val)) { return false; }
if (case_ignore::equal(key, "Location") ||
case_ignore::equal(key, "Referer")) {
fn(key, val);
} else {
fn(key, decode_path_component(val));
}
// RFC 9110 §5.5: header field values are opaque octets and MUST NOT be
// percent-decoded by the recipient. Applications that need to interpret a
// value as a URI component should call httplib::decode_uri_component()
// (or decode_path_component()) explicitly.
fn(key, val);
return true;
}
@ -6192,9 +6191,29 @@ ThreadPool::ThreadPool(size_t n, size_t max_n, size_t mqr)
#endif
max_thread_count_ = max_n == 0 ? n : max_n;
threads_.reserve(base_thread_count_);
for (size_t i = 0; i < base_thread_count_; i++) {
threads_.emplace_back(std::thread([this]() { worker(false); }));
#ifndef CPPHTTPLIB_NO_EXCEPTIONS
try {
#endif
for (size_t i = 0; i < base_thread_count_; i++) {
threads_.emplace_back(std::thread([this]() { worker(false); }));
}
#ifndef CPPHTTPLIB_NO_EXCEPTIONS
} catch (...) {
// If thread creation fails partway (e.g., pthread_create returns EAGAIN),
// signal the workers we already spawned to exit and join them so the
// vector destructor does not see joinable threads (which would call
// std::terminate). Then rethrow so the caller learns of the failure.
{
std::unique_lock<std::mutex> lock(mutex_);
shutdown_ = true;
}
cond_.notify_all();
for (auto &t : threads_) {
if (t.joinable()) { t.join(); }
}
throw;
}
#endif
}
bool ThreadPool::enqueue(std::function<void()> fn) {
@ -8980,10 +8999,22 @@ ssize_t ChunkedDecoder::read_payload(char *buf, size_t len,
stream_line_reader lr(strm, line_buf, sizeof(line_buf));
if (!lr.getline()) { return -1; }
char *endptr = nullptr;
unsigned long chunk_len = std::strtoul(lr.ptr(), &endptr, 16);
if (endptr == lr.ptr()) { return -1; }
if (chunk_len == ULONG_MAX) { return -1; }
// RFC 9112 §7.1: chunk-size = 1*HEXDIG
const char *p = lr.ptr();
int v = 0;
if (!is_hex(*p, v)) { return -1; }
size_t chunk_len = 0;
constexpr size_t chunk_len_max = (std::numeric_limits<size_t>::max)();
for (; is_hex(*p, v); ++p) {
if (chunk_len > (chunk_len_max >> 4)) { return -1; }
chunk_len = (chunk_len << 4) | static_cast<size_t>(v);
}
while (is_space_or_tab(*p)) {
++p;
}
if (*p != '\0' && *p != ';' && *p != '\r' && *p != '\n') { return -1; }
if (chunk_len == 0) {
chunk_remaining = 0;
@ -8993,7 +9024,7 @@ ssize_t ChunkedDecoder::read_payload(char *buf, size_t len,
return 0;
}
chunk_remaining = static_cast<size_t>(chunk_len);
chunk_remaining = chunk_len;
last_chunk_total = chunk_remaining;
last_chunk_offset = 0;
}

View file

@ -8,8 +8,8 @@
#ifndef CPPHTTPLIB_HTTPLIB_H
#define CPPHTTPLIB_HTTPLIB_H
#define CPPHTTPLIB_VERSION "0.43.3"
#define CPPHTTPLIB_VERSION_NUM "0x002b03"
#define CPPHTTPLIB_VERSION "0.44.0"
#define CPPHTTPLIB_VERSION_NUM "0x002c00"
#ifdef _WIN32
#if defined(_WIN32_WINNT) && _WIN32_WINNT < 0x0A00