From 6ed481eea4cf4ed40777db2fa29e8d08eb712b3b Mon Sep 17 00:00:00 2001 From: Oliver Simons Date: Fri, 29 May 2026 12:28:18 +0200 Subject: [PATCH] CUDA: Check PTX version on host side to guard PDL dispatch (#23530) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * CUDA: Check PTX version on host side to guard PDL dispatch Checking on `__CUDA_ARCH_LIST__` alone is insufficient for JIT, as this variable doesn't differentiate between compiling for say sm_90, sm_90a or sm_90f (so forward-jittable PTX vs. arch/family-specific PTX). Thus, one can have a bug when compiling with `DCMAKE_CUDA_ARCHITECTURES="89;90a"`, where current code would wrongly dispatch to PDL on sm_90/sm_120 in forward-JIT mode. This PR fixes this issue by checking `cudaFuncAttributes::ptxVersion` of the incoming kernel at runtime. A check on ptxVersion alone is sufficient, as device-codes will always be >= ptxVersion (and any violation of this would be a severe bug in CUDA/nvcc), see: https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/#gpu-code-code-code * Implement MurmurHash3 mixer for better hash distribution Magic constants were taken from boost: https://github.com/boostorg/container_hash/blob/2698b43803c012601e6bb1a6116e83767b97986c/include/boost/container_hash/detail/hash_mix.hpp#L19-L65 * Update ggml/src/ggml-cuda/common.cuh Co-authored-by: Johannes Gäßler * Address review comments, make seed non-zero * Apply code-formatting * Replace std::size_t -> size_t for consistency --------- Co-authored-by: Johannes Gäßler --- ggml/src/ggml-cuda/common.cuh | 60 +++++++++++++++++++++++++++++++++-- 1 file changed, 58 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 50d7763dc..560fab0b1 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -7,6 +7,7 @@ #include #include #include +#include #if defined(GGML_USE_HIP) #define GGML_COMMON_DECL_HIP @@ -1552,6 +1553,62 @@ struct ggml_cuda_pdl_config { ggml_cuda_pdl_config& operator=(ggml_cuda_pdl_config&&) = delete; }; + +static bool ggml_cuda_kernel_can_use_pdl(const void * kernel) { + const int device = ggml_cuda_get_device(); + + struct cache_key { + int device; + const void * kernel; + + bool operator==(const cache_key & other) const { return device == other.device && kernel == other.kernel; } + }; + + struct cache_key_hash { + // MurmurHash3 mixing function for better hash distribution (vs. just std::hash which in some implementations simply returns the identity) + static size_t hash_mix(size_t x) { + std::uint64_t y = x; + const std::uint64_t m = 0xe9846af9b1a615d; + + y ^= y >> 32; + y *= m; + y ^= y >> 32; + y *= m; + y ^= y >> 28; + + return static_cast(y); + } + + size_t operator()(const cache_key & key) const { + // Use a nonzero seed to avoid mapping all-zero keys to zero + size_t h = 42; + h = hash_mix(h + key.device); + h = hash_mix(h + reinterpret_cast(key.kernel)); + return h; + } + }; + + static std::mutex cache_mutex; + static std::unordered_map cache; + + const cache_key key = { device, kernel }; + std::lock_guard lock(cache_mutex); + const auto it = cache.find(key); + if (it != cache.end()) { + return it->second; + } + + cudaFuncAttributes attr = {}; + CUDA_CHECK(cudaFuncGetAttributes(&attr, kernel)); + + // PDL device-side primitives are emitted only for PTX versions >= 90. + // We have to guard on a loaded kernel's PTX version so a kernel forward-JIT'ed + // from pre-Hopper PTX to a Hopper-or-newer GPU does not opt into PDL. + const bool can_use_pdl = attr.ptxVersion >= 90; + cache.emplace(key, can_use_pdl); + return can_use_pdl; +} + #endif //defined(GGML_CUDA_USE_PDL) @@ -1564,8 +1621,7 @@ static __inline__ void ggml_cuda_kernel_launch(Kernel kernel, const ggml_cuda_ke return env == nullptr || std::atoi(env) != 0; }(); - const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc; - if (env_pdl_enabled && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_HOPPER) { + if (env_pdl_enabled && ggml_cuda_kernel_can_use_pdl(reinterpret_cast(kernel))) { auto pdl_cfg = ggml_cuda_pdl_config(launch_params); CUDA_CHECK(cudaLaunchKernelEx(&pdl_cfg.cfg, kernel, std::forward(args)... ));