mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-31 21:39:42 +00:00
CUDA: Check PTX version on host side to guard PDL dispatch (#23530)
* CUDA: Check PTX version on host side to guard PDL dispatch
Checking on `__CUDA_ARCH_LIST__` alone is insufficient for JIT, as this
variable doesn't differentiate between compiling for say sm_90, sm_90a
or sm_90f (so forward-jittable PTX vs. arch/family-specific PTX).
Thus, one can have a bug when compiling with
`DCMAKE_CUDA_ARCHITECTURES="89;90a"`, where current code would wrongly
dispatch to PDL on sm_90/sm_120 in forward-JIT mode.
This PR fixes this issue by checking `cudaFuncAttributes::ptxVersion` of
the incoming kernel at runtime. A check on ptxVersion alone is
sufficient, as device-codes will always be >= ptxVersion (and any
violation of this would be a severe bug in CUDA/nvcc), see:
https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/#gpu-code-code-code
* Implement MurmurHash3 mixer for better hash distribution
Magic constants were taken from boost:
2698b43803/include/boost/container_hash/detail/hash_mix.hpp (L19-L65)
* Update ggml/src/ggml-cuda/common.cuh
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
* Address review comments, make seed non-zero
* Apply code-formatting
* Replace std::size_t -> size_t for consistency
---------
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
This commit is contained in:
parent
cb47092b00
commit
6ed481eea4
1 changed files with 58 additions and 2 deletions
|
|
@ -7,6 +7,7 @@
|
|||
#include <cstdint>
|
||||
#include <cstdlib>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
|
||||
#if defined(GGML_USE_HIP)
|
||||
#define GGML_COMMON_DECL_HIP
|
||||
|
|
@ -1552,6 +1553,62 @@ struct ggml_cuda_pdl_config {
|
|||
ggml_cuda_pdl_config& operator=(ggml_cuda_pdl_config&&) = delete;
|
||||
|
||||
};
|
||||
|
||||
static bool ggml_cuda_kernel_can_use_pdl(const void * kernel) {
|
||||
const int device = ggml_cuda_get_device();
|
||||
|
||||
struct cache_key {
|
||||
int device;
|
||||
const void * kernel;
|
||||
|
||||
bool operator==(const cache_key & other) const { return device == other.device && kernel == other.kernel; }
|
||||
};
|
||||
|
||||
struct cache_key_hash {
|
||||
// MurmurHash3 mixing function for better hash distribution (vs. just std::hash which in some implementations simply returns the identity)
|
||||
static size_t hash_mix(size_t x) {
|
||||
std::uint64_t y = x;
|
||||
const std::uint64_t m = 0xe9846af9b1a615d;
|
||||
|
||||
y ^= y >> 32;
|
||||
y *= m;
|
||||
y ^= y >> 32;
|
||||
y *= m;
|
||||
y ^= y >> 28;
|
||||
|
||||
return static_cast<size_t>(y);
|
||||
}
|
||||
|
||||
size_t operator()(const cache_key & key) const {
|
||||
// Use a nonzero seed to avoid mapping all-zero keys to zero
|
||||
size_t h = 42;
|
||||
h = hash_mix(h + key.device);
|
||||
h = hash_mix(h + reinterpret_cast<size_t>(key.kernel));
|
||||
return h;
|
||||
}
|
||||
};
|
||||
|
||||
static std::mutex cache_mutex;
|
||||
static std::unordered_map<cache_key, bool, cache_key_hash> cache;
|
||||
|
||||
const cache_key key = { device, kernel };
|
||||
std::lock_guard<std::mutex> lock(cache_mutex);
|
||||
const auto it = cache.find(key);
|
||||
if (it != cache.end()) {
|
||||
return it->second;
|
||||
}
|
||||
|
||||
cudaFuncAttributes attr = {};
|
||||
CUDA_CHECK(cudaFuncGetAttributes(&attr, kernel));
|
||||
|
||||
// PDL device-side primitives are emitted only for PTX versions >= 90.
|
||||
// We have to guard on a loaded kernel's PTX version so a kernel forward-JIT'ed
|
||||
// from pre-Hopper PTX to a Hopper-or-newer GPU does not opt into PDL.
|
||||
const bool can_use_pdl = attr.ptxVersion >= 90;
|
||||
cache.emplace(key, can_use_pdl);
|
||||
return can_use_pdl;
|
||||
}
|
||||
|
||||
#endif //defined(GGML_CUDA_USE_PDL)
|
||||
|
||||
|
||||
|
|
@ -1564,8 +1621,7 @@ static __inline__ void ggml_cuda_kernel_launch(Kernel kernel, const ggml_cuda_ke
|
|||
return env == nullptr || std::atoi(env) != 0;
|
||||
}();
|
||||
|
||||
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
||||
if (env_pdl_enabled && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_HOPPER) {
|
||||
if (env_pdl_enabled && ggml_cuda_kernel_can_use_pdl(reinterpret_cast<const void *>(kernel))) {
|
||||
auto pdl_cfg = ggml_cuda_pdl_config(launch_params);
|
||||
|
||||
CUDA_CHECK(cudaLaunchKernelEx(&pdl_cfg.cfg, kernel, std::forward<Args>(args)... ));
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue