mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-13 02:19:41 +00:00
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .github/workflows/docker.yml # CMakeLists.txt # Makefile # README.md # flake.lock # tests/test-backend-ops.cpp
This commit is contained in:
commit
4ed9ba7352
58 changed files with 2404 additions and 361 deletions
33
ggml-cuda.cu
33
ggml-cuda.cu
|
@ -121,6 +121,20 @@ int ggml_cuda_get_device() {
|
|||
return id;
|
||||
}
|
||||
|
||||
static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
|
||||
ggml_cuda_set_device(device);
|
||||
#if defined(GGML_USE_HIPBLAS) && defined(GGML_HIP_UMA)
|
||||
auto res = hipMallocManaged(ptr, size);
|
||||
if (res == hipSuccess) {
|
||||
// if error we "need" to know why...
|
||||
CUDA_CHECK(hipMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device));
|
||||
}
|
||||
return res;
|
||||
#else
|
||||
return cudaMalloc(ptr, size);
|
||||
#endif
|
||||
}
|
||||
|
||||
static ggml_cuda_device_info ggml_cuda_init() {
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
// Workaround for a rocBLAS bug when using multiple graphics cards:
|
||||
|
@ -275,7 +289,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
|||
size_t look_ahead_size = (size_t) (1.05 * size);
|
||||
look_ahead_size = 256 * ((look_ahead_size + 255)/256);
|
||||
ggml_cuda_set_device(device);
|
||||
CUDA_CHECK(cudaMalloc((void **) &ptr, look_ahead_size));
|
||||
CUDA_CHECK(ggml_cuda_device_malloc(&ptr, look_ahead_size, device));
|
||||
*actual_size = look_ahead_size;
|
||||
pool_size += look_ahead_size;
|
||||
return ptr;
|
||||
|
@ -537,7 +551,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffe
|
|||
size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
|
||||
|
||||
void * dev_ptr;
|
||||
cudaError_t err = cudaMalloc(&dev_ptr, size);
|
||||
cudaError_t err = ggml_cuda_device_malloc(&dev_ptr, size, buft_ctx->device);
|
||||
if (err != cudaSuccess) {
|
||||
// clear the error
|
||||
cudaGetLastError();
|
||||
|
@ -798,7 +812,7 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_bu
|
|||
// currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
|
||||
ggml_cuda_set_device(id);
|
||||
char * buf;
|
||||
CUDA_CHECK(cudaMalloc(&buf, size));
|
||||
CUDA_CHECK(ggml_cuda_device_malloc((void**)&buf, size, id));
|
||||
|
||||
// set padding to 0 to avoid possible NaN values
|
||||
if (size > original_size) {
|
||||
|
@ -2516,9 +2530,9 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|||
|
||||
bool use_cuda_graph = true;
|
||||
bool cuda_graph_update_required = false;
|
||||
// pointer to CUDA cpy kernel, which is required to identify
|
||||
// vector of pointers to CUDA cpy kernels, which are required to identify
|
||||
// kernel parameters which need updated in the graph for each token
|
||||
void * ggml_cuda_cpy_fn_ptr = nullptr;
|
||||
std::vector<void *> ggml_cuda_cpy_fn_ptrs;
|
||||
|
||||
if (cuda_ctx->cuda_graph->graph == nullptr) {
|
||||
if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) {
|
||||
|
@ -2594,9 +2608,10 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|||
if (node->op == GGML_OP_CPY) {
|
||||
// store the copy op parameter which changes with each token.
|
||||
cuda_ctx->cuda_graph->updated_kernel_arg.push_back((char **) &(node->src[1]->data));
|
||||
if (ggml_cuda_cpy_fn_ptr == nullptr) {
|
||||
// store a pointer to the copy op CUDA kernel to identify it later
|
||||
ggml_cuda_cpy_fn_ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]);
|
||||
// store a pointer to each copy op CUDA kernel to identify it later
|
||||
void * ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]);
|
||||
if (std::find(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), ptr) == ggml_cuda_cpy_fn_ptrs.end()) {
|
||||
ggml_cuda_cpy_fn_ptrs.push_back(ptr);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2726,7 +2741,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|||
if (!cuda_graph_update_required) { // on update steps, the live parameters will already be captured
|
||||
int k = 0;
|
||||
for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
|
||||
if (cuda_ctx->cuda_graph->params[i].func == ggml_cuda_cpy_fn_ptr) {
|
||||
if(count(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), cuda_ctx->cuda_graph->params[i].func) > 0) {
|
||||
char ** updated_kernel_arg_ptr = cuda_ctx->cuda_graph->updated_kernel_arg.at(k++);
|
||||
cuda_ctx->cuda_graph->params[i].kernelParams[1] = updated_kernel_arg_ptr;
|
||||
CUDA_CHECK(cudaGraphKernelNodeSetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]));
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue