From 5126c41c1cae810a28e89dd424f729ef59417341 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Mon, 5 Jan 2026 01:37:09 +0800 Subject: [PATCH 1/3] ggml-cuda: remove unused params in ggml_cuda_graph (#18579) --- ggml/src/ggml-cuda/common.cuh | 1 - 1 file changed, 1 deletion(-) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 302065ce9..78502057a 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -1058,7 +1058,6 @@ struct ggml_cuda_graph { cudaGraphExec_t instance = nullptr; size_t num_nodes = 0; std::vector nodes; - std::vector params; bool disable_due_to_gpu_arch = false; bool disable_due_to_too_many_updates = false; bool disable_due_to_failed_graph_capture = false; From 908a9e5a1eaaff345f05087beafdf43d31e3f00a Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Mon, 5 Jan 2026 01:37:48 +0800 Subject: [PATCH 2/3] CUDA: disable cuda graph when using n-cpu-moe (#18593) * CUDA: disable cuda graph when using n-cpu-moe * call ggml_cuda_set_device --- ggml/src/ggml-cuda/ggml-cuda.cu | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index f05d5562b..80d983f9e 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3696,6 +3696,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx } static bool ggml_cuda_set_cuda_graph_enabled(ggml_backend_cuda_context * cuda_ctx) { + #ifdef USE_CUDA_GRAPH static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr); @@ -3736,17 +3737,15 @@ static bool ggml_cuda_set_cuda_graph_enabled(ggml_backend_cuda_context * cuda_ct static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context; + ggml_cuda_set_device(cuda_ctx->device); + bool use_cuda_graph = false; bool cuda_graph_update_required = false; // graph_optimize calls set_cuda_graph_enabled, in-case it not called (i.e. graph_compute is directly called) // we call it here instead. #ifdef USE_CUDA_GRAPH - if (!cuda_ctx->cuda_graph) { - use_cuda_graph = ggml_cuda_set_cuda_graph_enabled(cuda_ctx); - } else { - use_cuda_graph = cuda_ctx->cuda_graph && cuda_ctx->cuda_graph->cuda_graphs_enabled; - } + use_cuda_graph = ggml_cuda_set_cuda_graph_enabled(cuda_ctx); if (use_cuda_graph) { cuda_graph_update_required = is_cuda_graph_update_required(cuda_ctx, cgraph); @@ -3762,6 +3761,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) { cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true; + cuda_ctx->cuda_graph->cuda_graphs_enabled = false; #ifndef NDEBUG GGML_LOG_DEBUG("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__); #endif From 9a4eeafbfc9223a7922d6c2c3b2d1fa81eed5cec Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Mon, 5 Jan 2026 15:24:21 +0800 Subject: [PATCH 3/3] hotfix 1.105.3 --- koboldcpp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/koboldcpp.py b/koboldcpp.py index 604ad6b03..15fcebd14 100755 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -67,7 +67,7 @@ dry_seq_break_max = 128 extra_images_max = 4 # for kontext/qwen img # global vars -KcppVersion = "1.105.2" +KcppVersion = "1.105.3" showdebug = True kcpp_instance = None #global running instance global_memory = {"tunnel_url": "", "restart_target":"", "input_to_exit":False, "load_complete":False, "restart_override_config_target":""}