From fd0d0cab03375df419aac94de70bf96bd93e0614 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Thu, 11 Dec 2025 21:03:41 +0800 Subject: [PATCH] move pipeline parallelism to a --pipelineparallel launch flag --- expose.h | 1 + ggml/src/ggml-backend.cpp | 4 ++-- gpttype_adapter.cpp | 1 + koboldcpp.py | 9 ++++++++- src/llama-context.cpp | 8 ++++++++ 5 files changed, 20 insertions(+), 3 deletions(-) diff --git a/expose.h b/expose.h index eaa5d4ab7..67b428867 100644 --- a/expose.h +++ b/expose.h @@ -76,6 +76,7 @@ struct load_model_inputs const bool highpriority = false; const bool swa_support = false; const bool smartcache = false; + const bool pipelineparallel = false; const float lora_multiplier = 1.0f; const bool quiet = false; const int debugmode = 0; diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 84041d00f..d77658d97 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -662,8 +662,8 @@ static bool ggml_is_view_op(enum ggml_op op) { #endif #ifndef GGML_SCHED_MAX_COPIES -//kcpp reduced from 4 to 2 to try make buffer sizes smaller on multigpu -#define GGML_SCHED_MAX_COPIES 2 +//kcpp can reduce this if you want to try make buffer sizes smaller on multigpu +#define GGML_SCHED_MAX_COPIES 4 #endif struct ggml_backend_sched_split { diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index ddd8eb97d..0274c7763 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -2006,6 +2006,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in kcpp_data->use_contextshift = inputs.use_contextshift; kcpp_data->use_fastforward = inputs.use_fastforward; kcpp_data->smartcache = inputs.smartcache; + kcpp_pipeline_parallelism = inputs.pipelineparallel; if(!kcpp_data->use_fastforward && kcpp_data->smartcache) { kcpp_data->smartcache = false; diff --git a/koboldcpp.py b/koboldcpp.py index 9d407a801..0edbfe797 100755 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -218,6 +218,7 @@ class load_model_inputs(ctypes.Structure): ("highpriority", ctypes.c_bool), ("swa_support", ctypes.c_bool), ("smartcache", ctypes.c_bool), + ("pipelineparallel", ctypes.c_bool), ("lora_multiplier", ctypes.c_float), ("quiet", ctypes.c_bool), ("debugmode", ctypes.c_int)] @@ -1521,6 +1522,7 @@ def load_model(model_filename): inputs.highpriority = args.highpriority inputs.swa_support = args.useswa inputs.smartcache = args.smartcache + inputs.pipelineparallel = args.pipelineparallel inputs = set_backend_props(inputs) ret = handle.load_model(inputs) return ret @@ -5146,6 +5148,7 @@ def show_gui(): debugmode = ctk.IntVar() keepforeground = ctk.IntVar() terminalonly = ctk.IntVar() + pipelineparallel = ctk.IntVar() quietmode = ctk.IntVar(value=0) nocertifymode = ctk.IntVar(value=0) @@ -5828,7 +5831,8 @@ def show_gui(): "Use mlock": [usemlock, "Enables mlock, preventing the RAM used to load the model from being paged out."], "Debug Mode": [debugmode, "Enables debug mode, with extra info printed to the terminal."], "Keep Foreground": [keepforeground, "Bring KoboldCpp to the foreground every time there is a new generation."], - "CLI Terminal Only": [terminalonly, "Does not launch KoboldCpp HTTP server. Instead, enables KoboldCpp from the command line, accepting interactive console input and displaying responses to the terminal."] + "CLI Terminal Only": [terminalonly, "Does not launch KoboldCpp HTTP server. Instead, enables KoboldCpp from the command line, accepting interactive console input and displaying responses to the terminal."], + "Pipeline Parallel": [pipelineparallel, "Enable Pipeline Parallelism for faster multigpu speeds but using more memory, only active for multigpu."], } for idx, (name, properties) in enumerate(hardware_boxes.items()): @@ -6155,6 +6159,7 @@ def show_gui(): args.remotetunnel = remotetunnel_var.get()==1 args.foreground = keepforeground.get()==1 args.cli = terminalonly.get()==1 + args.pipelineparallel = pipelineparallel.get()==1 args.quiet = quietmode.get()==1 args.nocertify = nocertifymode.get()==1 args.nomodel = nomodel.get()==1 @@ -6377,6 +6382,7 @@ def show_gui(): remotetunnel_var.set(1 if "remotetunnel" in dict and dict["remotetunnel"] else 0) keepforeground.set(1 if "foreground" in dict and dict["foreground"] else 0) terminalonly.set(1 if "cli" in dict and dict["cli"] else 0) + pipelineparallel.set(1 if "pipelineparallel" in dict and dict["pipelineparallel"] else 0) quietmode.set(1 if "quiet" in dict and dict["quiet"] else 0) nocertifymode.set(1 if "nocertify" in dict and dict["nocertify"] else 0) nomodel.set(1 if "nomodel" in dict and dict["nomodel"] else 0) @@ -8404,6 +8410,7 @@ if __name__ == '__main__': compatgroup2.add_argument("--showgui", help="Always show the GUI instead of launching the model right away when loading settings from a .kcpps file.", action='store_true') compatgroup2.add_argument("--skiplauncher", help="Doesn't display or use the GUI launcher. Overrides showgui.", action='store_true') advparser.add_argument("--singleinstance", help="Allows this KoboldCpp instance to be shut down by any new instance requesting the same port, preventing duplicate servers from clashing on a port.", action='store_true') + advparser.add_argument("--pipelineparallel", help="Enable Pipeline Parallelism for faster multigpu speeds but using more memory, only active for multigpu.", action='store_true') hordeparsergroup = parser.add_argument_group('Horde Worker Commands') hordeparsergroup.add_argument("--hordemodelname", metavar=('[name]'), help="Sets your AI Horde display model name.", default="") diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 839c1ba84..fed4b5cf5 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -17,6 +17,9 @@ // llama_context // +//kcpp: use a global flag to toggle pipeline parallelism to avoid messing with ctx params +static bool kcpp_pipeline_parallelism = false; + llama_context::llama_context( const llama_model & model, llama_context_params params) : @@ -264,6 +267,11 @@ llama_context::llama_context( cparams.offload_kqv && !model.has_tensor_overrides(); + if(!kcpp_pipeline_parallelism) + { + pipeline_parallel = false; + } + // pipeline parallelism requires support for async compute and events in all devices if (pipeline_parallel) { for (auto & backend : backends) {