move pipeline parallelism to a --pipelineparallel launch flag

This commit is contained in:
Concedo 2025-12-11 21:03:41 +08:00
parent b7428048fc
commit fd0d0cab03
5 changed files with 20 additions and 3 deletions

View file

@ -76,6 +76,7 @@ struct load_model_inputs
const bool highpriority = false; const bool highpriority = false;
const bool swa_support = false; const bool swa_support = false;
const bool smartcache = false; const bool smartcache = false;
const bool pipelineparallel = false;
const float lora_multiplier = 1.0f; const float lora_multiplier = 1.0f;
const bool quiet = false; const bool quiet = false;
const int debugmode = 0; const int debugmode = 0;

View file

@ -662,8 +662,8 @@ static bool ggml_is_view_op(enum ggml_op op) {
#endif #endif
#ifndef GGML_SCHED_MAX_COPIES #ifndef GGML_SCHED_MAX_COPIES
//kcpp reduced from 4 to 2 to try make buffer sizes smaller on multigpu //kcpp can reduce this if you want to try make buffer sizes smaller on multigpu
#define GGML_SCHED_MAX_COPIES 2 #define GGML_SCHED_MAX_COPIES 4
#endif #endif
struct ggml_backend_sched_split { struct ggml_backend_sched_split {

View file

@ -2006,6 +2006,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
kcpp_data->use_contextshift = inputs.use_contextshift; kcpp_data->use_contextshift = inputs.use_contextshift;
kcpp_data->use_fastforward = inputs.use_fastforward; kcpp_data->use_fastforward = inputs.use_fastforward;
kcpp_data->smartcache = inputs.smartcache; kcpp_data->smartcache = inputs.smartcache;
kcpp_pipeline_parallelism = inputs.pipelineparallel;
if(!kcpp_data->use_fastforward && kcpp_data->smartcache) if(!kcpp_data->use_fastforward && kcpp_data->smartcache)
{ {
kcpp_data->smartcache = false; kcpp_data->smartcache = false;

View file

@ -218,6 +218,7 @@ class load_model_inputs(ctypes.Structure):
("highpriority", ctypes.c_bool), ("highpriority", ctypes.c_bool),
("swa_support", ctypes.c_bool), ("swa_support", ctypes.c_bool),
("smartcache", ctypes.c_bool), ("smartcache", ctypes.c_bool),
("pipelineparallel", ctypes.c_bool),
("lora_multiplier", ctypes.c_float), ("lora_multiplier", ctypes.c_float),
("quiet", ctypes.c_bool), ("quiet", ctypes.c_bool),
("debugmode", ctypes.c_int)] ("debugmode", ctypes.c_int)]
@ -1521,6 +1522,7 @@ def load_model(model_filename):
inputs.highpriority = args.highpriority inputs.highpriority = args.highpriority
inputs.swa_support = args.useswa inputs.swa_support = args.useswa
inputs.smartcache = args.smartcache inputs.smartcache = args.smartcache
inputs.pipelineparallel = args.pipelineparallel
inputs = set_backend_props(inputs) inputs = set_backend_props(inputs)
ret = handle.load_model(inputs) ret = handle.load_model(inputs)
return ret return ret
@ -5146,6 +5148,7 @@ def show_gui():
debugmode = ctk.IntVar() debugmode = ctk.IntVar()
keepforeground = ctk.IntVar() keepforeground = ctk.IntVar()
terminalonly = ctk.IntVar() terminalonly = ctk.IntVar()
pipelineparallel = ctk.IntVar()
quietmode = ctk.IntVar(value=0) quietmode = ctk.IntVar(value=0)
nocertifymode = ctk.IntVar(value=0) nocertifymode = ctk.IntVar(value=0)
@ -5828,7 +5831,8 @@ def show_gui():
"Use mlock": [usemlock, "Enables mlock, preventing the RAM used to load the model from being paged out."], "Use mlock": [usemlock, "Enables mlock, preventing the RAM used to load the model from being paged out."],
"Debug Mode": [debugmode, "Enables debug mode, with extra info printed to the terminal."], "Debug Mode": [debugmode, "Enables debug mode, with extra info printed to the terminal."],
"Keep Foreground": [keepforeground, "Bring KoboldCpp to the foreground every time there is a new generation."], "Keep Foreground": [keepforeground, "Bring KoboldCpp to the foreground every time there is a new generation."],
"CLI Terminal Only": [terminalonly, "Does not launch KoboldCpp HTTP server. Instead, enables KoboldCpp from the command line, accepting interactive console input and displaying responses to the terminal."] "CLI Terminal Only": [terminalonly, "Does not launch KoboldCpp HTTP server. Instead, enables KoboldCpp from the command line, accepting interactive console input and displaying responses to the terminal."],
"Pipeline Parallel": [pipelineparallel, "Enable Pipeline Parallelism for faster multigpu speeds but using more memory, only active for multigpu."],
} }
for idx, (name, properties) in enumerate(hardware_boxes.items()): for idx, (name, properties) in enumerate(hardware_boxes.items()):
@ -6155,6 +6159,7 @@ def show_gui():
args.remotetunnel = remotetunnel_var.get()==1 args.remotetunnel = remotetunnel_var.get()==1
args.foreground = keepforeground.get()==1 args.foreground = keepforeground.get()==1
args.cli = terminalonly.get()==1 args.cli = terminalonly.get()==1
args.pipelineparallel = pipelineparallel.get()==1
args.quiet = quietmode.get()==1 args.quiet = quietmode.get()==1
args.nocertify = nocertifymode.get()==1 args.nocertify = nocertifymode.get()==1
args.nomodel = nomodel.get()==1 args.nomodel = nomodel.get()==1
@ -6377,6 +6382,7 @@ def show_gui():
remotetunnel_var.set(1 if "remotetunnel" in dict and dict["remotetunnel"] else 0) remotetunnel_var.set(1 if "remotetunnel" in dict and dict["remotetunnel"] else 0)
keepforeground.set(1 if "foreground" in dict and dict["foreground"] else 0) keepforeground.set(1 if "foreground" in dict and dict["foreground"] else 0)
terminalonly.set(1 if "cli" in dict and dict["cli"] else 0) terminalonly.set(1 if "cli" in dict and dict["cli"] else 0)
pipelineparallel.set(1 if "pipelineparallel" in dict and dict["pipelineparallel"] else 0)
quietmode.set(1 if "quiet" in dict and dict["quiet"] else 0) quietmode.set(1 if "quiet" in dict and dict["quiet"] else 0)
nocertifymode.set(1 if "nocertify" in dict and dict["nocertify"] else 0) nocertifymode.set(1 if "nocertify" in dict and dict["nocertify"] else 0)
nomodel.set(1 if "nomodel" in dict and dict["nomodel"] else 0) nomodel.set(1 if "nomodel" in dict and dict["nomodel"] else 0)
@ -8404,6 +8410,7 @@ if __name__ == '__main__':
compatgroup2.add_argument("--showgui", help="Always show the GUI instead of launching the model right away when loading settings from a .kcpps file.", action='store_true') compatgroup2.add_argument("--showgui", help="Always show the GUI instead of launching the model right away when loading settings from a .kcpps file.", action='store_true')
compatgroup2.add_argument("--skiplauncher", help="Doesn't display or use the GUI launcher. Overrides showgui.", action='store_true') compatgroup2.add_argument("--skiplauncher", help="Doesn't display or use the GUI launcher. Overrides showgui.", action='store_true')
advparser.add_argument("--singleinstance", help="Allows this KoboldCpp instance to be shut down by any new instance requesting the same port, preventing duplicate servers from clashing on a port.", action='store_true') advparser.add_argument("--singleinstance", help="Allows this KoboldCpp instance to be shut down by any new instance requesting the same port, preventing duplicate servers from clashing on a port.", action='store_true')
advparser.add_argument("--pipelineparallel", help="Enable Pipeline Parallelism for faster multigpu speeds but using more memory, only active for multigpu.", action='store_true')
hordeparsergroup = parser.add_argument_group('Horde Worker Commands') hordeparsergroup = parser.add_argument_group('Horde Worker Commands')
hordeparsergroup.add_argument("--hordemodelname", metavar=('[name]'), help="Sets your AI Horde display model name.", default="") hordeparsergroup.add_argument("--hordemodelname", metavar=('[name]'), help="Sets your AI Horde display model name.", default="")

View file

@ -17,6 +17,9 @@
// llama_context // llama_context
// //
//kcpp: use a global flag to toggle pipeline parallelism to avoid messing with ctx params
static bool kcpp_pipeline_parallelism = false;
llama_context::llama_context( llama_context::llama_context(
const llama_model & model, const llama_model & model,
llama_context_params params) : llama_context_params params) :
@ -264,6 +267,11 @@ llama_context::llama_context(
cparams.offload_kqv && cparams.offload_kqv &&
!model.has_tensor_overrides(); !model.has_tensor_overrides();
if(!kcpp_pipeline_parallelism)
{
pipeline_parallel = false;
}
// pipeline parallelism requires support for async compute and events in all devices // pipeline parallelism requires support for async compute and events in all devices
if (pipeline_parallel) { if (pipeline_parallel) {
for (auto & backend : backends) { for (auto & backend : backends) {