mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-10 17:14:36 +00:00
allow offloading moe to cpu with --moecpu
This commit is contained in:
parent
7590a0ea39
commit
e40d26b9e7
3 changed files with 21 additions and 0 deletions
1
expose.h
1
expose.h
|
@ -61,6 +61,7 @@ struct load_model_inputs
|
||||||
const float rope_freq_scale = 1.0f;
|
const float rope_freq_scale = 1.0f;
|
||||||
const float rope_freq_base = 10000.0f;
|
const float rope_freq_base = 10000.0f;
|
||||||
const int moe_experts = -1;
|
const int moe_experts = -1;
|
||||||
|
const int moecpu = 0;
|
||||||
const bool no_bos_token = false;
|
const bool no_bos_token = false;
|
||||||
const bool load_guidance = false;
|
const bool load_guidance = false;
|
||||||
const char * override_kv = nullptr;
|
const char * override_kv = nullptr;
|
||||||
|
|
|
@ -2293,6 +2293,18 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
// std::string forced = "per_layer_token_embd.weight=CPU"; //this tensor on gpu is problematic on unsloth q4_0
|
// std::string forced = "per_layer_token_embd.weight=CPU"; //this tensor on gpu is problematic on unsloth q4_0
|
||||||
// tensoroverrides = (tensoroverrides=="" ? forced: (forced+","+tensoroverrides));
|
// tensoroverrides = (tensoroverrides=="" ? forced: (forced+","+tensoroverrides));
|
||||||
// }
|
// }
|
||||||
|
if(tensoroverrides=="" && ggml_backend_dev_count()>1 && inputs.moecpu>0)
|
||||||
|
{
|
||||||
|
for (int i = 0; i < inputs.moecpu; ++i) {
|
||||||
|
std::string tmp = string_format("blk\\.%d\\.ffn_(up|down|gate)_exps=CPU", i);
|
||||||
|
if(i>0)
|
||||||
|
{
|
||||||
|
tmp = "," + tmp;
|
||||||
|
}
|
||||||
|
tensoroverrides += tmp;
|
||||||
|
}
|
||||||
|
printf("Overriding %d MoE layers to CPU...\n",inputs.moecpu);
|
||||||
|
}
|
||||||
if(tensoroverrides!="" && ggml_backend_dev_count()>1)
|
if(tensoroverrides!="" && ggml_backend_dev_count()>1)
|
||||||
{
|
{
|
||||||
printf("Handling Override Tensors for backends: ");
|
printf("Handling Override Tensors for backends: ");
|
||||||
|
|
|
@ -195,6 +195,7 @@ class load_model_inputs(ctypes.Structure):
|
||||||
("rope_freq_scale", ctypes.c_float),
|
("rope_freq_scale", ctypes.c_float),
|
||||||
("rope_freq_base", ctypes.c_float),
|
("rope_freq_base", ctypes.c_float),
|
||||||
("moe_experts", ctypes.c_int),
|
("moe_experts", ctypes.c_int),
|
||||||
|
("moecpu", ctypes.c_int),
|
||||||
("no_bos_token", ctypes.c_bool),
|
("no_bos_token", ctypes.c_bool),
|
||||||
("load_guidance", ctypes.c_bool),
|
("load_guidance", ctypes.c_bool),
|
||||||
("override_kv", ctypes.c_char_p),
|
("override_kv", ctypes.c_char_p),
|
||||||
|
@ -1389,6 +1390,7 @@ def load_model(model_filename):
|
||||||
inputs.load_guidance = args.enableguidance
|
inputs.load_guidance = args.enableguidance
|
||||||
inputs.override_kv = args.overridekv.encode("UTF-8") if args.overridekv else "".encode("UTF-8")
|
inputs.override_kv = args.overridekv.encode("UTF-8") if args.overridekv else "".encode("UTF-8")
|
||||||
inputs.override_tensors = args.overridetensors.encode("UTF-8") if args.overridetensors else "".encode("UTF-8")
|
inputs.override_tensors = args.overridetensors.encode("UTF-8") if args.overridetensors else "".encode("UTF-8")
|
||||||
|
inputs.moecpu = (200 if args.moecpu > 200 else args.moecpu)
|
||||||
inputs.check_slowness = (not args.highpriority and os.name == 'nt' and 'Intel' in platform.processor())
|
inputs.check_slowness = (not args.highpriority and os.name == 'nt' and 'Intel' in platform.processor())
|
||||||
inputs.highpriority = args.highpriority
|
inputs.highpriority = args.highpriority
|
||||||
inputs.swa_support = args.useswa
|
inputs.swa_support = args.useswa
|
||||||
|
@ -4489,6 +4491,7 @@ def show_gui():
|
||||||
customrope_base = ctk.StringVar(value="10000")
|
customrope_base = ctk.StringVar(value="10000")
|
||||||
chatcompletionsadapter_var = ctk.StringVar(value="AutoGuess")
|
chatcompletionsadapter_var = ctk.StringVar(value="AutoGuess")
|
||||||
moeexperts_var = ctk.StringVar(value=str(-1))
|
moeexperts_var = ctk.StringVar(value=str(-1))
|
||||||
|
moecpu_var = ctk.StringVar(value=str(0))
|
||||||
defaultgenamt_var = ctk.StringVar(value=str(512))
|
defaultgenamt_var = ctk.StringVar(value=str(512))
|
||||||
nobostoken_var = ctk.IntVar(value=0)
|
nobostoken_var = ctk.IntVar(value=0)
|
||||||
override_kv_var = ctk.StringVar(value="")
|
override_kv_var = ctk.StringVar(value="")
|
||||||
|
@ -5163,6 +5166,7 @@ def show_gui():
|
||||||
makecheckbox(tokens_tab, "No BOS Token", nobostoken_var, 43, tooltiptxt="Prevents BOS token from being added at the start of any prompt. Usually NOT recommended for most models.")
|
makecheckbox(tokens_tab, "No BOS Token", nobostoken_var, 43, tooltiptxt="Prevents BOS token from being added at the start of any prompt. Usually NOT recommended for most models.")
|
||||||
makecheckbox(tokens_tab, "Enable Guidance", enableguidance_var, 43,padx=140, tooltiptxt="Enables the use of Classifier-Free-Guidance, which allows the use of negative prompts. Has performance and memory impact.")
|
makecheckbox(tokens_tab, "Enable Guidance", enableguidance_var, 43,padx=140, tooltiptxt="Enables the use of Classifier-Free-Guidance, which allows the use of negative prompts. Has performance and memory impact.")
|
||||||
makelabelentry(tokens_tab, "MoE Experts:", moeexperts_var, row=55, padx=120, singleline=True, tooltip="Override number of MoE experts.")
|
makelabelentry(tokens_tab, "MoE Experts:", moeexperts_var, row=55, padx=120, singleline=True, tooltip="Override number of MoE experts.")
|
||||||
|
makelabelentry(tokens_tab, "MoE CPU Layers:", moecpu_var, row=55, padx=320, singleline=True, tooltip="Keep Mixture of Experts (MoE) weights of the first N layers in the CPU.", labelpadx=210)
|
||||||
makelabelentry(tokens_tab, "Override KV:", override_kv_var, row=57, padx=120, singleline=True, width=150, tooltip="Advanced option to override model metadata by key, same as in llama.cpp. Mainly for debugging, not intended for general use. Types: int, float, bool, str")
|
makelabelentry(tokens_tab, "Override KV:", override_kv_var, row=57, padx=120, singleline=True, width=150, tooltip="Advanced option to override model metadata by key, same as in llama.cpp. Mainly for debugging, not intended for general use. Types: int, float, bool, str")
|
||||||
makelabelentry(tokens_tab, "Override Tensors:", override_tensors_var, row=59, padx=120, singleline=True, width=150, tooltip="Advanced option to override tensor backend selection, same as in llama.cpp.")
|
makelabelentry(tokens_tab, "Override Tensors:", override_tensors_var, row=59, padx=120, singleline=True, width=150, tooltip="Advanced option to override tensor backend selection, same as in llama.cpp.")
|
||||||
|
|
||||||
|
@ -5454,6 +5458,7 @@ def show_gui():
|
||||||
else:
|
else:
|
||||||
args.ropeconfig = [0.0, 10000.0]
|
args.ropeconfig = [0.0, 10000.0]
|
||||||
args.moeexperts = int(moeexperts_var.get()) if moeexperts_var.get()!="" else -1
|
args.moeexperts = int(moeexperts_var.get()) if moeexperts_var.get()!="" else -1
|
||||||
|
args.moecpu = int(moecpu_var.get()) if moecpu_var.get()!="" else 0
|
||||||
args.defaultgenamt = int(defaultgenamt_var.get()) if defaultgenamt_var.get()!="" else 512
|
args.defaultgenamt = int(defaultgenamt_var.get()) if defaultgenamt_var.get()!="" else 512
|
||||||
args.nobostoken = (nobostoken_var.get()==1)
|
args.nobostoken = (nobostoken_var.get()==1)
|
||||||
args.enableguidance = (enableguidance_var.get()==1)
|
args.enableguidance = (enableguidance_var.get()==1)
|
||||||
|
@ -5663,6 +5668,8 @@ def show_gui():
|
||||||
customrope_var.set(0)
|
customrope_var.set(0)
|
||||||
if "moeexperts" in dict and dict["moeexperts"]:
|
if "moeexperts" in dict and dict["moeexperts"]:
|
||||||
moeexperts_var.set(dict["moeexperts"])
|
moeexperts_var.set(dict["moeexperts"])
|
||||||
|
if "moecpu" in dict and dict["moecpu"]:
|
||||||
|
moecpu_var.set(dict["moecpu"])
|
||||||
if "defaultgenamt" in dict and dict["defaultgenamt"]:
|
if "defaultgenamt" in dict and dict["defaultgenamt"]:
|
||||||
defaultgenamt_var.set(dict["defaultgenamt"])
|
defaultgenamt_var.set(dict["defaultgenamt"])
|
||||||
nobostoken_var.set(dict["nobostoken"] if ("nobostoken" in dict) else 0)
|
nobostoken_var.set(dict["nobostoken"] if ("nobostoken" in dict) else 0)
|
||||||
|
@ -7498,6 +7505,7 @@ if __name__ == '__main__':
|
||||||
advparser.add_argument("--exporttemplate", help="Exports the current selected arguments as a .kcppt template file", metavar=('[filename]'), type=str, default="")
|
advparser.add_argument("--exporttemplate", help="Exports the current selected arguments as a .kcppt template file", metavar=('[filename]'), type=str, default="")
|
||||||
advparser.add_argument("--nomodel", help="Allows you to launch the GUI alone, without selecting any model.", action='store_true')
|
advparser.add_argument("--nomodel", help="Allows you to launch the GUI alone, without selecting any model.", action='store_true')
|
||||||
advparser.add_argument("--moeexperts", metavar=('[num of experts]'), help="How many experts to use for MoE models (default=follow gguf)", type=int, default=-1)
|
advparser.add_argument("--moeexperts", metavar=('[num of experts]'), help="How many experts to use for MoE models (default=follow gguf)", type=int, default=-1)
|
||||||
|
advparser.add_argument("--moecpu", metavar=('[layers affected]'), help="Keep the Mixture of Experts (MoE) weights of the first N layers in the CPU. If no value is provided, applies to all layers.", nargs='?', const=999, type=int, default=0)
|
||||||
advparser.add_argument("--defaultgenamt", help="How many tokens to generate by default, if not specified. Must be smaller than context size. Usually, your frontend GUI will override this.", type=check_range(int,64,8192), default=512)
|
advparser.add_argument("--defaultgenamt", help="How many tokens to generate by default, if not specified. Must be smaller than context size. Usually, your frontend GUI will override this.", type=check_range(int,64,8192), default=512)
|
||||||
advparser.add_argument("--nobostoken", help="Prevents BOS token from being added at the start of any prompt. Usually NOT recommended for most models.", action='store_true')
|
advparser.add_argument("--nobostoken", help="Prevents BOS token from being added at the start of any prompt. Usually NOT recommended for most models.", action='store_true')
|
||||||
advparser.add_argument("--enableguidance", help="Enables the use of Classifier-Free-Guidance, which allows the use of negative prompts. Has performance and memory impact.", action='store_true')
|
advparser.add_argument("--enableguidance", help="Enables the use of Classifier-Free-Guidance, which allows the use of negative prompts. Has performance and memory impact.", action='store_true')
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue